upend/cli/src/extractors/web.rs

173 lines
6.4 KiB
Rust

use std::sync::Arc;
use super::Extractor;
use crate::common::REQWEST_CLIENT;
use anyhow::anyhow;
use anyhow::Result;
use upend_base::addressing::Address;
use upend_base::constants::ATTR_LABEL;
use upend_base::constants::ATTR_OF;
use upend_base::constants::TYPE_URL_ADDRESS;
use upend_base::entry::Entry;
use upend_base::entry::EntryValue;
use upend_db::jobs::JobContainer;
use upend_db::jobs::JobState;
use upend_db::stores::UpStore;
use upend_db::{OperationContext, UpEndConnection};
use webpage::HTML;
pub struct WebExtractor;
impl Extractor for WebExtractor {
fn get(
&self,
address: &Address,
_connection: &UpEndConnection,
_store: Arc<Box<dyn UpStore + Send + Sync>>,
mut job_container: JobContainer,
context: OperationContext,
) -> Result<Vec<Entry>> {
if let Address::Url(url) = address {
let mut job_handle =
job_container.add_job(None, &format!("Getting info about {url:?}"))?;
let response = REQWEST_CLIENT.get(url.clone()).send()?;
let html = HTML::from_string(response.text()?, Some(url.to_string()));
if let Ok(html) = html {
let _ = job_handle.update_progress(50.0);
let mut entries = vec![
html.title.as_ref().map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".parse().unwrap(),
value: html_title.clone().into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}),
html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: ATTR_LABEL.parse().unwrap(),
value: html_title.into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}),
html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".parse().unwrap(),
value: html_desc.into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}),
];
for (key, value) in html.opengraph.properties {
let attribute = format!("OG_{}", key.to_uppercase());
if attribute == "OG_TITLE" {
entries.push(Some(Entry {
entity: address.clone(),
attribute: ATTR_LABEL.parse()?,
value: value.clone().into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}));
}
entries.push(Some(Entry {
entity: address.clone(),
attribute: attribute.parse()?,
value: value.into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}));
}
for image in html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".parse()?,
value: image.url.into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}))
}
let _ = job_handle.update_state(JobState::Done);
return Ok(entries
.into_iter()
.flatten()
.flat_map(|e| {
vec![
Entry {
entity: Address::Attribute(e.attribute.clone()),
attribute: ATTR_OF.parse().unwrap(),
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
e,
]
})
.collect());
}
Err(anyhow!("Failed for unknown reason."))
} else {
Ok(vec![])
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
Ok(connection
.query(
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
.parse()?,
)?
.is_empty())
}
}
#[cfg(test)]
mod test {
use upend_db::jobs::JobContainer;
use upend_db::stores::fs::FsStore;
use url::Url;
use super::*;
use anyhow::Result;
use std::sync::Arc;
use tempfile::TempDir;
#[test]
fn test_extract() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let open_result = upend_db::UpEndDatabase::open(&temp_dir, true)?;
let connection = open_result.db.connection()?;
let store =
Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box<dyn UpStore + Sync + Send>);
let job_container = JobContainer::new();
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
assert!(WebExtractor.is_needed(&address, &connection)?);
WebExtractor.insert_info(
&address,
&connection,
store,
job_container,
OperationContext::default(),
)?;
assert!(!WebExtractor.is_needed(&address, &connection)?);
Ok(())
}
}