173 lines
6.4 KiB
Rust
173 lines
6.4 KiB
Rust
use std::sync::Arc;
|
|
|
|
use super::Extractor;
|
|
use crate::common::REQWEST_CLIENT;
|
|
use anyhow::anyhow;
|
|
use anyhow::Result;
|
|
|
|
use upend_base::addressing::Address;
|
|
use upend_base::constants::ATTR_LABEL;
|
|
use upend_base::constants::ATTR_OF;
|
|
use upend_base::constants::TYPE_URL_ADDRESS;
|
|
use upend_base::entry::Entry;
|
|
use upend_base::entry::EntryValue;
|
|
use upend_db::jobs::JobContainer;
|
|
use upend_db::jobs::JobState;
|
|
use upend_db::stores::UpStore;
|
|
use upend_db::{OperationContext, UpEndConnection};
|
|
use webpage::HTML;
|
|
|
|
pub struct WebExtractor;
|
|
|
|
impl Extractor for WebExtractor {
|
|
fn get(
|
|
&self,
|
|
address: &Address,
|
|
_connection: &UpEndConnection,
|
|
_store: Arc<Box<dyn UpStore + Send + Sync>>,
|
|
mut job_container: JobContainer,
|
|
context: OperationContext,
|
|
) -> Result<Vec<Entry>> {
|
|
if let Address::Url(url) = address {
|
|
let mut job_handle =
|
|
job_container.add_job(None, &format!("Getting info about {url:?}"))?;
|
|
|
|
let response = REQWEST_CLIENT.get(url.clone()).send()?;
|
|
let html = HTML::from_string(response.text()?, Some(url.to_string()));
|
|
|
|
if let Ok(html) = html {
|
|
let _ = job_handle.update_progress(50.0);
|
|
|
|
let mut entries = vec![
|
|
html.title.as_ref().map(|html_title| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_TITLE".parse().unwrap(),
|
|
value: html_title.clone().into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}),
|
|
html.title.map(|html_title| Entry {
|
|
entity: address.clone(),
|
|
attribute: ATTR_LABEL.parse().unwrap(),
|
|
value: html_title.into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}),
|
|
html.description.map(|html_desc| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_DESCRIPTION".parse().unwrap(),
|
|
value: html_desc.into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}),
|
|
];
|
|
for (key, value) in html.opengraph.properties {
|
|
let attribute = format!("OG_{}", key.to_uppercase());
|
|
if attribute == "OG_TITLE" {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: ATTR_LABEL.parse()?,
|
|
value: value.clone().into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}));
|
|
}
|
|
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: attribute.parse()?,
|
|
value: value.into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}));
|
|
}
|
|
for image in html.opengraph.images {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: "OG_IMAGE".parse()?,
|
|
value: image.url.into(),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}))
|
|
}
|
|
|
|
let _ = job_handle.update_state(JobState::Done);
|
|
|
|
return Ok(entries
|
|
.into_iter()
|
|
.flatten()
|
|
.flat_map(|e| {
|
|
vec![
|
|
Entry {
|
|
entity: Address::Attribute(e.attribute.clone()),
|
|
attribute: ATTR_OF.parse().unwrap(),
|
|
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
|
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
|
user: context.user.clone(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
},
|
|
e,
|
|
]
|
|
})
|
|
.collect());
|
|
}
|
|
Err(anyhow!("Failed for unknown reason."))
|
|
} else {
|
|
Ok(vec![])
|
|
}
|
|
}
|
|
|
|
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
|
Ok(connection
|
|
.query(
|
|
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
|
|
.parse()?,
|
|
)?
|
|
.is_empty())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
|
|
use upend_db::jobs::JobContainer;
|
|
use upend_db::stores::fs::FsStore;
|
|
use url::Url;
|
|
|
|
use super::*;
|
|
use anyhow::Result;
|
|
use std::sync::Arc;
|
|
use tempfile::TempDir;
|
|
|
|
#[test]
|
|
fn test_extract() -> Result<()> {
|
|
let temp_dir = TempDir::new().unwrap();
|
|
let open_result = upend_db::UpEndDatabase::open(&temp_dir, true)?;
|
|
let connection = open_result.db.connection()?;
|
|
let store =
|
|
Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box<dyn UpStore + Sync + Send>);
|
|
let job_container = JobContainer::new();
|
|
|
|
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
|
|
assert!(WebExtractor.is_needed(&address, &connection)?);
|
|
|
|
WebExtractor.insert_info(
|
|
&address,
|
|
&connection,
|
|
store,
|
|
job_container,
|
|
OperationContext::default(),
|
|
)?;
|
|
|
|
assert!(!WebExtractor.is_needed(&address, &connection)?);
|
|
|
|
Ok(())
|
|
}
|
|
}
|