upend/cli/src/extractors/web.rs

120 lines
4.1 KiB
Rust

use std::sync::Arc;
use super::Extractor;
use anyhow::anyhow;
use anyhow::Result;
use upend::{
addressing::Address,
database::{entry::Entry, stores::UpStore, UpEndConnection},
util::jobs::{JobContainer, JobState},
};
use webpage::{Webpage, WebpageOptions};
pub struct WebExtractor;
impl Extractor for WebExtractor {
fn get(
&self,
address: &Address,
_connection: &UpEndConnection,
_store: Arc<Box<dyn UpStore + Send + Sync>>,
mut job_container: JobContainer,
) -> Result<Vec<Entry>> {
if let Address::Url(url) = address {
let mut job_handle =
job_container.add_job(None, &format!("Getting info about {url:?}"))?;
let webpage_url = url.clone();
let webpage_get = Webpage::from_url(webpage_url.as_ref(), WebpageOptions::default());
if let Ok(webpage) = webpage_get {
let _ = job_handle.update_progress(50.0);
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}))
}
let _ = job_handle.update_state(JobState::Done);
return Ok(entries.into_iter().flatten().collect());
}
Err(anyhow!("Failed for unknown reason."))
} else {
Ok(vec![])
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
Ok(connection
.query(
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
.parse()?,
)?
.is_empty())
}
}
#[cfg(test)]
mod test {
use upend::database::stores::fs::FsStore;
use upend::util::jobs::JobContainer;
use url::Url;
use super::*;
use anyhow::Result;
use std::sync::Arc;
use tempfile::TempDir;
#[test]
fn test_extract() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let open_result = upend::database::UpEndDatabase::open(&temp_dir, true)?;
let connection = open_result.db.connection()?;
let store =
Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box<dyn UpStore + Sync + Send>);
let job_container = JobContainer::new();
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
assert!(WebExtractor.is_needed(&address, &connection)?);
WebExtractor.insert_info(&address, &connection, store, job_container)?;
assert!(!WebExtractor.is_needed(&address, &connection)?);
Ok(())
}
}