120 lines
4.1 KiB
Rust
120 lines
4.1 KiB
Rust
use std::sync::Arc;
|
|
|
|
use super::Extractor;
|
|
use anyhow::anyhow;
|
|
use anyhow::Result;
|
|
use upend::{
|
|
addressing::Address,
|
|
database::{entry::Entry, stores::UpStore, UpEndConnection},
|
|
util::jobs::{JobContainer, JobState},
|
|
};
|
|
|
|
use webpage::{Webpage, WebpageOptions};
|
|
|
|
pub struct WebExtractor;
|
|
|
|
impl Extractor for WebExtractor {
|
|
fn get(
|
|
&self,
|
|
address: &Address,
|
|
_connection: &UpEndConnection,
|
|
_store: Arc<Box<dyn UpStore + Send + Sync>>,
|
|
mut job_container: JobContainer,
|
|
) -> Result<Vec<Entry>> {
|
|
if let Address::Url(url) = address {
|
|
let mut job_handle =
|
|
job_container.add_job(None, &format!("Getting info about {url:?}"))?;
|
|
|
|
let webpage_url = url.clone();
|
|
let webpage_get = Webpage::from_url(webpage_url.as_ref(), WebpageOptions::default());
|
|
|
|
if let Ok(webpage) = webpage_get {
|
|
let _ = job_handle.update_progress(50.0);
|
|
|
|
let mut entries = vec![
|
|
webpage.html.title.map(|html_title| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_TITLE".to_string(),
|
|
value: html_title.into(),
|
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}),
|
|
webpage.html.description.map(|html_desc| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_DESCRIPTION".to_string(),
|
|
value: html_desc.into(),
|
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}),
|
|
];
|
|
for (key, value) in webpage.html.opengraph.properties {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: format!("OG_{}", key.to_uppercase()),
|
|
value: value.into(),
|
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}))
|
|
}
|
|
for image in webpage.html.opengraph.images {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: "OG_IMAGE".to_string(),
|
|
value: image.url.into(),
|
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
|
timestamp: chrono::Utc::now().naive_utc(),
|
|
}))
|
|
}
|
|
|
|
let _ = job_handle.update_state(JobState::Done);
|
|
|
|
return Ok(entries.into_iter().flatten().collect());
|
|
}
|
|
Err(anyhow!("Failed for unknown reason."))
|
|
} else {
|
|
Ok(vec![])
|
|
}
|
|
}
|
|
|
|
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
|
Ok(connection
|
|
.query(
|
|
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
|
|
.parse()?,
|
|
)?
|
|
.is_empty())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
|
|
use upend::database::stores::fs::FsStore;
|
|
use upend::util::jobs::JobContainer;
|
|
use url::Url;
|
|
|
|
use super::*;
|
|
use anyhow::Result;
|
|
use std::sync::Arc;
|
|
use tempfile::TempDir;
|
|
|
|
#[test]
|
|
fn test_extract() -> Result<()> {
|
|
let temp_dir = TempDir::new().unwrap();
|
|
let open_result = upend::database::UpEndDatabase::open(&temp_dir, true)?;
|
|
let connection = open_result.db.connection()?;
|
|
let store =
|
|
Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box<dyn UpStore + Sync + Send>);
|
|
let job_container = JobContainer::new();
|
|
|
|
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
|
|
assert!(WebExtractor.is_needed(&address, &connection)?);
|
|
|
|
WebExtractor.insert_info(&address, &connection, store, job_container)?;
|
|
|
|
assert!(!WebExtractor.is_needed(&address, &connection)?);
|
|
|
|
Ok(())
|
|
}
|
|
}
|