use std::sync::Arc; use super::Extractor; use anyhow::anyhow; use anyhow::Result; use upend::{ addressing::Address, database::{entry::Entry, stores::UpStore, UpEndConnection}, util::jobs::{JobContainer, JobState}, }; use webpage::{Webpage, WebpageOptions}; pub struct WebExtractor; impl Extractor for WebExtractor { fn get( &self, address: &Address, _connection: &UpEndConnection, _store: Arc>, mut job_container: JobContainer, ) -> Result> { if let Address::Url(url) = address { let mut job_handle = job_container.add_job(None, &format!("Getting info about {url:?}"))?; let webpage_url = url.clone(); let webpage_get = Webpage::from_url(webpage_url.as_ref(), WebpageOptions::default()); if let Ok(webpage) = webpage_get { let _ = job_handle.update_progress(50.0); let mut entries = vec![ webpage.html.title.map(|html_title| Entry { entity: address.clone(), attribute: "HTML_TITLE".to_string(), value: html_title.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), webpage.html.description.map(|html_desc| Entry { entity: address.clone(), attribute: "HTML_DESCRIPTION".to_string(), value: html_desc.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), ]; for (key, value) in webpage.html.opengraph.properties { entries.push(Some(Entry { entity: address.clone(), attribute: format!("OG_{}", key.to_uppercase()), value: value.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), })) } for image in webpage.html.opengraph.images { entries.push(Some(Entry { entity: address.clone(), attribute: "OG_IMAGE".to_string(), value: image.url.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), })) } let _ = job_handle.update_state(JobState::Done); return Ok(entries.into_iter().flatten().collect()); } Err(anyhow!("Failed for unknown reason.")) } else { Ok(vec![]) } } fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { Ok(connection .query( format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#) .parse()?, )? .is_empty()) } } #[cfg(test)] mod test { use upend::database::stores::fs::FsStore; use upend::util::jobs::JobContainer; use url::Url; use super::*; use anyhow::Result; use std::sync::Arc; use tempfile::TempDir; #[test] fn test_extract() -> Result<()> { let temp_dir = TempDir::new().unwrap(); let open_result = upend::database::UpEndDatabase::open(&temp_dir, true)?; let connection = open_result.db.connection()?; let store = Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box); let job_container = JobContainer::new(); let address = Address::Url(Url::parse("https://upend.dev").unwrap()); assert!(WebExtractor.is_needed(&address, &connection)?); WebExtractor.insert_info(&address, &connection, store, job_container)?; assert!(!WebExtractor.is_needed(&address, &connection)?); Ok(()) } }