use std::sync::Arc; use super::Extractor; use crate::common::REQWEST_CLIENT; use eyre::eyre; use eyre::Result; use upend_base::addressing::Address; use upend_base::constants::ATTR_LABEL; use upend_base::constants::ATTR_OF; use upend_base::constants::TYPE_URL_ADDRESS; use upend_base::entry::Entry; use upend_base::entry::EntryValue; use upend_db::jobs::JobContainer; use upend_db::jobs::JobState; use upend_db::stores::UpStore; use upend_db::UpEndConnection; use webpage::HTML; pub struct WebExtractor; impl Extractor for WebExtractor { fn get( &self, address: &Address, _connection: &UpEndConnection, _store: Arc>, mut job_container: JobContainer, ) -> Result> { if let Address::Url(url) = address { let mut job_handle = job_container.add_job(None, &format!("Getting info about {url:?}"))?; let response = REQWEST_CLIENT.get(url.clone()).send()?; let html = HTML::from_string(response.text()?, Some(url.to_string())); if let Ok(html) = html { let _ = job_handle.update_progress(50.0); let mut entries = vec![ html.title.as_ref().map(|html_title| Entry { entity: address.clone(), attribute: "HTML_TITLE".to_string(), value: html_title.clone().into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), html.title.map(|html_title| Entry { entity: address.clone(), attribute: ATTR_LABEL.to_string(), value: html_title.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), html.description.map(|html_desc| Entry { entity: address.clone(), attribute: "HTML_DESCRIPTION".to_string(), value: html_desc.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), ]; for (key, value) in html.opengraph.properties { let attribute = format!("OG_{}", key.to_uppercase()); if attribute == "OG_TITLE" { entries.push(Some(Entry { entity: address.clone(), attribute: ATTR_LABEL.to_string(), value: value.clone().into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), })); } entries.push(Some(Entry { entity: address.clone(), attribute, value: value.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), })); } for image in html.opengraph.images { entries.push(Some(Entry { entity: address.clone(), attribute: "OG_IMAGE".to_string(), value: image.url.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), })) } let _ = job_handle.update_state(JobState::Done); return Ok(entries .into_iter() .flatten() .flat_map(|e| { vec![ Entry { entity: Address::Attribute(e.attribute.clone()), attribute: ATTR_OF.to_string(), value: EntryValue::Address(TYPE_URL_ADDRESS.clone()), provenance: "SYSTEM EXTRACTOR".to_string(), ..Default::default() }, e, ] }) .collect()); } Err(eyre!("Failed for unknown reason.")) } else { Ok(vec![]) } } fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { Ok(connection .query( format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#) .parse()?, )? .is_empty()) } } #[cfg(test)] mod test { use upend_db::jobs::JobContainer; use upend_db::stores::fs::FsStore; use url::Url; use super::*; use eyre::Result; use std::sync::Arc; use tempfile::TempDir; #[test] fn test_extract() -> Result<()> { let temp_dir = TempDir::new().unwrap(); let open_result = upend_db::UpEndDatabase::open(&temp_dir, true)?; let connection = open_result.db.connection()?; let store = Arc::new(Box::new(FsStore::from_path(&temp_dir)?) as Box); let job_container = JobContainer::new(); let address = Address::Url(Url::parse("https://upend.dev").unwrap()); assert!(WebExtractor.is_needed(&address, &connection)?); WebExtractor.insert_info(&address, &connection, store, job_container)?; assert!(!WebExtractor.is_needed(&address, &connection)?); Ok(()) } }