diff --git a/src/extractors/mod.rs b/src/extractors/mod.rs index 8628242..dca9540 100644 --- a/src/extractors/mod.rs +++ b/src/extractors/mod.rs @@ -10,22 +10,33 @@ use std::sync::{Arc, RwLock}; pub mod web; pub trait Extractor { - fn get(&self, address: Address, job_container: Arc>) - -> Result>; + fn get( + &self, + address: &Address, + job_container: Arc>, + ) -> Result>; + + fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result { + Ok(true) + } fn insert_info( &self, - address: Address, - connection: UpEndConnection, + address: &Address, + connection: &UpEndConnection, job_container: Arc>, ) -> Result<()> { - let entries = self.get(address, job_container)?; + if self.is_needed(address, connection)? { + let entries = self.get(address, job_container)?; - connection.transaction(|| { - for entry in entries { - connection.insert_entry(entry)?; - } + connection.transaction(|| { + for entry in entries { + connection.insert_entry(entry)?; + } + Ok(()) + }) + } else { Ok(()) - }) + } } } diff --git a/src/extractors/web.rs b/src/extractors/web.rs index ae9808c..af70a18 100644 --- a/src/extractors/web.rs +++ b/src/extractors/web.rs @@ -5,6 +5,7 @@ use crate::{ util::jobs::{Job, JobContainer, State}, }; use anyhow::anyhow; +use anyhow::Result; use std::sync::{Arc, RwLock}; use webpage::{Webpage, WebpageOptions}; @@ -13,7 +14,7 @@ pub struct WebExtractor; impl Extractor for WebExtractor { fn get( &self, - address: Address, + address: &Address, job_container: Arc>, ) -> anyhow::Result> { if let Address::Url(url) = address { @@ -32,7 +33,6 @@ impl Extractor for WebExtractor { .unwrap() .update_progress(&job_id, 50.0); - let address = Address::Url(url); let mut entries = vec![ webpage.html.title.map(|html_title| Entry { entity: address.clone(), @@ -77,4 +77,41 @@ impl Extractor for WebExtractor { Ok(vec![]) } } + + fn is_needed( + &self, + address: &Address, + connection: &crate::database::UpEndConnection, + ) -> Result { + Ok(connection + .query( + format!("(matches \"{address}\" (in \"HTML_TITLE\" \"HTML_DESCRIPTION\") ?)") + .parse()?, + )? + .is_empty()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use anyhow::Result; + use tempfile::TempDir; + + #[test] + fn test_extract() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let open_result = crate::database::UpEndDatabase::open(&temp_dir, None, true)?; + let connection = open_result.db.connection()?; + let job_container = Arc::new(RwLock::new(crate::util::jobs::JobContainer::default())); + + let address = Address::Url("https://upendproject.net".into()); + assert!(WebExtractor.is_needed(&address, &connection)?); + + WebExtractor.insert_info(&address, &connection, job_container)?; + + assert!(!WebExtractor.is_needed(&address, &connection)?); + + Ok(()) + } } diff --git a/src/routes.rs b/src/routes.rs index 494235d..efdac91 100644 --- a/src/routes.rs +++ b/src/routes.rs @@ -367,8 +367,8 @@ pub async fn put_object( let _address = address.clone(); block_background(move || { (crate::extractors::web::WebExtractor {}).insert_info( - _address, - state.upend.connection()?, + &_address, + &state.upend.connection()?, state.job_container.clone(), ) });