add an "is_needed" check for extractors; add tests for web extractor
parent
27cc6eb31c
commit
920e67a283
|
@ -10,22 +10,33 @@ use std::sync::{Arc, RwLock};
|
|||
pub mod web;
|
||||
|
||||
pub trait Extractor {
|
||||
fn get(&self, address: Address, job_container: Arc<RwLock<JobContainer>>)
|
||||
-> Result<Vec<Entry>>;
|
||||
fn get(
|
||||
&self,
|
||||
address: &Address,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> Result<Vec<Entry>>;
|
||||
|
||||
fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn insert_info(
|
||||
&self,
|
||||
address: Address,
|
||||
connection: UpEndConnection,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> Result<()> {
|
||||
let entries = self.get(address, job_container)?;
|
||||
if self.is_needed(address, connection)? {
|
||||
let entries = self.get(address, job_container)?;
|
||||
|
||||
connection.transaction(|| {
|
||||
for entry in entries {
|
||||
connection.insert_entry(entry)?;
|
||||
}
|
||||
connection.transaction(|| {
|
||||
for entry in entries {
|
||||
connection.insert_entry(entry)?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
} else {
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ use crate::{
|
|||
util::jobs::{Job, JobContainer, State},
|
||||
};
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use webpage::{Webpage, WebpageOptions};
|
||||
|
||||
|
@ -13,7 +14,7 @@ pub struct WebExtractor;
|
|||
impl Extractor for WebExtractor {
|
||||
fn get(
|
||||
&self,
|
||||
address: Address,
|
||||
address: &Address,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> anyhow::Result<Vec<Entry>> {
|
||||
if let Address::Url(url) = address {
|
||||
|
@ -32,7 +33,6 @@ impl Extractor for WebExtractor {
|
|||
.unwrap()
|
||||
.update_progress(&job_id, 50.0);
|
||||
|
||||
let address = Address::Url(url);
|
||||
let mut entries = vec![
|
||||
webpage.html.title.map(|html_title| Entry {
|
||||
entity: address.clone(),
|
||||
|
@ -77,4 +77,41 @@ impl Extractor for WebExtractor {
|
|||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
fn is_needed(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &crate::database::UpEndConnection,
|
||||
) -> Result<bool> {
|
||||
Ok(connection
|
||||
.query(
|
||||
format!("(matches \"{address}\" (in \"HTML_TITLE\" \"HTML_DESCRIPTION\") ?)")
|
||||
.parse()?,
|
||||
)?
|
||||
.is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use anyhow::Result;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_extract() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let open_result = crate::database::UpEndDatabase::open(&temp_dir, None, true)?;
|
||||
let connection = open_result.db.connection()?;
|
||||
let job_container = Arc::new(RwLock::new(crate::util::jobs::JobContainer::default()));
|
||||
|
||||
let address = Address::Url("https://upendproject.net".into());
|
||||
assert!(WebExtractor.is_needed(&address, &connection)?);
|
||||
|
||||
WebExtractor.insert_info(&address, &connection, job_container)?;
|
||||
|
||||
assert!(!WebExtractor.is_needed(&address, &connection)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -367,8 +367,8 @@ pub async fn put_object(
|
|||
let _address = address.clone();
|
||||
block_background(move || {
|
||||
(crate::extractors::web::WebExtractor {}).insert_info(
|
||||
_address,
|
||||
state.upend.connection()?,
|
||||
&_address,
|
||||
&state.upend.connection()?,
|
||||
state.job_container.clone(),
|
||||
)
|
||||
});
|
||||
|
|
Loading…
Reference in New Issue