add an "is_needed" check for extractors; add tests for web extractor

feat/vaults
Tomáš Mládek 2022-02-28 20:36:16 +01:00
parent 27cc6eb31c
commit 920e67a283
No known key found for this signature in database
GPG Key ID: 65E225C8B3E2ED8A
3 changed files with 62 additions and 14 deletions

View File

@ -10,22 +10,33 @@ use std::sync::{Arc, RwLock};
pub mod web;
pub trait Extractor {
fn get(&self, address: Address, job_container: Arc<RwLock<JobContainer>>)
-> Result<Vec<Entry>>;
fn get(
&self,
address: &Address,
job_container: Arc<RwLock<JobContainer>>,
) -> Result<Vec<Entry>>;
fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result<bool> {
Ok(true)
}
fn insert_info(
&self,
address: Address,
connection: UpEndConnection,
address: &Address,
connection: &UpEndConnection,
job_container: Arc<RwLock<JobContainer>>,
) -> Result<()> {
let entries = self.get(address, job_container)?;
if self.is_needed(address, connection)? {
let entries = self.get(address, job_container)?;
connection.transaction(|| {
for entry in entries {
connection.insert_entry(entry)?;
}
connection.transaction(|| {
for entry in entries {
connection.insert_entry(entry)?;
}
Ok(())
})
} else {
Ok(())
})
}
}
}

View File

@ -5,6 +5,7 @@ use crate::{
util::jobs::{Job, JobContainer, State},
};
use anyhow::anyhow;
use anyhow::Result;
use std::sync::{Arc, RwLock};
use webpage::{Webpage, WebpageOptions};
@ -13,7 +14,7 @@ pub struct WebExtractor;
impl Extractor for WebExtractor {
fn get(
&self,
address: Address,
address: &Address,
job_container: Arc<RwLock<JobContainer>>,
) -> anyhow::Result<Vec<Entry>> {
if let Address::Url(url) = address {
@ -32,7 +33,6 @@ impl Extractor for WebExtractor {
.unwrap()
.update_progress(&job_id, 50.0);
let address = Address::Url(url);
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
@ -77,4 +77,41 @@ impl Extractor for WebExtractor {
Ok(vec![])
}
}
fn is_needed(
&self,
address: &Address,
connection: &crate::database::UpEndConnection,
) -> Result<bool> {
Ok(connection
.query(
format!("(matches \"{address}\" (in \"HTML_TITLE\" \"HTML_DESCRIPTION\") ?)")
.parse()?,
)?
.is_empty())
}
}
#[cfg(test)]
mod test {
use super::*;
use anyhow::Result;
use tempfile::TempDir;
#[test]
fn test_extract() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let open_result = crate::database::UpEndDatabase::open(&temp_dir, None, true)?;
let connection = open_result.db.connection()?;
let job_container = Arc::new(RwLock::new(crate::util::jobs::JobContainer::default()));
let address = Address::Url("https://upendproject.net".into());
assert!(WebExtractor.is_needed(&address, &connection)?);
WebExtractor.insert_info(&address, &connection, job_container)?;
assert!(!WebExtractor.is_needed(&address, &connection)?);
Ok(())
}
}

View File

@ -367,8 +367,8 @@ pub async fn put_object(
let _address = address.clone();
block_background(move || {
(crate::extractors::web::WebExtractor {}).insert_info(
_address,
state.upend.connection()?,
&_address,
&state.upend.connection()?,
state.job_container.clone(),
)
});