upend/src/extractors/web.rs

107 lines
3.3 KiB
Rust

use super::Extractor;
use crate::{
addressing::Address,
database::{entry::Entry, UpEndConnection},
util::jobs::{JobContainer, JobState},
};
use anyhow::anyhow;
use anyhow::Result;
use webpage::{Webpage, WebpageOptions};
pub struct WebExtractor;
impl Extractor for WebExtractor {
fn get(
&self,
address: &Address,
_: &UpEndConnection,
mut job_container: JobContainer,
) -> anyhow::Result<Vec<Entry>> {
if let Address::Url(url) = address {
let mut job_handle =
job_container.add_job(None, &format!("Getting info about {url:?}"))?;
let webpage_url = url.clone();
let webpage_get = Webpage::from_url(&webpage_url, WebpageOptions::default());
if let Ok(webpage) = webpage_get {
let _ = job_handle.update_progress(50.0);
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
}))
}
let _ = job_handle.update_state(JobState::Done);
return Ok(entries.into_iter().flatten().collect());
}
Err(anyhow!("Failed for unknown reason."))
} else {
Ok(vec![])
}
}
fn is_needed(
&self,
address: &Address,
connection: &crate::database::UpEndConnection,
) -> Result<bool> {
Ok(connection
.query(
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
.parse()?,
)?
.is_empty())
}
}
#[cfg(test)]
mod test {
use crate::util::jobs::JobContainer;
use super::*;
use anyhow::Result;
use tempfile::TempDir;
#[test]
fn test_extract() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let open_result = crate::database::UpEndDatabase::open(&temp_dir, None, true)?;
let connection = open_result.db.connection()?;
let job_container = JobContainer::new();
let address = Address::Url("https://upendproject.net".into());
assert!(WebExtractor.is_needed(&address, &connection)?);
WebExtractor.insert_info(&address, &connection, job_container)?;
assert!(!WebExtractor.is_needed(&address, &connection)?);
Ok(())
}
}