upend/src/extractors/web.rs

86 lines
2.8 KiB
Rust

use super::Extractor;
use crate::{
addressing::Address,
database::entry::Entry,
util::jobs::{Job, JobContainer, State},
};
use actix_web::web;
use anyhow::anyhow;
use async_trait::async_trait;
use std::sync::{Arc, RwLock};
use webpage::{Webpage, WebpageOptions};
pub struct WebExtractor;
#[async_trait]
impl Extractor for WebExtractor {
async fn get(
&self,
address: Address,
job_container: Arc<RwLock<JobContainer>>,
) -> anyhow::Result<Vec<Entry>> {
if let Address::Url(url) = address {
let job_id = job_container
.write()
.unwrap()
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
.unwrap();
let webpage_url = url.clone();
let webpage_get =
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default()))
.await;
if let Ok(webpage) = webpage_get {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 50.0);
let address = Address::Url(url.clone());
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
}))
}
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 100.0);
return Ok(entries.into_iter().flatten().collect());
}
let _ = job_container
.write()
.unwrap()
.update_state(&job_id, State::Failed);
Err(anyhow!("Failed for unknown reason."))
} else {
Err(anyhow!("Can only extract info for URLs."))
}
}
}