86 lines
2.8 KiB
Rust
86 lines
2.8 KiB
Rust
use super::Extractor;
|
|
use crate::{
|
|
addressing::Address,
|
|
database::entry::Entry,
|
|
util::jobs::{Job, JobContainer, State},
|
|
};
|
|
use actix_web::web;
|
|
use anyhow::anyhow;
|
|
use async_trait::async_trait;
|
|
use std::sync::{Arc, RwLock};
|
|
use webpage::{Webpage, WebpageOptions};
|
|
|
|
pub struct WebExtractor;
|
|
|
|
#[async_trait]
|
|
impl Extractor for WebExtractor {
|
|
async fn get(
|
|
&self,
|
|
address: Address,
|
|
job_container: Arc<RwLock<JobContainer>>,
|
|
) -> anyhow::Result<Vec<Entry>> {
|
|
if let Address::Url(url) = address {
|
|
let job_id = job_container
|
|
.write()
|
|
.unwrap()
|
|
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
|
|
.unwrap();
|
|
|
|
let webpage_url = url.clone();
|
|
let webpage_get =
|
|
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default()))
|
|
.await;
|
|
|
|
if let Ok(webpage) = webpage_get {
|
|
let _ = job_container
|
|
.write()
|
|
.unwrap()
|
|
.update_progress(&job_id, 50.0);
|
|
|
|
let address = Address::Url(url.clone());
|
|
let mut entries = vec![
|
|
webpage.html.title.map(|html_title| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_TITLE".to_string(),
|
|
value: html_title.into(),
|
|
}),
|
|
webpage.html.description.map(|html_desc| Entry {
|
|
entity: address.clone(),
|
|
attribute: "HTML_DESCRIPTION".to_string(),
|
|
value: html_desc.into(),
|
|
}),
|
|
];
|
|
for (key, value) in webpage.html.opengraph.properties {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: format!("OG_{}", key.to_uppercase()),
|
|
value: value.into(),
|
|
}))
|
|
}
|
|
for image in webpage.html.opengraph.images {
|
|
entries.push(Some(Entry {
|
|
entity: address.clone(),
|
|
attribute: "OG_IMAGE".to_string(),
|
|
value: image.url.into(),
|
|
}))
|
|
}
|
|
|
|
let _ = job_container
|
|
.write()
|
|
.unwrap()
|
|
.update_progress(&job_id, 100.0);
|
|
|
|
return Ok(entries.into_iter().flatten().collect());
|
|
}
|
|
|
|
let _ = job_container
|
|
.write()
|
|
.unwrap()
|
|
.update_state(&job_id, State::Failed);
|
|
Err(anyhow!("Failed for unknown reason."))
|
|
} else {
|
|
Err(anyhow!("Can only extract info for URLs."))
|
|
}
|
|
}
|
|
}
|