create extractor trait

feat/vaults
Tomáš Mládek 2022-02-15 13:32:46 +01:00
parent 3da153296e
commit 6820e9a174
No known key found for this signature in database
GPG Key ID: 65E225C8B3E2ED8A
5 changed files with 120 additions and 66 deletions

1
Cargo.lock generated
View File

@ -2884,6 +2884,7 @@ dependencies = [
"actix-web",
"actix_derive",
"anyhow",
"async-trait",
"bs58",
"built",
"chrono",

View File

@ -69,6 +69,7 @@ image = { version = "0.23.14", optional = true }
webp = { version = "0.2.0", optional = true }
webpage = { version = "1.4.0", optional = true }
async-trait = "0.1.52"
[build-dependencies]
built = "0.5.1"

View File

@ -1,3 +1,48 @@
use crate::{
addressing::Address,
database::{entry::Entry, UpEndConnection},
util::jobs::JobContainer,
};
use anyhow::Result;
use async_trait::async_trait;
use std::sync::{Arc, RwLock};
#[cfg(feature = "extractors-web")]
pub mod web;
#[async_trait]
pub trait Extractor {
async fn get(
&self,
address: Address,
job_container: Arc<RwLock<JobContainer>>,
) -> Result<Vec<Entry>>;
async fn insert_info(
&self,
address: Address,
connection: UpEndConnection,
job_container: Arc<RwLock<JobContainer>>,
) -> Result<()> {
let entries = self.get(address, job_container).await?;
Ok(actix_web::web::block::<_, _, anyhow::Error>(move || {
connection.transaction(|| {
for entry in entries {
connection.insert_entry(entry)?;
}
Ok(())
})
})
.await?)
}
async fn insert_info_fnf(
&self,
address: Address,
connection: UpEndConnection,
job_container: Arc<RwLock<JobContainer>>,
) {
let _ = self.insert_info(address, connection, job_container).await;
}
}

View File

@ -1,81 +1,85 @@
use super::Extractor;
use crate::{
addressing::Address,
database::{entry::Entry, UpEndConnection},
database::entry::Entry,
util::jobs::{Job, JobContainer, State},
};
use actix_web::web;
use anyhow::anyhow;
use async_trait::async_trait;
use std::sync::{Arc, RwLock};
use webpage::{Webpage, WebpageOptions};
pub async fn insert_info(
url: String,
connection: UpEndConnection,
job_container: Arc<RwLock<JobContainer>>,
) {
let job_id = job_container
.write()
.unwrap()
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
.unwrap();
pub struct WebExtractor;
let webpage_url = url.clone();
let webpage_get =
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default())).await;
#[async_trait]
impl Extractor for WebExtractor {
async fn get(
&self,
address: Address,
job_container: Arc<RwLock<JobContainer>>,
) -> anyhow::Result<Vec<Entry>> {
if let Address::Url(url) = address {
let job_id = job_container
.write()
.unwrap()
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
.unwrap();
if let Ok(webpage) = webpage_get {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 50.0);
let webpage_url = url.clone();
let webpage_get =
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default()))
.await;
let address = Address::Url(url.clone());
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
}))
}
if let Ok(webpage) = webpage_get {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 50.0);
let insert_result = web::block::<_, _, anyhow::Error>(move || {
connection.transaction(|| {
for entry in entries.into_iter().flatten() {
connection.insert_entry(entry)?;
let address = Address::Url(url.clone());
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
}))
}
Ok(())
})
})
.await;
if let Ok(()) = insert_result {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 100.0);
return Ok(entries.into_iter().flatten().collect());
}
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 100.0);
return;
.update_state(&job_id, State::Failed);
Err(anyhow!("Failed for unknown reason."))
} else {
Err(anyhow!("Can only extract info for URLs."))
}
}
let _ = job_container
.write()
.unwrap()
.update_state(&job_id, State::Failed);
}

View File

@ -4,6 +4,7 @@ use crate::database::entry::{Entry, EntryValue, InvariantEntry};
use crate::database::hierarchies::{list_roots, resolve_path, UHierPath};
use crate::database::lang::Query;
use crate::database::UpEndDatabase;
use crate::extractors::Extractor;
use crate::filesystem::add_file;
use crate::previews::PreviewStore;
use crate::util::hash::{b58_decode, b58_encode, Hashable};
@ -353,11 +354,13 @@ pub async fn put_object(
}],
Address::Url(url) => {
#[cfg(feature = "extractors-web")]
actix::spawn(crate::extractors::web::insert_info(
url.clone(),
state.upend.connection().map_err(ErrorInternalServerError)?,
state.job_container.clone(),
));
actix::spawn(
(crate::extractors::web::WebExtractor {}).insert_info_fnf(
address.clone(),
state.upend.connection().map_err(ErrorInternalServerError)?,
state.job_container.clone(),
),
);
vec![Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),