create extractor trait
parent
3da153296e
commit
6820e9a174
|
@ -2884,6 +2884,7 @@ dependencies = [
|
|||
"actix-web",
|
||||
"actix_derive",
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bs58",
|
||||
"built",
|
||||
"chrono",
|
||||
|
|
|
@ -69,6 +69,7 @@ image = { version = "0.23.14", optional = true }
|
|||
webp = { version = "0.2.0", optional = true }
|
||||
|
||||
webpage = { version = "1.4.0", optional = true }
|
||||
async-trait = "0.1.52"
|
||||
|
||||
[build-dependencies]
|
||||
built = "0.5.1"
|
||||
|
|
|
@ -1,3 +1,48 @@
|
|||
use crate::{
|
||||
addressing::Address,
|
||||
database::{entry::Entry, UpEndConnection},
|
||||
util::jobs::JobContainer,
|
||||
};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
#[cfg(feature = "extractors-web")]
|
||||
pub mod web;
|
||||
|
||||
#[async_trait]
|
||||
pub trait Extractor {
|
||||
async fn get(
|
||||
&self,
|
||||
address: Address,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> Result<Vec<Entry>>;
|
||||
|
||||
async fn insert_info(
|
||||
&self,
|
||||
address: Address,
|
||||
connection: UpEndConnection,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> Result<()> {
|
||||
let entries = self.get(address, job_container).await?;
|
||||
|
||||
Ok(actix_web::web::block::<_, _, anyhow::Error>(move || {
|
||||
connection.transaction(|| {
|
||||
for entry in entries {
|
||||
connection.insert_entry(entry)?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
})
|
||||
.await?)
|
||||
}
|
||||
|
||||
async fn insert_info_fnf(
|
||||
&self,
|
||||
address: Address,
|
||||
connection: UpEndConnection,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) {
|
||||
let _ = self.insert_info(address, connection, job_container).await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,81 +1,85 @@
|
|||
use super::Extractor;
|
||||
use crate::{
|
||||
addressing::Address,
|
||||
database::{entry::Entry, UpEndConnection},
|
||||
database::entry::Entry,
|
||||
util::jobs::{Job, JobContainer, State},
|
||||
};
|
||||
use actix_web::web;
|
||||
use anyhow::anyhow;
|
||||
use async_trait::async_trait;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use webpage::{Webpage, WebpageOptions};
|
||||
|
||||
pub async fn insert_info(
|
||||
url: String,
|
||||
connection: UpEndConnection,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) {
|
||||
let job_id = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
|
||||
.unwrap();
|
||||
pub struct WebExtractor;
|
||||
|
||||
let webpage_url = url.clone();
|
||||
let webpage_get =
|
||||
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default())).await;
|
||||
#[async_trait]
|
||||
impl Extractor for WebExtractor {
|
||||
async fn get(
|
||||
&self,
|
||||
address: Address,
|
||||
job_container: Arc<RwLock<JobContainer>>,
|
||||
) -> anyhow::Result<Vec<Entry>> {
|
||||
if let Address::Url(url) = address {
|
||||
let job_id = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
|
||||
.unwrap();
|
||||
|
||||
if let Ok(webpage) = webpage_get {
|
||||
let _ = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.update_progress(&job_id, 50.0);
|
||||
let webpage_url = url.clone();
|
||||
let webpage_get =
|
||||
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default()))
|
||||
.await;
|
||||
|
||||
let address = Address::Url(url.clone());
|
||||
let mut entries = vec![
|
||||
webpage.html.title.map(|html_title| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "HTML_TITLE".to_string(),
|
||||
value: html_title.into(),
|
||||
}),
|
||||
webpage.html.description.map(|html_desc| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "HTML_DESCRIPTION".to_string(),
|
||||
value: html_desc.into(),
|
||||
}),
|
||||
];
|
||||
for (key, value) in webpage.html.opengraph.properties {
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: format!("OG_{}", key.to_uppercase()),
|
||||
value: value.into(),
|
||||
}))
|
||||
}
|
||||
for image in webpage.html.opengraph.images {
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "OG_IMAGE".to_string(),
|
||||
value: image.url.into(),
|
||||
}))
|
||||
}
|
||||
if let Ok(webpage) = webpage_get {
|
||||
let _ = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.update_progress(&job_id, 50.0);
|
||||
|
||||
let insert_result = web::block::<_, _, anyhow::Error>(move || {
|
||||
connection.transaction(|| {
|
||||
for entry in entries.into_iter().flatten() {
|
||||
connection.insert_entry(entry)?;
|
||||
let address = Address::Url(url.clone());
|
||||
let mut entries = vec![
|
||||
webpage.html.title.map(|html_title| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "HTML_TITLE".to_string(),
|
||||
value: html_title.into(),
|
||||
}),
|
||||
webpage.html.description.map(|html_desc| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "HTML_DESCRIPTION".to_string(),
|
||||
value: html_desc.into(),
|
||||
}),
|
||||
];
|
||||
for (key, value) in webpage.html.opengraph.properties {
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: format!("OG_{}", key.to_uppercase()),
|
||||
value: value.into(),
|
||||
}))
|
||||
}
|
||||
for image in webpage.html.opengraph.images {
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "OG_IMAGE".to_string(),
|
||||
value: image.url.into(),
|
||||
}))
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
})
|
||||
.await;
|
||||
|
||||
if let Ok(()) = insert_result {
|
||||
let _ = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.update_progress(&job_id, 100.0);
|
||||
|
||||
return Ok(entries.into_iter().flatten().collect());
|
||||
}
|
||||
|
||||
let _ = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.update_progress(&job_id, 100.0);
|
||||
return;
|
||||
.update_state(&job_id, State::Failed);
|
||||
Err(anyhow!("Failed for unknown reason."))
|
||||
} else {
|
||||
Err(anyhow!("Can only extract info for URLs."))
|
||||
}
|
||||
}
|
||||
let _ = job_container
|
||||
.write()
|
||||
.unwrap()
|
||||
.update_state(&job_id, State::Failed);
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ use crate::database::entry::{Entry, EntryValue, InvariantEntry};
|
|||
use crate::database::hierarchies::{list_roots, resolve_path, UHierPath};
|
||||
use crate::database::lang::Query;
|
||||
use crate::database::UpEndDatabase;
|
||||
use crate::extractors::Extractor;
|
||||
use crate::filesystem::add_file;
|
||||
use crate::previews::PreviewStore;
|
||||
use crate::util::hash::{b58_decode, b58_encode, Hashable};
|
||||
|
@ -353,11 +354,13 @@ pub async fn put_object(
|
|||
}],
|
||||
Address::Url(url) => {
|
||||
#[cfg(feature = "extractors-web")]
|
||||
actix::spawn(crate::extractors::web::insert_info(
|
||||
url.clone(),
|
||||
state.upend.connection().map_err(ErrorInternalServerError)?,
|
||||
state.job_container.clone(),
|
||||
));
|
||||
actix::spawn(
|
||||
(crate::extractors::web::WebExtractor {}).insert_info_fnf(
|
||||
address.clone(),
|
||||
state.upend.connection().map_err(ErrorInternalServerError)?,
|
||||
state.job_container.clone(),
|
||||
),
|
||||
);
|
||||
vec![Entry {
|
||||
entity: address.clone(),
|
||||
attribute: LABEL_ATTR.to_string(),
|
||||
|
|
Loading…
Reference in New Issue