add metadata to urls on insert

feat/vaults
Tomáš Mládek 2022-02-10 11:38:45 +01:00
parent bbc871ccf6
commit 674663028d
No known key found for this signature in database
GPG Key ID: ED21612889E75EC5
7 changed files with 394 additions and 13 deletions

292
Cargo.lock generated
View File

@ -223,7 +223,7 @@ dependencies = [
"mio-uds",
"num_cpus",
"slab",
"socket2",
"socket2 0.3.19",
]
[[package]]
@ -247,7 +247,7 @@ dependencies = [
"actix-server",
"actix-service",
"log",
"socket2",
"socket2 0.3.19",
]
[[package]]
@ -330,7 +330,7 @@ dependencies = [
"serde",
"serde_json",
"serde_urlencoded",
"socket2",
"socket2 0.3.19",
"time 0.2.27",
"tinyvec",
"url",
@ -770,6 +770,36 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "curl"
version = "0.4.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7de97b894edd5b5bcceef8b78d7da9b75b1d2f2f9a910569d0bde3dd31d84939"
dependencies = [
"curl-sys",
"libc",
"openssl-probe",
"openssl-sys",
"schannel",
"socket2 0.4.4",
"winapi 0.3.9",
]
[[package]]
name = "curl-sys"
version = "0.4.52+curl-7.81.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8c2d1023ea5fded5b7b892e4b8e95f70038a421126a056761a84246a28971"
dependencies = [
"cc",
"libc",
"libz-sys",
"openssl-sys",
"pkg-config",
"vcpkg",
"winapi 0.3.9",
]
[[package]]
name = "deflate"
version = "0.8.6"
@ -969,6 +999,16 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.19"
@ -1155,6 +1195,20 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "html5ever"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "http"
version = "0.2.6"
@ -1242,7 +1296,7 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7e2f18aece9709094573a9f24f483c4f65caa4298e2f7ae1b71cc65d853fad7"
dependencies = [
"socket2",
"socket2 0.3.19",
"widestring",
"winapi 0.3.9",
"winreg",
@ -1343,9 +1397,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.112"
version = "0.2.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b03d17f364a3a042d5e5d46b053bbbf82c92c9430c592dd4c064dc6ee997125"
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
[[package]]
name = "libsqlite3-sys"
@ -1367,6 +1421,18 @@ dependencies = [
"cc",
]
[[package]]
name = "libz-sys"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "linked-hash-map"
version = "0.5.4"
@ -1409,6 +1475,38 @@ dependencies = [
"linked-hash-map",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
@ -1579,6 +1677,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "nom"
version = "4.2.3"
@ -1688,6 +1792,25 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb"
dependencies = [
"autocfg",
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "parking_lot"
version = "0.11.2"
@ -1729,6 +1852,53 @@ dependencies = [
"indexmap",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared 0.8.0",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared 0.8.0",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared 0.8.0",
"rand 0.7.3",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "0.4.29"
@ -1811,6 +1981,12 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
@ -1863,6 +2039,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg",
]
[[package]]
@ -1933,6 +2110,15 @@ dependencies = [
"rand_core 0.6.3",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rayon"
version = "1.5.1"
@ -2052,6 +2238,16 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "schannel"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75"
dependencies = [
"lazy_static",
"winapi 0.3.9",
]
[[package]]
name = "scheduled-thread-pool"
version = "0.2.5"
@ -2168,6 +2364,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e"
[[package]]
name = "slab"
version = "0.4.5"
@ -2191,6 +2393,16 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "socket2"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]]
name = "standback"
version = "0.2.17"
@ -2249,6 +2461,32 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
[[package]]
name = "string_cache"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33994d0838dc2d152d17a62adf608a869b5e846b65b389af7f3dbc1de45c5b26"
dependencies = [
"lazy_static",
"new_debug_unreachable",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97"
dependencies = [
"phf_generator",
"phf_shared 0.8.0",
"proc-macro2",
"quote",
]
[[package]]
name = "strsim"
version = "0.8.0"
@ -2280,6 +2518,17 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "tendril"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9ef557cb397a4f0a5a3a628f06515f78563f2209e64d47055d9dc6052bf5e33"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "termcolor"
version = "1.1.2"
@ -2668,6 +2917,7 @@ dependencies = [
"walkdir",
"webbrowser",
"webp",
"webpage",
"xdg",
]
@ -2683,6 +2933,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "uuid"
version = "0.8.2"
@ -2856,6 +3112,18 @@ dependencies = [
"libwebp-sys",
]
[[package]]
name = "webpage"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d03ebca8fe2492fffdb5be0f681a942665c50488348d8abc00efb470e0ea890"
dependencies = [
"curl",
"html5ever",
"markup5ever_rcdom",
"serde_json",
]
[[package]]
name = "weezl"
version = "0.1.5"
@ -2938,3 +3206,15 @@ checksum = "3a23fe958c70412687039c86f578938b4a0bb50ec788e96bce4d6ab00ddd5803"
dependencies = [
"dirs",
]
[[package]]
name = "xml5ever"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9234163818fd8e2418fcde330655e757900d4236acd8cc70fef345ef91f6d865"
dependencies = [
"log",
"mac",
"markup5ever",
"time 0.1.43",
]

View File

@ -68,11 +68,14 @@ actix-multipart = "0.3.0"
image = { version = "0.23.14", optional = true }
webp = { version = "0.2.0", optional = true }
webpage = { version = "1.4.0", optional = true }
[build-dependencies]
built = "0.5.1"
[features]
default = ["desktop", "previews", "previews-image"]
default = ["desktop", "previews", "previews-image", 'extractors-web']
desktop = ["webbrowser", "opener", "is_executable"]
previews = []
previews-image = ["image", "webp"]
extractors-web = ["webpage"]

View File

@ -233,6 +233,12 @@ impl std::str::FromStr for EntryValue {
}
}
impl From<String> for EntryValue {
fn from(str: String) -> Self {
Self::String(str)
}
}
#[cfg(test)]
mod tests {
use super::*;

3
src/extractors/mod.rs Normal file
View File

@ -0,0 +1,3 @@
#[cfg(feature = "extractors-web")]
pub mod web;

81
src/extractors/web.rs Normal file
View File

@ -0,0 +1,81 @@
use crate::{
addressing::Address,
database::{entry::Entry, UpEndConnection},
util::jobs::{Job, JobContainer, State},
};
use actix_web::web;
use std::sync::{Arc, RwLock};
use webpage::{Webpage, WebpageOptions};
pub async fn insert_info(
url: String,
connection: UpEndConnection,
job_container: Arc<RwLock<JobContainer>>,
) {
let job_id = job_container
.write()
.unwrap()
.add_job(Job::new(None, &format!("Getting info about {url:?}")))
.unwrap();
let webpage_url = url.clone();
let webpage_get =
web::block(move || Webpage::from_url(&webpage_url, WebpageOptions::default())).await;
if let Ok(webpage) = webpage_get {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 50.0);
let address = Address::Url(url.clone());
let mut entries = vec![
webpage.html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.into(),
}),
webpage.html.description.map(|html_desc| Entry {
entity: address.clone(),
attribute: "HTML_DESCRIPTION".to_string(),
value: html_desc.into(),
}),
];
for (key, value) in webpage.html.opengraph.properties {
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
value: value.into(),
}))
}
for image in webpage.html.opengraph.images {
entries.push(Some(Entry {
entity: address.clone(),
attribute: "OG_IMAGE".to_string(),
value: image.url.into(),
}))
}
let insert_result = web::block::<_, _, anyhow::Error>(move || {
connection.transaction(|| {
for entry in entries.into_iter().flatten() {
connection.insert_entry(entry)?;
}
Ok(())
})
})
.await;
if let Ok(()) = insert_result {
let _ = job_container
.write()
.unwrap()
.update_progress(&job_id, 100.0);
return;
}
}
let _ = job_container
.write()
.unwrap()
.update_state(&job_id, State::Failed);
}

View File

@ -22,6 +22,7 @@ use crate::{
mod addressing;
mod common;
mod database;
mod extractors;
mod filesystem;
mod previews;
mod routes;

View File

@ -282,12 +282,19 @@ pub async fn put_object(
attribute: LABEL_ATTR.to_string(),
value: EntryValue::String(format!("ATTRIBUTE: {attribute}")),
}],
// todo: set off an opengraph query
Address::Url(url) => vec![Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: EntryValue::String(url.to_string()),
}],
Address::Url(url) => {
#[cfg(feature = "extractors-web")]
actix::spawn(crate::extractors::web::insert_info(
url.clone(),
state.upend.connection().map_err(ErrorInternalServerError)?,
state.job_container.clone(),
));
vec![Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: EntryValue::String(url.clone()),
}]
}
};
connection