diff --git a/Cargo.lock b/Cargo.lock index 7800a53..8cee6e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -775,36 +775,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "curl" -version = "0.4.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "509bd11746c7ac09ebd19f0b17782eae80aadee26237658a6b4808afb5c11a22" -dependencies = [ - "curl-sys", - "libc", - "openssl-probe", - "openssl-sys", - "schannel", - "socket2", - "winapi", -] - -[[package]] -name = "curl-sys" -version = "0.4.61+curl-8.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d05c10f541ae6f3bc5b3d923c20001f47db7d5f0b2bc6ad16490133842db79" -dependencies = [ - "cc", - "libc", - "libz-sys", - "openssl-sys", - "pkg-config", - "vcpkg", - "winapi", -] - [[package]] name = "cxx" version = "1.0.94" @@ -3503,7 +3473,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d261bbae112cb48a95d3cc9e8873a4e40933bc54ae8eddc1eef70e952dd3b232" dependencies = [ - "curl", "html5ever", "markup5ever_rcdom", "serde_json", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 8bc4aa7..144ac81 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -74,7 +74,7 @@ nonempty = "0.6.0" image = { version = "0.23.14", optional = true } webp = { version = "0.2.0", optional = true } -webpage = { version = "1.5.0", optional = true } +webpage = { version = "1.5.0", optional = true, default-features = false} id3 = { version = "1.0.2", optional = true } kamadak-exif = { version = "0.5.4", optional = true } diff --git a/cli/src/common.rs b/cli/src/common.rs index ef11d3b..b79eb25 100644 --- a/cli/src/common.rs +++ b/cli/src/common.rs @@ -1,7 +1,6 @@ use anyhow::{anyhow, Result}; -use shadow_rs::{is_debug, shadow}; - -shadow!(build); +use lazy_static::lazy_static; +use shadow_rs::is_debug; pub fn get_static_dir>(dir: S) -> Result { let cwd = std::env::current_exe()?.parent().unwrap().to_path_buf(); @@ -17,3 +16,16 @@ pub fn get_static_dir>(dir: S) -> Result { Err(anyhow!("Path {result:?} doesn't exist.")) } } + +lazy_static! { + static ref APP_USER_AGENT: String = format!( + "{} / {}", + upend::common::build::PROJECT_NAME, + upend::common::build::PKG_VERSION + ); + + pub static ref REQWEST_CLIENT: reqwest::blocking::Client = reqwest::blocking::Client::builder() + .user_agent(APP_USER_AGENT.as_str()) + .build() + .unwrap(); +} diff --git a/cli/src/extractors/web.rs b/cli/src/extractors/web.rs index 91abb45..84dbabf 100644 --- a/cli/src/extractors/web.rs +++ b/cli/src/extractors/web.rs @@ -1,16 +1,16 @@ use std::sync::Arc; use super::Extractor; +use crate::common::REQWEST_CLIENT; use anyhow::anyhow; use anyhow::Result; -use upend::common::APP_USER_AGENT; use upend::{ addressing::Address, database::{entry::Entry, stores::UpStore, UpEndConnection}, util::jobs::{JobContainer, JobState}, }; -use webpage::{Webpage, WebpageOptions}; +use webpage::HTML; pub struct WebExtractor; @@ -26,25 +26,21 @@ impl Extractor for WebExtractor { let mut job_handle = job_container.add_job(None, &format!("Getting info about {url:?}"))?; - let webpage_url = url.clone(); - let options = WebpageOptions { - useragent: APP_USER_AGENT.to_string(), - ..WebpageOptions::default() - }; - let webpage_get = Webpage::from_url(webpage_url.as_ref(), options); + let response = REQWEST_CLIENT.get(url.clone()).send()?; + let html = HTML::from_string(response.text()?, Some(url.to_string())); - if let Ok(webpage) = webpage_get { + if let Ok(html) = html { let _ = job_handle.update_progress(50.0); let mut entries = vec![ - webpage.html.title.map(|html_title| Entry { + html.title.map(|html_title| Entry { entity: address.clone(), attribute: "HTML_TITLE".to_string(), value: html_title.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), }), - webpage.html.description.map(|html_desc| Entry { + html.description.map(|html_desc| Entry { entity: address.clone(), attribute: "HTML_DESCRIPTION".to_string(), value: html_desc.into(), @@ -52,7 +48,7 @@ impl Extractor for WebExtractor { timestamp: chrono::Utc::now().naive_utc(), }), ]; - for (key, value) in webpage.html.opengraph.properties { + for (key, value) in html.opengraph.properties { entries.push(Some(Entry { entity: address.clone(), attribute: format!("OG_{}", key.to_uppercase()), @@ -61,7 +57,7 @@ impl Extractor for WebExtractor { timestamp: chrono::Utc::now().naive_utc(), })) } - for image in webpage.html.opengraph.images { + for image in html.opengraph.images { entries.push(Some(Entry { entity: address.clone(), attribute: "OG_IMAGE".to_string(), diff --git a/cli/src/main.rs b/cli/src/main.rs index 80ae47d..35feb08 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,5 +1,6 @@ #[macro_use] extern crate upend; +use crate::common::{get_static_dir, REQWEST_CLIENT}; use actix_cors::Cors; use actix_web::web::Data; use actix_web::{middleware, App, HttpServer}; @@ -20,12 +21,11 @@ use tracing::trace; use tracing::{debug, error, info, warn}; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use upend::addressing::Address; -use upend::common::APP_USER_AGENT; use upend::database::entry::EntryValue; use upend::util::hash::hash; use upend::{ - common::{build, get_static_dir}, + common::build, config::UpEndConfig, database::{ stores::{fs::FsStore, UpStore}, @@ -36,6 +36,7 @@ use upend::{ use crate::util::exec::block_background; +mod common; mod routes; mod util; @@ -200,10 +201,7 @@ async fn main() -> Result<()> { let api_url = url.join("/api/query")?; debug!("Querying \"{}\"", api_url); - let client = reqwest::blocking::Client::builder() - .user_agent(APP_USER_AGENT.as_str()) - .build()?; - let response = client.post(api_url).body(query).send()?; + let response = REQWEST_CLIENT.post(api_url).body(query).send()?; response.error_for_status_ref()?; @@ -254,10 +252,7 @@ async fn main() -> Result<()> { }); debug!("Inserting {:?} at \"{}\"", body, api_url); - let client = reqwest::blocking::Client::builder() - .user_agent(APP_USER_AGENT.as_str()) - .build()?; - let response = client.put(api_url).json(&body).send()?; + let response = REQWEST_CLIENT.put(api_url).json(&body).send()?; match response.error_for_status_ref() { Ok(_) => { diff --git a/cli/src/routes.rs b/cli/src/routes.rs index 7a860af..c6d15ae 100644 --- a/cli/src/routes.rs +++ b/cli/src/routes.rs @@ -1,3 +1,4 @@ +use crate::common::REQWEST_CLIENT; use crate::extractors; use crate::previews::PreviewStore; use crate::util::exec::block_background; @@ -27,7 +28,7 @@ use std::time::{SystemTime, UNIX_EPOCH}; use tempfile::NamedTempFile; use tracing::{debug, info, trace}; use upend::addressing::{Address, Addressable}; -use upend::common::{build, APP_USER_AGENT}; +use upend::common::build; use upend::config::UpEndConfig; use upend::database::constants::{ADDED_ATTR, LABEL_ATTR}; use upend::database::entry::{Entry, EntryValue, InvariantEntry}; @@ -856,22 +857,12 @@ pub async fn get_info(state: web::Data) -> Result { } const MAX_EXTERNAL_SIZE: usize = 128_000_000; - #[tracing::instrument(skip(url), fields(url=%url))] async fn fetch_external(url: Url) -> Result<(bytes::Bytes, Option), actix_web::Error> { - let client = reqwest::Client::builder() - .user_agent(APP_USER_AGENT.as_str()) - .build() - .map_err(ErrorInternalServerError)?; - debug!("Fetching..."); - let response = client - .get(url) - .send() - .await - .map_err(ErrorInternalServerError)? - .error_for_status() + let response = web::block(|| REQWEST_CLIENT.get(url).send()) + .await? .map_err(ErrorInternalServerError)?; if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) { @@ -899,7 +890,7 @@ async fn fetch_external(url: Url) -> Result<(bytes::Bytes, Option), acti .and_then(|cd| cd.get_filename().map(String::from)); debug!("Got filename: {filename:?}"); - let bytes = response.bytes().await.map_err(ErrorInternalServerError)?; + let bytes = response.bytes().map_err(ErrorInternalServerError)?; debug!("Got {} bytes.", bytes.len()); Ok((bytes, filename)) diff --git a/src/common.rs b/src/common.rs index 97dc734..1c5a80c 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,23 +1,3 @@ -use anyhow::{anyhow, Result}; -use shadow_rs::{is_debug, shadow}; +use shadow_rs::shadow; shadow!(build); - -pub fn get_static_dir>(dir: S) -> Result { - let cwd = std::env::current_exe()?.parent().unwrap().to_path_buf(); - let base_path = if is_debug() { - cwd.join("../../tmp/static") - } else { - cwd - }; - let result = base_path.join(dir.as_ref()); - if result.exists() { - Ok(result) - } else { - Err(anyhow!("Path {result:?} doesn't exist.")) - } -} -lazy_static! { - pub static ref APP_USER_AGENT: String = - format!("{} / {}", build::PROJECT_NAME, build::PKG_VERSION); -}