From 99f15dd5842531162c7460264fd73ff9b342fd6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Ml=C3=A1dek?= Date: Mon, 29 Jul 2024 19:33:58 +0200 Subject: [PATCH] feat(backend): Add API endpoint for available extractors and their statuses --- .zed/tasks.json | 4 ++ cli/src/common.rs | 30 +++++++++-- cli/src/extractors/audio.rs | 28 +++++++--- cli/src/extractors/exif.rs | 30 +++++++---- cli/src/extractors/external/monolith.rs | 25 +++++++-- cli/src/extractors/external/ytdlp.rs | 25 +++++++-- cli/src/extractors/media.rs | 34 ++++++++---- cli/src/extractors/mod.rs | 69 +++++++++++++++++++++---- cli/src/extractors/web.rs | 36 ++++++++++--- cli/src/routes.rs | 28 ++++++++++ cli/src/serve.rs | 1 + 11 files changed, 252 insertions(+), 58 deletions(-) diff --git a/.zed/tasks.json b/.zed/tasks.json index 692525a..d036c97 100644 --- a/.zed/tasks.json +++ b/.zed/tasks.json @@ -23,5 +23,9 @@ "use_new_terminal": false, "allow_concurrent_runs": false, "reveal": "always" + }, + { + "label": "cargo clippy", + "command": "cargo clippy --fix --allow-dirty --allow-staged" } ] diff --git a/cli/src/common.rs b/cli/src/common.rs index f95292a..035be99 100644 --- a/cli/src/common.rs +++ b/cli/src/common.rs @@ -1,9 +1,9 @@ +use lazy_static::lazy_static; +use serde::Serialize; +use shadow_rs::{is_debug, shadow}; use std::env::current_exe; use std::path::PathBuf; -use lazy_static::lazy_static; -use shadow_rs::{is_debug, shadow}; - shadow!(build); lazy_static! { @@ -32,3 +32,27 @@ lazy_static! { pub fn get_version() -> &'static str { option_env!("UPEND_VERSION").unwrap_or("unknown") } + +pub trait UserDescribable { + fn name(&self) -> &'static str; + fn description(&self) -> Option<&'static str> { + None + } + fn icon(&self) -> Option<&'static str> { + None + } + fn full_description(&self) -> UserDescription { + UserDescription { + name: self.name(), + description: self.description(), + icon: self.icon(), + } + } +} + +#[derive(Debug, Serialize)] +pub struct UserDescription { + pub name: &'static str, + pub description: Option<&'static str>, + pub icon: Option<&'static str>, +} diff --git a/cli/src/extractors/audio.rs b/cli/src/extractors/audio.rs index 3a838fa..58fb6e2 100644 --- a/cli/src/extractors/audio.rs +++ b/cli/src/extractors/audio.rs @@ -1,9 +1,9 @@ -use std::io::Write; -use std::sync::Arc; - -use super::{Extractor, ExtractorGetResult}; +use super::{Extractor, ExtractorGetResult, ExtractorStatus}; +use crate::common::UserDescribable; use anyhow::{anyhow, Result}; use lazy_static::lazy_static; +use std::io::Write; +use std::sync::Arc; use upend_base::{ addressing::Address, constants::{ATTR_IN, ATTR_KEY, ATTR_LABEL, ATTR_OF}, @@ -163,7 +163,11 @@ impl Extractor for ID3Extractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { let is_audio = connection.retrieve_object(address)?.iter().any(|e| { if e.attribute == FILE_MIME_KEY { if let EntryValue::String(mime) = &e.value { @@ -174,13 +178,23 @@ impl Extractor for ID3Extractor { }); if !is_audio { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } let is_extracted = !connection .query(format!("(matches @{} (contains \"ID3\") ?)", address).parse()?)? .is_empty(); - Ok(!is_extracted) + Ok(ExtractorStatus::from_extracted(is_extracted)) + } +} + +impl UserDescribable for ID3Extractor { + fn name(&self) -> &'static str { + "ID3 Extractor" + } + + fn description(&self) -> Option<&'static str> { + Some("Extracts ID3 tags from audio files (MP3, FLAC)") } } diff --git a/cli/src/extractors/exif.rs b/cli/src/extractors/exif.rs index 9d8e213..ec5f558 100644 --- a/cli/src/extractors/exif.rs +++ b/cli/src/extractors/exif.rs @@ -1,8 +1,8 @@ -use std::sync::Arc; - -use super::{Extractor, ExtractorGetResult}; +use super::{Extractor, ExtractorGetResult, ExtractorStatus}; +use crate::common::UserDescribable; use anyhow::{anyhow, Result}; use lazy_static::lazy_static; +use std::sync::Arc; use upend_base::entry::Attribute; use upend_base::{ addressing::Address, @@ -146,7 +146,11 @@ impl Extractor for ExifExtractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { let is_exif = connection.retrieve_object(address)?.iter().any(|e| { if e.attribute == FILE_MIME_KEY { if let EntryValue::String(mime) = &e.value { @@ -157,17 +161,23 @@ impl Extractor for ExifExtractor { }); if !is_exif { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } let is_extracted = !connection .query(format!("(matches @{} (contains \"EXIF\") ?)", address).parse()?)? .is_empty(); - if is_extracted { - return Ok(false); - } - - Ok(true) + Ok(ExtractorStatus::from_extracted(is_extracted)) + } +} + +impl UserDescribable for ExifExtractor { + fn name(&self) -> &'static str { + "EXIF Metadata Extractor" + } + + fn description(&self) -> Option<&'static str> { + Some("Extracts EXIF metadata from image files.") } } diff --git a/cli/src/extractors/external/monolith.rs b/cli/src/extractors/external/monolith.rs index 4dab57d..f79bf0c 100644 --- a/cli/src/extractors/external/monolith.rs +++ b/cli/src/extractors/external/monolith.rs @@ -1,5 +1,6 @@ +use crate::common::UserDescribable; use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; -use crate::extractors::{Extractor, ExtractorGetResult}; +use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus}; use anyhow::Result; use regex::Regex; use std::process::Command; @@ -73,20 +74,24 @@ impl Extractor for MonolithExtractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { if !matches!(address, Address::Url(_)) { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } if self.get_version().is_err() { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } let is_extracted = !connection .query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)? .is_empty(); - Ok(!is_extracted) + Ok(ExtractorStatus::from_extracted(is_extracted)) } } @@ -97,6 +102,16 @@ impl ExternalCommand for MonolithExtractor { } } +impl UserDescribable for MonolithExtractor { + fn name(&self) -> &'static str { + "Web Archiver (monolith)" + } + + fn description(&self) -> Option<&'static str> { + Some("Archives webpages using the `monolith` command-line tool, which saves a webpage as a single HTML file with all resources embedded.") + } +} + // #[cfg(test)] // mod tests { // use super::*; diff --git a/cli/src/extractors/external/ytdlp.rs b/cli/src/extractors/external/ytdlp.rs index a79ee3b..a01c616 100644 --- a/cli/src/extractors/external/ytdlp.rs +++ b/cli/src/extractors/external/ytdlp.rs @@ -1,5 +1,6 @@ +use crate::common::UserDescribable; use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; -use crate::extractors::{Extractor, ExtractorGetResult}; +use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus}; use anyhow::Result; use regex::Regex; use std::io::{BufReader, Read}; @@ -208,20 +209,24 @@ impl Extractor for YtDlpExtractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { if !matches!(address, Address::Url(_)) { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } if self.get_version().is_err() { - return Ok(false); + return Ok(ExtractorStatus::Unavailable); } let is_extracted = !connection .query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)? .is_empty(); - Ok(!is_extracted) + Ok(ExtractorStatus::from_extracted(is_extracted)) } } @@ -232,6 +237,16 @@ impl ExternalCommand for YtDlpExtractor { } } +impl UserDescribable for YtDlpExtractor { + fn name(&self) -> &'static str { + "yt-dlp downloader" + } + + fn description(&self) -> Option<&'static str> { + Some("Downloads media from a URL using yt-dlp") + } +} + const KNOWN_METADATA: [(&str, &str); 8] = [ ("title", "Title"), ("fulltitle", "Full Title"), diff --git a/cli/src/extractors/media.rs b/cli/src/extractors/media.rs index ea3b939..d41cd49 100644 --- a/cli/src/extractors/media.rs +++ b/cli/src/extractors/media.rs @@ -1,8 +1,8 @@ -use std::{process::Command, sync::Arc}; - -use super::{Extractor, ExtractorGetResult}; +use super::{Extractor, ExtractorGetResult, ExtractorStatus}; +use crate::common::UserDescribable; use anyhow::{anyhow, Result}; use lazy_static::lazy_static; +use std::{process::Command, sync::Arc}; use tracing::{debug, trace}; use upend_base::{ addressing::Address, @@ -127,7 +127,15 @@ impl Extractor for MediaExtractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { + if !matches!(address, Address::Hash(_)) { + return Ok(ExtractorStatus::Unavailable); + } + let is_media = connection.retrieve_object(address)?.iter().any(|e| { if e.attribute == FILE_MIME_KEY { if let EntryValue::String(mime) = &e.value { @@ -146,17 +154,23 @@ impl Extractor for MediaExtractor { }); if !is_media { - return Ok(false); + return Ok(ExtractorStatus::Improbable); } let is_extracted = !connection .query(format!("(matches @{} (contains \"{}\") ?)", address, DURATION_KEY).parse()?)? .is_empty(); - if is_extracted { - return Ok(false); - } - - Ok(true) + Ok(ExtractorStatus::from_extracted(is_extracted)) + } +} + +impl UserDescribable for MediaExtractor { + fn name(&self) -> &'static str { + "Generic Media Extractor" + } + + fn description(&self) -> Option<&'static str> { + Some("Extracts rudimentary metadata (duration) from media files.") } } diff --git a/cli/src/extractors/mod.rs b/cli/src/extractors/mod.rs index 85eb36f..d55631d 100644 --- a/cli/src/extractors/mod.rs +++ b/cli/src/extractors/mod.rs @@ -1,5 +1,7 @@ +use crate::common::{UserDescribable, UserDescription}; use anyhow::Result; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; +use serde::Serialize; use std::{ borrow::Borrow, sync::{Arc, Mutex, RwLock}, @@ -25,7 +27,7 @@ pub mod media; #[cfg(feature = "extractors-external")] pub mod external; -pub trait Extractor: Send + Sync { +pub trait Extractor: Send + Sync + UserDescribable { fn get( &self, address: &Address, @@ -35,13 +37,11 @@ pub trait Extractor: Send + Sync { context: OperationContext, ) -> Result; - fn is_applicable(&self, _address: &Address) -> bool { - true - } - - fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result { - Ok(true) - } + fn status_for( + &self, + _address: &Address, + _connection: &UpEndConnection, + ) -> Result; fn insert_info( &self, @@ -51,7 +51,7 @@ pub trait Extractor: Send + Sync { job_container: JobContainer, context: OperationContext, ) -> Result { - if self.is_needed(address, connection)? { + if self.status_for(address, connection)?.is_applicable() { let ExtractorGetResult { entries, stored: inserted, @@ -97,6 +97,32 @@ impl From> for ExtractorGetResult { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +pub enum ExtractorStatus { + Unavailable, + Improbable, + Needed, + Applicable, +} + +impl ExtractorStatus { + pub fn is_needed(&self) -> bool { + matches!(self, ExtractorStatus::Needed) + } + + pub fn is_applicable(&self) -> bool { + matches!(self, ExtractorStatus::Applicable) || self.is_needed() + } + + pub fn from_extracted(is_extracted: bool) -> Self { + if is_extracted { + ExtractorStatus::Applicable + } else { + ExtractorStatus::Needed + } + } +} + pub struct ExtractorManager { extractors: Vec<(&'static str, Box)>, } @@ -148,7 +174,12 @@ impl ExtractorManager { trace!("Extracting metadata for {address:?}"); for (name, extractor) in &self.extractors { - if extractor.is_applicable(address) { + let status = extractor.status_for(address, connection); + if status.is_err() { + debug!("{name}: {status:?}"); + continue; + } + if status.unwrap().is_applicable() { trace!("Extracting with {name}"); let extract_result = extractor.insert_info( address, @@ -238,4 +269,22 @@ impl ExtractorManager { Ok(result) } + + pub fn statuses_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Vec<(&'static str, UserDescription, ExtractorStatus)> { + self.extractors + .iter() + .map(|(name, extractor)| { + let status = extractor.status_for(address, connection); + ( + *name, + extractor.full_description(), + status.unwrap_or(ExtractorStatus::Unavailable), + ) + }) + .collect() + } } diff --git a/cli/src/extractors/web.rs b/cli/src/extractors/web.rs index 314a4cd..9f971a4 100644 --- a/cli/src/extractors/web.rs +++ b/cli/src/extractors/web.rs @@ -1,10 +1,10 @@ -use std::sync::Arc; - +use super::ExtractorStatus; use super::{Extractor, ExtractorGetResult}; +use crate::common::UserDescribable; use crate::common::REQWEST_CLIENT; use anyhow::anyhow; use anyhow::Result; - +use std::sync::Arc; use upend_base::addressing::Address; use upend_base::constants::ATTR_LABEL; use upend_base::constants::ATTR_OF; @@ -146,13 +146,33 @@ impl Extractor for WebExtractor { } } - fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { - Ok(connection + fn status_for( + &self, + address: &Address, + connection: &UpEndConnection, + ) -> Result { + if !matches!(address, Address::Url(_)) { + return Ok(ExtractorStatus::Unavailable); + } + + let is_extracted = !connection .query( format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#) .parse()?, )? - .is_empty()) + .is_empty(); + + Ok(ExtractorStatus::from_extracted(is_extracted)) + } +} + +impl UserDescribable for WebExtractor { + fn name(&self) -> &'static str { + "Web Metadata Extractor" + } + + fn description(&self) -> Option<&'static str> { + Some("Extracts basic metadata from web pages using OpenGraph and HTML tags.") } } @@ -188,7 +208,7 @@ mod test { let job_container = JobContainer::new(); let address = Address::Url(Url::parse("https://upend.dev").unwrap()); - assert!(WebExtractor.is_needed(&address, &connection)?); + assert!(WebExtractor.status_for(&address, &connection)?.is_needed()); WebExtractor.insert_info( &address, @@ -198,7 +218,7 @@ mod test { OperationContext::default(), )?; - assert!(!WebExtractor.is_needed(&address, &connection)?); + assert!(!WebExtractor.status_for(&address, &connection)?.is_needed()); Ok(()) } diff --git a/cli/src/routes.rs b/cli/src/routes.rs index 065cf6d..694544f 100644 --- a/cli/src/routes.rs +++ b/cli/src/routes.rs @@ -870,6 +870,34 @@ pub async fn get_address( Ok(response.json(format!("{}", address))) } +#[get("/api/obj/{address}/extractors")] +pub async fn get_extractors( + req: HttpRequest, + state: web::Data, + address: web::Path
, +) -> Result { + check_auth(&req, &state)?; + + let connection = state.upend.connection().map_err(ErrorInternalServerError)?; + let statuses = state + .extractor_manager + .statuses_for(&address.into_inner(), &connection); + + Ok(HttpResponse::Ok().json( + statuses + .into_iter() + .map(|(name, info, status)| { + json!({ + "id": name, + "name": info.name, + "description": info.description, + "status": status, + }) + }) + .collect::>(), + )) +} + #[get("/api/all/attributes")] pub async fn get_all_attributes( req: HttpRequest, diff --git a/cli/src/serve.rs b/cli/src/serve.rs index 8227453..ca3d9f0 100644 --- a/cli/src/serve.rs +++ b/cli/src/serve.rs @@ -55,6 +55,7 @@ where .service(routes::get_query) .service(routes::get_object) .service(routes::put_object) + .service(routes::get_extractors) .service(routes::put_blob) .service(routes::put_object_attribute) .service(routes::delete_object)