feat(backend): Add API endpoint for available extractors and their statuses
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
Tomáš Mládek 2024-07-29 19:33:58 +02:00
parent baa4c02014
commit 99f15dd584
11 changed files with 252 additions and 58 deletions

View file

@ -23,5 +23,9 @@
"use_new_terminal": false, "use_new_terminal": false,
"allow_concurrent_runs": false, "allow_concurrent_runs": false,
"reveal": "always" "reveal": "always"
},
{
"label": "cargo clippy",
"command": "cargo clippy --fix --allow-dirty --allow-staged"
} }
] ]

View file

@ -1,9 +1,9 @@
use lazy_static::lazy_static;
use serde::Serialize;
use shadow_rs::{is_debug, shadow};
use std::env::current_exe; use std::env::current_exe;
use std::path::PathBuf; use std::path::PathBuf;
use lazy_static::lazy_static;
use shadow_rs::{is_debug, shadow};
shadow!(build); shadow!(build);
lazy_static! { lazy_static! {
@ -32,3 +32,27 @@ lazy_static! {
pub fn get_version() -> &'static str { pub fn get_version() -> &'static str {
option_env!("UPEND_VERSION").unwrap_or("unknown") option_env!("UPEND_VERSION").unwrap_or("unknown")
} }
pub trait UserDescribable {
fn name(&self) -> &'static str;
fn description(&self) -> Option<&'static str> {
None
}
fn icon(&self) -> Option<&'static str> {
None
}
fn full_description(&self) -> UserDescription {
UserDescription {
name: self.name(),
description: self.description(),
icon: self.icon(),
}
}
}
#[derive(Debug, Serialize)]
pub struct UserDescription {
pub name: &'static str,
pub description: Option<&'static str>,
pub icon: Option<&'static str>,
}

View file

@ -1,9 +1,9 @@
use std::io::Write; use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use std::sync::Arc; use crate::common::UserDescribable;
use super::{Extractor, ExtractorGetResult};
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::io::Write;
use std::sync::Arc;
use upend_base::{ use upend_base::{
addressing::Address, addressing::Address,
constants::{ATTR_IN, ATTR_KEY, ATTR_LABEL, ATTR_OF}, constants::{ATTR_IN, ATTR_KEY, ATTR_LABEL, ATTR_OF},
@ -163,7 +163,11 @@ impl Extractor for ID3Extractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<super::ExtractorStatus> {
let is_audio = connection.retrieve_object(address)?.iter().any(|e| { let is_audio = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY { if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value { if let EntryValue::String(mime) = &e.value {
@ -174,13 +178,23 @@ impl Extractor for ID3Extractor {
}); });
if !is_audio { if !is_audio {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
let is_extracted = !connection let is_extracted = !connection
.query(format!("(matches @{} (contains \"ID3\") ?)", address).parse()?)? .query(format!("(matches @{} (contains \"ID3\") ?)", address).parse()?)?
.is_empty(); .is_empty();
Ok(!is_extracted) Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for ID3Extractor {
fn name(&self) -> &'static str {
"ID3 Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts ID3 tags from audio files (MP3, FLAC)")
} }
} }

View file

@ -1,8 +1,8 @@
use std::sync::Arc; use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use crate::common::UserDescribable;
use super::{Extractor, ExtractorGetResult};
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::sync::Arc;
use upend_base::entry::Attribute; use upend_base::entry::Attribute;
use upend_base::{ use upend_base::{
addressing::Address, addressing::Address,
@ -146,7 +146,11 @@ impl Extractor for ExifExtractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<ExtractorStatus> {
let is_exif = connection.retrieve_object(address)?.iter().any(|e| { let is_exif = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY { if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value { if let EntryValue::String(mime) = &e.value {
@ -157,17 +161,23 @@ impl Extractor for ExifExtractor {
}); });
if !is_exif { if !is_exif {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
let is_extracted = !connection let is_extracted = !connection
.query(format!("(matches @{} (contains \"EXIF\") ?)", address).parse()?)? .query(format!("(matches @{} (contains \"EXIF\") ?)", address).parse()?)?
.is_empty(); .is_empty();
if is_extracted { Ok(ExtractorStatus::from_extracted(is_extracted))
return Ok(false); }
} }
Ok(true) impl UserDescribable for ExifExtractor {
fn name(&self) -> &'static str {
"EXIF Metadata Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts EXIF metadata from image files.")
} }
} }

View file

@ -1,5 +1,6 @@
use crate::common::UserDescribable;
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::{Extractor, ExtractorGetResult}; use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
use anyhow::Result; use anyhow::Result;
use regex::Regex; use regex::Regex;
use std::process::Command; use std::process::Command;
@ -73,20 +74,24 @@ impl Extractor for MonolithExtractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<crate::extractors::ExtractorStatus> {
if !matches!(address, Address::Url(_)) { if !matches!(address, Address::Url(_)) {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
if self.get_version().is_err() { if self.get_version().is_err() {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
let is_extracted = !connection let is_extracted = !connection
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)? .query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
.is_empty(); .is_empty();
Ok(!is_extracted) Ok(ExtractorStatus::from_extracted(is_extracted))
} }
} }
@ -97,6 +102,16 @@ impl ExternalCommand for MonolithExtractor {
} }
} }
impl UserDescribable for MonolithExtractor {
fn name(&self) -> &'static str {
"Web Archiver (monolith)"
}
fn description(&self) -> Option<&'static str> {
Some("Archives webpages using the `monolith` command-line tool, which saves a webpage as a single HTML file with all resources embedded.")
}
}
// #[cfg(test)] // #[cfg(test)]
// mod tests { // mod tests {
// use super::*; // use super::*;

View file

@ -1,5 +1,6 @@
use crate::common::UserDescribable;
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::{Extractor, ExtractorGetResult}; use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
use anyhow::Result; use anyhow::Result;
use regex::Regex; use regex::Regex;
use std::io::{BufReader, Read}; use std::io::{BufReader, Read};
@ -208,20 +209,24 @@ impl Extractor for YtDlpExtractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<crate::extractors::ExtractorStatus> {
if !matches!(address, Address::Url(_)) { if !matches!(address, Address::Url(_)) {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
if self.get_version().is_err() { if self.get_version().is_err() {
return Ok(false); return Ok(ExtractorStatus::Unavailable);
} }
let is_extracted = !connection let is_extracted = !connection
.query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)? .query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)?
.is_empty(); .is_empty();
Ok(!is_extracted) Ok(ExtractorStatus::from_extracted(is_extracted))
} }
} }
@ -232,6 +237,16 @@ impl ExternalCommand for YtDlpExtractor {
} }
} }
impl UserDescribable for YtDlpExtractor {
fn name(&self) -> &'static str {
"yt-dlp downloader"
}
fn description(&self) -> Option<&'static str> {
Some("Downloads media from a URL using yt-dlp")
}
}
const KNOWN_METADATA: [(&str, &str); 8] = [ const KNOWN_METADATA: [(&str, &str); 8] = [
("title", "Title"), ("title", "Title"),
("fulltitle", "Full Title"), ("fulltitle", "Full Title"),

View file

@ -1,8 +1,8 @@
use std::{process::Command, sync::Arc}; use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use crate::common::UserDescribable;
use super::{Extractor, ExtractorGetResult};
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use std::{process::Command, sync::Arc};
use tracing::{debug, trace}; use tracing::{debug, trace};
use upend_base::{ use upend_base::{
addressing::Address, addressing::Address,
@ -127,7 +127,15 @@ impl Extractor for MediaExtractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<super::ExtractorStatus> {
if !matches!(address, Address::Hash(_)) {
return Ok(ExtractorStatus::Unavailable);
}
let is_media = connection.retrieve_object(address)?.iter().any(|e| { let is_media = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY { if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value { if let EntryValue::String(mime) = &e.value {
@ -146,17 +154,23 @@ impl Extractor for MediaExtractor {
}); });
if !is_media { if !is_media {
return Ok(false); return Ok(ExtractorStatus::Improbable);
} }
let is_extracted = !connection let is_extracted = !connection
.query(format!("(matches @{} (contains \"{}\") ?)", address, DURATION_KEY).parse()?)? .query(format!("(matches @{} (contains \"{}\") ?)", address, DURATION_KEY).parse()?)?
.is_empty(); .is_empty();
if is_extracted { Ok(ExtractorStatus::from_extracted(is_extracted))
return Ok(false); }
} }
Ok(true) impl UserDescribable for MediaExtractor {
fn name(&self) -> &'static str {
"Generic Media Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts rudimentary metadata (duration) from media files.")
} }
} }

View file

@ -1,5 +1,7 @@
use crate::common::{UserDescribable, UserDescription};
use anyhow::Result; use anyhow::Result;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use serde::Serialize;
use std::{ use std::{
borrow::Borrow, borrow::Borrow,
sync::{Arc, Mutex, RwLock}, sync::{Arc, Mutex, RwLock},
@ -25,7 +27,7 @@ pub mod media;
#[cfg(feature = "extractors-external")] #[cfg(feature = "extractors-external")]
pub mod external; pub mod external;
pub trait Extractor: Send + Sync { pub trait Extractor: Send + Sync + UserDescribable {
fn get( fn get(
&self, &self,
address: &Address, address: &Address,
@ -35,13 +37,11 @@ pub trait Extractor: Send + Sync {
context: OperationContext, context: OperationContext,
) -> Result<ExtractorGetResult>; ) -> Result<ExtractorGetResult>;
fn is_applicable(&self, _address: &Address) -> bool { fn status_for(
true &self,
} _address: &Address,
_connection: &UpEndConnection,
fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result<bool> { ) -> Result<ExtractorStatus>;
Ok(true)
}
fn insert_info( fn insert_info(
&self, &self,
@ -51,7 +51,7 @@ pub trait Extractor: Send + Sync {
job_container: JobContainer, job_container: JobContainer,
context: OperationContext, context: OperationContext,
) -> Result<ExtractorResult> { ) -> Result<ExtractorResult> {
if self.is_needed(address, connection)? { if self.status_for(address, connection)?.is_applicable() {
let ExtractorGetResult { let ExtractorGetResult {
entries, entries,
stored: inserted, stored: inserted,
@ -97,6 +97,32 @@ impl From<Vec<Entry>> for ExtractorGetResult {
} }
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
pub enum ExtractorStatus {
Unavailable,
Improbable,
Needed,
Applicable,
}
impl ExtractorStatus {
pub fn is_needed(&self) -> bool {
matches!(self, ExtractorStatus::Needed)
}
pub fn is_applicable(&self) -> bool {
matches!(self, ExtractorStatus::Applicable) || self.is_needed()
}
pub fn from_extracted(is_extracted: bool) -> Self {
if is_extracted {
ExtractorStatus::Applicable
} else {
ExtractorStatus::Needed
}
}
}
pub struct ExtractorManager { pub struct ExtractorManager {
extractors: Vec<(&'static str, Box<dyn Extractor>)>, extractors: Vec<(&'static str, Box<dyn Extractor>)>,
} }
@ -148,7 +174,12 @@ impl ExtractorManager {
trace!("Extracting metadata for {address:?}"); trace!("Extracting metadata for {address:?}");
for (name, extractor) in &self.extractors { for (name, extractor) in &self.extractors {
if extractor.is_applicable(address) { let status = extractor.status_for(address, connection);
if status.is_err() {
debug!("{name}: {status:?}");
continue;
}
if status.unwrap().is_applicable() {
trace!("Extracting with {name}"); trace!("Extracting with {name}");
let extract_result = extractor.insert_info( let extract_result = extractor.insert_info(
address, address,
@ -238,4 +269,22 @@ impl ExtractorManager {
Ok(result) Ok(result)
} }
pub fn statuses_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Vec<(&'static str, UserDescription, ExtractorStatus)> {
self.extractors
.iter()
.map(|(name, extractor)| {
let status = extractor.status_for(address, connection);
(
*name,
extractor.full_description(),
status.unwrap_or(ExtractorStatus::Unavailable),
)
})
.collect()
}
} }

View file

@ -1,10 +1,10 @@
use std::sync::Arc; use super::ExtractorStatus;
use super::{Extractor, ExtractorGetResult}; use super::{Extractor, ExtractorGetResult};
use crate::common::UserDescribable;
use crate::common::REQWEST_CLIENT; use crate::common::REQWEST_CLIENT;
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Result; use anyhow::Result;
use std::sync::Arc;
use upend_base::addressing::Address; use upend_base::addressing::Address;
use upend_base::constants::ATTR_LABEL; use upend_base::constants::ATTR_LABEL;
use upend_base::constants::ATTR_OF; use upend_base::constants::ATTR_OF;
@ -146,13 +146,33 @@ impl Extractor for WebExtractor {
} }
} }
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> { fn status_for(
Ok(connection &self,
address: &Address,
connection: &UpEndConnection,
) -> Result<ExtractorStatus> {
if !matches!(address, Address::Url(_)) {
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query( .query(
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#) format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
.parse()?, .parse()?,
)? )?
.is_empty()) .is_empty();
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for WebExtractor {
fn name(&self) -> &'static str {
"Web Metadata Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts basic metadata from web pages using OpenGraph and HTML tags.")
} }
} }
@ -188,7 +208,7 @@ mod test {
let job_container = JobContainer::new(); let job_container = JobContainer::new();
let address = Address::Url(Url::parse("https://upend.dev").unwrap()); let address = Address::Url(Url::parse("https://upend.dev").unwrap());
assert!(WebExtractor.is_needed(&address, &connection)?); assert!(WebExtractor.status_for(&address, &connection)?.is_needed());
WebExtractor.insert_info( WebExtractor.insert_info(
&address, &address,
@ -198,7 +218,7 @@ mod test {
OperationContext::default(), OperationContext::default(),
)?; )?;
assert!(!WebExtractor.is_needed(&address, &connection)?); assert!(!WebExtractor.status_for(&address, &connection)?.is_needed());
Ok(()) Ok(())
} }

View file

@ -870,6 +870,34 @@ pub async fn get_address(
Ok(response.json(format!("{}", address))) Ok(response.json(format!("{}", address)))
} }
#[get("/api/obj/{address}/extractors")]
pub async fn get_extractors(
req: HttpRequest,
state: web::Data<State>,
address: web::Path<Address>,
) -> Result<HttpResponse, Error> {
check_auth(&req, &state)?;
let connection = state.upend.connection().map_err(ErrorInternalServerError)?;
let statuses = state
.extractor_manager
.statuses_for(&address.into_inner(), &connection);
Ok(HttpResponse::Ok().json(
statuses
.into_iter()
.map(|(name, info, status)| {
json!({
"id": name,
"name": info.name,
"description": info.description,
"status": status,
})
})
.collect::<Vec<_>>(),
))
}
#[get("/api/all/attributes")] #[get("/api/all/attributes")]
pub async fn get_all_attributes( pub async fn get_all_attributes(
req: HttpRequest, req: HttpRequest,

View file

@ -55,6 +55,7 @@ where
.service(routes::get_query) .service(routes::get_query)
.service(routes::get_object) .service(routes::get_object)
.service(routes::put_object) .service(routes::put_object)
.service(routes::get_extractors)
.service(routes::put_blob) .service(routes::put_blob)
.service(routes::put_object_attribute) .service(routes::put_object_attribute)
.service(routes::delete_object) .service(routes::delete_object)