feat(backend): Add API endpoint for available extractors and their statuses
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
Tomáš Mládek 2024-07-29 19:33:58 +02:00
parent baa4c02014
commit 99f15dd584
11 changed files with 252 additions and 58 deletions

View file

@ -23,5 +23,9 @@
"use_new_terminal": false,
"allow_concurrent_runs": false,
"reveal": "always"
},
{
"label": "cargo clippy",
"command": "cargo clippy --fix --allow-dirty --allow-staged"
}
]

View file

@ -1,9 +1,9 @@
use lazy_static::lazy_static;
use serde::Serialize;
use shadow_rs::{is_debug, shadow};
use std::env::current_exe;
use std::path::PathBuf;
use lazy_static::lazy_static;
use shadow_rs::{is_debug, shadow};
shadow!(build);
lazy_static! {
@ -32,3 +32,27 @@ lazy_static! {
pub fn get_version() -> &'static str {
option_env!("UPEND_VERSION").unwrap_or("unknown")
}
pub trait UserDescribable {
fn name(&self) -> &'static str;
fn description(&self) -> Option<&'static str> {
None
}
fn icon(&self) -> Option<&'static str> {
None
}
fn full_description(&self) -> UserDescription {
UserDescription {
name: self.name(),
description: self.description(),
icon: self.icon(),
}
}
}
#[derive(Debug, Serialize)]
pub struct UserDescription {
pub name: &'static str,
pub description: Option<&'static str>,
pub icon: Option<&'static str>,
}

View file

@ -1,9 +1,9 @@
use std::io::Write;
use std::sync::Arc;
use super::{Extractor, ExtractorGetResult};
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use crate::common::UserDescribable;
use anyhow::{anyhow, Result};
use lazy_static::lazy_static;
use std::io::Write;
use std::sync::Arc;
use upend_base::{
addressing::Address,
constants::{ATTR_IN, ATTR_KEY, ATTR_LABEL, ATTR_OF},
@ -163,7 +163,11 @@ impl Extractor for ID3Extractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<super::ExtractorStatus> {
let is_audio = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value {
@ -174,13 +178,23 @@ impl Extractor for ID3Extractor {
});
if !is_audio {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"ID3\") ?)", address).parse()?)?
.is_empty();
Ok(!is_extracted)
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for ID3Extractor {
fn name(&self) -> &'static str {
"ID3 Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts ID3 tags from audio files (MP3, FLAC)")
}
}

View file

@ -1,8 +1,8 @@
use std::sync::Arc;
use super::{Extractor, ExtractorGetResult};
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use crate::common::UserDescribable;
use anyhow::{anyhow, Result};
use lazy_static::lazy_static;
use std::sync::Arc;
use upend_base::entry::Attribute;
use upend_base::{
addressing::Address,
@ -146,7 +146,11 @@ impl Extractor for ExifExtractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<ExtractorStatus> {
let is_exif = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value {
@ -157,17 +161,23 @@ impl Extractor for ExifExtractor {
});
if !is_exif {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"EXIF\") ?)", address).parse()?)?
.is_empty();
if is_extracted {
return Ok(false);
}
Ok(true)
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for ExifExtractor {
fn name(&self) -> &'static str {
"EXIF Metadata Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts EXIF metadata from image files.")
}
}

View file

@ -1,5 +1,6 @@
use crate::common::UserDescribable;
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::{Extractor, ExtractorGetResult};
use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
use anyhow::Result;
use regex::Regex;
use std::process::Command;
@ -73,20 +74,24 @@ impl Extractor for MonolithExtractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<crate::extractors::ExtractorStatus> {
if !matches!(address, Address::Url(_)) {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
if self.get_version().is_err() {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
.is_empty();
Ok(!is_extracted)
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
@ -97,6 +102,16 @@ impl ExternalCommand for MonolithExtractor {
}
}
impl UserDescribable for MonolithExtractor {
fn name(&self) -> &'static str {
"Web Archiver (monolith)"
}
fn description(&self) -> Option<&'static str> {
Some("Archives webpages using the `monolith` command-line tool, which saves a webpage as a single HTML file with all resources embedded.")
}
}
// #[cfg(test)]
// mod tests {
// use super::*;

View file

@ -1,5 +1,6 @@
use crate::common::UserDescribable;
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::{Extractor, ExtractorGetResult};
use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
use anyhow::Result;
use regex::Regex;
use std::io::{BufReader, Read};
@ -208,20 +209,24 @@ impl Extractor for YtDlpExtractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<crate::extractors::ExtractorStatus> {
if !matches!(address, Address::Url(_)) {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
if self.get_version().is_err() {
return Ok(false);
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)?
.is_empty();
Ok(!is_extracted)
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
@ -232,6 +237,16 @@ impl ExternalCommand for YtDlpExtractor {
}
}
impl UserDescribable for YtDlpExtractor {
fn name(&self) -> &'static str {
"yt-dlp downloader"
}
fn description(&self) -> Option<&'static str> {
Some("Downloads media from a URL using yt-dlp")
}
}
const KNOWN_METADATA: [(&str, &str); 8] = [
("title", "Title"),
("fulltitle", "Full Title"),

View file

@ -1,8 +1,8 @@
use std::{process::Command, sync::Arc};
use super::{Extractor, ExtractorGetResult};
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
use crate::common::UserDescribable;
use anyhow::{anyhow, Result};
use lazy_static::lazy_static;
use std::{process::Command, sync::Arc};
use tracing::{debug, trace};
use upend_base::{
addressing::Address,
@ -127,7 +127,15 @@ impl Extractor for MediaExtractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<super::ExtractorStatus> {
if !matches!(address, Address::Hash(_)) {
return Ok(ExtractorStatus::Unavailable);
}
let is_media = connection.retrieve_object(address)?.iter().any(|e| {
if e.attribute == FILE_MIME_KEY {
if let EntryValue::String(mime) = &e.value {
@ -146,17 +154,23 @@ impl Extractor for MediaExtractor {
});
if !is_media {
return Ok(false);
return Ok(ExtractorStatus::Improbable);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"{}\") ?)", address, DURATION_KEY).parse()?)?
.is_empty();
if is_extracted {
return Ok(false);
}
Ok(true)
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for MediaExtractor {
fn name(&self) -> &'static str {
"Generic Media Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts rudimentary metadata (duration) from media files.")
}
}

View file

@ -1,5 +1,7 @@
use crate::common::{UserDescribable, UserDescription};
use anyhow::Result;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use serde::Serialize;
use std::{
borrow::Borrow,
sync::{Arc, Mutex, RwLock},
@ -25,7 +27,7 @@ pub mod media;
#[cfg(feature = "extractors-external")]
pub mod external;
pub trait Extractor: Send + Sync {
pub trait Extractor: Send + Sync + UserDescribable {
fn get(
&self,
address: &Address,
@ -35,13 +37,11 @@ pub trait Extractor: Send + Sync {
context: OperationContext,
) -> Result<ExtractorGetResult>;
fn is_applicable(&self, _address: &Address) -> bool {
true
}
fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result<bool> {
Ok(true)
}
fn status_for(
&self,
_address: &Address,
_connection: &UpEndConnection,
) -> Result<ExtractorStatus>;
fn insert_info(
&self,
@ -51,7 +51,7 @@ pub trait Extractor: Send + Sync {
job_container: JobContainer,
context: OperationContext,
) -> Result<ExtractorResult> {
if self.is_needed(address, connection)? {
if self.status_for(address, connection)?.is_applicable() {
let ExtractorGetResult {
entries,
stored: inserted,
@ -97,6 +97,32 @@ impl From<Vec<Entry>> for ExtractorGetResult {
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
pub enum ExtractorStatus {
Unavailable,
Improbable,
Needed,
Applicable,
}
impl ExtractorStatus {
pub fn is_needed(&self) -> bool {
matches!(self, ExtractorStatus::Needed)
}
pub fn is_applicable(&self) -> bool {
matches!(self, ExtractorStatus::Applicable) || self.is_needed()
}
pub fn from_extracted(is_extracted: bool) -> Self {
if is_extracted {
ExtractorStatus::Applicable
} else {
ExtractorStatus::Needed
}
}
}
pub struct ExtractorManager {
extractors: Vec<(&'static str, Box<dyn Extractor>)>,
}
@ -148,7 +174,12 @@ impl ExtractorManager {
trace!("Extracting metadata for {address:?}");
for (name, extractor) in &self.extractors {
if extractor.is_applicable(address) {
let status = extractor.status_for(address, connection);
if status.is_err() {
debug!("{name}: {status:?}");
continue;
}
if status.unwrap().is_applicable() {
trace!("Extracting with {name}");
let extract_result = extractor.insert_info(
address,
@ -238,4 +269,22 @@ impl ExtractorManager {
Ok(result)
}
pub fn statuses_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Vec<(&'static str, UserDescription, ExtractorStatus)> {
self.extractors
.iter()
.map(|(name, extractor)| {
let status = extractor.status_for(address, connection);
(
*name,
extractor.full_description(),
status.unwrap_or(ExtractorStatus::Unavailable),
)
})
.collect()
}
}

View file

@ -1,10 +1,10 @@
use std::sync::Arc;
use super::ExtractorStatus;
use super::{Extractor, ExtractorGetResult};
use crate::common::UserDescribable;
use crate::common::REQWEST_CLIENT;
use anyhow::anyhow;
use anyhow::Result;
use std::sync::Arc;
use upend_base::addressing::Address;
use upend_base::constants::ATTR_LABEL;
use upend_base::constants::ATTR_OF;
@ -146,13 +146,33 @@ impl Extractor for WebExtractor {
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
Ok(connection
fn status_for(
&self,
address: &Address,
connection: &UpEndConnection,
) -> Result<ExtractorStatus> {
if !matches!(address, Address::Url(_)) {
return Ok(ExtractorStatus::Unavailable);
}
let is_extracted = !connection
.query(
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
.parse()?,
)?
.is_empty())
.is_empty();
Ok(ExtractorStatus::from_extracted(is_extracted))
}
}
impl UserDescribable for WebExtractor {
fn name(&self) -> &'static str {
"Web Metadata Extractor"
}
fn description(&self) -> Option<&'static str> {
Some("Extracts basic metadata from web pages using OpenGraph and HTML tags.")
}
}
@ -188,7 +208,7 @@ mod test {
let job_container = JobContainer::new();
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
assert!(WebExtractor.is_needed(&address, &connection)?);
assert!(WebExtractor.status_for(&address, &connection)?.is_needed());
WebExtractor.insert_info(
&address,
@ -198,7 +218,7 @@ mod test {
OperationContext::default(),
)?;
assert!(!WebExtractor.is_needed(&address, &connection)?);
assert!(!WebExtractor.status_for(&address, &connection)?.is_needed());
Ok(())
}

View file

@ -870,6 +870,34 @@ pub async fn get_address(
Ok(response.json(format!("{}", address)))
}
#[get("/api/obj/{address}/extractors")]
pub async fn get_extractors(
req: HttpRequest,
state: web::Data<State>,
address: web::Path<Address>,
) -> Result<HttpResponse, Error> {
check_auth(&req, &state)?;
let connection = state.upend.connection().map_err(ErrorInternalServerError)?;
let statuses = state
.extractor_manager
.statuses_for(&address.into_inner(), &connection);
Ok(HttpResponse::Ok().json(
statuses
.into_iter()
.map(|(name, info, status)| {
json!({
"id": name,
"name": info.name,
"description": info.description,
"status": status,
})
})
.collect::<Vec<_>>(),
))
}
#[get("/api/all/attributes")]
pub async fn get_all_attributes(
req: HttpRequest,

View file

@ -55,6 +55,7 @@ where
.service(routes::get_query)
.service(routes::get_object)
.service(routes::put_object)
.service(routes::get_extractors)
.service(routes::put_blob)
.service(routes::put_object_attribute)
.service(routes::delete_object)