feat(backend): Add API endpoint for available extractors and their statuses
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This commit is contained in:
parent
baa4c02014
commit
99f15dd584
11 changed files with 252 additions and 58 deletions
|
@ -23,5 +23,9 @@
|
|||
"use_new_terminal": false,
|
||||
"allow_concurrent_runs": false,
|
||||
"reveal": "always"
|
||||
},
|
||||
{
|
||||
"label": "cargo clippy",
|
||||
"command": "cargo clippy --fix --allow-dirty --allow-staged"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
use lazy_static::lazy_static;
|
||||
use serde::Serialize;
|
||||
use shadow_rs::{is_debug, shadow};
|
||||
use std::env::current_exe;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use shadow_rs::{is_debug, shadow};
|
||||
|
||||
shadow!(build);
|
||||
|
||||
lazy_static! {
|
||||
|
@ -32,3 +32,27 @@ lazy_static! {
|
|||
pub fn get_version() -> &'static str {
|
||||
option_env!("UPEND_VERSION").unwrap_or("unknown")
|
||||
}
|
||||
|
||||
pub trait UserDescribable {
|
||||
fn name(&self) -> &'static str;
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
None
|
||||
}
|
||||
fn icon(&self) -> Option<&'static str> {
|
||||
None
|
||||
}
|
||||
fn full_description(&self) -> UserDescription {
|
||||
UserDescription {
|
||||
name: self.name(),
|
||||
description: self.description(),
|
||||
icon: self.icon(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct UserDescription {
|
||||
pub name: &'static str,
|
||||
pub description: Option<&'static str>,
|
||||
pub icon: Option<&'static str>,
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{Extractor, ExtractorGetResult};
|
||||
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
|
||||
use crate::common::UserDescribable;
|
||||
use anyhow::{anyhow, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use upend_base::{
|
||||
addressing::Address,
|
||||
constants::{ATTR_IN, ATTR_KEY, ATTR_LABEL, ATTR_OF},
|
||||
|
@ -163,7 +163,11 @@ impl Extractor for ID3Extractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<super::ExtractorStatus> {
|
||||
let is_audio = connection.retrieve_object(address)?.iter().any(|e| {
|
||||
if e.attribute == FILE_MIME_KEY {
|
||||
if let EntryValue::String(mime) = &e.value {
|
||||
|
@ -174,13 +178,23 @@ impl Extractor for ID3Extractor {
|
|||
});
|
||||
|
||||
if !is_audio {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"ID3\") ?)", address).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
Ok(!is_extracted)
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for ID3Extractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"ID3 Extractor"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Extracts ID3 tags from audio files (MP3, FLAC)")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use super::{Extractor, ExtractorGetResult};
|
||||
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
|
||||
use crate::common::UserDescribable;
|
||||
use anyhow::{anyhow, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use std::sync::Arc;
|
||||
use upend_base::entry::Attribute;
|
||||
use upend_base::{
|
||||
addressing::Address,
|
||||
|
@ -146,7 +146,11 @@ impl Extractor for ExifExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<ExtractorStatus> {
|
||||
let is_exif = connection.retrieve_object(address)?.iter().any(|e| {
|
||||
if e.attribute == FILE_MIME_KEY {
|
||||
if let EntryValue::String(mime) = &e.value {
|
||||
|
@ -157,17 +161,23 @@ impl Extractor for ExifExtractor {
|
|||
});
|
||||
|
||||
if !is_exif {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"EXIF\") ?)", address).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
if is_extracted {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for ExifExtractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"EXIF Metadata Extractor"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Extracts EXIF metadata from image files.")
|
||||
}
|
||||
}
|
||||
|
|
25
cli/src/extractors/external/monolith.rs
vendored
25
cli/src/extractors/external/monolith.rs
vendored
|
@ -1,5 +1,6 @@
|
|||
use crate::common::UserDescribable;
|
||||
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
|
||||
use crate::extractors::{Extractor, ExtractorGetResult};
|
||||
use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
|
||||
use anyhow::Result;
|
||||
use regex::Regex;
|
||||
use std::process::Command;
|
||||
|
@ -73,20 +74,24 @@ impl Extractor for MonolithExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<crate::extractors::ExtractorStatus> {
|
||||
if !matches!(address, Address::Url(_)) {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
if self.get_version().is_err() {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
Ok(!is_extracted)
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,6 +102,16 @@ impl ExternalCommand for MonolithExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for MonolithExtractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"Web Archiver (monolith)"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Archives webpages using the `monolith` command-line tool, which saves a webpage as a single HTML file with all resources embedded.")
|
||||
}
|
||||
}
|
||||
|
||||
// #[cfg(test)]
|
||||
// mod tests {
|
||||
// use super::*;
|
||||
|
|
25
cli/src/extractors/external/ytdlp.rs
vendored
25
cli/src/extractors/external/ytdlp.rs
vendored
|
@ -1,5 +1,6 @@
|
|||
use crate::common::UserDescribable;
|
||||
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
|
||||
use crate::extractors::{Extractor, ExtractorGetResult};
|
||||
use crate::extractors::{Extractor, ExtractorGetResult, ExtractorStatus};
|
||||
use anyhow::Result;
|
||||
use regex::Regex;
|
||||
use std::io::{BufReader, Read};
|
||||
|
@ -208,20 +209,24 @@ impl Extractor for YtDlpExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<crate::extractors::ExtractorStatus> {
|
||||
if !matches!(address, Address::Url(_)) {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
if self.get_version().is_err() {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
Ok(!is_extracted)
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -232,6 +237,16 @@ impl ExternalCommand for YtDlpExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for YtDlpExtractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"yt-dlp downloader"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Downloads media from a URL using yt-dlp")
|
||||
}
|
||||
}
|
||||
|
||||
const KNOWN_METADATA: [(&str, &str); 8] = [
|
||||
("title", "Title"),
|
||||
("fulltitle", "Full Title"),
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
use std::{process::Command, sync::Arc};
|
||||
|
||||
use super::{Extractor, ExtractorGetResult};
|
||||
use super::{Extractor, ExtractorGetResult, ExtractorStatus};
|
||||
use crate::common::UserDescribable;
|
||||
use anyhow::{anyhow, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use std::{process::Command, sync::Arc};
|
||||
use tracing::{debug, trace};
|
||||
use upend_base::{
|
||||
addressing::Address,
|
||||
|
@ -127,7 +127,15 @@ impl Extractor for MediaExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<super::ExtractorStatus> {
|
||||
if !matches!(address, Address::Hash(_)) {
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_media = connection.retrieve_object(address)?.iter().any(|e| {
|
||||
if e.attribute == FILE_MIME_KEY {
|
||||
if let EntryValue::String(mime) = &e.value {
|
||||
|
@ -146,17 +154,23 @@ impl Extractor for MediaExtractor {
|
|||
});
|
||||
|
||||
if !is_media {
|
||||
return Ok(false);
|
||||
return Ok(ExtractorStatus::Improbable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"{}\") ?)", address, DURATION_KEY).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
if is_extracted {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for MediaExtractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"Generic Media Extractor"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Extracts rudimentary metadata (duration) from media files.")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
use crate::common::{UserDescribable, UserDescription};
|
||||
use anyhow::Result;
|
||||
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
||||
use serde::Serialize;
|
||||
use std::{
|
||||
borrow::Borrow,
|
||||
sync::{Arc, Mutex, RwLock},
|
||||
|
@ -25,7 +27,7 @@ pub mod media;
|
|||
#[cfg(feature = "extractors-external")]
|
||||
pub mod external;
|
||||
|
||||
pub trait Extractor: Send + Sync {
|
||||
pub trait Extractor: Send + Sync + UserDescribable {
|
||||
fn get(
|
||||
&self,
|
||||
address: &Address,
|
||||
|
@ -35,13 +37,11 @@ pub trait Extractor: Send + Sync {
|
|||
context: OperationContext,
|
||||
) -> Result<ExtractorGetResult>;
|
||||
|
||||
fn is_applicable(&self, _address: &Address) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_needed(&self, _address: &Address, _connection: &UpEndConnection) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
fn status_for(
|
||||
&self,
|
||||
_address: &Address,
|
||||
_connection: &UpEndConnection,
|
||||
) -> Result<ExtractorStatus>;
|
||||
|
||||
fn insert_info(
|
||||
&self,
|
||||
|
@ -51,7 +51,7 @@ pub trait Extractor: Send + Sync {
|
|||
job_container: JobContainer,
|
||||
context: OperationContext,
|
||||
) -> Result<ExtractorResult> {
|
||||
if self.is_needed(address, connection)? {
|
||||
if self.status_for(address, connection)?.is_applicable() {
|
||||
let ExtractorGetResult {
|
||||
entries,
|
||||
stored: inserted,
|
||||
|
@ -97,6 +97,32 @@ impl From<Vec<Entry>> for ExtractorGetResult {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
|
||||
pub enum ExtractorStatus {
|
||||
Unavailable,
|
||||
Improbable,
|
||||
Needed,
|
||||
Applicable,
|
||||
}
|
||||
|
||||
impl ExtractorStatus {
|
||||
pub fn is_needed(&self) -> bool {
|
||||
matches!(self, ExtractorStatus::Needed)
|
||||
}
|
||||
|
||||
pub fn is_applicable(&self) -> bool {
|
||||
matches!(self, ExtractorStatus::Applicable) || self.is_needed()
|
||||
}
|
||||
|
||||
pub fn from_extracted(is_extracted: bool) -> Self {
|
||||
if is_extracted {
|
||||
ExtractorStatus::Applicable
|
||||
} else {
|
||||
ExtractorStatus::Needed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractorManager {
|
||||
extractors: Vec<(&'static str, Box<dyn Extractor>)>,
|
||||
}
|
||||
|
@ -148,7 +174,12 @@ impl ExtractorManager {
|
|||
trace!("Extracting metadata for {address:?}");
|
||||
|
||||
for (name, extractor) in &self.extractors {
|
||||
if extractor.is_applicable(address) {
|
||||
let status = extractor.status_for(address, connection);
|
||||
if status.is_err() {
|
||||
debug!("{name}: {status:?}");
|
||||
continue;
|
||||
}
|
||||
if status.unwrap().is_applicable() {
|
||||
trace!("Extracting with {name}");
|
||||
let extract_result = extractor.insert_info(
|
||||
address,
|
||||
|
@ -238,4 +269,22 @@ impl ExtractorManager {
|
|||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn statuses_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Vec<(&'static str, UserDescription, ExtractorStatus)> {
|
||||
self.extractors
|
||||
.iter()
|
||||
.map(|(name, extractor)| {
|
||||
let status = extractor.status_for(address, connection);
|
||||
(
|
||||
*name,
|
||||
extractor.full_description(),
|
||||
status.unwrap_or(ExtractorStatus::Unavailable),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use super::ExtractorStatus;
|
||||
use super::{Extractor, ExtractorGetResult};
|
||||
use crate::common::UserDescribable;
|
||||
use crate::common::REQWEST_CLIENT;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Result;
|
||||
|
||||
use std::sync::Arc;
|
||||
use upend_base::addressing::Address;
|
||||
use upend_base::constants::ATTR_LABEL;
|
||||
use upend_base::constants::ATTR_OF;
|
||||
|
@ -146,13 +146,33 @@ impl Extractor for WebExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
Ok(connection
|
||||
fn status_for(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
) -> Result<ExtractorStatus> {
|
||||
if !matches!(address, Address::Url(_)) {
|
||||
return Ok(ExtractorStatus::Unavailable);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(
|
||||
format!(r#"(matches @{address} (in "HTML_TITLE" "HTML_DESCRIPTION") ?)"#)
|
||||
.parse()?,
|
||||
)?
|
||||
.is_empty())
|
||||
.is_empty();
|
||||
|
||||
Ok(ExtractorStatus::from_extracted(is_extracted))
|
||||
}
|
||||
}
|
||||
|
||||
impl UserDescribable for WebExtractor {
|
||||
fn name(&self) -> &'static str {
|
||||
"Web Metadata Extractor"
|
||||
}
|
||||
|
||||
fn description(&self) -> Option<&'static str> {
|
||||
Some("Extracts basic metadata from web pages using OpenGraph and HTML tags.")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -188,7 +208,7 @@ mod test {
|
|||
let job_container = JobContainer::new();
|
||||
|
||||
let address = Address::Url(Url::parse("https://upend.dev").unwrap());
|
||||
assert!(WebExtractor.is_needed(&address, &connection)?);
|
||||
assert!(WebExtractor.status_for(&address, &connection)?.is_needed());
|
||||
|
||||
WebExtractor.insert_info(
|
||||
&address,
|
||||
|
@ -198,7 +218,7 @@ mod test {
|
|||
OperationContext::default(),
|
||||
)?;
|
||||
|
||||
assert!(!WebExtractor.is_needed(&address, &connection)?);
|
||||
assert!(!WebExtractor.status_for(&address, &connection)?.is_needed());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -870,6 +870,34 @@ pub async fn get_address(
|
|||
Ok(response.json(format!("{}", address)))
|
||||
}
|
||||
|
||||
#[get("/api/obj/{address}/extractors")]
|
||||
pub async fn get_extractors(
|
||||
req: HttpRequest,
|
||||
state: web::Data<State>,
|
||||
address: web::Path<Address>,
|
||||
) -> Result<HttpResponse, Error> {
|
||||
check_auth(&req, &state)?;
|
||||
|
||||
let connection = state.upend.connection().map_err(ErrorInternalServerError)?;
|
||||
let statuses = state
|
||||
.extractor_manager
|
||||
.statuses_for(&address.into_inner(), &connection);
|
||||
|
||||
Ok(HttpResponse::Ok().json(
|
||||
statuses
|
||||
.into_iter()
|
||||
.map(|(name, info, status)| {
|
||||
json!({
|
||||
"id": name,
|
||||
"name": info.name,
|
||||
"description": info.description,
|
||||
"status": status,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
))
|
||||
}
|
||||
|
||||
#[get("/api/all/attributes")]
|
||||
pub async fn get_all_attributes(
|
||||
req: HttpRequest,
|
||||
|
|
|
@ -55,6 +55,7 @@ where
|
|||
.service(routes::get_query)
|
||||
.service(routes::get_object)
|
||||
.service(routes::put_object)
|
||||
.service(routes::get_extractors)
|
||||
.service(routes::put_blob)
|
||||
.service(routes::put_object_attribute)
|
||||
.service(routes::delete_object)
|
||||
|
|
Loading…
Reference in a new issue