diff --git a/cli/src/extractors/external/monolith.rs b/cli/src/extractors/external/monolith.rs index f2b1002..f702e1a 100644 --- a/cli/src/extractors/external/monolith.rs +++ b/cli/src/extractors/external/monolith.rs @@ -6,6 +6,7 @@ use std::process::Command; use std::sync::Arc; use tracing::debug; use upend_base::addressing::Address; +use upend_base::constants::ATTR_LABEL; use upend_base::entry::Entry; use upend_db::jobs::JobContainer; use upend_db::stores::UpStore; @@ -46,14 +47,24 @@ impl Extractor for MonolithExtractor { )?; debug!("Stored {} as {:?}", url.as_str(), stored); job_handle.update_progress(100.0)?; - Ok(vec![Entry { - entity: address.clone(), - attribute: "WM_ARCHIVED".parse().unwrap(), - value: Address::Hash(stored).into(), - provenance: context.provenance.clone() + "EXTRACTOR monolith", - user: context.user.clone(), - timestamp: chrono::Utc::now().naive_utc(), - }]) + Ok(vec![ + Entry { + entity: address.clone(), + attribute: "WM_ARCHIVED".parse().unwrap(), + value: Address::Hash(stored).into(), + provenance: context.provenance.clone() + "EXTRACTOR monolith", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + Entry { + entity: Address::Attribute("WM_ARCHIVED".parse().unwrap()), + attribute: ATTR_LABEL.parse().unwrap(), + value: "Webpage Archived (monolith)".into(), + provenance: context.provenance.clone() + "EXTRACTOR monolith", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + ]) } else { Ok(vec![]) } diff --git a/cli/src/extractors/external/ytdlp.rs b/cli/src/extractors/external/ytdlp.rs index f9bc12d..0b0afd1 100644 --- a/cli/src/extractors/external/ytdlp.rs +++ b/cli/src/extractors/external/ytdlp.rs @@ -7,7 +7,8 @@ use std::process::{Command, Stdio}; use std::sync::Arc; use tracing::{debug, trace}; use upend_base::addressing::Address; -use upend_base::entry::{Entry, EntryValue}; +use upend_base::constants::{ATTR_LABEL, ATTR_OF, TYPE_URL_ADDRESS}; +use upend_base::entry::{Attribute, Entry, EntryValue}; use upend_db::jobs::JobContainer; use upend_db::stores::UpStore; use upend_db::{OperationContext, UpEndConnection}; @@ -129,25 +130,35 @@ impl Extractor for YtDlpExtractor { .path(); let json_text = std::fs::read_to_string(json_path)?; let json_data = serde_json::from_str::(&json_text)?; - for key in [ - "title", - "fulltitle", - "description", - "channel", - "uploader", - "channel_url", - "upload_date", - "timestamp", - ] { + for (key, label) in KNOWN_METADATA { if let Some(value) = json_data.get(key) { - result.push(Entry { - entity: address.clone(), - attribute: format!("YTDL_META_{}", key).parse().unwrap(), - value: EntryValue::guess_from(value.to_string()), - provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", - user: context.user.clone(), - timestamp: chrono::Utc::now().naive_utc(), - }); + let attribute: Attribute = format!("YTDL_META_{}", key).parse().unwrap(); + result.extend([ + Entry { + entity: address.clone(), + attribute: attribute.clone(), + value: EntryValue::guess_from(value.to_string()), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + Entry { + entity: Address::Attribute(attribute.clone()), + attribute: ATTR_LABEL.parse().unwrap(), + value: format!("yt-dlp | {}", label).into(), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + Entry { + entity: Address::Attribute(attribute), + attribute: ATTR_OF.parse().unwrap(), + value: EntryValue::Address(TYPE_URL_ADDRESS.clone()), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + ]); } } @@ -161,14 +172,24 @@ impl Extractor for YtDlpExtractor { context.clone(), )?; - result.push(Entry { - entity: address.clone(), - attribute: "YTDLD".parse().unwrap(), - value: Address::Hash(stored).into(), - provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", - user: context.user.clone(), - timestamp: chrono::Utc::now().naive_utc(), - }); + result.extend([ + Entry { + entity: address.clone(), + attribute: "YTDLD".parse().unwrap(), + value: Address::Hash(stored).into(), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + Entry { + entity: Address::Attribute("YTDLD".parse().unwrap()), + attribute: ATTR_LABEL.parse().unwrap(), + value: "Media Downloaded (yt-dlp)".into(), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + ]); job_handle.update_progress(100.0)?; @@ -202,6 +223,17 @@ impl ExternalCommand for YtDlpExtractor { } } +const KNOWN_METADATA: [(&str, &str); 8] = [ + ("title", "Title"), + ("fulltitle", "Full Title"), + ("description", "Description"), + ("channel", "Channel"), + ("uploader", "Uploader"), + ("channel_url", "Channel URL"), + ("upload_date", "Upload Date"), + ("timestamp", "Timestamp"), +]; + // #[cfg(test)] // mod tests { // use super::*; diff --git a/cli/src/extractors/web.rs b/cli/src/extractors/web.rs index 81fabb3..72e2a4e 100644 --- a/cli/src/extractors/web.rs +++ b/cli/src/extractors/web.rs @@ -119,6 +119,24 @@ impl Extractor for WebExtractor { vec![e] } }) + .flat_map(|e| { + for (key, label) in KNOWN_ATTRIBUTES.iter() { + if e.attribute == key { + return vec![ + Entry { + entity: Address::Attribute(key.parse().unwrap()), + attribute: ATTR_LABEL.parse().unwrap(), + value: label.to_string().into(), + provenance: context.provenance.clone() + "EXTRACTOR", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }, + e, + ]; + } + } + vec![e] + }) .collect()); } Err(anyhow!("Failed for unknown reason.")) @@ -137,6 +155,16 @@ impl Extractor for WebExtractor { } } +const KNOWN_ATTRIBUTES: [(&str, &str); 7] = [ + ("OG_TITLE", "OG | Title"), + ("OG_DESCRIPTION", "OG | Description"), + ("OG_IMAGE", "OG | Image"), + ("OG_URL", "OG | URL"), + ("OG_SITE_NAME", "OG | Site Name"), + ("HTML_DESCRIPTION", "HTML Description"), + ("HTML_TITLE", "HTML Title"), +]; + #[cfg(test)] mod test {