feat(backend): add labels & OFs to web/yt-dlp/monolith entries

This commit is contained in:
Tomáš Mládek 2024-07-02 20:02:01 +02:00
parent 5bd63a314b
commit 29fe6d8864
3 changed files with 106 additions and 35 deletions

View file

@ -6,6 +6,7 @@ use std::process::Command;
use std::sync::Arc; use std::sync::Arc;
use tracing::debug; use tracing::debug;
use upend_base::addressing::Address; use upend_base::addressing::Address;
use upend_base::constants::ATTR_LABEL;
use upend_base::entry::Entry; use upend_base::entry::Entry;
use upend_db::jobs::JobContainer; use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore; use upend_db::stores::UpStore;
@ -46,14 +47,24 @@ impl Extractor for MonolithExtractor {
)?; )?;
debug!("Stored {} as {:?}", url.as_str(), stored); debug!("Stored {} as {:?}", url.as_str(), stored);
job_handle.update_progress(100.0)?; job_handle.update_progress(100.0)?;
Ok(vec![Entry { Ok(vec![
entity: address.clone(), Entry {
attribute: "WM_ARCHIVED".parse().unwrap(), entity: address.clone(),
value: Address::Hash(stored).into(), attribute: "WM_ARCHIVED".parse().unwrap(),
provenance: context.provenance.clone() + "EXTRACTOR monolith", value: Address::Hash(stored).into(),
user: context.user.clone(), provenance: context.provenance.clone() + "EXTRACTOR monolith",
timestamp: chrono::Utc::now().naive_utc(), user: context.user.clone(),
}]) timestamp: chrono::Utc::now().naive_utc(),
},
Entry {
entity: Address::Attribute("WM_ARCHIVED".parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: "Webpage Archived (monolith)".into(),
provenance: context.provenance.clone() + "EXTRACTOR monolith",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
])
} else { } else {
Ok(vec![]) Ok(vec![])
} }

View file

@ -7,7 +7,8 @@ use std::process::{Command, Stdio};
use std::sync::Arc; use std::sync::Arc;
use tracing::{debug, trace}; use tracing::{debug, trace};
use upend_base::addressing::Address; use upend_base::addressing::Address;
use upend_base::entry::{Entry, EntryValue}; use upend_base::constants::{ATTR_LABEL, ATTR_OF, TYPE_URL_ADDRESS};
use upend_base::entry::{Attribute, Entry, EntryValue};
use upend_db::jobs::JobContainer; use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore; use upend_db::stores::UpStore;
use upend_db::{OperationContext, UpEndConnection}; use upend_db::{OperationContext, UpEndConnection};
@ -129,25 +130,35 @@ impl Extractor for YtDlpExtractor {
.path(); .path();
let json_text = std::fs::read_to_string(json_path)?; let json_text = std::fs::read_to_string(json_path)?;
let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?; let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?;
for key in [ for (key, label) in KNOWN_METADATA {
"title",
"fulltitle",
"description",
"channel",
"uploader",
"channel_url",
"upload_date",
"timestamp",
] {
if let Some(value) = json_data.get(key) { if let Some(value) = json_data.get(key) {
result.push(Entry { let attribute: Attribute = format!("YTDL_META_{}", key).parse().unwrap();
entity: address.clone(), result.extend([
attribute: format!("YTDL_META_{}", key).parse().unwrap(), Entry {
value: EntryValue::guess_from(value.to_string()), entity: address.clone(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", attribute: attribute.clone(),
user: context.user.clone(), value: EntryValue::guess_from(value.to_string()),
timestamp: chrono::Utc::now().naive_utc(), provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
}); user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
Entry {
entity: Address::Attribute(attribute.clone()),
attribute: ATTR_LABEL.parse().unwrap(),
value: format!("yt-dlp | {}", label).into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
Entry {
entity: Address::Attribute(attribute),
attribute: ATTR_OF.parse().unwrap(),
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
]);
} }
} }
@ -161,14 +172,24 @@ impl Extractor for YtDlpExtractor {
context.clone(), context.clone(),
)?; )?;
result.push(Entry { result.extend([
entity: address.clone(), Entry {
attribute: "YTDLD".parse().unwrap(), entity: address.clone(),
value: Address::Hash(stored).into(), attribute: "YTDLD".parse().unwrap(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", value: Address::Hash(stored).into(),
user: context.user.clone(), provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
timestamp: chrono::Utc::now().naive_utc(), user: context.user.clone(),
}); timestamp: chrono::Utc::now().naive_utc(),
},
Entry {
entity: Address::Attribute("YTDLD".parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: "Media Downloaded (yt-dlp)".into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
]);
job_handle.update_progress(100.0)?; job_handle.update_progress(100.0)?;
@ -202,6 +223,17 @@ impl ExternalCommand for YtDlpExtractor {
} }
} }
const KNOWN_METADATA: [(&str, &str); 8] = [
("title", "Title"),
("fulltitle", "Full Title"),
("description", "Description"),
("channel", "Channel"),
("uploader", "Uploader"),
("channel_url", "Channel URL"),
("upload_date", "Upload Date"),
("timestamp", "Timestamp"),
];
// #[cfg(test)] // #[cfg(test)]
// mod tests { // mod tests {
// use super::*; // use super::*;

View file

@ -119,6 +119,24 @@ impl Extractor for WebExtractor {
vec![e] vec![e]
} }
}) })
.flat_map(|e| {
for (key, label) in KNOWN_ATTRIBUTES.iter() {
if e.attribute == key {
return vec![
Entry {
entity: Address::Attribute(key.parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: label.to_string().into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
e,
];
}
}
vec![e]
})
.collect()); .collect());
} }
Err(anyhow!("Failed for unknown reason.")) Err(anyhow!("Failed for unknown reason."))
@ -137,6 +155,16 @@ impl Extractor for WebExtractor {
} }
} }
const KNOWN_ATTRIBUTES: [(&str, &str); 7] = [
("OG_TITLE", "OG | Title"),
("OG_DESCRIPTION", "OG | Description"),
("OG_IMAGE", "OG | Image"),
("OG_URL", "OG | URL"),
("OG_SITE_NAME", "OG | Site Name"),
("HTML_DESCRIPTION", "HTML Description"),
("HTML_TITLE", "HTML Title"),
];
#[cfg(test)] #[cfg(test)]
mod test { mod test {