feat(backend): add labels & OFs to web/yt-dlp/monolith entries

This commit is contained in:
Tomáš Mládek 2024-07-02 20:02:01 +02:00
parent 5bd63a314b
commit 29fe6d8864
3 changed files with 106 additions and 35 deletions

View file

@ -6,6 +6,7 @@ use std::process::Command;
use std::sync::Arc;
use tracing::debug;
use upend_base::addressing::Address;
use upend_base::constants::ATTR_LABEL;
use upend_base::entry::Entry;
use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore;
@ -46,14 +47,24 @@ impl Extractor for MonolithExtractor {
)?;
debug!("Stored {} as {:?}", url.as_str(), stored);
job_handle.update_progress(100.0)?;
Ok(vec![Entry {
Ok(vec![
Entry {
entity: address.clone(),
attribute: "WM_ARCHIVED".parse().unwrap(),
value: Address::Hash(stored).into(),
provenance: context.provenance.clone() + "EXTRACTOR monolith",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}])
},
Entry {
entity: Address::Attribute("WM_ARCHIVED".parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: "Webpage Archived (monolith)".into(),
provenance: context.provenance.clone() + "EXTRACTOR monolith",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
])
} else {
Ok(vec![])
}

View file

@ -7,7 +7,8 @@ use std::process::{Command, Stdio};
use std::sync::Arc;
use tracing::{debug, trace};
use upend_base::addressing::Address;
use upend_base::entry::{Entry, EntryValue};
use upend_base::constants::{ATTR_LABEL, ATTR_OF, TYPE_URL_ADDRESS};
use upend_base::entry::{Attribute, Entry, EntryValue};
use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore;
use upend_db::{OperationContext, UpEndConnection};
@ -129,25 +130,35 @@ impl Extractor for YtDlpExtractor {
.path();
let json_text = std::fs::read_to_string(json_path)?;
let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?;
for key in [
"title",
"fulltitle",
"description",
"channel",
"uploader",
"channel_url",
"upload_date",
"timestamp",
] {
for (key, label) in KNOWN_METADATA {
if let Some(value) = json_data.get(key) {
result.push(Entry {
let attribute: Attribute = format!("YTDL_META_{}", key).parse().unwrap();
result.extend([
Entry {
entity: address.clone(),
attribute: format!("YTDL_META_{}", key).parse().unwrap(),
attribute: attribute.clone(),
value: EntryValue::guess_from(value.to_string()),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
});
},
Entry {
entity: Address::Attribute(attribute.clone()),
attribute: ATTR_LABEL.parse().unwrap(),
value: format!("yt-dlp | {}", label).into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
Entry {
entity: Address::Attribute(attribute),
attribute: ATTR_OF.parse().unwrap(),
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
]);
}
}
@ -161,14 +172,24 @@ impl Extractor for YtDlpExtractor {
context.clone(),
)?;
result.push(Entry {
result.extend([
Entry {
entity: address.clone(),
attribute: "YTDLD".parse().unwrap(),
value: Address::Hash(stored).into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
});
},
Entry {
entity: Address::Attribute("YTDLD".parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: "Media Downloaded (yt-dlp)".into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
]);
job_handle.update_progress(100.0)?;
@ -202,6 +223,17 @@ impl ExternalCommand for YtDlpExtractor {
}
}
const KNOWN_METADATA: [(&str, &str); 8] = [
("title", "Title"),
("fulltitle", "Full Title"),
("description", "Description"),
("channel", "Channel"),
("uploader", "Uploader"),
("channel_url", "Channel URL"),
("upload_date", "Upload Date"),
("timestamp", "Timestamp"),
];
// #[cfg(test)]
// mod tests {
// use super::*;

View file

@ -119,6 +119,24 @@ impl Extractor for WebExtractor {
vec![e]
}
})
.flat_map(|e| {
for (key, label) in KNOWN_ATTRIBUTES.iter() {
if e.attribute == key {
return vec![
Entry {
entity: Address::Attribute(key.parse().unwrap()),
attribute: ATTR_LABEL.parse().unwrap(),
value: label.to_string().into(),
provenance: context.provenance.clone() + "EXTRACTOR",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
},
e,
];
}
}
vec![e]
})
.collect());
}
Err(anyhow!("Failed for unknown reason."))
@ -137,6 +155,16 @@ impl Extractor for WebExtractor {
}
}
const KNOWN_ATTRIBUTES: [(&str, &str); 7] = [
("OG_TITLE", "OG | Title"),
("OG_DESCRIPTION", "OG | Description"),
("OG_IMAGE", "OG | Image"),
("OG_URL", "OG | URL"),
("OG_SITE_NAME", "OG | Site Name"),
("HTML_DESCRIPTION", "HTML Description"),
("HTML_TITLE", "HTML Title"),
];
#[cfg(test)]
mod test {