feat(backend): add labels & OF
s to web/yt-dlp/monolith entries
This commit is contained in:
parent
5bd63a314b
commit
29fe6d8864
3 changed files with 106 additions and 35 deletions
15
cli/src/extractors/external/monolith.rs
vendored
15
cli/src/extractors/external/monolith.rs
vendored
|
@ -6,6 +6,7 @@ use std::process::Command;
|
|||
use std::sync::Arc;
|
||||
use tracing::debug;
|
||||
use upend_base::addressing::Address;
|
||||
use upend_base::constants::ATTR_LABEL;
|
||||
use upend_base::entry::Entry;
|
||||
use upend_db::jobs::JobContainer;
|
||||
use upend_db::stores::UpStore;
|
||||
|
@ -46,14 +47,24 @@ impl Extractor for MonolithExtractor {
|
|||
)?;
|
||||
debug!("Stored {} as {:?}", url.as_str(), stored);
|
||||
job_handle.update_progress(100.0)?;
|
||||
Ok(vec![Entry {
|
||||
Ok(vec![
|
||||
Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "WM_ARCHIVED".parse().unwrap(),
|
||||
value: Address::Hash(stored).into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
}])
|
||||
},
|
||||
Entry {
|
||||
entity: Address::Attribute("WM_ARCHIVED".parse().unwrap()),
|
||||
attribute: ATTR_LABEL.parse().unwrap(),
|
||||
value: "Webpage Archived (monolith)".into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
},
|
||||
])
|
||||
} else {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
|
64
cli/src/extractors/external/ytdlp.rs
vendored
64
cli/src/extractors/external/ytdlp.rs
vendored
|
@ -7,7 +7,8 @@ use std::process::{Command, Stdio};
|
|||
use std::sync::Arc;
|
||||
use tracing::{debug, trace};
|
||||
use upend_base::addressing::Address;
|
||||
use upend_base::entry::{Entry, EntryValue};
|
||||
use upend_base::constants::{ATTR_LABEL, ATTR_OF, TYPE_URL_ADDRESS};
|
||||
use upend_base::entry::{Attribute, Entry, EntryValue};
|
||||
use upend_db::jobs::JobContainer;
|
||||
use upend_db::stores::UpStore;
|
||||
use upend_db::{OperationContext, UpEndConnection};
|
||||
|
@ -129,25 +130,35 @@ impl Extractor for YtDlpExtractor {
|
|||
.path();
|
||||
let json_text = std::fs::read_to_string(json_path)?;
|
||||
let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?;
|
||||
for key in [
|
||||
"title",
|
||||
"fulltitle",
|
||||
"description",
|
||||
"channel",
|
||||
"uploader",
|
||||
"channel_url",
|
||||
"upload_date",
|
||||
"timestamp",
|
||||
] {
|
||||
for (key, label) in KNOWN_METADATA {
|
||||
if let Some(value) = json_data.get(key) {
|
||||
result.push(Entry {
|
||||
let attribute: Attribute = format!("YTDL_META_{}", key).parse().unwrap();
|
||||
result.extend([
|
||||
Entry {
|
||||
entity: address.clone(),
|
||||
attribute: format!("YTDL_META_{}", key).parse().unwrap(),
|
||||
attribute: attribute.clone(),
|
||||
value: EntryValue::guess_from(value.to_string()),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
});
|
||||
},
|
||||
Entry {
|
||||
entity: Address::Attribute(attribute.clone()),
|
||||
attribute: ATTR_LABEL.parse().unwrap(),
|
||||
value: format!("yt-dlp | {}", label).into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
},
|
||||
Entry {
|
||||
entity: Address::Attribute(attribute),
|
||||
attribute: ATTR_OF.parse().unwrap(),
|
||||
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
},
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,14 +172,24 @@ impl Extractor for YtDlpExtractor {
|
|||
context.clone(),
|
||||
)?;
|
||||
|
||||
result.push(Entry {
|
||||
result.extend([
|
||||
Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "YTDLD".parse().unwrap(),
|
||||
value: Address::Hash(stored).into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
});
|
||||
},
|
||||
Entry {
|
||||
entity: Address::Attribute("YTDLD".parse().unwrap()),
|
||||
attribute: ATTR_LABEL.parse().unwrap(),
|
||||
value: "Media Downloaded (yt-dlp)".into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
},
|
||||
]);
|
||||
|
||||
job_handle.update_progress(100.0)?;
|
||||
|
||||
|
@ -202,6 +223,17 @@ impl ExternalCommand for YtDlpExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
const KNOWN_METADATA: [(&str, &str); 8] = [
|
||||
("title", "Title"),
|
||||
("fulltitle", "Full Title"),
|
||||
("description", "Description"),
|
||||
("channel", "Channel"),
|
||||
("uploader", "Uploader"),
|
||||
("channel_url", "Channel URL"),
|
||||
("upload_date", "Upload Date"),
|
||||
("timestamp", "Timestamp"),
|
||||
];
|
||||
|
||||
// #[cfg(test)]
|
||||
// mod tests {
|
||||
// use super::*;
|
||||
|
|
|
@ -119,6 +119,24 @@ impl Extractor for WebExtractor {
|
|||
vec![e]
|
||||
}
|
||||
})
|
||||
.flat_map(|e| {
|
||||
for (key, label) in KNOWN_ATTRIBUTES.iter() {
|
||||
if e.attribute == key {
|
||||
return vec![
|
||||
Entry {
|
||||
entity: Address::Attribute(key.parse().unwrap()),
|
||||
attribute: ATTR_LABEL.parse().unwrap(),
|
||||
value: label.to_string().into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
},
|
||||
e,
|
||||
];
|
||||
}
|
||||
}
|
||||
vec![e]
|
||||
})
|
||||
.collect());
|
||||
}
|
||||
Err(anyhow!("Failed for unknown reason."))
|
||||
|
@ -137,6 +155,16 @@ impl Extractor for WebExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
const KNOWN_ATTRIBUTES: [(&str, &str); 7] = [
|
||||
("OG_TITLE", "OG | Title"),
|
||||
("OG_DESCRIPTION", "OG | Description"),
|
||||
("OG_IMAGE", "OG | Image"),
|
||||
("OG_URL", "OG | URL"),
|
||||
("OG_SITE_NAME", "OG | Site Name"),
|
||||
("HTML_DESCRIPTION", "HTML Description"),
|
||||
("HTML_TITLE", "HTML Title"),
|
||||
];
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
|
|
Loading…
Reference in a new issue