feat(backend): add labels & OF
s to web/yt-dlp/monolith entries
This commit is contained in:
parent
5bd63a314b
commit
29fe6d8864
3 changed files with 106 additions and 35 deletions
15
cli/src/extractors/external/monolith.rs
vendored
15
cli/src/extractors/external/monolith.rs
vendored
|
@ -6,6 +6,7 @@ use std::process::Command;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use upend_base::addressing::Address;
|
use upend_base::addressing::Address;
|
||||||
|
use upend_base::constants::ATTR_LABEL;
|
||||||
use upend_base::entry::Entry;
|
use upend_base::entry::Entry;
|
||||||
use upend_db::jobs::JobContainer;
|
use upend_db::jobs::JobContainer;
|
||||||
use upend_db::stores::UpStore;
|
use upend_db::stores::UpStore;
|
||||||
|
@ -46,14 +47,24 @@ impl Extractor for MonolithExtractor {
|
||||||
)?;
|
)?;
|
||||||
debug!("Stored {} as {:?}", url.as_str(), stored);
|
debug!("Stored {} as {:?}", url.as_str(), stored);
|
||||||
job_handle.update_progress(100.0)?;
|
job_handle.update_progress(100.0)?;
|
||||||
Ok(vec![Entry {
|
Ok(vec![
|
||||||
|
Entry {
|
||||||
entity: address.clone(),
|
entity: address.clone(),
|
||||||
attribute: "WM_ARCHIVED".parse().unwrap(),
|
attribute: "WM_ARCHIVED".parse().unwrap(),
|
||||||
value: Address::Hash(stored).into(),
|
value: Address::Hash(stored).into(),
|
||||||
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||||
user: context.user.clone(),
|
user: context.user.clone(),
|
||||||
timestamp: chrono::Utc::now().naive_utc(),
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
}])
|
},
|
||||||
|
Entry {
|
||||||
|
entity: Address::Attribute("WM_ARCHIVED".parse().unwrap()),
|
||||||
|
attribute: ATTR_LABEL.parse().unwrap(),
|
||||||
|
value: "Webpage Archived (monolith)".into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
},
|
||||||
|
])
|
||||||
} else {
|
} else {
|
||||||
Ok(vec![])
|
Ok(vec![])
|
||||||
}
|
}
|
||||||
|
|
64
cli/src/extractors/external/ytdlp.rs
vendored
64
cli/src/extractors/external/ytdlp.rs
vendored
|
@ -7,7 +7,8 @@ use std::process::{Command, Stdio};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::{debug, trace};
|
use tracing::{debug, trace};
|
||||||
use upend_base::addressing::Address;
|
use upend_base::addressing::Address;
|
||||||
use upend_base::entry::{Entry, EntryValue};
|
use upend_base::constants::{ATTR_LABEL, ATTR_OF, TYPE_URL_ADDRESS};
|
||||||
|
use upend_base::entry::{Attribute, Entry, EntryValue};
|
||||||
use upend_db::jobs::JobContainer;
|
use upend_db::jobs::JobContainer;
|
||||||
use upend_db::stores::UpStore;
|
use upend_db::stores::UpStore;
|
||||||
use upend_db::{OperationContext, UpEndConnection};
|
use upend_db::{OperationContext, UpEndConnection};
|
||||||
|
@ -129,25 +130,35 @@ impl Extractor for YtDlpExtractor {
|
||||||
.path();
|
.path();
|
||||||
let json_text = std::fs::read_to_string(json_path)?;
|
let json_text = std::fs::read_to_string(json_path)?;
|
||||||
let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?;
|
let json_data = serde_json::from_str::<serde_json::Value>(&json_text)?;
|
||||||
for key in [
|
for (key, label) in KNOWN_METADATA {
|
||||||
"title",
|
|
||||||
"fulltitle",
|
|
||||||
"description",
|
|
||||||
"channel",
|
|
||||||
"uploader",
|
|
||||||
"channel_url",
|
|
||||||
"upload_date",
|
|
||||||
"timestamp",
|
|
||||||
] {
|
|
||||||
if let Some(value) = json_data.get(key) {
|
if let Some(value) = json_data.get(key) {
|
||||||
result.push(Entry {
|
let attribute: Attribute = format!("YTDL_META_{}", key).parse().unwrap();
|
||||||
|
result.extend([
|
||||||
|
Entry {
|
||||||
entity: address.clone(),
|
entity: address.clone(),
|
||||||
attribute: format!("YTDL_META_{}", key).parse().unwrap(),
|
attribute: attribute.clone(),
|
||||||
value: EntryValue::guess_from(value.to_string()),
|
value: EntryValue::guess_from(value.to_string()),
|
||||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
user: context.user.clone(),
|
user: context.user.clone(),
|
||||||
timestamp: chrono::Utc::now().naive_utc(),
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
});
|
},
|
||||||
|
Entry {
|
||||||
|
entity: Address::Attribute(attribute.clone()),
|
||||||
|
attribute: ATTR_LABEL.parse().unwrap(),
|
||||||
|
value: format!("yt-dlp | {}", label).into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
},
|
||||||
|
Entry {
|
||||||
|
entity: Address::Attribute(attribute),
|
||||||
|
attribute: ATTR_OF.parse().unwrap(),
|
||||||
|
value: EntryValue::Address(TYPE_URL_ADDRESS.clone()),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
},
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,14 +172,24 @@ impl Extractor for YtDlpExtractor {
|
||||||
context.clone(),
|
context.clone(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
result.push(Entry {
|
result.extend([
|
||||||
|
Entry {
|
||||||
entity: address.clone(),
|
entity: address.clone(),
|
||||||
attribute: "YTDLD".parse().unwrap(),
|
attribute: "YTDLD".parse().unwrap(),
|
||||||
value: Address::Hash(stored).into(),
|
value: Address::Hash(stored).into(),
|
||||||
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
user: context.user.clone(),
|
user: context.user.clone(),
|
||||||
timestamp: chrono::Utc::now().naive_utc(),
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
});
|
},
|
||||||
|
Entry {
|
||||||
|
entity: Address::Attribute("YTDLD".parse().unwrap()),
|
||||||
|
attribute: ATTR_LABEL.parse().unwrap(),
|
||||||
|
value: "Media Downloaded (yt-dlp)".into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
job_handle.update_progress(100.0)?;
|
job_handle.update_progress(100.0)?;
|
||||||
|
|
||||||
|
@ -202,6 +223,17 @@ impl ExternalCommand for YtDlpExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const KNOWN_METADATA: [(&str, &str); 8] = [
|
||||||
|
("title", "Title"),
|
||||||
|
("fulltitle", "Full Title"),
|
||||||
|
("description", "Description"),
|
||||||
|
("channel", "Channel"),
|
||||||
|
("uploader", "Uploader"),
|
||||||
|
("channel_url", "Channel URL"),
|
||||||
|
("upload_date", "Upload Date"),
|
||||||
|
("timestamp", "Timestamp"),
|
||||||
|
];
|
||||||
|
|
||||||
// #[cfg(test)]
|
// #[cfg(test)]
|
||||||
// mod tests {
|
// mod tests {
|
||||||
// use super::*;
|
// use super::*;
|
||||||
|
|
|
@ -119,6 +119,24 @@ impl Extractor for WebExtractor {
|
||||||
vec![e]
|
vec![e]
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
.flat_map(|e| {
|
||||||
|
for (key, label) in KNOWN_ATTRIBUTES.iter() {
|
||||||
|
if e.attribute == key {
|
||||||
|
return vec![
|
||||||
|
Entry {
|
||||||
|
entity: Address::Attribute(key.parse().unwrap()),
|
||||||
|
attribute: ATTR_LABEL.parse().unwrap(),
|
||||||
|
value: label.to_string().into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
},
|
||||||
|
e,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vec![e]
|
||||||
|
})
|
||||||
.collect());
|
.collect());
|
||||||
}
|
}
|
||||||
Err(anyhow!("Failed for unknown reason."))
|
Err(anyhow!("Failed for unknown reason."))
|
||||||
|
@ -137,6 +155,16 @@ impl Extractor for WebExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const KNOWN_ATTRIBUTES: [(&str, &str); 7] = [
|
||||||
|
("OG_TITLE", "OG | Title"),
|
||||||
|
("OG_DESCRIPTION", "OG | Description"),
|
||||||
|
("OG_IMAGE", "OG | Image"),
|
||||||
|
("OG_URL", "OG | URL"),
|
||||||
|
("OG_SITE_NAME", "OG | Site Name"),
|
||||||
|
("HTML_DESCRIPTION", "HTML Description"),
|
||||||
|
("HTML_TITLE", "HTML Title"),
|
||||||
|
];
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue