feat: web extractor adds LBLs
parent
116e850b66
commit
30cb6ab69c
|
@ -4,6 +4,7 @@ use super::Extractor;
|
|||
use crate::common::REQWEST_CLIENT;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Result;
|
||||
use upend::database::constants::LABEL_ATTR;
|
||||
use upend::{
|
||||
addressing::Address,
|
||||
database::{entry::Entry, stores::UpStore, UpEndConnection},
|
||||
|
@ -33,9 +34,16 @@ impl Extractor for WebExtractor {
|
|||
let _ = job_handle.update_progress(50.0);
|
||||
|
||||
let mut entries = vec![
|
||||
html.title.map(|html_title| Entry {
|
||||
html.title.as_ref().map(|html_title| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "HTML_TITLE".to_string(),
|
||||
value: html_title.clone().into(),
|
||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
}),
|
||||
html.title.map(|html_title| Entry {
|
||||
entity: address.clone(),
|
||||
attribute: LABEL_ATTR.to_string(),
|
||||
value: html_title.into(),
|
||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
|
@ -49,13 +57,24 @@ impl Extractor for WebExtractor {
|
|||
}),
|
||||
];
|
||||
for (key, value) in html.opengraph.properties {
|
||||
let attribute = format!("OG_{}", key.to_uppercase());
|
||||
if attribute == "OG_TITLE" {
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: LABEL_ATTR.to_string(),
|
||||
value: value.clone().into(),
|
||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
}));
|
||||
}
|
||||
|
||||
entries.push(Some(Entry {
|
||||
entity: address.clone(),
|
||||
attribute: format!("OG_{}", key.to_uppercase()),
|
||||
attribute,
|
||||
value: value.into(),
|
||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
}))
|
||||
}));
|
||||
}
|
||||
for image in html.opengraph.images {
|
||||
entries.push(Some(Entry {
|
||||
|
|
Loading…
Reference in New Issue