feat: web extractor adds LBLs
This commit is contained in:
parent
116e850b66
commit
30cb6ab69c
1 changed files with 22 additions and 3 deletions
|
@ -4,6 +4,7 @@ use super::Extractor;
|
||||||
use crate::common::REQWEST_CLIENT;
|
use crate::common::REQWEST_CLIENT;
|
||||||
use anyhow::anyhow;
|
use anyhow::anyhow;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use upend::database::constants::LABEL_ATTR;
|
||||||
use upend::{
|
use upend::{
|
||||||
addressing::Address,
|
addressing::Address,
|
||||||
database::{entry::Entry, stores::UpStore, UpEndConnection},
|
database::{entry::Entry, stores::UpStore, UpEndConnection},
|
||||||
|
@ -33,9 +34,16 @@ impl Extractor for WebExtractor {
|
||||||
let _ = job_handle.update_progress(50.0);
|
let _ = job_handle.update_progress(50.0);
|
||||||
|
|
||||||
let mut entries = vec![
|
let mut entries = vec![
|
||||||
html.title.map(|html_title| Entry {
|
html.title.as_ref().map(|html_title| Entry {
|
||||||
entity: address.clone(),
|
entity: address.clone(),
|
||||||
attribute: "HTML_TITLE".to_string(),
|
attribute: "HTML_TITLE".to_string(),
|
||||||
|
value: html_title.clone().into(),
|
||||||
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
}),
|
||||||
|
html.title.map(|html_title| Entry {
|
||||||
|
entity: address.clone(),
|
||||||
|
attribute: LABEL_ATTR.to_string(),
|
||||||
value: html_title.into(),
|
value: html_title.into(),
|
||||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||||
timestamp: chrono::Utc::now().naive_utc(),
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
@ -49,13 +57,24 @@ impl Extractor for WebExtractor {
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
for (key, value) in html.opengraph.properties {
|
for (key, value) in html.opengraph.properties {
|
||||||
|
let attribute = format!("OG_{}", key.to_uppercase());
|
||||||
|
if attribute == "OG_TITLE" {
|
||||||
entries.push(Some(Entry {
|
entries.push(Some(Entry {
|
||||||
entity: address.clone(),
|
entity: address.clone(),
|
||||||
attribute: format!("OG_{}", key.to_uppercase()),
|
attribute: LABEL_ATTR.to_string(),
|
||||||
|
value: value.clone().into(),
|
||||||
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
entries.push(Some(Entry {
|
||||||
|
entity: address.clone(),
|
||||||
|
attribute,
|
||||||
value: value.into(),
|
value: value.into(),
|
||||||
provenance: "SYSTEM EXTRACTOR".to_string(),
|
provenance: "SYSTEM EXTRACTOR".to_string(),
|
||||||
timestamp: chrono::Utc::now().naive_utc(),
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
}))
|
}));
|
||||||
}
|
}
|
||||||
for image in html.opengraph.images {
|
for image in html.opengraph.images {
|
||||||
entries.push(Some(Entry {
|
entries.push(Some(Entry {
|
||||||
|
|
Loading…
Reference in a new issue