feat: web extractor adds LBLs

feat/type-attributes
Tomáš Mládek 2023-05-27 23:05:22 +02:00
parent 116e850b66
commit 30cb6ab69c
1 changed files with 22 additions and 3 deletions

View File

@ -4,6 +4,7 @@ use super::Extractor;
use crate::common::REQWEST_CLIENT;
use anyhow::anyhow;
use anyhow::Result;
use upend::database::constants::LABEL_ATTR;
use upend::{
addressing::Address,
database::{entry::Entry, stores::UpStore, UpEndConnection},
@ -33,9 +34,16 @@ impl Extractor for WebExtractor {
let _ = job_handle.update_progress(50.0);
let mut entries = vec![
html.title.map(|html_title| Entry {
html.title.as_ref().map(|html_title| Entry {
entity: address.clone(),
attribute: "HTML_TITLE".to_string(),
value: html_title.clone().into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}),
html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: html_title.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
@ -49,13 +57,24 @@ impl Extractor for WebExtractor {
}),
];
for (key, value) in html.opengraph.properties {
let attribute = format!("OG_{}", key.to_uppercase());
if attribute == "OG_TITLE" {
entries.push(Some(Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: value.clone().into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}));
}
entries.push(Some(Entry {
entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()),
attribute,
value: value.into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}))
}));
}
for image in html.opengraph.images {
entries.push(Some(Entry {