feat: web extractor adds LBLs

This commit is contained in:
Tomáš Mládek 2023-05-27 23:05:22 +02:00
parent 116e850b66
commit 30cb6ab69c

View file

@ -4,6 +4,7 @@ use super::Extractor;
use crate::common::REQWEST_CLIENT; use crate::common::REQWEST_CLIENT;
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Result; use anyhow::Result;
use upend::database::constants::LABEL_ATTR;
use upend::{ use upend::{
addressing::Address, addressing::Address,
database::{entry::Entry, stores::UpStore, UpEndConnection}, database::{entry::Entry, stores::UpStore, UpEndConnection},
@ -33,9 +34,16 @@ impl Extractor for WebExtractor {
let _ = job_handle.update_progress(50.0); let _ = job_handle.update_progress(50.0);
let mut entries = vec![ let mut entries = vec![
html.title.map(|html_title| Entry { html.title.as_ref().map(|html_title| Entry {
entity: address.clone(), entity: address.clone(),
attribute: "HTML_TITLE".to_string(), attribute: "HTML_TITLE".to_string(),
value: html_title.clone().into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}),
html.title.map(|html_title| Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: html_title.into(), value: html_title.into(),
provenance: "SYSTEM EXTRACTOR".to_string(), provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(), timestamp: chrono::Utc::now().naive_utc(),
@ -49,13 +57,24 @@ impl Extractor for WebExtractor {
}), }),
]; ];
for (key, value) in html.opengraph.properties { for (key, value) in html.opengraph.properties {
let attribute = format!("OG_{}", key.to_uppercase());
if attribute == "OG_TITLE" {
entries.push(Some(Entry {
entity: address.clone(),
attribute: LABEL_ATTR.to_string(),
value: value.clone().into(),
provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(),
}));
}
entries.push(Some(Entry { entries.push(Some(Entry {
entity: address.clone(), entity: address.clone(),
attribute: format!("OG_{}", key.to_uppercase()), attribute,
value: value.into(), value: value.into(),
provenance: "SYSTEM EXTRACTOR".to_string(), provenance: "SYSTEM EXTRACTOR".to_string(),
timestamp: chrono::Utc::now().naive_utc(), timestamp: chrono::Utc::now().naive_utc(),
})) }));
} }
for image in html.opengraph.images { for image in html.opengraph.images {
entries.push(Some(Entry { entries.push(Some(Entry {