From 30cb6ab69ca8f0a9bd036c1ae2523b3712d9e1eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Ml=C3=A1dek?= Date: Sat, 27 May 2023 23:05:22 +0200 Subject: [PATCH] feat: web extractor adds LBLs --- cli/src/extractors/web.rs | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cli/src/extractors/web.rs b/cli/src/extractors/web.rs index 84dbabf..2baabb1 100644 --- a/cli/src/extractors/web.rs +++ b/cli/src/extractors/web.rs @@ -4,6 +4,7 @@ use super::Extractor; use crate::common::REQWEST_CLIENT; use anyhow::anyhow; use anyhow::Result; +use upend::database::constants::LABEL_ATTR; use upend::{ addressing::Address, database::{entry::Entry, stores::UpStore, UpEndConnection}, @@ -33,9 +34,16 @@ impl Extractor for WebExtractor { let _ = job_handle.update_progress(50.0); let mut entries = vec![ - html.title.map(|html_title| Entry { + html.title.as_ref().map(|html_title| Entry { entity: address.clone(), attribute: "HTML_TITLE".to_string(), + value: html_title.clone().into(), + provenance: "SYSTEM EXTRACTOR".to_string(), + timestamp: chrono::Utc::now().naive_utc(), + }), + html.title.map(|html_title| Entry { + entity: address.clone(), + attribute: LABEL_ATTR.to_string(), value: html_title.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), @@ -49,13 +57,24 @@ impl Extractor for WebExtractor { }), ]; for (key, value) in html.opengraph.properties { + let attribute = format!("OG_{}", key.to_uppercase()); + if attribute == "OG_TITLE" { + entries.push(Some(Entry { + entity: address.clone(), + attribute: LABEL_ATTR.to_string(), + value: value.clone().into(), + provenance: "SYSTEM EXTRACTOR".to_string(), + timestamp: chrono::Utc::now().naive_utc(), + })); + } + entries.push(Some(Entry { entity: address.clone(), - attribute: format!("OG_{}", key.to_uppercase()), + attribute, value: value.into(), provenance: "SYSTEM EXTRACTOR".to_string(), timestamp: chrono::Utc::now().naive_utc(), - })) + })); } for image in html.opengraph.images { entries.push(Some(Entry {