diff --git a/Earthfile b/Earthfile index 768d813..d0de35b 100644 --- a/Earthfile +++ b/Earthfile @@ -126,6 +126,7 @@ docker: apt-get clean && \ rm -rf /var/lib/apt/lists/* DO +DOCKER_COMMON + COPY +external-monolith/monolith /usr/bin/monolith ARG tag=trunk SAVE IMAGE --push upend/upend:$tag @@ -138,6 +139,11 @@ DOCKER_COMMON: EXPOSE 8093 ENV UPEND_NO_DESKTOP=true ENV UPEND_ALLOW_HOST='*' + +external-monolith: + FROM rust:bookworm + RUN cargo install monolith + SAVE ARTIFACT /usr/local/cargo/bin/monolith # CI targets diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 4c2cbbd..75fdbf7 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -29,10 +29,10 @@ once_cell = "1.7.2" lru = "0.7.0" diesel = { version = "1.4", features = [ - "sqlite", - "r2d2", - "chrono", - "serde_json", + "sqlite", + "r2d2", + "chrono", + "serde_json", ] } diesel_migrations = "1.4" libsqlite3-sys = { version = "^0", features = ["bundled"] } @@ -54,10 +54,10 @@ regex = "1" multibase = "0.9" multihash = { version = "*", default-features = false, features = [ - "alloc", - "multihash-impl", - "sha2", - "identity", + "alloc", + "multihash-impl", + "sha2", + "identity", ] } uuid = { version = "1.4", features = ["v4"] } @@ -96,13 +96,14 @@ shadow-rs = { version = "0.23", default-features = false } [features] default = [ - "desktop", - "previews", - "previews-image", - "extractors-web", - "extractors-audio", - "extractors-exif", - "extractors-media", + "desktop", + "previews", + "previews-image", + "extractors-web", + "extractors-audio", + "extractors-exif", + "extractors-media", + "extractors-external" ] desktop = ["webbrowser", "opener", "is_executable"] previews = [] @@ -111,3 +112,4 @@ extractors-web = ["webpage"] extractors-audio = ["id3"] extractors-exif = ["kamadak-exif"] extractors-media = [] +extractors-external = [] diff --git a/cli/src/extractors/external/mod.rs b/cli/src/extractors/external/mod.rs new file mode 100644 index 0000000..0230318 --- /dev/null +++ b/cli/src/extractors/external/mod.rs @@ -0,0 +1,39 @@ +use thiserror::Error; + +pub mod monolith; +pub use monolith::MonolithExtractor; + +#[derive(Error, Debug)] +pub enum ExternalCommandError { + #[error("Command not found")] + CommandNotFound, + #[error("Command failed: {0}")] + CommandError(String), +} + +pub trait ExternalCommand { + fn get_version(&self) -> Result; +} + +fn process( + output: std::io::Result, +) -> Result { + match output { + Ok(output) => { + if output.status.success() { + Ok(output) + } else { + Err(ExternalCommandError::CommandError( + String::from_utf8(output.stderr).unwrap_or_else(|_| String::from("")), + )) + } + } + Err(err) => { + if err.kind() == std::io::ErrorKind::NotFound { + Err(ExternalCommandError::CommandNotFound) + } else { + Err(ExternalCommandError::CommandError(err.to_string())) + } + } + } +} diff --git a/cli/src/extractors/external/monolith.rs b/cli/src/extractors/external/monolith.rs new file mode 100644 index 0000000..f2b1002 --- /dev/null +++ b/cli/src/extractors/external/monolith.rs @@ -0,0 +1,97 @@ +use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; +use crate::extractors::Extractor; +use anyhow::Result; +use regex::Regex; +use std::process::Command; +use std::sync::Arc; +use tracing::debug; +use upend_base::addressing::Address; +use upend_base::entry::Entry; +use upend_db::jobs::JobContainer; +use upend_db::stores::UpStore; +use upend_db::{OperationContext, UpEndConnection}; + +pub struct MonolithExtractor; + +impl Extractor for MonolithExtractor { + fn get( + &self, + address: &Address, + connection: &UpEndConnection, + store: Arc>, + mut job_container: JobContainer, + context: OperationContext, + ) -> Result> { + if let Address::Url(url) = address { + debug!("Archiving {} with `monolith`", url.as_str()); + let mut job_handle = + job_container.add_job(None, &format!("Archiving <{}>", url.as_str()))?; + job_handle.update_progress(33.0)?; + let output = process(Command::new("monolith").arg(url.as_str()).output())?; + job_handle.update_progress(66.0)?; + let stdout = String::from_utf8(output.stdout)?; + let non_alpha_regex = Regex::new(r"[^a-zA-Z0-9]+").unwrap(); + let name_hint = format!( + "web_{}.html", + non_alpha_regex + .replace_all(url.as_str(), "-") + .trim_end_matches('-') + ); + let stored = store.store( + connection, + stdout.into(), + Some(name_hint), + None, + context.clone(), + )?; + debug!("Stored {} as {:?}", url.as_str(), stored); + job_handle.update_progress(100.0)?; + Ok(vec![Entry { + entity: address.clone(), + attribute: "WM_ARCHIVED".parse().unwrap(), + value: Address::Hash(stored).into(), + provenance: context.provenance.clone() + "EXTRACTOR monolith", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }]) + } else { + Ok(vec![]) + } + } + + fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + if !matches!(address, Address::Url(_)) { + return Ok(false); + } + + if self.get_version().is_err() { + return Ok(false); + } + + let is_extracted = !connection + .query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)? + .is_empty(); + + Ok(!is_extracted) + } +} + +impl ExternalCommand for MonolithExtractor { + fn get_version(&self) -> Result { + let output = process(Command::new("monolith").arg("--version").output())?; + Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown"))) + } +} + +// #[cfg(test)] +// mod tests { +// use super::*; +// +// #[test] +// fn test_get_version() { +// let monolith = MonolithExtractor {}; +// let version = monolith.get_version().unwrap(); +// println!("version: {:?}", version); +// assert!(version.contains("monolith")); +// } +// } diff --git a/cli/src/extractors/mod.rs b/cli/src/extractors/mod.rs index d5d28eb..7351159 100644 --- a/cli/src/extractors/mod.rs +++ b/cli/src/extractors/mod.rs @@ -22,6 +22,9 @@ pub mod exif; #[cfg(feature = "extractors-media")] pub mod media; +#[cfg(feature = "extractors-external")] +pub mod external; + pub trait Extractor { fn get( &self, @@ -177,7 +180,7 @@ pub fn extract( address, connection, store.clone(), - job_container, + job_container.clone(), context.clone(), ); @@ -187,6 +190,22 @@ pub fn extract( } } + #[cfg(feature = "extractors-external")] + { + let extract_result = external::MonolithExtractor.insert_info( + address, + connection, + store.clone(), + job_container, + context.clone(), + ); + + match extract_result { + Ok(count) => entry_count += count, + Err(err) => debug!("external: {}", err), + } + } + trace!("Extracting metadata for {address:?} - got {entry_count} entries."); entry_count diff --git a/webui/src/lib/components/display/BlobViewer.svelte b/webui/src/lib/components/display/BlobViewer.svelte index 5742340..1868be6 100644 --- a/webui/src/lib/components/display/BlobViewer.svelte +++ b/webui/src/lib/components/display/BlobViewer.svelte @@ -31,12 +31,15 @@ types.pdf || types.model || types.web || + types.html || types.fragment)) ?? false; $: dispatch('handled', handled); let imageLoaded: string | null = null; + let size = 0; + $: size = $entity?.getAs('FILE_SIZE', 'number') ?? 0; {#if handled} @@ -55,7 +58,7 @@ {#if types?.image} {/if} - {#if types?.pdf} + {#if types?.pdf && size < 15_000}