feat(cli): add support for archiving URLs on adding via monolith

This commit is contained in:
Tomáš Mládek 2024-06-30 13:31:26 +02:00
parent 44d76aa8d4
commit 630e8feee1
7 changed files with 189 additions and 17 deletions

View file

@ -126,6 +126,7 @@ docker:
apt-get clean && \
rm -rf /var/lib/apt/lists/*
DO +DOCKER_COMMON
COPY +external-monolith/monolith /usr/bin/monolith
ARG tag=trunk
SAVE IMAGE --push upend/upend:$tag
@ -139,6 +140,11 @@ DOCKER_COMMON:
ENV UPEND_NO_DESKTOP=true
ENV UPEND_ALLOW_HOST='*'
external-monolith:
FROM rust:bookworm
RUN cargo install monolith
SAVE ARTIFACT /usr/local/cargo/bin/monolith
# CI targets
lint:

View file

@ -103,6 +103,7 @@ default = [
"extractors-audio",
"extractors-exif",
"extractors-media",
"extractors-external"
]
desktop = ["webbrowser", "opener", "is_executable"]
previews = []
@ -111,3 +112,4 @@ extractors-web = ["webpage"]
extractors-audio = ["id3"]
extractors-exif = ["kamadak-exif"]
extractors-media = []
extractors-external = []

39
cli/src/extractors/external/mod.rs vendored Normal file
View file

@ -0,0 +1,39 @@
use thiserror::Error;
pub mod monolith;
pub use monolith::MonolithExtractor;
#[derive(Error, Debug)]
pub enum ExternalCommandError {
#[error("Command not found")]
CommandNotFound,
#[error("Command failed: {0}")]
CommandError(String),
}
pub trait ExternalCommand {
fn get_version(&self) -> Result<String, ExternalCommandError>;
}
fn process(
output: std::io::Result<std::process::Output>,
) -> Result<std::process::Output, ExternalCommandError> {
match output {
Ok(output) => {
if output.status.success() {
Ok(output)
} else {
Err(ExternalCommandError::CommandError(
String::from_utf8(output.stderr).unwrap_or_else(|_| String::from("")),
))
}
}
Err(err) => {
if err.kind() == std::io::ErrorKind::NotFound {
Err(ExternalCommandError::CommandNotFound)
} else {
Err(ExternalCommandError::CommandError(err.to_string()))
}
}
}
}

97
cli/src/extractors/external/monolith.rs vendored Normal file
View file

@ -0,0 +1,97 @@
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::Extractor;
use anyhow::Result;
use regex::Regex;
use std::process::Command;
use std::sync::Arc;
use tracing::debug;
use upend_base::addressing::Address;
use upend_base::entry::Entry;
use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore;
use upend_db::{OperationContext, UpEndConnection};
pub struct MonolithExtractor;
impl Extractor for MonolithExtractor {
fn get(
&self,
address: &Address,
connection: &UpEndConnection,
store: Arc<Box<dyn UpStore + Send + Sync>>,
mut job_container: JobContainer,
context: OperationContext,
) -> Result<Vec<Entry>> {
if let Address::Url(url) = address {
debug!("Archiving {} with `monolith`", url.as_str());
let mut job_handle =
job_container.add_job(None, &format!("Archiving <{}>", url.as_str()))?;
job_handle.update_progress(33.0)?;
let output = process(Command::new("monolith").arg(url.as_str()).output())?;
job_handle.update_progress(66.0)?;
let stdout = String::from_utf8(output.stdout)?;
let non_alpha_regex = Regex::new(r"[^a-zA-Z0-9]+").unwrap();
let name_hint = format!(
"web_{}.html",
non_alpha_regex
.replace_all(url.as_str(), "-")
.trim_end_matches('-')
);
let stored = store.store(
connection,
stdout.into(),
Some(name_hint),
None,
context.clone(),
)?;
debug!("Stored {} as {:?}", url.as_str(), stored);
job_handle.update_progress(100.0)?;
Ok(vec![Entry {
entity: address.clone(),
attribute: "WM_ARCHIVED".parse().unwrap(),
value: Address::Hash(stored).into(),
provenance: context.provenance.clone() + "EXTRACTOR monolith",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}])
} else {
Ok(vec![])
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
if !matches!(address, Address::Url(_)) {
return Ok(false);
}
if self.get_version().is_err() {
return Ok(false);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
.is_empty();
Ok(!is_extracted)
}
}
impl ExternalCommand for MonolithExtractor {
fn get_version(&self) -> Result<String, ExternalCommandError> {
let output = process(Command::new("monolith").arg("--version").output())?;
Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown")))
}
}
// #[cfg(test)]
// mod tests {
// use super::*;
//
// #[test]
// fn test_get_version() {
// let monolith = MonolithExtractor {};
// let version = monolith.get_version().unwrap();
// println!("version: {:?}", version);
// assert!(version.contains("monolith"));
// }
// }

View file

@ -22,6 +22,9 @@ pub mod exif;
#[cfg(feature = "extractors-media")]
pub mod media;
#[cfg(feature = "extractors-external")]
pub mod external;
pub trait Extractor {
fn get(
&self,
@ -177,7 +180,7 @@ pub fn extract(
address,
connection,
store.clone(),
job_container,
job_container.clone(),
context.clone(),
);
@ -187,6 +190,22 @@ pub fn extract(
}
}
#[cfg(feature = "extractors-external")]
{
let extract_result = external::MonolithExtractor.insert_info(
address,
connection,
store.clone(),
job_container,
context.clone(),
);
match extract_result {
Ok(count) => entry_count += count,
Err(err) => debug!("external: {}", err),
}
}
trace!("Extracting metadata for {address:?} - got {entry_count} entries.");
entry_count

View file

@ -31,12 +31,15 @@
types.pdf ||
types.model ||
types.web ||
types.html ||
types.fragment)) ??
false;
$: dispatch('handled', handled);
let imageLoaded: string | null = null;
let size = 0;
$: size = $entity?.getAs('FILE_SIZE', 'number') ?? 0;
</script>
{#if handled}
@ -55,7 +58,7 @@
{#if types?.image}
<ImageViewer {address} {detail} />
{/if}
{#if types?.pdf}
{#if types?.pdf && size < 15_000}
<iframe src="{api.apiUrl}/raw/{address}?inline" title="PDF document of {address}" />
{/if}
{#if types?.model}
@ -72,6 +75,9 @@
on:error={() => (handled = false)}
/>
{/if}
{#if types?.html && size < 15_000}
<iframe src="{api.apiUrl}/raw/{address}?inline=1" title="HTML document of {address}" />
{/if}
{#if types?.fragment}
<UpLink passthrough to={{ entity: String($entity?.get('ANNOTATES')) }}>
<FragmentViewer {address} {detail} />

View file

@ -19,6 +19,8 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
const model = mimeType?.startsWith('model') || entity?.identify().some((l) => l.endsWith('.stl'));
const web = entityInfo?.t == 'Url';
const fragment = Boolean(entity?.get('ANNOTATES'));
const html =
mimeType.startsWith('text/html') || entity?.identify().some((l) => l.endsWith('.html'));
const group = entity?.backlinks.some((e) => e.attribute == ATTR_IN);
@ -32,6 +34,7 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
model,
web,
fragment,
html,
group
};
}