feat(cli): add support for archiving URLs on adding via monolith
This commit is contained in:
parent
44d76aa8d4
commit
630e8feee1
7 changed files with 189 additions and 17 deletions
|
@ -126,6 +126,7 @@ docker:
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
DO +DOCKER_COMMON
|
DO +DOCKER_COMMON
|
||||||
|
COPY +external-monolith/monolith /usr/bin/monolith
|
||||||
ARG tag=trunk
|
ARG tag=trunk
|
||||||
SAVE IMAGE --push upend/upend:$tag
|
SAVE IMAGE --push upend/upend:$tag
|
||||||
|
|
||||||
|
@ -139,6 +140,11 @@ DOCKER_COMMON:
|
||||||
ENV UPEND_NO_DESKTOP=true
|
ENV UPEND_NO_DESKTOP=true
|
||||||
ENV UPEND_ALLOW_HOST='*'
|
ENV UPEND_ALLOW_HOST='*'
|
||||||
|
|
||||||
|
external-monolith:
|
||||||
|
FROM rust:bookworm
|
||||||
|
RUN cargo install monolith
|
||||||
|
SAVE ARTIFACT /usr/local/cargo/bin/monolith
|
||||||
|
|
||||||
# CI targets
|
# CI targets
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
|
|
|
@ -103,6 +103,7 @@ default = [
|
||||||
"extractors-audio",
|
"extractors-audio",
|
||||||
"extractors-exif",
|
"extractors-exif",
|
||||||
"extractors-media",
|
"extractors-media",
|
||||||
|
"extractors-external"
|
||||||
]
|
]
|
||||||
desktop = ["webbrowser", "opener", "is_executable"]
|
desktop = ["webbrowser", "opener", "is_executable"]
|
||||||
previews = []
|
previews = []
|
||||||
|
@ -111,3 +112,4 @@ extractors-web = ["webpage"]
|
||||||
extractors-audio = ["id3"]
|
extractors-audio = ["id3"]
|
||||||
extractors-exif = ["kamadak-exif"]
|
extractors-exif = ["kamadak-exif"]
|
||||||
extractors-media = []
|
extractors-media = []
|
||||||
|
extractors-external = []
|
||||||
|
|
39
cli/src/extractors/external/mod.rs
vendored
Normal file
39
cli/src/extractors/external/mod.rs
vendored
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
pub mod monolith;
|
||||||
|
pub use monolith::MonolithExtractor;
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
pub enum ExternalCommandError {
|
||||||
|
#[error("Command not found")]
|
||||||
|
CommandNotFound,
|
||||||
|
#[error("Command failed: {0}")]
|
||||||
|
CommandError(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait ExternalCommand {
|
||||||
|
fn get_version(&self) -> Result<String, ExternalCommandError>;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process(
|
||||||
|
output: std::io::Result<std::process::Output>,
|
||||||
|
) -> Result<std::process::Output, ExternalCommandError> {
|
||||||
|
match output {
|
||||||
|
Ok(output) => {
|
||||||
|
if output.status.success() {
|
||||||
|
Ok(output)
|
||||||
|
} else {
|
||||||
|
Err(ExternalCommandError::CommandError(
|
||||||
|
String::from_utf8(output.stderr).unwrap_or_else(|_| String::from("")),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
if err.kind() == std::io::ErrorKind::NotFound {
|
||||||
|
Err(ExternalCommandError::CommandNotFound)
|
||||||
|
} else {
|
||||||
|
Err(ExternalCommandError::CommandError(err.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
97
cli/src/extractors/external/monolith.rs
vendored
Normal file
97
cli/src/extractors/external/monolith.rs
vendored
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
|
||||||
|
use crate::extractors::Extractor;
|
||||||
|
use anyhow::Result;
|
||||||
|
use regex::Regex;
|
||||||
|
use std::process::Command;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tracing::debug;
|
||||||
|
use upend_base::addressing::Address;
|
||||||
|
use upend_base::entry::Entry;
|
||||||
|
use upend_db::jobs::JobContainer;
|
||||||
|
use upend_db::stores::UpStore;
|
||||||
|
use upend_db::{OperationContext, UpEndConnection};
|
||||||
|
|
||||||
|
pub struct MonolithExtractor;
|
||||||
|
|
||||||
|
impl Extractor for MonolithExtractor {
|
||||||
|
fn get(
|
||||||
|
&self,
|
||||||
|
address: &Address,
|
||||||
|
connection: &UpEndConnection,
|
||||||
|
store: Arc<Box<dyn UpStore + Send + Sync>>,
|
||||||
|
mut job_container: JobContainer,
|
||||||
|
context: OperationContext,
|
||||||
|
) -> Result<Vec<Entry>> {
|
||||||
|
if let Address::Url(url) = address {
|
||||||
|
debug!("Archiving {} with `monolith`", url.as_str());
|
||||||
|
let mut job_handle =
|
||||||
|
job_container.add_job(None, &format!("Archiving <{}>", url.as_str()))?;
|
||||||
|
job_handle.update_progress(33.0)?;
|
||||||
|
let output = process(Command::new("monolith").arg(url.as_str()).output())?;
|
||||||
|
job_handle.update_progress(66.0)?;
|
||||||
|
let stdout = String::from_utf8(output.stdout)?;
|
||||||
|
let non_alpha_regex = Regex::new(r"[^a-zA-Z0-9]+").unwrap();
|
||||||
|
let name_hint = format!(
|
||||||
|
"web_{}.html",
|
||||||
|
non_alpha_regex
|
||||||
|
.replace_all(url.as_str(), "-")
|
||||||
|
.trim_end_matches('-')
|
||||||
|
);
|
||||||
|
let stored = store.store(
|
||||||
|
connection,
|
||||||
|
stdout.into(),
|
||||||
|
Some(name_hint),
|
||||||
|
None,
|
||||||
|
context.clone(),
|
||||||
|
)?;
|
||||||
|
debug!("Stored {} as {:?}", url.as_str(), stored);
|
||||||
|
job_handle.update_progress(100.0)?;
|
||||||
|
Ok(vec![Entry {
|
||||||
|
entity: address.clone(),
|
||||||
|
attribute: "WM_ARCHIVED".parse().unwrap(),
|
||||||
|
value: Address::Hash(stored).into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
}])
|
||||||
|
} else {
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||||
|
if !matches!(address, Address::Url(_)) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.get_version().is_err() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_extracted = !connection
|
||||||
|
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
|
||||||
|
.is_empty();
|
||||||
|
|
||||||
|
Ok(!is_extracted)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExternalCommand for MonolithExtractor {
|
||||||
|
fn get_version(&self) -> Result<String, ExternalCommandError> {
|
||||||
|
let output = process(Command::new("monolith").arg("--version").output())?;
|
||||||
|
Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown")))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// #[cfg(test)]
|
||||||
|
// mod tests {
|
||||||
|
// use super::*;
|
||||||
|
//
|
||||||
|
// #[test]
|
||||||
|
// fn test_get_version() {
|
||||||
|
// let monolith = MonolithExtractor {};
|
||||||
|
// let version = monolith.get_version().unwrap();
|
||||||
|
// println!("version: {:?}", version);
|
||||||
|
// assert!(version.contains("monolith"));
|
||||||
|
// }
|
||||||
|
// }
|
|
@ -22,6 +22,9 @@ pub mod exif;
|
||||||
#[cfg(feature = "extractors-media")]
|
#[cfg(feature = "extractors-media")]
|
||||||
pub mod media;
|
pub mod media;
|
||||||
|
|
||||||
|
#[cfg(feature = "extractors-external")]
|
||||||
|
pub mod external;
|
||||||
|
|
||||||
pub trait Extractor {
|
pub trait Extractor {
|
||||||
fn get(
|
fn get(
|
||||||
&self,
|
&self,
|
||||||
|
@ -177,7 +180,7 @@ pub fn extract(
|
||||||
address,
|
address,
|
||||||
connection,
|
connection,
|
||||||
store.clone(),
|
store.clone(),
|
||||||
job_container,
|
job_container.clone(),
|
||||||
context.clone(),
|
context.clone(),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -187,6 +190,22 @@ pub fn extract(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "extractors-external")]
|
||||||
|
{
|
||||||
|
let extract_result = external::MonolithExtractor.insert_info(
|
||||||
|
address,
|
||||||
|
connection,
|
||||||
|
store.clone(),
|
||||||
|
job_container,
|
||||||
|
context.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
match extract_result {
|
||||||
|
Ok(count) => entry_count += count,
|
||||||
|
Err(err) => debug!("external: {}", err),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
trace!("Extracting metadata for {address:?} - got {entry_count} entries.");
|
trace!("Extracting metadata for {address:?} - got {entry_count} entries.");
|
||||||
|
|
||||||
entry_count
|
entry_count
|
||||||
|
|
|
@ -31,12 +31,15 @@
|
||||||
types.pdf ||
|
types.pdf ||
|
||||||
types.model ||
|
types.model ||
|
||||||
types.web ||
|
types.web ||
|
||||||
|
types.html ||
|
||||||
types.fragment)) ??
|
types.fragment)) ??
|
||||||
false;
|
false;
|
||||||
|
|
||||||
$: dispatch('handled', handled);
|
$: dispatch('handled', handled);
|
||||||
|
|
||||||
let imageLoaded: string | null = null;
|
let imageLoaded: string | null = null;
|
||||||
|
let size = 0;
|
||||||
|
$: size = $entity?.getAs('FILE_SIZE', 'number') ?? 0;
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
{#if handled}
|
{#if handled}
|
||||||
|
@ -55,7 +58,7 @@
|
||||||
{#if types?.image}
|
{#if types?.image}
|
||||||
<ImageViewer {address} {detail} />
|
<ImageViewer {address} {detail} />
|
||||||
{/if}
|
{/if}
|
||||||
{#if types?.pdf}
|
{#if types?.pdf && size < 15_000}
|
||||||
<iframe src="{api.apiUrl}/raw/{address}?inline" title="PDF document of {address}" />
|
<iframe src="{api.apiUrl}/raw/{address}?inline" title="PDF document of {address}" />
|
||||||
{/if}
|
{/if}
|
||||||
{#if types?.model}
|
{#if types?.model}
|
||||||
|
@ -72,6 +75,9 @@
|
||||||
on:error={() => (handled = false)}
|
on:error={() => (handled = false)}
|
||||||
/>
|
/>
|
||||||
{/if}
|
{/if}
|
||||||
|
{#if types?.html && size < 15_000}
|
||||||
|
<iframe src="{api.apiUrl}/raw/{address}?inline=1" title="HTML document of {address}" />
|
||||||
|
{/if}
|
||||||
{#if types?.fragment}
|
{#if types?.fragment}
|
||||||
<UpLink passthrough to={{ entity: String($entity?.get('ANNOTATES')) }}>
|
<UpLink passthrough to={{ entity: String($entity?.get('ANNOTATES')) }}>
|
||||||
<FragmentViewer {address} {detail} />
|
<FragmentViewer {address} {detail} />
|
||||||
|
|
|
@ -19,6 +19,8 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
|
||||||
const model = mimeType?.startsWith('model') || entity?.identify().some((l) => l.endsWith('.stl'));
|
const model = mimeType?.startsWith('model') || entity?.identify().some((l) => l.endsWith('.stl'));
|
||||||
const web = entityInfo?.t == 'Url';
|
const web = entityInfo?.t == 'Url';
|
||||||
const fragment = Boolean(entity?.get('ANNOTATES'));
|
const fragment = Boolean(entity?.get('ANNOTATES'));
|
||||||
|
const html =
|
||||||
|
mimeType.startsWith('text/html') || entity?.identify().some((l) => l.endsWith('.html'));
|
||||||
|
|
||||||
const group = entity?.backlinks.some((e) => e.attribute == ATTR_IN);
|
const group = entity?.backlinks.some((e) => e.attribute == ATTR_IN);
|
||||||
|
|
||||||
|
@ -32,6 +34,7 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
|
||||||
model,
|
model,
|
||||||
web,
|
web,
|
||||||
fragment,
|
fragment,
|
||||||
|
html,
|
||||||
group
|
group
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue