feat(cli): add support for archiving URLs on adding via monolith
This commit is contained in:
parent
44d76aa8d4
commit
630e8feee1
7 changed files with 189 additions and 17 deletions
|
@ -126,6 +126,7 @@ docker:
|
|||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
DO +DOCKER_COMMON
|
||||
COPY +external-monolith/monolith /usr/bin/monolith
|
||||
ARG tag=trunk
|
||||
SAVE IMAGE --push upend/upend:$tag
|
||||
|
||||
|
@ -139,6 +140,11 @@ DOCKER_COMMON:
|
|||
ENV UPEND_NO_DESKTOP=true
|
||||
ENV UPEND_ALLOW_HOST='*'
|
||||
|
||||
external-monolith:
|
||||
FROM rust:bookworm
|
||||
RUN cargo install monolith
|
||||
SAVE ARTIFACT /usr/local/cargo/bin/monolith
|
||||
|
||||
# CI targets
|
||||
|
||||
lint:
|
||||
|
|
|
@ -29,10 +29,10 @@ once_cell = "1.7.2"
|
|||
lru = "0.7.0"
|
||||
|
||||
diesel = { version = "1.4", features = [
|
||||
"sqlite",
|
||||
"r2d2",
|
||||
"chrono",
|
||||
"serde_json",
|
||||
"sqlite",
|
||||
"r2d2",
|
||||
"chrono",
|
||||
"serde_json",
|
||||
] }
|
||||
diesel_migrations = "1.4"
|
||||
libsqlite3-sys = { version = "^0", features = ["bundled"] }
|
||||
|
@ -54,10 +54,10 @@ regex = "1"
|
|||
|
||||
multibase = "0.9"
|
||||
multihash = { version = "*", default-features = false, features = [
|
||||
"alloc",
|
||||
"multihash-impl",
|
||||
"sha2",
|
||||
"identity",
|
||||
"alloc",
|
||||
"multihash-impl",
|
||||
"sha2",
|
||||
"identity",
|
||||
] }
|
||||
uuid = { version = "1.4", features = ["v4"] }
|
||||
|
||||
|
@ -96,13 +96,14 @@ shadow-rs = { version = "0.23", default-features = false }
|
|||
|
||||
[features]
|
||||
default = [
|
||||
"desktop",
|
||||
"previews",
|
||||
"previews-image",
|
||||
"extractors-web",
|
||||
"extractors-audio",
|
||||
"extractors-exif",
|
||||
"extractors-media",
|
||||
"desktop",
|
||||
"previews",
|
||||
"previews-image",
|
||||
"extractors-web",
|
||||
"extractors-audio",
|
||||
"extractors-exif",
|
||||
"extractors-media",
|
||||
"extractors-external"
|
||||
]
|
||||
desktop = ["webbrowser", "opener", "is_executable"]
|
||||
previews = []
|
||||
|
@ -111,3 +112,4 @@ extractors-web = ["webpage"]
|
|||
extractors-audio = ["id3"]
|
||||
extractors-exif = ["kamadak-exif"]
|
||||
extractors-media = []
|
||||
extractors-external = []
|
||||
|
|
39
cli/src/extractors/external/mod.rs
vendored
Normal file
39
cli/src/extractors/external/mod.rs
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
use thiserror::Error;
|
||||
|
||||
pub mod monolith;
|
||||
pub use monolith::MonolithExtractor;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ExternalCommandError {
|
||||
#[error("Command not found")]
|
||||
CommandNotFound,
|
||||
#[error("Command failed: {0}")]
|
||||
CommandError(String),
|
||||
}
|
||||
|
||||
pub trait ExternalCommand {
|
||||
fn get_version(&self) -> Result<String, ExternalCommandError>;
|
||||
}
|
||||
|
||||
fn process(
|
||||
output: std::io::Result<std::process::Output>,
|
||||
) -> Result<std::process::Output, ExternalCommandError> {
|
||||
match output {
|
||||
Ok(output) => {
|
||||
if output.status.success() {
|
||||
Ok(output)
|
||||
} else {
|
||||
Err(ExternalCommandError::CommandError(
|
||||
String::from_utf8(output.stderr).unwrap_or_else(|_| String::from("")),
|
||||
))
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
if err.kind() == std::io::ErrorKind::NotFound {
|
||||
Err(ExternalCommandError::CommandNotFound)
|
||||
} else {
|
||||
Err(ExternalCommandError::CommandError(err.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
97
cli/src/extractors/external/monolith.rs
vendored
Normal file
97
cli/src/extractors/external/monolith.rs
vendored
Normal file
|
@ -0,0 +1,97 @@
|
|||
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
|
||||
use crate::extractors::Extractor;
|
||||
use anyhow::Result;
|
||||
use regex::Regex;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use tracing::debug;
|
||||
use upend_base::addressing::Address;
|
||||
use upend_base::entry::Entry;
|
||||
use upend_db::jobs::JobContainer;
|
||||
use upend_db::stores::UpStore;
|
||||
use upend_db::{OperationContext, UpEndConnection};
|
||||
|
||||
pub struct MonolithExtractor;
|
||||
|
||||
impl Extractor for MonolithExtractor {
|
||||
fn get(
|
||||
&self,
|
||||
address: &Address,
|
||||
connection: &UpEndConnection,
|
||||
store: Arc<Box<dyn UpStore + Send + Sync>>,
|
||||
mut job_container: JobContainer,
|
||||
context: OperationContext,
|
||||
) -> Result<Vec<Entry>> {
|
||||
if let Address::Url(url) = address {
|
||||
debug!("Archiving {} with `monolith`", url.as_str());
|
||||
let mut job_handle =
|
||||
job_container.add_job(None, &format!("Archiving <{}>", url.as_str()))?;
|
||||
job_handle.update_progress(33.0)?;
|
||||
let output = process(Command::new("monolith").arg(url.as_str()).output())?;
|
||||
job_handle.update_progress(66.0)?;
|
||||
let stdout = String::from_utf8(output.stdout)?;
|
||||
let non_alpha_regex = Regex::new(r"[^a-zA-Z0-9]+").unwrap();
|
||||
let name_hint = format!(
|
||||
"web_{}.html",
|
||||
non_alpha_regex
|
||||
.replace_all(url.as_str(), "-")
|
||||
.trim_end_matches('-')
|
||||
);
|
||||
let stored = store.store(
|
||||
connection,
|
||||
stdout.into(),
|
||||
Some(name_hint),
|
||||
None,
|
||||
context.clone(),
|
||||
)?;
|
||||
debug!("Stored {} as {:?}", url.as_str(), stored);
|
||||
job_handle.update_progress(100.0)?;
|
||||
Ok(vec![Entry {
|
||||
entity: address.clone(),
|
||||
attribute: "WM_ARCHIVED".parse().unwrap(),
|
||||
value: Address::Hash(stored).into(),
|
||||
provenance: context.provenance.clone() + "EXTRACTOR monolith",
|
||||
user: context.user.clone(),
|
||||
timestamp: chrono::Utc::now().naive_utc(),
|
||||
}])
|
||||
} else {
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||
if !matches!(address, Address::Url(_)) {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if self.get_version().is_err() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let is_extracted = !connection
|
||||
.query(format!("(matches @{} (contains \"WM_ARCHIVED\") ?)", address).parse()?)?
|
||||
.is_empty();
|
||||
|
||||
Ok(!is_extracted)
|
||||
}
|
||||
}
|
||||
|
||||
impl ExternalCommand for MonolithExtractor {
|
||||
fn get_version(&self) -> Result<String, ExternalCommandError> {
|
||||
let output = process(Command::new("monolith").arg("--version").output())?;
|
||||
Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown")))
|
||||
}
|
||||
}
|
||||
|
||||
// #[cfg(test)]
|
||||
// mod tests {
|
||||
// use super::*;
|
||||
//
|
||||
// #[test]
|
||||
// fn test_get_version() {
|
||||
// let monolith = MonolithExtractor {};
|
||||
// let version = monolith.get_version().unwrap();
|
||||
// println!("version: {:?}", version);
|
||||
// assert!(version.contains("monolith"));
|
||||
// }
|
||||
// }
|
|
@ -22,6 +22,9 @@ pub mod exif;
|
|||
#[cfg(feature = "extractors-media")]
|
||||
pub mod media;
|
||||
|
||||
#[cfg(feature = "extractors-external")]
|
||||
pub mod external;
|
||||
|
||||
pub trait Extractor {
|
||||
fn get(
|
||||
&self,
|
||||
|
@ -177,7 +180,7 @@ pub fn extract(
|
|||
address,
|
||||
connection,
|
||||
store.clone(),
|
||||
job_container,
|
||||
job_container.clone(),
|
||||
context.clone(),
|
||||
);
|
||||
|
||||
|
@ -187,6 +190,22 @@ pub fn extract(
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "extractors-external")]
|
||||
{
|
||||
let extract_result = external::MonolithExtractor.insert_info(
|
||||
address,
|
||||
connection,
|
||||
store.clone(),
|
||||
job_container,
|
||||
context.clone(),
|
||||
);
|
||||
|
||||
match extract_result {
|
||||
Ok(count) => entry_count += count,
|
||||
Err(err) => debug!("external: {}", err),
|
||||
}
|
||||
}
|
||||
|
||||
trace!("Extracting metadata for {address:?} - got {entry_count} entries.");
|
||||
|
||||
entry_count
|
||||
|
|
|
@ -31,12 +31,15 @@
|
|||
types.pdf ||
|
||||
types.model ||
|
||||
types.web ||
|
||||
types.html ||
|
||||
types.fragment)) ??
|
||||
false;
|
||||
|
||||
$: dispatch('handled', handled);
|
||||
|
||||
let imageLoaded: string | null = null;
|
||||
let size = 0;
|
||||
$: size = $entity?.getAs('FILE_SIZE', 'number') ?? 0;
|
||||
</script>
|
||||
|
||||
{#if handled}
|
||||
|
@ -55,7 +58,7 @@
|
|||
{#if types?.image}
|
||||
<ImageViewer {address} {detail} />
|
||||
{/if}
|
||||
{#if types?.pdf}
|
||||
{#if types?.pdf && size < 15_000}
|
||||
<iframe src="{api.apiUrl}/raw/{address}?inline" title="PDF document of {address}" />
|
||||
{/if}
|
||||
{#if types?.model}
|
||||
|
@ -72,6 +75,9 @@
|
|||
on:error={() => (handled = false)}
|
||||
/>
|
||||
{/if}
|
||||
{#if types?.html && size < 15_000}
|
||||
<iframe src="{api.apiUrl}/raw/{address}?inline=1" title="HTML document of {address}" />
|
||||
{/if}
|
||||
{#if types?.fragment}
|
||||
<UpLink passthrough to={{ entity: String($entity?.get('ANNOTATES')) }}>
|
||||
<FragmentViewer {address} {detail} />
|
||||
|
|
|
@ -19,6 +19,8 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
|
|||
const model = mimeType?.startsWith('model') || entity?.identify().some((l) => l.endsWith('.stl'));
|
||||
const web = entityInfo?.t == 'Url';
|
||||
const fragment = Boolean(entity?.get('ANNOTATES'));
|
||||
const html =
|
||||
mimeType.startsWith('text/html') || entity?.identify().some((l) => l.endsWith('.html'));
|
||||
|
||||
const group = entity?.backlinks.some((e) => e.attribute == ATTR_IN);
|
||||
|
||||
|
@ -32,6 +34,7 @@ export function getTypes(entity: UpObject, entityInfo: EntityInfo) {
|
|||
model,
|
||||
web,
|
||||
fragment,
|
||||
html,
|
||||
group
|
||||
};
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue