diff --git a/Earthfile b/Earthfile index d0de35b..c9981a8 100644 --- a/Earthfile +++ b/Earthfile @@ -118,10 +118,12 @@ docker-minimal: docker: FROM debian:bookworm RUN apt-get update && \ - apt-get -y install --no-install-recommends ffmpeg wget libssl3 ca-certificates && \ + apt-get -y install --no-install-recommends ffmpeg python3 wget libssl3 ca-certificates && \ wget https://github.com/bbc/audiowaveform/releases/download/1.8.1/audiowaveform_1.8.1-1-12_amd64.deb && \ apt-get -y install ./audiowaveform_1.8.1-1-12_amd64.deb && \ rm -v audiowaveform_1.8.1-1-12_amd64.deb && \ + wget https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -O /usr/local/bin/yt-dlp && \ + chmod a+rx /usr/local/bin/yt-dlp && \ apt-get remove -y wget && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/cli/src/extractors/external/mod.rs b/cli/src/extractors/external/mod.rs index 0230318..a9d54d7 100644 --- a/cli/src/extractors/external/mod.rs +++ b/cli/src/extractors/external/mod.rs @@ -1,7 +1,10 @@ use thiserror::Error; -pub mod monolith; +mod monolith; +mod ytdlp; + pub use monolith::MonolithExtractor; +pub use ytdlp::YtDlpExtractor; #[derive(Error, Debug)] pub enum ExternalCommandError { diff --git a/cli/src/extractors/external/ytdlp.rs b/cli/src/extractors/external/ytdlp.rs new file mode 100644 index 0000000..7ad12b0 --- /dev/null +++ b/cli/src/extractors/external/ytdlp.rs @@ -0,0 +1,171 @@ +use crate::extractors::external::{process, ExternalCommand, ExternalCommandError}; +use crate::extractors::Extractor; +use anyhow::Result; +use regex::Regex; +use std::io::{BufReader, Read}; +use std::process::{Command, Stdio}; +use std::sync::Arc; +use tracing::{debug, trace}; +use upend_base::addressing::Address; +use upend_base::entry::{Entry, EntryValue}; +use upend_db::jobs::JobContainer; +use upend_db::stores::UpStore; +use upend_db::{OperationContext, UpEndConnection}; + +pub struct YtDlpExtractor; + +impl Extractor for YtDlpExtractor { + fn get( + &self, + address: &Address, + connection: &UpEndConnection, + store: Arc>, + mut job_container: JobContainer, + context: OperationContext, + ) -> Result> { + if let Address::Url(url) = address { + debug!("Getting {} with `yt-dlp`", url.as_str()); + let mut job_handle = + job_container.add_job(None, &format!("Downloading <{}>", url.as_str()))?; + + let temp_dir = tempfile::tempdir()?; + debug!( + "Downloading {} with `yt-dlp` in {}", + url.as_str(), + temp_dir.path().display() + ); + let mut cmd = Command::new("yt-dlp") + .args([ + "--progress", + "--no-call-home", + "--no-playlist", + url.as_str(), + ]) + .current_dir(temp_dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + let stdout = cmd.stdout.take().unwrap(); + let mut reader = BufReader::new(stdout); + let mut buffer = Vec::new(); + let mut chunk = vec![0; 1024]; + + let progress_regex = Regex::new(r"\[download] +(\d+\.\d)% of").unwrap(); + while let Ok(bytes_read) = reader.read(&mut chunk) { + if bytes_read == 0 { + break; + } + + buffer.extend_from_slice(&chunk[..bytes_read]); + + while let Some(pos) = buffer.iter().position(|&x| x == b'\r' || x == b'\n') { + if pos > 0 { + let line = String::from_utf8_lossy(&buffer[..pos]); + trace!("Received: {}", line); + if let Some(caps) = progress_regex.captures(&line) { + if let Some(progress) = caps.get(1) { + let progress = progress.as_str().parse::().ok(); + if let Some(progress) = progress { + job_handle.update_progress(progress)?; + } + } + } + } + + buffer.drain(..=pos); + } + } + + let status = cmd.wait()?; + + if !status.success() { + connection.insert_entry(Entry { + entity: address.clone(), + attribute: "YTDLD".parse().unwrap(), + value: EntryValue::Null, + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + })?; + + return Err(anyhow::anyhow!( + "yt-dlp failed ({}): {}", + status, + cmd.stderr + .map(|mut x| { + let mut buffer = String::new(); + x.read_to_string(&mut buffer).unwrap(); + buffer + }) + .unwrap_or_else(|| "".to_string()) + )); + } + + let files = std::fs::read_dir(temp_dir.path())?.collect::, _>>()?; + let destination = files + .first() + .ok_or_else(|| { + anyhow::anyhow!("yt-dlp didn't produce any files in {:?}", temp_dir.path()) + })? + .path(); + + let stored = store.store( + connection, + destination.clone().into(), + destination + .file_name() + .map(|f| f.to_string_lossy().to_string()), + None, + context.clone(), + )?; + job_handle.update_progress(100.0)?; + + Ok(vec![Entry { + entity: address.clone(), + attribute: "YTDLD".parse().unwrap(), + value: Address::Hash(stored).into(), + provenance: context.provenance.clone() + "EXTRACTOR yt-dlp", + user: context.user.clone(), + timestamp: chrono::Utc::now().naive_utc(), + }]) + } else { + Ok(vec![]) + } + } + + fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result { + if !matches!(address, Address::Url(_)) { + return Ok(false); + } + + if self.get_version().is_err() { + return Ok(false); + } + + let is_extracted = !connection + .query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)? + .is_empty(); + + Ok(!is_extracted) + } +} + +impl ExternalCommand for YtDlpExtractor { + fn get_version(&self) -> Result { + let output = process(Command::new("yt-dlp").arg("--version").output())?; + Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown"))) + } +} + +// #[cfg(test)] +// mod tests { +// use super::*; +// +// #[test] +// fn test_get_version() { +// let yt_dlp = YtDlpExtractor; +// let version = yt_dlp.get_version().unwrap(); +// assert!(!version.is_empty()); +// } +// } diff --git a/cli/src/extractors/mod.rs b/cli/src/extractors/mod.rs index 7351159..e383f01 100644 --- a/cli/src/extractors/mod.rs +++ b/cli/src/extractors/mod.rs @@ -193,6 +193,19 @@ pub fn extract( #[cfg(feature = "extractors-external")] { let extract_result = external::MonolithExtractor.insert_info( + address, + connection, + store.clone(), + job_container.clone(), + context.clone(), + ); + + match extract_result { + Ok(count) => entry_count += count, + Err(err) => debug!("external monolith: {}", err), + } + + let extract_result = external::YtDlpExtractor.insert_info( address, connection, store.clone(), @@ -202,7 +215,7 @@ pub fn extract( match extract_result { Ok(count) => entry_count += count, - Err(err) => debug!("external: {}", err), + Err(err) => debug!("external yt-dlp: {}", err), } }