feat(cli): download URLs via yt-dlp

This commit is contained in:
Tomáš Mládek 2024-06-30 17:07:06 +02:00
parent a24167949f
commit f6c127bda5
4 changed files with 192 additions and 3 deletions

View file

@ -118,10 +118,12 @@ docker-minimal:
docker: docker:
FROM debian:bookworm FROM debian:bookworm
RUN apt-get update && \ RUN apt-get update && \
apt-get -y install --no-install-recommends ffmpeg wget libssl3 ca-certificates && \ apt-get -y install --no-install-recommends ffmpeg python3 wget libssl3 ca-certificates && \
wget https://github.com/bbc/audiowaveform/releases/download/1.8.1/audiowaveform_1.8.1-1-12_amd64.deb && \ wget https://github.com/bbc/audiowaveform/releases/download/1.8.1/audiowaveform_1.8.1-1-12_amd64.deb && \
apt-get -y install ./audiowaveform_1.8.1-1-12_amd64.deb && \ apt-get -y install ./audiowaveform_1.8.1-1-12_amd64.deb && \
rm -v audiowaveform_1.8.1-1-12_amd64.deb && \ rm -v audiowaveform_1.8.1-1-12_amd64.deb && \
wget https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -O /usr/local/bin/yt-dlp && \
chmod a+rx /usr/local/bin/yt-dlp && \
apt-get remove -y wget && \ apt-get remove -y wget && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*

View file

@ -1,7 +1,10 @@
use thiserror::Error; use thiserror::Error;
pub mod monolith; mod monolith;
mod ytdlp;
pub use monolith::MonolithExtractor; pub use monolith::MonolithExtractor;
pub use ytdlp::YtDlpExtractor;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum ExternalCommandError { pub enum ExternalCommandError {

171
cli/src/extractors/external/ytdlp.rs vendored Normal file
View file

@ -0,0 +1,171 @@
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
use crate::extractors::Extractor;
use anyhow::Result;
use regex::Regex;
use std::io::{BufReader, Read};
use std::process::{Command, Stdio};
use std::sync::Arc;
use tracing::{debug, trace};
use upend_base::addressing::Address;
use upend_base::entry::{Entry, EntryValue};
use upend_db::jobs::JobContainer;
use upend_db::stores::UpStore;
use upend_db::{OperationContext, UpEndConnection};
pub struct YtDlpExtractor;
impl Extractor for YtDlpExtractor {
fn get(
&self,
address: &Address,
connection: &UpEndConnection,
store: Arc<Box<dyn UpStore + Send + Sync>>,
mut job_container: JobContainer,
context: OperationContext,
) -> Result<Vec<Entry>> {
if let Address::Url(url) = address {
debug!("Getting {} with `yt-dlp`", url.as_str());
let mut job_handle =
job_container.add_job(None, &format!("Downloading <{}>", url.as_str()))?;
let temp_dir = tempfile::tempdir()?;
debug!(
"Downloading {} with `yt-dlp` in {}",
url.as_str(),
temp_dir.path().display()
);
let mut cmd = Command::new("yt-dlp")
.args([
"--progress",
"--no-call-home",
"--no-playlist",
url.as_str(),
])
.current_dir(temp_dir.path())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()?;
let stdout = cmd.stdout.take().unwrap();
let mut reader = BufReader::new(stdout);
let mut buffer = Vec::new();
let mut chunk = vec![0; 1024];
let progress_regex = Regex::new(r"\[download] +(\d+\.\d)% of").unwrap();
while let Ok(bytes_read) = reader.read(&mut chunk) {
if bytes_read == 0 {
break;
}
buffer.extend_from_slice(&chunk[..bytes_read]);
while let Some(pos) = buffer.iter().position(|&x| x == b'\r' || x == b'\n') {
if pos > 0 {
let line = String::from_utf8_lossy(&buffer[..pos]);
trace!("Received: {}", line);
if let Some(caps) = progress_regex.captures(&line) {
if let Some(progress) = caps.get(1) {
let progress = progress.as_str().parse::<f32>().ok();
if let Some(progress) = progress {
job_handle.update_progress(progress)?;
}
}
}
}
buffer.drain(..=pos);
}
}
let status = cmd.wait()?;
if !status.success() {
connection.insert_entry(Entry {
entity: address.clone(),
attribute: "YTDLD".parse().unwrap(),
value: EntryValue::Null,
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
})?;
return Err(anyhow::anyhow!(
"yt-dlp failed ({}): {}",
status,
cmd.stderr
.map(|mut x| {
let mut buffer = String::new();
x.read_to_string(&mut buffer).unwrap();
buffer
})
.unwrap_or_else(|| "<no output>".to_string())
));
}
let files = std::fs::read_dir(temp_dir.path())?.collect::<Result<Vec<_>, _>>()?;
let destination = files
.first()
.ok_or_else(|| {
anyhow::anyhow!("yt-dlp didn't produce any files in {:?}", temp_dir.path())
})?
.path();
let stored = store.store(
connection,
destination.clone().into(),
destination
.file_name()
.map(|f| f.to_string_lossy().to_string()),
None,
context.clone(),
)?;
job_handle.update_progress(100.0)?;
Ok(vec![Entry {
entity: address.clone(),
attribute: "YTDLD".parse().unwrap(),
value: Address::Hash(stored).into(),
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
user: context.user.clone(),
timestamp: chrono::Utc::now().naive_utc(),
}])
} else {
Ok(vec![])
}
}
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
if !matches!(address, Address::Url(_)) {
return Ok(false);
}
if self.get_version().is_err() {
return Ok(false);
}
let is_extracted = !connection
.query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)?
.is_empty();
Ok(!is_extracted)
}
}
impl ExternalCommand for YtDlpExtractor {
fn get_version(&self) -> Result<String, ExternalCommandError> {
let output = process(Command::new("yt-dlp").arg("--version").output())?;
Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown")))
}
}
// #[cfg(test)]
// mod tests {
// use super::*;
//
// #[test]
// fn test_get_version() {
// let yt_dlp = YtDlpExtractor;
// let version = yt_dlp.get_version().unwrap();
// assert!(!version.is_empty());
// }
// }

View file

@ -193,6 +193,19 @@ pub fn extract(
#[cfg(feature = "extractors-external")] #[cfg(feature = "extractors-external")]
{ {
let extract_result = external::MonolithExtractor.insert_info( let extract_result = external::MonolithExtractor.insert_info(
address,
connection,
store.clone(),
job_container.clone(),
context.clone(),
);
match extract_result {
Ok(count) => entry_count += count,
Err(err) => debug!("external monolith: {}", err),
}
let extract_result = external::YtDlpExtractor.insert_info(
address, address,
connection, connection,
store.clone(), store.clone(),
@ -202,7 +215,7 @@ pub fn extract(
match extract_result { match extract_result {
Ok(count) => entry_count += count, Ok(count) => entry_count += count,
Err(err) => debug!("external: {}", err), Err(err) => debug!("external yt-dlp: {}", err),
} }
} }