feat(cli): download URLs via yt-dlp
This commit is contained in:
parent
a24167949f
commit
f6c127bda5
4 changed files with 192 additions and 3 deletions
|
@ -118,10 +118,12 @@ docker-minimal:
|
||||||
docker:
|
docker:
|
||||||
FROM debian:bookworm
|
FROM debian:bookworm
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get -y install --no-install-recommends ffmpeg wget libssl3 ca-certificates && \
|
apt-get -y install --no-install-recommends ffmpeg python3 wget libssl3 ca-certificates && \
|
||||||
wget https://github.com/bbc/audiowaveform/releases/download/1.8.1/audiowaveform_1.8.1-1-12_amd64.deb && \
|
wget https://github.com/bbc/audiowaveform/releases/download/1.8.1/audiowaveform_1.8.1-1-12_amd64.deb && \
|
||||||
apt-get -y install ./audiowaveform_1.8.1-1-12_amd64.deb && \
|
apt-get -y install ./audiowaveform_1.8.1-1-12_amd64.deb && \
|
||||||
rm -v audiowaveform_1.8.1-1-12_amd64.deb && \
|
rm -v audiowaveform_1.8.1-1-12_amd64.deb && \
|
||||||
|
wget https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -O /usr/local/bin/yt-dlp && \
|
||||||
|
chmod a+rx /usr/local/bin/yt-dlp && \
|
||||||
apt-get remove -y wget && \
|
apt-get remove -y wget && \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
5
cli/src/extractors/external/mod.rs
vendored
5
cli/src/extractors/external/mod.rs
vendored
|
@ -1,7 +1,10 @@
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
pub mod monolith;
|
mod monolith;
|
||||||
|
mod ytdlp;
|
||||||
|
|
||||||
pub use monolith::MonolithExtractor;
|
pub use monolith::MonolithExtractor;
|
||||||
|
pub use ytdlp::YtDlpExtractor;
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum ExternalCommandError {
|
pub enum ExternalCommandError {
|
||||||
|
|
171
cli/src/extractors/external/ytdlp.rs
vendored
Normal file
171
cli/src/extractors/external/ytdlp.rs
vendored
Normal file
|
@ -0,0 +1,171 @@
|
||||||
|
use crate::extractors::external::{process, ExternalCommand, ExternalCommandError};
|
||||||
|
use crate::extractors::Extractor;
|
||||||
|
use anyhow::Result;
|
||||||
|
use regex::Regex;
|
||||||
|
use std::io::{BufReader, Read};
|
||||||
|
use std::process::{Command, Stdio};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tracing::{debug, trace};
|
||||||
|
use upend_base::addressing::Address;
|
||||||
|
use upend_base::entry::{Entry, EntryValue};
|
||||||
|
use upend_db::jobs::JobContainer;
|
||||||
|
use upend_db::stores::UpStore;
|
||||||
|
use upend_db::{OperationContext, UpEndConnection};
|
||||||
|
|
||||||
|
pub struct YtDlpExtractor;
|
||||||
|
|
||||||
|
impl Extractor for YtDlpExtractor {
|
||||||
|
fn get(
|
||||||
|
&self,
|
||||||
|
address: &Address,
|
||||||
|
connection: &UpEndConnection,
|
||||||
|
store: Arc<Box<dyn UpStore + Send + Sync>>,
|
||||||
|
mut job_container: JobContainer,
|
||||||
|
context: OperationContext,
|
||||||
|
) -> Result<Vec<Entry>> {
|
||||||
|
if let Address::Url(url) = address {
|
||||||
|
debug!("Getting {} with `yt-dlp`", url.as_str());
|
||||||
|
let mut job_handle =
|
||||||
|
job_container.add_job(None, &format!("Downloading <{}>", url.as_str()))?;
|
||||||
|
|
||||||
|
let temp_dir = tempfile::tempdir()?;
|
||||||
|
debug!(
|
||||||
|
"Downloading {} with `yt-dlp` in {}",
|
||||||
|
url.as_str(),
|
||||||
|
temp_dir.path().display()
|
||||||
|
);
|
||||||
|
let mut cmd = Command::new("yt-dlp")
|
||||||
|
.args([
|
||||||
|
"--progress",
|
||||||
|
"--no-call-home",
|
||||||
|
"--no-playlist",
|
||||||
|
url.as_str(),
|
||||||
|
])
|
||||||
|
.current_dir(temp_dir.path())
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::piped())
|
||||||
|
.spawn()?;
|
||||||
|
|
||||||
|
let stdout = cmd.stdout.take().unwrap();
|
||||||
|
let mut reader = BufReader::new(stdout);
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut chunk = vec![0; 1024];
|
||||||
|
|
||||||
|
let progress_regex = Regex::new(r"\[download] +(\d+\.\d)% of").unwrap();
|
||||||
|
while let Ok(bytes_read) = reader.read(&mut chunk) {
|
||||||
|
if bytes_read == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.extend_from_slice(&chunk[..bytes_read]);
|
||||||
|
|
||||||
|
while let Some(pos) = buffer.iter().position(|&x| x == b'\r' || x == b'\n') {
|
||||||
|
if pos > 0 {
|
||||||
|
let line = String::from_utf8_lossy(&buffer[..pos]);
|
||||||
|
trace!("Received: {}", line);
|
||||||
|
if let Some(caps) = progress_regex.captures(&line) {
|
||||||
|
if let Some(progress) = caps.get(1) {
|
||||||
|
let progress = progress.as_str().parse::<f32>().ok();
|
||||||
|
if let Some(progress) = progress {
|
||||||
|
job_handle.update_progress(progress)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.drain(..=pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let status = cmd.wait()?;
|
||||||
|
|
||||||
|
if !status.success() {
|
||||||
|
connection.insert_entry(Entry {
|
||||||
|
entity: address.clone(),
|
||||||
|
attribute: "YTDLD".parse().unwrap(),
|
||||||
|
value: EntryValue::Null,
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"yt-dlp failed ({}): {}",
|
||||||
|
status,
|
||||||
|
cmd.stderr
|
||||||
|
.map(|mut x| {
|
||||||
|
let mut buffer = String::new();
|
||||||
|
x.read_to_string(&mut buffer).unwrap();
|
||||||
|
buffer
|
||||||
|
})
|
||||||
|
.unwrap_or_else(|| "<no output>".to_string())
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let files = std::fs::read_dir(temp_dir.path())?.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
let destination = files
|
||||||
|
.first()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow::anyhow!("yt-dlp didn't produce any files in {:?}", temp_dir.path())
|
||||||
|
})?
|
||||||
|
.path();
|
||||||
|
|
||||||
|
let stored = store.store(
|
||||||
|
connection,
|
||||||
|
destination.clone().into(),
|
||||||
|
destination
|
||||||
|
.file_name()
|
||||||
|
.map(|f| f.to_string_lossy().to_string()),
|
||||||
|
None,
|
||||||
|
context.clone(),
|
||||||
|
)?;
|
||||||
|
job_handle.update_progress(100.0)?;
|
||||||
|
|
||||||
|
Ok(vec![Entry {
|
||||||
|
entity: address.clone(),
|
||||||
|
attribute: "YTDLD".parse().unwrap(),
|
||||||
|
value: Address::Hash(stored).into(),
|
||||||
|
provenance: context.provenance.clone() + "EXTRACTOR yt-dlp",
|
||||||
|
user: context.user.clone(),
|
||||||
|
timestamp: chrono::Utc::now().naive_utc(),
|
||||||
|
}])
|
||||||
|
} else {
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_needed(&self, address: &Address, connection: &UpEndConnection) -> Result<bool> {
|
||||||
|
if !matches!(address, Address::Url(_)) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.get_version().is_err() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_extracted = !connection
|
||||||
|
.query(format!("(matches @{} (contains \"YTDLD\") ?)", address).parse()?)?
|
||||||
|
.is_empty();
|
||||||
|
|
||||||
|
Ok(!is_extracted)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExternalCommand for YtDlpExtractor {
|
||||||
|
fn get_version(&self) -> Result<String, ExternalCommandError> {
|
||||||
|
let output = process(Command::new("yt-dlp").arg("--version").output())?;
|
||||||
|
Ok(String::from_utf8(output.stdout).unwrap_or_else(|_| String::from("unknown")))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// #[cfg(test)]
|
||||||
|
// mod tests {
|
||||||
|
// use super::*;
|
||||||
|
//
|
||||||
|
// #[test]
|
||||||
|
// fn test_get_version() {
|
||||||
|
// let yt_dlp = YtDlpExtractor;
|
||||||
|
// let version = yt_dlp.get_version().unwrap();
|
||||||
|
// assert!(!version.is_empty());
|
||||||
|
// }
|
||||||
|
// }
|
|
@ -193,6 +193,19 @@ pub fn extract(
|
||||||
#[cfg(feature = "extractors-external")]
|
#[cfg(feature = "extractors-external")]
|
||||||
{
|
{
|
||||||
let extract_result = external::MonolithExtractor.insert_info(
|
let extract_result = external::MonolithExtractor.insert_info(
|
||||||
|
address,
|
||||||
|
connection,
|
||||||
|
store.clone(),
|
||||||
|
job_container.clone(),
|
||||||
|
context.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
match extract_result {
|
||||||
|
Ok(count) => entry_count += count,
|
||||||
|
Err(err) => debug!("external monolith: {}", err),
|
||||||
|
}
|
||||||
|
|
||||||
|
let extract_result = external::YtDlpExtractor.insert_info(
|
||||||
address,
|
address,
|
||||||
connection,
|
connection,
|
||||||
store.clone(),
|
store.clone(),
|
||||||
|
@ -202,7 +215,7 @@ pub fn extract(
|
||||||
|
|
||||||
match extract_result {
|
match extract_result {
|
||||||
Ok(count) => entry_count += count,
|
Ok(count) => entry_count += count,
|
||||||
Err(err) => debug!("external: {}", err),
|
Err(err) => debug!("external yt-dlp: {}", err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue