upend/src/filesystem.rs

676 lines
22 KiB
Rust

use crate::addressing::Address;
use crate::database::{
bulk_retrieve_objects, file_set_valid, insert_entry, insert_file, query, retrieve_all_files,
DbPool, Entry, EntryQuery, EntryValue, InvariantEntry, Query, QueryComponent, QueryPart,
DATABASE_FILENAME, IS_OF_TYPE_ATTR, TYPE_ADDR, TYPE_HAS_ATTR, TYPE_ID_ATTR,
TYPE_INSTANCED_ATTR, TYPE_IS_ATTR, TYPE_REQUIRES_ATTR,
};
use crate::hash::Hashable;
use crate::jobs::{Job, JobContainer, JobId};
use crate::models;
use crate::models::File;
use anyhow::{anyhow, Error, Result};
use chrono::prelude::*;
use diesel::sqlite::Sqlite;
use diesel::Connection;
use log::{error, info, trace, warn};
use once_cell::unsync::Lazy;
use rayon::prelude::*;
use serde_json::Value;
use std::convert::TryFrom;
use std::path::{Component, Path, PathBuf};
use std::sync::{Arc, RwLock};
use std::time::{Instant, UNIX_EPOCH};
use std::{fs, iter};
use uuid::Uuid;
use walkdir::WalkDir;
const DIR_TYPE: &str = "FS_DIR";
const DIR_KEY: &str = "DIR";
const DIR_HAS_KEY: &str = "DIR_HAS";
lazy_static! {
static ref DIR_TYPE_INVARIANT: InvariantEntry = InvariantEntry {
attribute: String::from(TYPE_IS_ATTR),
value: EntryValue::Value(Value::from(DIR_TYPE)),
};
static ref DIR_TYPE_ADDR: Address = DIR_TYPE_INVARIANT.entity().unwrap();
}
const BLOB_TYPE: &str = "BLOB";
const FILE_TYPE: &str = "FS_FILE";
const FILE_IDENTITY_KEY: &str = "FILE_IS";
const FILENAME_KEY: &str = "FILE_NAME";
const FILE_MIME_KEY: &str = "FILE_MIME";
const FILE_MTIME_KEY: &str = "FILE_MTIME";
const FILE_SIZE_KEY: &str = "FILE_SIZE";
lazy_static! {
static ref BLOB_TYPE_INVARIANT: InvariantEntry = InvariantEntry {
attribute: String::from(TYPE_IS_ATTR),
value: EntryValue::Value(Value::from(BLOB_TYPE)),
};
static ref BLOB_TYPE_ADDR: Address = BLOB_TYPE_INVARIANT.entity().unwrap();
static ref FILE_TYPE_INVARIANT: InvariantEntry = InvariantEntry {
attribute: String::from(TYPE_IS_ATTR),
value: EntryValue::Value(Value::from(FILE_TYPE)),
};
static ref FILE_TYPE_ADDR: Address = FILE_TYPE_INVARIANT.entity().unwrap();
}
fn initialize_types(pool: &DbPool) -> Result<()> {
// BLOB_TYPE
insert_entry(&pool.get()?, Entry::try_from(&*BLOB_TYPE_INVARIANT)?)?;
upend_insert_addr!(&pool.get()?, BLOB_TYPE_ADDR, IS_OF_TYPE_ATTR, TYPE_ADDR);
upend_insert_val!(&pool.get()?, BLOB_TYPE_ADDR, TYPE_INSTANCED_ATTR, FILE_TYPE);
upend_insert_val!(&pool.get()?, BLOB_TYPE_ADDR, TYPE_HAS_ATTR, FILE_MTIME_KEY);
upend_insert_val!(&pool.get()?, BLOB_TYPE_ADDR, TYPE_HAS_ATTR, FILE_SIZE_KEY);
upend_insert_val!(&pool.get()?, BLOB_TYPE_ADDR, TYPE_HAS_ATTR, FILE_MIME_KEY);
// FILE_TYPE
insert_entry(&pool.get()?, Entry::try_from(&*FILE_TYPE_INVARIANT)?)?;
upend_insert_addr!(&pool.get()?, FILE_TYPE_ADDR, IS_OF_TYPE_ATTR, TYPE_ADDR);
upend_insert_val!(&pool.get()?, FILE_TYPE_ADDR, TYPE_ID_ATTR, FILENAME_KEY);
upend_insert_val!(
&pool.get()?,
FILE_TYPE_ADDR,
TYPE_REQUIRES_ATTR,
FILE_IDENTITY_KEY
);
upend_insert_val!(&pool.get()?, FILE_TYPE_ADDR, TYPE_HAS_ATTR, FILE_MIME_KEY);
// DIR_TYPE
insert_entry(&pool.get()?, Entry::try_from(&*DIR_TYPE_INVARIANT)?)?;
upend_insert_addr!(&pool.get()?, DIR_TYPE_ADDR, IS_OF_TYPE_ATTR, TYPE_ADDR);
upend_insert_val!(&pool.get()?, DIR_TYPE_ADDR, TYPE_ID_ATTR, DIR_KEY);
upend_insert_val!(&pool.get()?, DIR_TYPE_ADDR, TYPE_HAS_ATTR, DIR_HAS_KEY);
Ok(())
}
#[derive(Debug, Clone, PartialEq)]
pub struct UDirectory {
name: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct UPath(Vec<UDirectory>);
const TOP_SEPARATOR: &str = "//";
impl std::str::FromStr for UPath {
type Err = anyhow::Error;
fn from_str(string: &str) -> Result<Self, Self::Err> {
if string.is_empty() {
Ok(UPath(vec![]))
} else {
let result = match string.find(TOP_SEPARATOR) {
Some(head_idx) => {
let (head, rest) = string.split_at(head_idx);
let mut result: Vec<UDirectory> = vec![UDirectory {
name: String::from(head),
}];
result.append(
rest[TOP_SEPARATOR.len()..rest.len()]
.trim_end_matches('/')
.split('/')
.map(|part| UDirectory {
name: String::from(part),
})
.collect::<Vec<UDirectory>>()
.as_mut(),
);
result
}
None => string
.trim_end_matches('/')
.split('/')
.map(|part| UDirectory {
name: String::from(part),
})
.collect(),
};
for directory in &result {
if directory.name.is_empty() {
return Err(anyhow!("INVALID PATH: Directory name cannot be empty!"));
}
}
Ok(UPath(result))
}
}
}
impl std::fmt::Display for UDirectory {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.name)
}
}
impl std::fmt::Display for UPath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self.0.len() {
0 => write!(f, ""),
1 => write!(f, "{}", self.0.first().unwrap().name),
_ => {
let (head, tail) = self.0.split_first().unwrap();
write!(
f,
"{}//{}",
head.name,
tail.iter()
.map(|udir| udir.name.clone())
.collect::<Vec<String>>()
.join("/")
)
}
}
}
}
trait EntryList {
fn extract_addresses(&self) -> Vec<Address>;
}
impl EntryList for Vec<Entry> {
fn extract_addresses(&self) -> Vec<Address> {
self.iter()
.filter_map(|e| {
if let EntryValue::Address(address) = &e.value {
Some(address.clone())
} else {
None
}
})
.collect()
}
}
pub fn list_roots<C: Connection<Backend = Sqlite>>(connection: &C) -> Result<Vec<Address>> {
let all_directories: Vec<Entry> = query(
connection,
Query::SingleQuery(QueryPart::Matches(EntryQuery {
entity: QueryComponent::Any,
attribute: QueryComponent::Exact(DIR_KEY.to_string()),
value: QueryComponent::Any,
})),
)?;
let directories_with_parents: Vec<Address> = query(
connection,
Query::SingleQuery(QueryPart::Matches(EntryQuery {
entity: QueryComponent::Any,
attribute: QueryComponent::Exact(DIR_HAS_KEY.to_string()),
value: QueryComponent::Any,
})),
)?
.extract_addresses();
Ok(all_directories
.into_iter()
.filter(|entry| !directories_with_parents.contains(&entry.entity))
.map(|e| e.entity)
.collect())
}
pub async fn list_directory<C: Connection<Backend = Sqlite>>(
connection: &C,
path: &UPath,
) -> Result<Vec<Entry>> {
let entry_addresses = match path.0.len() {
0 => list_roots(connection)?,
_ => {
let resolved_path: Vec<Address> = resolve_path(connection, path, false)?;
let last = resolved_path.last().unwrap();
query(
connection,
Query::SingleQuery(QueryPart::Matches(EntryQuery {
entity: QueryComponent::Exact(last.clone()),
attribute: QueryComponent::Exact(DIR_HAS_KEY.to_string()),
value: QueryComponent::Any,
})),
)?
.extract_addresses()
}
};
Ok(bulk_retrieve_objects(connection, entry_addresses)?
.into_iter()
.filter(|e| [DIR_KEY, FILENAME_KEY, FILE_IDENTITY_KEY].contains(&e.attribute.as_str()))
.collect::<Vec<Entry>>())
}
pub fn fetch_or_create_dir<C: Connection<Backend = Sqlite>>(
connection: &C,
parent: Option<Address>,
directory: UDirectory,
create: bool,
) -> Result<Address> {
match parent.clone() {
Some(address) => trace!("FETCHING/CREATING {}/{:#}", address, directory),
None => trace!("FETCHING/CREATING /{:#}", directory),
}
let matching_directories: Vec<Address> = query(
connection,
Query::SingleQuery(QueryPart::Matches(EntryQuery {
entity: QueryComponent::Any,
attribute: QueryComponent::Exact(String::from(DIR_KEY)),
value: QueryComponent::Exact(EntryValue::Value(Value::String(directory.name.clone()))),
})),
)?
.into_iter()
.map(|e: Entry| e.entity)
.collect();
let valid_directories: Vec<Address> = match parent.clone() {
Some(address) => {
let parent_has: Vec<Address> = query(
connection,
Query::SingleQuery(QueryPart::Matches(EntryQuery {
entity: QueryComponent::Exact(address),
attribute: QueryComponent::Exact(String::from(DIR_HAS_KEY)),
value: QueryComponent::Any,
})),
)?
.extract_addresses();
matching_directories
.into_iter()
.filter(|a| parent_has.contains(a))
.collect()
}
None => {
let roots = list_roots(connection)?;
matching_directories
.into_iter()
.filter(|a| roots.contains(a))
.collect()
}
};
match valid_directories.len() {
0 => {
if create {
let new_directory_address = Address::Uuid(Uuid::new_v4());
let type_entry = Entry {
entity: new_directory_address.clone(),
attribute: String::from(IS_OF_TYPE_ATTR),
value: EntryValue::Address(DIR_TYPE_ADDR.clone()),
};
insert_entry(connection, type_entry)?;
let directory_entry = Entry {
entity: new_directory_address.clone(),
attribute: String::from(DIR_KEY),
value: EntryValue::Value(Value::String(directory.name)),
};
insert_entry(connection, directory_entry)?;
if let Some(parent_addr) = parent {
let has_entry = Entry {
entity: parent_addr,
attribute: String::from(DIR_HAS_KEY),
value: EntryValue::Address(new_directory_address.clone()),
};
insert_entry(connection, has_entry)?;
}
Ok(new_directory_address)
} else {
Err(anyhow!("Directory does not exist."))
}
}
1 => Ok(valid_directories[0].clone()),
_ => Err(anyhow!(
"Invalid database state - more than one directory matches the query!"
)),
}
}
pub fn resolve_path<C: Connection<Backend = Sqlite>>(
connection: &C,
path: &UPath,
create: bool,
) -> Result<Vec<Address>> {
let mut result: Vec<Address> = vec![];
let mut path_stack = path.0.to_vec();
path_stack.reverse();
while !path_stack.is_empty() {
let dir_address = fetch_or_create_dir(
connection,
result.last().cloned(),
path_stack.pop().unwrap(),
create,
)?;
result.push(dir_address);
}
Ok(result)
}
pub async fn rescan_vault(
pool: DbPool,
directory: PathBuf,
job_container: Arc<RwLock<JobContainer>>,
) {
let job_id = job_container
.write()
.unwrap()
.add_job(Job::new("REIMPORT", "Reimporting vault..."))
.unwrap();
let result =
actix_web::web::block(move || _rescan_vault(pool, directory, job_container, job_id)).await;
if result.is_err() {
let err = result.err().unwrap();
error!("Update did not succeed! {:?}", err);
}
}
type UpdatePathResult = Result<UpdatePathOutcome>;
enum UpdatePathOutcome {
Added(PathBuf),
Unchanged(PathBuf),
Removed(PathBuf),
}
fn _rescan_vault<T: AsRef<Path>>(
pool: DbPool,
directory: T,
job_container: Arc<RwLock<JobContainer>>,
job_id: JobId,
) -> Result<Vec<UpdatePathResult>> {
let start = Instant::now();
// Initialize types, etc...
initialize_types(&pool)?;
// Walk through the vault, find all paths
let path_entries: Vec<PathBuf> = WalkDir::new(&directory)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.path().is_file() && e.file_name() != DATABASE_FILENAME)
.map(|e| fs::canonicalize(e.into_path()).unwrap())
.collect();
// Prepare for processing
let rw_pool = Arc::new(RwLock::new(pool.clone()));
let absolute_path = fs::canonicalize(&directory)?;
let existing_files = Arc::new(RwLock::new(retrieve_all_files(&pool.get()?)?));
// Actual processing
let count = RwLock::new(0_usize);
let total = path_entries.len() as f32;
let path_results: Vec<UpdatePathResult> = path_entries
.into_par_iter()
.map(|path| {
let result = _process_directory_entry(&rw_pool, path, &absolute_path, &existing_files)?;
let mut cnt = count.write().unwrap();
*cnt += 1;
job_container
.write()
.unwrap()
.update_progress(&job_id, *cnt as f32 / total * 100.0)
.unwrap();
Ok(result)
})
.collect();
let cleanup_results: Vec<UpdatePathResult> = existing_files
.write()
.unwrap()
.iter()
.filter(|f| f.valid)
.map(|file| {
let connection = pool.get()?;
connection.transaction::<_, Error, _>(|| {
file_set_valid(&connection, file.id, false)?;
// remove_object(&connection, )?
Ok(UpdatePathOutcome::Removed(PathBuf::from(file.path.clone())))
})
})
.collect();
let mut failed: Vec<&Error> = vec![];
let mut created = 0;
let mut unchanged = 0;
let mut deleted = 0;
for result in &path_results {
match result {
Ok(result) => match result {
UpdatePathOutcome::Added(_) => created += 1,
UpdatePathOutcome::Unchanged(_) => unchanged += 1,
UpdatePathOutcome::Removed(_) => deleted += 1,
},
Err(err) => failed.push(err),
}
}
if !failed.is_empty() {
warn!(
"{} path updates failed! ({})",
failed.len(),
failed
.iter()
.map(|e| e.to_string())
.collect::<Vec<String>>()
.join(", ")
)
}
info!(
"Finished updating {} ({} created, {} deleted, {} left unchanged). Took {}s.",
directory.as_ref().display(),
created,
deleted,
unchanged,
start.elapsed().as_secs()
);
Ok(path_results
.into_iter()
.chain(cleanup_results.into_iter())
.collect())
}
fn _process_directory_entry<P: AsRef<Path>>(
db_pool: &Arc<RwLock<DbPool>>,
path: PathBuf,
directory_path: &P,
existing_files: &Arc<RwLock<Vec<File>>>,
) -> UpdatePathResult {
info!("Processing: {:?}", path);
// Prepare the data
let db_pool = Arc::clone(&db_pool);
let existing_files = Arc::clone(&existing_files);
let normalized_path = path.strip_prefix(&directory_path)?;
let normalized_path_str = normalized_path.to_str().expect("path not valid unicode?!");
let digest = Lazy::new(|| path.hash());
// Get size & mtime for quick comparison
let metadata = fs::metadata(&path)?;
let size = metadata.len() as i64;
if size < 0 {
panic!("File {} too large?!", path.display());
}
let mtime = metadata
.modified()
.map(|t| {
NaiveDateTime::from_timestamp(t.duration_since(UNIX_EPOCH).unwrap().as_secs() as i64, 0)
})
.ok();
// Check if the path entry for this file already exists in database
{
// Only grab existing_files for the duration of this block
let mut existing_files = existing_files.write().unwrap();
let maybe_existing_file = existing_files
.iter()
.enumerate()
.find(|(_, file)| file.path == normalized_path_str);
if let Some((idx, existing_file)) = maybe_existing_file {
if (size == existing_file.size && mtime == existing_file.mtime)
|| ((*digest).is_ok() && &existing_file.hash == (*digest).as_ref().unwrap())
{
if !existing_file.valid {
file_set_valid(&db_pool.write().unwrap().get()?, existing_file.id, true)?;
}
existing_files.swap_remove(idx);
return Ok(UpdatePathOutcome::Unchanged(path));
}
}
}
// If not, add it!
if let Err(err) = &*digest {
return Err(anyhow!(format!("Error hashing: {}", err)));
}
let digest = (*digest).as_ref().unwrap().clone();
let new_file = models::NewFile {
path: normalized_path_str.to_string(),
hash: (digest.clone()).0,
added: NaiveDateTime::from_timestamp(Utc::now().timestamp(), 0),
size,
mtime,
};
insert_file(&db_pool.write().unwrap().get()?, new_file)?;
// Insert metadata
let type_entry = Entry {
entity: Address::Hash(digest.clone()),
attribute: String::from(IS_OF_TYPE_ATTR),
value: EntryValue::Address(BLOB_TYPE_ADDR.clone()),
};
insert_entry(&db_pool.write().unwrap().get()?, type_entry)?;
let size_entry = Entry {
entity: Address::Hash(digest.clone()),
attribute: FILE_SIZE_KEY.to_string(),
value: EntryValue::Value(Value::from(size)),
};
insert_entry(&db_pool.write().unwrap().get()?, size_entry)?;
if let Some(mtime) = mtime {
let mtime_entry = Entry {
entity: Address::Hash(digest.clone()),
attribute: FILE_MTIME_KEY.to_string(),
value: EntryValue::Value(Value::from(mtime.timestamp())),
};
insert_entry(&db_pool.write().unwrap().get()?, mtime_entry)?;
}
let mime_entry = Entry {
entity: Address::Hash(digest.clone()),
attribute: FILE_MIME_KEY.to_string(),
value: EntryValue::Value(Value::String(tree_magic::from_filepath(&path))),
};
insert_entry(&db_pool.write().unwrap().get()?, mime_entry)?;
// Finally, add the appropriate entries w/r/t virtual filesystem location
let components = normalized_path.components().collect::<Vec<Component>>();
let (filename, dir_path) = components.split_last().unwrap();
let upath = UPath(
iter::once(UDirectory {
name: "NATIVE".to_string(),
})
.chain(dir_path.iter().map(|component| UDirectory {
name: component.as_os_str().to_string_lossy().to_string(),
}))
.collect(),
);
let resolved_path = resolve_path(&db_pool.write().unwrap().get()?, &upath, true)?;
let parent_dir = resolved_path.last().unwrap();
let _pool = &db_pool.write().unwrap();
let connection = _pool.get()?;
connection.transaction::<_, Error, _>(|| {
let file_address = Address::Uuid(Uuid::new_v4());
let type_entry = Entry {
entity: file_address.clone(),
attribute: String::from(IS_OF_TYPE_ATTR),
value: EntryValue::Address(FILE_TYPE_ADDR.clone()),
};
insert_entry(&connection, type_entry)?;
let name_entry = Entry {
entity: file_address.clone(),
attribute: FILENAME_KEY.to_string(),
value: EntryValue::Value(Value::String(
filename.as_os_str().to_string_lossy().to_string(),
)),
};
insert_entry(&connection, name_entry)?;
let identity_entry = Entry {
entity: file_address.clone(),
attribute: FILE_IDENTITY_KEY.to_string(),
value: EntryValue::Address(Address::Hash(digest.clone())),
};
insert_entry(&connection, identity_entry)?;
let dir_has_entry = Entry {
entity: parent_dir.clone(),
attribute: DIR_HAS_KEY.to_string(),
value: EntryValue::Address(file_address),
};
insert_entry(&connection, dir_has_entry)?;
Ok(UpdatePathOutcome::Added(path.clone()))
})
}
#[cfg(test)]
mod tests {
use anyhow::Result;
use crate::filesystem::{UDirectory, UPath};
#[test]
fn test_path_codec() {
let path = UPath(vec![
UDirectory {
name: "top".to_string(),
},
UDirectory {
name: "foo".to_string(),
},
UDirectory {
name: "bar".to_string(),
},
UDirectory {
name: "baz".to_string(),
},
]);
let str_path = path.to_string();
assert!(str_path.len() > 0);
let decoded_path: Result<UPath> = str_path.parse();
assert!(decoded_path.is_ok());
assert_eq!(path, decoded_path.unwrap());
}
#[test]
fn test_validation() {
let invalid_path: Result<UPath> = "a//b/c//d/e/f///g".parse();
assert!(invalid_path.is_err())
}
}