rdupe/src/main.rs

298 lines
8.5 KiB
Rust

extern crate clap;
extern crate term;
extern crate sha2;
extern crate walkdir;
extern crate time;
extern crate num_cpus;
extern crate crossbeam;
use time::PreciseTime;
use sha2::{Sha256, Digest};
use std::path::Path;
use std::process;
use std::fs;
use std::io::Read;
use walkdir::WalkDir;
use std::fmt;
use std::thread;
use std::sync::{Arc, Mutex};
const BUFFER_SIZE: usize = 1024;
/* Note for myself : CLAP = _C_ommand _L_ine _A_rgument _P_arser */
use clap::{Arg,ArgMatches, App};
struct Args {
input: String,
output: String,
vlevel: u8,
dryrun: bool,
}
impl Args {
fn new(matches: &ArgMatches) -> Args {
let i = matches.value_of("source").unwrap();
let o = matches.value_of("dest").unwrap();
let vl = match matches.occurrences_of("verbose") {
0 => 0,
1 => 1,
2 => 2,
3 | _ => 3,
};
let dr = matches.is_present("dry-run");
Args {
input: i.to_string(),
output: o.to_string(),
vlevel: vl,
dryrun: dr,
}
}
fn path_exist(&self) -> bool {
let mut result = true;
if ! Path::new(&self.input).exists() {
result = false;
println!("Error, input ( {} ) is not a valid directory", self.input);
}
if ! Path::new(&self.output).exists() {
result = false;
println!("Error, output ({} ) is not a valid directory", self.output);
}
result
}
fn check_not_same(&self) -> bool {
self.input != self.output
}
fn check_not_parent(&self) -> bool {
for (i, x) in vec![&self.input, &self.output].iter().enumerate() {
let mut a = Path::new(x);
let tmp = match i {
0 => &self.output,
1 => &self.input,
_ => "None",
};
loop {
let b = a.parent();
a = match b {
Some(b) => {
if b.to_str().unwrap() == tmp {
return false;
}
b
},
None => break,
};
}
}
true
}
}
struct FileToProcess {
hash: Vec<u8>,
name: String,
realpath: String,
}
impl fmt::Display for FileToProcess {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "[").unwrap();
for byte in &self.hash {
write!(f, "{:02x}", byte).unwrap();
}
write!(f, "] - {} ({})",
self.name,
self.realpath)
}
}
impl FileToProcess {
/// From https://github.com/RustCrypto/hashes/blob/master/sha2/examples/sha256sum.rs
/// Compute digest value for given `Reader` and print it
/// On any error simply return without doing anything
fn hash<D: Digest + Default, R: Read>(&mut self, reader: &mut R) {
let mut sh = D::default();
let mut buffer = [0u8; BUFFER_SIZE];
loop {
let n = match reader.read(&mut buffer) {
Ok(n) => n,
Err(_) => return,
};
sh.input(&buffer[..n]);
if n == 0 || n < BUFFER_SIZE {
break;
}
}
for i in sh.result() {
self.hash.push(i);
}
}
}
fn main() {
let files_candidate = Arc::new(Mutex::new(Vec::new()));
let mut t = term::stdout().unwrap();
let matches = App::new("rdupe")
.version("0.1.0")
.author("Beneth <bmauduit@beneth.fr>")
.about("Symlink identical files from source to dest based on hash")
.arg(Arg::with_name("source")
.help("Input1 directory (will keep the real file)")
.short("i")
.long("input")
.takes_value(true)
.required(true)
)
.arg(Arg::with_name("dest")
.help("destination directory (will be symlink to real file)")
.short("o")
.long("dest")
.takes_value(true)
.required(true)
)
.arg(Arg::with_name("dry-run")
.short("d")
.long("dry-run")
.help("Dry run (Compare hash but do not symlink)")
)
.arg(Arg::with_name("verbose")
.short("v")
.long("verbose")
.multiple(true)
.help("Sets the level of verbosity")
)
.get_matches();
t.fg(term::color::GREEN).unwrap();
let args = Args::new(&matches);
if args.vlevel >= 2 {
println!("Value for input: {}", args.input);
println!("Value for output: {}", args.output);
println!("Verbosity Level: {}", args.vlevel);
if args.dryrun {
println!("dry-run enabled");
} else if args.vlevel >= 2 {
println!("dry-run not enabled");
}
}
// Check input & output
// 1 - Existence
if ! args.path_exist() {
println!("Exiting: Path 1 or Path 2 does not exist");
process::exit(1);
}
// 2 - Not the same path
if ! args.check_not_same() {
println!("Exiting: input and output are the same");
process::exit(1);
}
// 3 - Coherence (Path1|2 does not contain path1|2)
if ! args.check_not_parent() {
println!("Exiting: Path1 or Path2 are parent !");
process::exit(1);
}
let start = PreciseTime::now();
// Walk through path 1 & 2 with 2 threads
let mut children = vec![];
let args_source = vec![args.input, args.output];
for s in args_source {
let fc = files_candidate.clone();
children.push(thread::spawn(move || {
for entry in WalkDir::new(&s)
.into_iter()
.filter_map(|e| e.ok())
{
// symlink_metadata does not follow symlink :-]
let metadata = fs::symlink_metadata(entry.path()).unwrap();
let ft = metadata.file_type();
if ft.is_file() {
let a = FileToProcess {
name: format!("{}",
entry.path().display()),
hash: vec![],
realpath: String::from("TODO"),
};
fc.lock().unwrap().push(a);
}
}
}));
}
for child in children {
let _ = child.join();
}
// compute file hash in parallel
let num_cpus = num_cpus::get();
let files_candidate_len = files_candidate.lock().unwrap().len();
let chunk_size = (files_candidate_len / num_cpus) + 1;
let modulus = files_candidate_len % num_cpus;
println!("Calculate {} file(s)", files_candidate_len);
println!("Use {} chunk(s) of size {}", modulus, chunk_size);
println!("Use {} chunk(s) of size {}", num_cpus - modulus, chunk_size - 1);
let mut work = files_candidate.lock().unwrap();
// Example from :
// https://stackoverflow.com/questions/33818141/how-do-i-pass-disjoint-slices-from-a-vector-to-different-threads
// Scoped threads allow the compiler to prove that no threads will outlive
// table (which would be bad).
crossbeam::scope(|scope| {
let (split_a, split_b) = work.split_at_mut(chunk_size * modulus);
let chunk_a = split_a.chunks_mut(chunk_size);
let chunk_b = split_b.chunks_mut(chunk_size - 1);
let mut guards = vec![];
for (i, slice) in chunk_a.chain(chunk_b).enumerate() {
// Spawn a thread operating on that subslice.
let guard = scope.spawn(move || {
for w in slice {
println!("[{}] Hashing : {}", i, w);
//thread::sleep(std::time::Duration::from_secs(2));
if let Ok(mut file) = fs::File::open(&w.name) {
w.hash::<Sha256, _>(&mut file);
} else {
panic!("Error opening file (name = {})!", w.name);
}
}
});
guards.push(guard);
}
for guard in guards {
let _ = guard.join();
}
});
for i in work.iter() {
println!("{}", i);
}
// TODO with work !
// check for each hash duplication
// if so --> log to file and remove from list (store to done vector)
t.fg(term::color::CYAN).unwrap();
let end = PreciseTime::now();
println!("{} seconds.", start.to(end));
t.reset().unwrap();
}