
296 lines
8.4 KiB

extern crate clap;
extern crate term;
extern crate sha2;
extern crate walkdir;
extern crate time;
extern crate num_cpus;
extern crate crossbeam;
use time::PreciseTime;
use sha2::{Sha256, Digest};
use std::path::Path;
use std::process;
use std::fs;
use std::io::Read;
use walkdir::WalkDir;
use std::fmt;
use std::thread;
use std::sync::{Arc, Mutex};
const BUFFER_SIZE: usize = 1024;
/* Note for myself : CLAP = _C_ommand _L_ine _A_rgument _P_arser */
use clap::{Arg,ArgMatches, App};
struct Args {
input: String,
output: String,
vlevel: u8,
dryrun: bool,
impl Args {
fn new(matches: &ArgMatches) -> Args {
let i = matches.value_of("source").unwrap();
let o = matches.value_of("dest").unwrap();
let vl = match matches.occurrences_of("verbose") {
0 => 0,
1 => 1,
2 => 2,
3 | _ => 3,
let dr = matches.is_present("dry-run");
Args {
input: i.to_string(),
output: o.to_string(),
vlevel: vl,
dryrun: dr,
fn path_exist(&self) -> bool {
let mut result = true;
if ! Path::new(&self.input).exists() {
result = false;
println!("Error, input ( {} ) is not a valid directory", self.input);
if ! Path::new(&self.output).exists() {
result = false;
println!("Error, output ({} ) is not a valid directory", self.output);
fn check_not_same(&self) -> bool {
self.input != self.output
fn check_not_parent(&self) -> bool {
for (i, x) in vec![&self.input, &self.output].iter().enumerate() {
let mut a = Path::new(x);
let tmp = match i {
0 => &self.output,
1 => &self.input,
_ => "None",
loop {
let b = a.parent();
a = match b {
Some(b) => {
if b.to_str().unwrap() == tmp {
return false;
None => break,
struct FileToProcess {
hash: Vec<u8>,
name: String,
realpath: String,
impl fmt::Display for FileToProcess {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "[").unwrap();
for byte in &self.hash {
write!(f, "{:02x}", byte).unwrap();
write!(f, "] - {} ({})",
impl FileToProcess {
/// From https://github.com/RustCrypto/hashes/blob/master/sha2/examples/sha256sum.rs
/// Compute digest value for given `Reader` and print it
/// On any error simply return without doing anything
fn hash<D: Digest + Default, R: Read>(&mut self, reader: &mut R) {
let mut sh = D::default();
let mut buffer = [0u8; BUFFER_SIZE];
loop {
let n = match reader.read(&mut buffer) {
Ok(n) => n,
Err(_) => return,
if n == 0 || n < BUFFER_SIZE {
for i in sh.result() {
fn main() {
let files_candidate = Arc::new(Mutex::new(Vec::new()));
let mut t = term::stdout().unwrap();
let matches = App::new("rdupe")
.author("Beneth <bmauduit@beneth.fr>")
.about("Symlink identical files from source to dest based on hash")
.help("Input1 directory (will keep the real file)")
.help("destination directory (will be symlink to real file)")
.help("Dry run (Compare hash but do not symlink)")
.help("Sets the level of verbosity")
let args = Args::new(&matches);
if args.vlevel >= 2 {
println!("Value for input: {}", args.input);
println!("Value for output: {}", args.output);
println!("Verbosity Level: {}", args.vlevel);
if args.dryrun {
println!("dry-run enabled");
} else if args.vlevel >= 2 {
println!("dry-run not enabled");
// Check input & output
// 1 - Existence
if ! args.path_exist() {
println!("Exiting: Path 1 or Path 2 does not exist");
// 2 - Not the same path
if ! args.check_not_same() {
println!("Exiting: input and output are the same");
// 3 - Coherence (Path1|2 does not contain path1|2)
if ! args.check_not_parent() {
println!("Exiting: Path1 or Path2 are parent !");
let start = PreciseTime::now();
// Walk through path 1 & 2 with 2 threads
let mut children = vec![];
let args_source = vec![args.input, args.output];
for s in args_source {
let fc = files_candidate.clone();
children.push(thread::spawn(move || {
for entry in WalkDir::new(&s)
.filter_map(|e| e.ok())
// symlink_metadata does not follow symlink :-]
let metadata = fs::symlink_metadata(entry.path()).unwrap();
let ft = metadata.file_type();
if ft.is_file() {
let mut a = FileToProcess {
name: format!("{}",
hash: vec![],
realpath: String::from("TODO"),
for child in children {
let _ = child.join();
// compute file hash in parallel
let num_cpus = num_cpus::get();
let files_candidate_len = files_candidate.lock().unwrap().len();
let chunk_size = (files_candidate_len / num_cpus) + 1;
let modulus = files_candidate_len % num_cpus;
println!("Calculate {} file(s)", files_candidate_len);
println!("Use {} chunk(s) of size {}", modulus, chunk_size);
println!("Use {} chunk(s) of size {}", num_cpus - modulus, chunk_size - 1);
let mut guards = vec![];
let fc = files_candidate.clone();
let mut work = fc.lock().unwrap();
// Example from :
// https://stackoverflow.com/questions/33818141/how-do-i-pass-disjoint-slices-from-a-vector-to-different-threads
// Scoped threads allow the compiler to prove that no threads will outlive
// table (which would be bad).
crossbeam::scope(|scope| {
let (split_a, split_b) = work.split_at_mut(chunk_size * modulus);
let chunk_a = split_a.chunks_mut(chunk_size);
let chunk_b = split_b.chunks_mut(chunk_size - 1);
for (i, slice) in chunk_a.chain(chunk_b).enumerate() {
// Spawn a thread operating on that subslice.
let guard = scope.spawn(move || {
for w in slice {
println!("[{}] Hashing : {}", i, w);
if let Ok(mut file) = fs::File::open(&w.name) {
w.hash::<Sha256, _>(&mut file);
} else {
panic!("Error opening file (name = {})!", w.name);
for guard in guards {
for i in work.iter() {
println!("{}", i);
let end = PreciseTime::now();
println!("{} seconds.", start.to(end));