Last active
August 29, 2015 14:12
-
-
Save akostadinov/4cda59f17c450f64bbd6 to your computer and use it in GitHub Desktop.
find duplicate files (WIP)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'fileutils' | |
# this is ugly quickly hacked script, not very nice and reliable at the moment | |
# it finds what files from one archive need to be backed-up (i.e. are not present in some other archive dir) | |
# $ find new_dir -type f > arch.txt | |
# $ find old_dir -type f > arch_old.txt | |
# `find -name ".?*" -prune -o -type f` to avoid hidden files | |
# cat to_arch.txt | xargs -d "\n" stat -c "%s" | awk '{size+=$1} END {print size}' # to see size of files to copy | |
# sudo rsync -avx --files-from=to_arch.txt ./ /path/to/store/backup | |
# TODO: get files fast hash (md5?) and compare based on it; avoid external tools like find | |
org_arch = "arch_old.txt" | |
to_arch = "arch.txt" | |
to_arch_basedir = "new_dir" | |
org_basedir = "old_dir" | |
# populate Hash with files under question to archive | |
arch={} | |
File.open(to_arch, "r") do |infile| | |
while (line = infile.gets) | |
file = File.basename(line).strip | |
arch[file] ||= [] | |
arch[file] << line.strip | |
end | |
end | |
# remove from Hash what has already been archived | |
File.open(org_arch, "r") do |infile| | |
while (path = infile.gets) | |
match = arch[File.basename(path).strip] | |
#require 'pry' | |
#binding.pry | |
if match | |
path.strip!.slice!("./") | |
org_path = File.join(org_basedir, path) | |
match.delete_if { |file| | |
file.slice!("./") | |
file_path = File.join(to_arch_basedir, file) | |
true if FileUtils.cmp(org_path, file_path) | |
} | |
end | |
end | |
end | |
#print files to be archived | |
arch.each { |name,paths| | |
paths.each { |path| | |
puts path | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'fileutils' | |
require 'yaml' | |
require 'find' | |
# select original archive dir and dir that is to be archived | |
# script generates list of files in original archive | |
# and list of files to be archived that are not present in | |
# original archive | |
# | |
# how to perform the actual archival: | |
# cd <to_arch_basedir> | |
# sudo rsync -avx --files-from=_path_containing_files_under_question_to_be_archived-to_arch.lst ./ /path/to/store/backup | |
to_arch_basedir = "/path/containing/previously/archived/files" | |
org_basedir = "/path/containing/files/under/question/to/be/archived" | |
unless File.directory?(org_basedir) | |
puts "Original archive directory does not exist: #{org_basedir}" | |
end | |
unless File.directory?(to_arch_basedir) | |
puts "Directory to archive does not exist: #{to_arch_basedir}" | |
end | |
org_arch_lst = File.absolute_path(org_basedir).split(/[\/\\ ]/).join("_") + ".lst" | |
to_arch = File.absolute_path(to_arch_basedir).split(/[\/\\ ]/).join("_") + "-to_arch.lst" | |
# populate Hash with files from existing archive | |
if File.file?(org_arch_lst) | |
arch=YAML.load_file(org_arch_lst) | |
else | |
arch={} | |
Find.find(org_basedir) { |file| | |
next unless File.file?(file) | |
bname = File.basename(file) | |
#file.slice!(org_basedir) | |
#file.slice!(File::SEPARATOR) | |
arch[bname] ||= [] | |
arch[bname] << file | |
} | |
# save original archove files for later use | |
File.open(org_arch_lst, 'w') { |f| YAML.dump(arch, f) } | |
end | |
to_arch_size = 0 | |
# find all files not present in original archive | |
File.open(to_arch, "w") do |outfile| | |
Find.find(to_arch_basedir) do |path| | |
match = arch[File.basename(path)] | |
#require 'pry' | |
#binding.pry | |
if match | |
path_archived = match.find{ |f| | |
begin | |
FileUtils.cmp(path, f) | |
rescue => e | |
# seems like an io error reading some file | |
# lets figure out which one and report in console | |
if e.message.include?(path) | |
puts "Error reading from files to archive: #{path}" | |
elsif e.message.include?(f) | |
puts "Error reading from original archive: #{f}" | |
else | |
puts(e.message) | |
end | |
end | |
} | |
else | |
path_archived = false | |
end | |
unless path_archived | |
to_arch_size += File.size(path) | |
path.slice!(to_arch_basedir) | |
path.slice!(File::SEPARATOR) | |
outfile.write(path) | |
outfile.write("\n") | |
end | |
end | |
end | |
puts "Size to archive: #{to_arch_size}" | |
puts "file with list of paths to archive: #{to_arch}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment