Skip to content

Instantly share code, notes, and snippets.

@akostadinov
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akostadinov/4cda59f17c450f64bbd6 to your computer and use it in GitHub Desktop.
Save akostadinov/4cda59f17c450f64bbd6 to your computer and use it in GitHub Desktop.
find duplicate files (WIP)
require 'fileutils'
# this is ugly quickly hacked script, not very nice and reliable at the moment
# it finds what files from one archive need to be backed-up (i.e. are not present in some other archive dir)
# $ find new_dir -type f > arch.txt
# $ find old_dir -type f > arch_old.txt
# `find -name ".?*" -prune -o -type f` to avoid hidden files
# cat to_arch.txt | xargs -d "\n" stat -c "%s" | awk '{size+=$1} END {print size}' # to see size of files to copy
# sudo rsync -avx --files-from=to_arch.txt ./ /path/to/store/backup
# TODO: get files fast hash (md5?) and compare based on it; avoid external tools like find
org_arch = "arch_old.txt"
to_arch = "arch.txt"
to_arch_basedir = "new_dir"
org_basedir = "old_dir"
# populate Hash with files under question to archive
arch={}
File.open(to_arch, "r") do |infile|
while (line = infile.gets)
file = File.basename(line).strip
arch[file] ||= []
arch[file] << line.strip
end
end
# remove from Hash what has already been archived
File.open(org_arch, "r") do |infile|
while (path = infile.gets)
match = arch[File.basename(path).strip]
#require 'pry'
#binding.pry
if match
path.strip!.slice!("./")
org_path = File.join(org_basedir, path)
match.delete_if { |file|
file.slice!("./")
file_path = File.join(to_arch_basedir, file)
true if FileUtils.cmp(org_path, file_path)
}
end
end
end
#print files to be archived
arch.each { |name,paths|
paths.each { |path|
puts path
}
}
require 'fileutils'
require 'yaml'
require 'find'
# select original archive dir and dir that is to be archived
# script generates list of files in original archive
# and list of files to be archived that are not present in
# original archive
#
# how to perform the actual archival:
# cd <to_arch_basedir>
# sudo rsync -avx --files-from=_path_containing_files_under_question_to_be_archived-to_arch.lst ./ /path/to/store/backup
to_arch_basedir = "/path/containing/previously/archived/files"
org_basedir = "/path/containing/files/under/question/to/be/archived"
unless File.directory?(org_basedir)
puts "Original archive directory does not exist: #{org_basedir}"
end
unless File.directory?(to_arch_basedir)
puts "Directory to archive does not exist: #{to_arch_basedir}"
end
org_arch_lst = File.absolute_path(org_basedir).split(/[\/\\ ]/).join("_") + ".lst"
to_arch = File.absolute_path(to_arch_basedir).split(/[\/\\ ]/).join("_") + "-to_arch.lst"
# populate Hash with files from existing archive
if File.file?(org_arch_lst)
arch=YAML.load_file(org_arch_lst)
else
arch={}
Find.find(org_basedir) { |file|
next unless File.file?(file)
bname = File.basename(file)
#file.slice!(org_basedir)
#file.slice!(File::SEPARATOR)
arch[bname] ||= []
arch[bname] << file
}
# save original archove files for later use
File.open(org_arch_lst, 'w') { |f| YAML.dump(arch, f) }
end
to_arch_size = 0
# find all files not present in original archive
File.open(to_arch, "w") do |outfile|
Find.find(to_arch_basedir) do |path|
match = arch[File.basename(path)]
#require 'pry'
#binding.pry
if match
path_archived = match.find{ |f|
begin
FileUtils.cmp(path, f)
rescue => e
# seems like an io error reading some file
# lets figure out which one and report in console
if e.message.include?(path)
puts "Error reading from files to archive: #{path}"
elsif e.message.include?(f)
puts "Error reading from original archive: #{f}"
else
puts(e.message)
end
end
}
else
path_archived = false
end
unless path_archived
to_arch_size += File.size(path)
path.slice!(to_arch_basedir)
path.slice!(File::SEPARATOR)
outfile.write(path)
outfile.write("\n")
end
end
end
puts "Size to archive: #{to_arch_size}"
puts "file with list of paths to archive: #{to_arch}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment