akostadinov/arch_comp.rb

## arch_comp.rb
require 'fileutils'

# this is ugly quickly hacked script, not very nice and reliable at the moment
# it finds what files from one archive need to be backed-up (i.e. are not present in some other archive dir)
# $ find new_dir -type f > arch.txt
# $ find old_dir -type f > arch_old.txt
# `find -name ".?*" -prune -o -type f` to avoid hidden files
# cat to_arch.txt | xargs -d "\n" stat -c "%s" | awk '{size+=$1} END {print size}' # to see size of files to copy
# sudo rsync -avx --files-from=to_arch.txt ./ /path/to/store/backup
# TODO: get files fast hash (md5?) and compare based on it; avoid external tools like find

org_arch = "arch_old.txt"
to_arch = "arch.txt"
to_arch_basedir = "new_dir"
org_basedir = "old_dir"

# populate Hash with files under question to archive
arch={}
File.open(to_arch, "r") do |infile|
  while (line = infile.gets)
    file = File.basename(line).strip
    arch[file] ||= []
    arch[file] << line.strip
  end
end

# remove from Hash what has already been archived
File.open(org_arch, "r") do |infile|
  while (path = infile.gets)
    match = arch[File.basename(path).strip]
    #require 'pry'
    #binding.pry
    if match
      path.strip!.slice!("./")
      org_path = File.join(org_basedir, path)
      match.delete_if { |file|
        file.slice!("./")
        file_path = File.join(to_arch_basedir, file)
        true if FileUtils.cmp(org_path, file_path)
      }
    end
  end
end

#print files to be archived
arch.each { |name,paths|
  paths.each { |path|
    puts path
  }
}

## arch_comp_2.rb
require 'fileutils'
require 'yaml'
require 'find'

# select original archive dir and dir that is to be archived
# script generates list of files in original archive
# and list of files to be archived that are not present in
# original archive
#
# how to perform the actual archival:
# cd <to_arch_basedir>
# sudo rsync -avx --files-from=_path_containing_files_under_question_to_be_archived-to_arch.lst ./ /path/to/store/backup

to_arch_basedir = "/path/containing/previously/archived/files"
org_basedir = "/path/containing/files/under/question/to/be/archived"

unless File.directory?(org_basedir)
  puts "Original archive directory does not exist: #{org_basedir}"
end
unless File.directory?(to_arch_basedir)
  puts "Directory to archive does not exist: #{to_arch_basedir}"
end


org_arch_lst = File.absolute_path(org_basedir).split(/[\/\\ ]/).join("_") + ".lst"
to_arch =  File.absolute_path(to_arch_basedir).split(/[\/\\ ]/).join("_") + "-to_arch.lst"

# populate Hash with files from existing archive
if File.file?(org_arch_lst)
  arch=YAML.load_file(org_arch_lst)
else
  arch={}
  Find.find(org_basedir) { |file|
    next unless File.file?(file)
    bname = File.basename(file)
    #file.slice!(org_basedir)
    #file.slice!(File::SEPARATOR)
    arch[bname] ||= []
    arch[bname] << file
  }
  # save original archove files for later use
  File.open(org_arch_lst, 'w') { |f| YAML.dump(arch, f) }
end

to_arch_size = 0
# find all files not present in original archive
File.open(to_arch, "w") do |outfile|
  Find.find(to_arch_basedir) do |path|
    match = arch[File.basename(path)]
    #require 'pry'
    #binding.pry
    if match
      path_archived = match.find{ |f|
        begin
          FileUtils.cmp(path, f)
        rescue => e
          # seems like an io error reading some file
          # lets figure out which one and report in console
          if e.message.include?(path)
            puts "Error reading from files to archive: #{path}"
          elsif e.message.include?(f)
            puts "Error reading from original archive: #{f}"
          else
            puts(e.message)
          end
        end
      }
    else
      path_archived = false
    end

    unless path_archived
      to_arch_size += File.size(path)

      path.slice!(to_arch_basedir)
      path.slice!(File::SEPARATOR)
      outfile.write(path)
      outfile.write("\n")
    end
  end
end

puts "Size to archive: #{to_arch_size}"
puts "file with list of paths to archive: #{to_arch}"
	require 'fileutils'

	# this is ugly quickly hacked script, not very nice and reliable at the moment
	# it finds what files from one archive need to be backed-up (i.e. are not present in some other archive dir)
	# $ find new_dir -type f > arch.txt
	# $ find old_dir -type f > arch_old.txt
	# `find -name ".?*" -prune -o -type f` to avoid hidden files
	# cat to_arch.txt \| xargs -d "\n" stat -c "%s" \| awk '{size+=$1} END {print size}' # to see size of files to copy
	# sudo rsync -avx --files-from=to_arch.txt ./ /path/to/store/backup
	# TODO: get files fast hash (md5?) and compare based on it; avoid external tools like find

	org_arch = "arch_old.txt"
	to_arch = "arch.txt"
	to_arch_basedir = "new_dir"
	org_basedir = "old_dir"

	# populate Hash with files under question to archive
	arch={}
	File.open(to_arch, "r") do \|infile\|
	while (line = infile.gets)
	file = File.basename(line).strip
	arch[file] \|\|= []
	arch[file] << line.strip
	end
	end

	# remove from Hash what has already been archived
	File.open(org_arch, "r") do \|infile\|
	while (path = infile.gets)
	match = arch[File.basename(path).strip]
	#require 'pry'
	#binding.pry
	if match
	path.strip!.slice!("./")
	org_path = File.join(org_basedir, path)
	match.delete_if { \|file\|
	file.slice!("./")
	file_path = File.join(to_arch_basedir, file)
	true if FileUtils.cmp(org_path, file_path)
	}
	end
	end
	end

	#print files to be archived
	arch.each { \|name,paths\|
	paths.each { \|path\|
	puts path
	}
	}
	require 'fileutils'
	require 'yaml'
	require 'find'

	# select original archive dir and dir that is to be archived
	# script generates list of files in original archive
	# and list of files to be archived that are not present in
	# original archive
	#
	# how to perform the actual archival:
	# cd <to_arch_basedir>
	# sudo rsync -avx --files-from=_path_containing_files_under_question_to_be_archived-to_arch.lst ./ /path/to/store/backup

	to_arch_basedir = "/path/containing/previously/archived/files"
	org_basedir = "/path/containing/files/under/question/to/be/archived"

	unless File.directory?(org_basedir)
	puts "Original archive directory does not exist: #{org_basedir}"
	end
	unless File.directory?(to_arch_basedir)
	puts "Directory to archive does not exist: #{to_arch_basedir}"
	end


	org_arch_lst = File.absolute_path(org_basedir).split(/[\/\\ ]/).join("_") + ".lst"
	to_arch = File.absolute_path(to_arch_basedir).split(/[\/\\ ]/).join("_") + "-to_arch.lst"

	# populate Hash with files from existing archive
	if File.file?(org_arch_lst)
	arch=YAML.load_file(org_arch_lst)
	else
	arch={}
	Find.find(org_basedir) { \|file\|
	next unless File.file?(file)
	bname = File.basename(file)
	#file.slice!(org_basedir)
	#file.slice!(File::SEPARATOR)
	arch[bname] \|\|= []
	arch[bname] << file
	}
	# save original archove files for later use
	File.open(org_arch_lst, 'w') { \|f\| YAML.dump(arch, f) }
	end

	to_arch_size = 0
	# find all files not present in original archive
	File.open(to_arch, "w") do \|outfile\|
	Find.find(to_arch_basedir) do \|path\|
	match = arch[File.basename(path)]
	#require 'pry'
	#binding.pry
	if match
	path_archived = match.find{ \|f\|
	begin
	FileUtils.cmp(path, f)
	rescue => e
	# seems like an io error reading some file
	# lets figure out which one and report in console
	if e.message.include?(path)
	puts "Error reading from files to archive: #{path}"
	elsif e.message.include?(f)
	puts "Error reading from original archive: #{f}"
	else
	puts(e.message)
	end
	end
	}
	else
	path_archived = false
	end

	unless path_archived
	to_arch_size += File.size(path)

	path.slice!(to_arch_basedir)
	path.slice!(File::SEPARATOR)
	outfile.write(path)
	outfile.write("\n")
	end
	end
	end

	puts "Size to archive: #{to_arch_size}"
	puts "file with list of paths to archive: #{to_arch}"