kdabir/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Calculate the space wasted by duplicate files on your Hard Drive

Sometimes (rather always), duplicate files eat up space on hardrive. If you want to compute the total space wasted by
such duplicate files this utility script may come handy. fdupes is a nice utility that finds the duplicate files in the
given directories. This script consumes the fdupes output and calculates the space wasted.
These steps assume *nix flavor OS (OS X/Linux) but this might work on Windows+Cygwin as well

Install fdupes on your machine (verify by doing fdupes --version on Terminal)
Install groovy (verify by doing groovy --version on Terminal)
cd into dir where you want to install dupe_size:

either git clone git://gist.github.com/3488741.git
or download and extract zip from here
(optional) rename the cloned/extracted dir to a more friendly name


cd into the directory that you just cloned/extracted/renamed
run ./dupes_size.sh "dir1" "dir2" ... "dirN" (script may take some time depending on the size of input directories, have some patience)

Warning: don't try to run the command from any directory than the one in which you installed it. It assumes groovy script in the directory from where the command is issued.

  
## dupes_size.sh
#!/bin/sh
FDUPES_OUTPUT_FILE=fdupes_out.txt

fdupes -rS $*>$FDUPES_OUTPUT_FILE

if [ -s $FDUPES_OUTPUT_FILE ]
then
	groovy parse_fdupes_out.groovy $FDUPES_OUTPUT_FILE
fi

rm -f $FDUPES_OUTPUT_FILE

## parse_fdupes_out.groovy
def test_mode = false // switch test mode on/off here

def size_required = 0, size_overhead = 0, total_files = 0, repeated_files=0
def each_file_size = 0, files_count = 0, duplicates_count=0

def process = { fdupes_output ->
	fdupes_output.eachLine { line ->

		def matcher = (line=~/(^\d+)/) // line starts with a number

		if (matcher.find()){ // if line starts with number
			each_file_size = matcher.group().toInteger() // thats the size
		} else if (line) { // if line has some text
			files_count++
		} else if (files_count>0) { // only compute if files are accumulated
			size_required += each_file_size
			total_files += files_count
			duplicates_count = (files_count)?(files_count - 1):0
			repeated_files += duplicates_count
			size_overhead += each_file_size * (duplicates_count)
			files_count = each_file_size = 0
		}
	}
}

if (test_mode) {
	println "running in test mode"
	process(testText())
} else if(args){
	process(new File (args[0]).text)
} else {
	println "please pass fdupes output file name"
	return
}

def df = new java.text.DecimalFormat()
println """
what's stored is               : ${df.format(size_overhead + size_required)} bytes in ${total_files} files
could've beend stored in just  : ${df.format(size_required)} bytes in ${total_files - repeated_files} files
how much is repeated           : ${df.format(size_overhead)} bytes in ${repeated_files} files
"""

/// testing the logic
def testText(){
"""
1000 some text
filename 1
filename 2
filename 3

2000
file1
file2

"""
}

if (test_mode) {
	assert repeated_files == 3
	assert size_overhead == 4000
}
	#!/bin/sh
	FDUPES_OUTPUT_FILE=fdupes_out.txt

	fdupes -rS $*>$FDUPES_OUTPUT_FILE

	if [ -s $FDUPES_OUTPUT_FILE ]
	then
	groovy parse_fdupes_out.groovy $FDUPES_OUTPUT_FILE
	fi

	rm -f $FDUPES_OUTPUT_FILE
	def test_mode = false // switch test mode on/off here

	def size_required = 0, size_overhead = 0, total_files = 0, repeated_files=0
	def each_file_size = 0, files_count = 0, duplicates_count=0

	def process = { fdupes_output ->
	fdupes_output.eachLine { line ->

	def matcher = (line=~/(^\d+)/) // line starts with a number

	if (matcher.find()){ // if line starts with number
	each_file_size = matcher.group().toInteger() // thats the size
	} else if (line) { // if line has some text
	files_count++
	} else if (files_count>0) { // only compute if files are accumulated
	size_required += each_file_size
	total_files += files_count
	duplicates_count = (files_count)?(files_count - 1):0
	repeated_files += duplicates_count
	size_overhead += each_file_size * (duplicates_count)
	files_count = each_file_size = 0
	}
	}
	}

	if (test_mode) {
	println "running in test mode"
	process(testText())
	} else if(args){
	process(new File (args[0]).text)
	} else {
	println "please pass fdupes output file name"
	return
	}

	def df = new java.text.DecimalFormat()
	println """
	what's stored is : ${df.format(size_overhead + size_required)} bytes in ${total_files} files
	could've beend stored in just : ${df.format(size_required)} bytes in ${total_files - repeated_files} files
	how much is repeated : ${df.format(size_overhead)} bytes in ${repeated_files} files
	"""

	/// testing the logic
	def testText(){
	"""
	1000 some text
	filename 1
	filename 2
	filename 3

	2000
	file1
	file2

	"""
	}

	if (test_mode) {
	assert repeated_files == 3
	assert size_overhead == 4000
	}