lindenb/finddups.nf

## finddups.nf
params.directories="."
params.headsize=100000
params.extensions="bam bai"
params.help=false
params.extrafind=""
params.lines = 1000
params.publishDir="."

def helpMessage() {
  log.info"""

=========================================
Usage:
	find duplicate files, generate a bash script to generate symbolic links replacing the duplicates.


Mandatory arguments:
	--directories (dir) find files under dir
	--extensions 'string' space/comma/pipe separated list of extensions

Other options:
	--extrafind (string) extra arguments for find. eg. " -size '+10000' "
	--lines (int) split the input into 'n' parallel jobs [${params.lines}]
  --headsize (int) max number of bytes for calculating the MD5 [${params.headsize}]
	--publishDir (dir)

Nextflow options:
  -w  Work directory used by Nextflow.

workflow Author: Pierre Lindenbaum @yokofakun 20200304
=========================================

  """
}


if( params.help ) {
    helpMessage()
    exit 0
}


process findFiles {
    tag "${params.extensions}"
    cache 'lenient'
    executor 'local'
    cpus 5
    output:
    	file("split.list") into split_list
    script:
	def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\|") +"\\)\$"
	"""
	find ${params.directories}  -type f -regex '${suffixes}' ${params.extrafind} > split.list
	"""
	}

process split {
	tag "N=${params.lines}"
	executor 'local'
	input:
		file splitin from split_list
	output:
		file("chunks.txt") into chunk_list
	script:
	"""
	cut -f 2,4 "${splitin}" |\
		tr "\t" "\\n" |\
		LC_ALL=C sort -T . | uniq |\
		split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck.
	find \${PWD} -type f -name "chunck.*.list" > chunks.txt
	"""
	}


chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items}

/* scan a set of files ,extract the firs bytes and calculate md5 .
   Output is

     MD5(comma)path(comma)date(comma)size(comma)


sorted on MD5 and date
*/
process md5sum {
	tag "${chunk}"
        cache 'lenient'
	input:
		val chunk from chunk_items
	output:
		file("md5.list") into md5_list
	script:
	"""
	set -o pipefail

	cat "${chunk}"  | while read F
	do
		if [ -f "\${F}" ]; then
			head -c '${params.headsize}'  "\${F}" | md5sum | cut -d ' ' -f1 | tr "\\n" ","
			echo -n "\${F},"
			stat -c "%Y,%s" "\${F}"
		fi
	done | sort -T. -t "," -k1,1 -k3,3n > md5.list
	"""
	}

/** merge already sorted list on MD5/date */
process merge {
	executor "local"
	tag "N=${L.size()}"
	input:
		val L from md5_list.collect()
	output:
		file("merged.list") into merged_list
	script:
	"""
	sort -T . -t "," -k1,1 -k3,3n --merge  ${L.join(" ")} > merged.list
	"""
	}

/** create cleanup bash script */
process createScript {
	publishDir "${params.publishDir}" ,  mode: 'copy', overwrite: true
	executor "local"
	input:
		file merged from merged_list
	output:
		file("fdups.bash") into bash_script
	script:
	"""
cat << __EOF__ > jeter.awk
BEGIN	{
	printf("#!/bin/bash\\nset -e\\n");
	PREV_HASH="";
	PREV_FILE="";
	FS=","
	}
	{
	FNAME=\\\$2;
	if(\\\$1!=PREV_HASH) {
		PREV_HASH = \\\$1;
		PREV_FILE = FNAME;
		}
	else if(FNAME==PREV_FILE) {
		printf("##ERROR %s %s\\n",FNAME,PREV_FILE);
		}
	else
		{
		printf("# %s\\n",\\\$1);
		printf("test -f \\"%s\\" ", PREV_FILE);
		printf(" && test -f \\"%s\\" ", FNAME);
		printf(" && cmp  \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME);
		printf(" && mv -v  \\"%s\\" \\"%s.back\\" ", FNAME,FNAME);
		printf(" && ln -s  \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME);
		printf(" && rm -v  \\"%s.back\\"\\n", FNAME);
		N+=int(\\\$4);
		printf("\\n");
		}
	}

END	{
	printf("echo 'Saved %d bytes.'\\n\",N);
	}
__EOF__


awk -f jeter.awk "${merged}" > fdups.bash

	"""
	}
	params.directories="."
	params.headsize=100000
	params.extensions="bam bai"
	params.help=false
	params.extrafind=""
	params.lines = 1000
	params.publishDir="."

	def helpMessage() {
	log.info"""

	=========================================
	Usage:
	find duplicate files, generate a bash script to generate symbolic links replacing the duplicates.


	Mandatory arguments:
	--directories (dir) find files under dir
	--extensions 'string' space/comma/pipe separated list of extensions

	Other options:
	--extrafind (string) extra arguments for find. eg. " -size '+10000' "
	--lines (int) split the input into 'n' parallel jobs [${params.lines}]
	--headsize (int) max number of bytes for calculating the MD5 [${params.headsize}]
	--publishDir (dir)

	Nextflow options:
	-w Work directory used by Nextflow.

	workflow Author: Pierre Lindenbaum @yokofakun 20200304
	=========================================

	"""
	}


	if( params.help ) {
	helpMessage()
	exit 0
	}


	process findFiles {
	tag "${params.extensions}"
	cache 'lenient'
	executor 'local'
	cpus 5
	output:
	file("split.list") into split_list
	script:
	def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\\|") +"\\)\$"
	"""
	find ${params.directories} -type f -regex '${suffixes}' ${params.extrafind} > split.list
	"""
	}

	process split {
	tag "N=${params.lines}"
	executor 'local'
	input:
	file splitin from split_list
	output:
	file("chunks.txt") into chunk_list
	script:
	"""
	cut -f 2,4 "${splitin}" \|\
	tr "\t" "\\n" \|\
	LC_ALL=C sort -T . \| uniq \|\
	split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck.
	find \${PWD} -type f -name "chunck.*.list" > chunks.txt
	"""
	}


	chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items}

	/* scan a set of files ,extract the firs bytes and calculate md5 .
	Output is

	MD5(comma)path(comma)date(comma)size(comma)


	sorted on MD5 and date
	*/
	process md5sum {
	tag "${chunk}"
	cache 'lenient'
	input:
	val chunk from chunk_items
	output:
	file("md5.list") into md5_list
	script:
	"""
	set -o pipefail

	cat "${chunk}" \| while read F
	do
	if [ -f "\${F}" ]; then
	head -c '${params.headsize}' "\${F}" \| md5sum \| cut -d ' ' -f1 \| tr "\\n" ","
	echo -n "\${F},"
	stat -c "%Y,%s" "\${F}"
	fi
	done \| sort -T. -t "," -k1,1 -k3,3n > md5.list
	"""
	}

	/** merge already sorted list on MD5/date */
	process merge {
	executor "local"
	tag "N=${L.size()}"
	input:
	val L from md5_list.collect()
	output:
	file("merged.list") into merged_list
	script:
	"""
	sort -T . -t "," -k1,1 -k3,3n --merge ${L.join(" ")} > merged.list
	"""
	}

	/** create cleanup bash script */
	process createScript {
	publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
	executor "local"
	input:
	file merged from merged_list
	output:
	file("fdups.bash") into bash_script
	script:
	"""
	cat << __EOF__ > jeter.awk
	BEGIN {
	printf("#!/bin/bash\\nset -e\\n");
	PREV_HASH="";
	PREV_FILE="";
	FS=","
	}
	{
	FNAME=\\\$2;
	if(\\\$1!=PREV_HASH) {
	PREV_HASH = \\\$1;
	PREV_FILE = FNAME;
	}
	else if(FNAME==PREV_FILE) {
	printf("##ERROR %s %s\\n",FNAME,PREV_FILE);
	}
	else
	{
	printf("# %s\\n",\\\$1);
	printf("test -f \\"%s\\" ", PREV_FILE);
	printf(" && test -f \\"%s\\" ", FNAME);
	printf(" && cmp \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME);
	printf(" && mv -v \\"%s\\" \\"%s.back\\" ", FNAME,FNAME);
	printf(" && ln -s \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME);
	printf(" && rm -v \\"%s.back\\"\\n", FNAME);
	N+=int(\\\$4);
	printf("\\n");
	}
	}

	END {
	printf("echo 'Saved %d bytes.'\\n\",N);
	}
	__EOF__


	awk -f jeter.awk "${merged}" > fdups.bash

	"""
	}