Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created March 4, 2020 13:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save lindenb/398624f6cb0bfb0837704b605681d420 to your computer and use it in GitHub Desktop.
Save lindenb/398624f6cb0bfb0837704b605681d420 to your computer and use it in GitHub Desktop.
workflow nextflow finding duplicated files and creating a bash script to replace the duplicate with a symbolic link
params.directories="."
params.headsize=100000
params.extensions="bam bai"
params.help=false
params.extrafind=""
params.lines = 1000
params.publishDir="."
def helpMessage() {
log.info"""
=========================================
Usage:
find duplicate files, generate a bash script to generate symbolic links replacing the duplicates.
Mandatory arguments:
--directories (dir) find files under dir
--extensions 'string' space/comma/pipe separated list of extensions
Other options:
--extrafind (string) extra arguments for find. eg. " -size '+10000' "
--lines (int) split the input into 'n' parallel jobs [${params.lines}]
--headsize (int) max number of bytes for calculating the MD5 [${params.headsize}]
--publishDir (dir)
Nextflow options:
-w Work directory used by Nextflow.
workflow Author: Pierre Lindenbaum @yokofakun 20200304
=========================================
"""
}
if( params.help ) {
helpMessage()
exit 0
}
process findFiles {
tag "${params.extensions}"
cache 'lenient'
executor 'local'
cpus 5
output:
file("split.list") into split_list
script:
def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\|") +"\\)\$"
"""
find ${params.directories} -type f -regex '${suffixes}' ${params.extrafind} > split.list
"""
}
process split {
tag "N=${params.lines}"
executor 'local'
input:
file splitin from split_list
output:
file("chunks.txt") into chunk_list
script:
"""
cut -f 2,4 "${splitin}" |\
tr "\t" "\\n" |\
LC_ALL=C sort -T . | uniq |\
split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck.
find \${PWD} -type f -name "chunck.*.list" > chunks.txt
"""
}
chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items}
/* scan a set of files ,extract the firs bytes and calculate md5 .
Output is
MD5(comma)path(comma)date(comma)size(comma)
sorted on MD5 and date
*/
process md5sum {
tag "${chunk}"
cache 'lenient'
input:
val chunk from chunk_items
output:
file("md5.list") into md5_list
script:
"""
set -o pipefail
cat "${chunk}" | while read F
do
if [ -f "\${F}" ]; then
head -c '${params.headsize}' "\${F}" | md5sum | cut -d ' ' -f1 | tr "\\n" ","
echo -n "\${F},"
stat -c "%Y,%s" "\${F}"
fi
done | sort -T. -t "," -k1,1 -k3,3n > md5.list
"""
}
/** merge already sorted list on MD5/date */
process merge {
executor "local"
tag "N=${L.size()}"
input:
val L from md5_list.collect()
output:
file("merged.list") into merged_list
script:
"""
sort -T . -t "," -k1,1 -k3,3n --merge ${L.join(" ")} > merged.list
"""
}
/** create cleanup bash script */
process createScript {
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
executor "local"
input:
file merged from merged_list
output:
file("fdups.bash") into bash_script
script:
"""
cat << __EOF__ > jeter.awk
BEGIN {
printf("#!/bin/bash\\nset -e\\n");
PREV_HASH="";
PREV_FILE="";
FS=","
}
{
FNAME=\\\$2;
if(\\\$1!=PREV_HASH) {
PREV_HASH = \\\$1;
PREV_FILE = FNAME;
}
else if(FNAME==PREV_FILE) {
printf("##ERROR %s %s\\n",FNAME,PREV_FILE);
}
else
{
printf("# %s\\n",\\\$1);
printf("test -f \\"%s\\" ", PREV_FILE);
printf(" && test -f \\"%s\\" ", FNAME);
printf(" && cmp \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME);
printf(" && mv -v \\"%s\\" \\"%s.back\\" ", FNAME,FNAME);
printf(" && ln -s \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME);
printf(" && rm -v \\"%s.back\\"\\n", FNAME);
N+=int(\\\$4);
printf("\\n");
}
}
END {
printf("echo 'Saved %d bytes.'\\n\",N);
}
__EOF__
awk -f jeter.awk "${merged}" > fdups.bash
"""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment