Created
August 7, 2013 16:28
-
-
Save ewels/6175673 to your computer and use it in GitHub Desktop.
In bioinformatics, raw ASCII text file can get massive. This script sniffs out large uncompressed text files and sends their paths to STDOUT for piping to a file or zipping command.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/perl | |
use warnings; | |
use strict; | |
use Cwd; | |
use File::Find; | |
#### | |
# FIND UNCOMPRESSED FILES | |
# Prints the full path of any files larger than 50mb which are uncompressed | |
# One file path per newline, so output can be piped to other tools, eg: | |
# perl find_uncompressed_files.pl | xargs gzip | |
# perl find_uncompressed_files.pl | grep sra | |
#### | |
my $dir = $ARGV[0]; | |
unless (defined $dir) { | |
$dir = getcwd(); | |
} | |
find(\&print_large_uncompressed, $dir); | |
sub print_large_uncompressed { | |
if( -s > 52428800){ # file larger than 50mb | |
if ( index ( `file $_`, "ASCII text" ) != -1 ) { | |
print $File::Find::name . "\n"; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment