josephhughes/parse_cdhit.pl

## parse_cdhit.pl
# use this to get the number of reads in each cluster
use strict;
use Getopt::Long;
use Bio::SeqIO;

my ($clstr,$result,$long,%clusters,$infile);
&GetOptions(
      'clstr:s'  =>\$clstr, #a cd-hit generated cluster file
	    'out:s'     => \$result, # a text file with the numbers of reads in each cluster
           );

print "Input cluster file $clstr, fatsa file $infile, output $result and fasta output $long\n";
open(CLUSTER,"<$clstr")||die "Can't open $clstr\n";

my $clusterid;
my $longest;
my %cluster;
while(<CLUSTER>){
  if ($_=~/^>(.+)$/){
    $clusterid=$1;
    #$longest="";
    #print "$clusterid\n";
  }elsif ($_=~/\d+.+\>(.+)\.\.\..+\*$/){
    my $id=$1;
    #print "$id\n";
    $longest=$id;
    $clusters{$clusterid}{$id}=$longest;
    #print "$clusterid $longest\n";
  }
  elsif ($_=~/\d+.+\>(.+)\.\.\..+$/){
    my $id=$1;
    #print "$id\n";
    $clusters{$clusterid}{$id}=$longest;
    #print "clusterid\t$id Shorter than $longest\n";
  }
}
open(OUT,">$result")||die "Can't open $result\n";
print OUT "ClusterID\tNbSeqs\n";
foreach my $clid (keys %clusters){
  my $nbseqs=keys %{$clusters{$clid}};
  print OUT "$clid\t$nbseqs\n";
}
	# use this to get the number of reads in each cluster
	use strict;
	use Getopt::Long;
	use Bio::SeqIO;

	my ($clstr,$result,$long,%clusters,$infile);
	&GetOptions(
	'clstr:s' =>\$clstr, #a cd-hit generated cluster file
	'out:s' => \$result, # a text file with the numbers of reads in each cluster
	);

	print "Input cluster file $clstr, fatsa file $infile, output $result and fasta output $long\n";
	open(CLUSTER,"<$clstr")\|\|die "Can't open $clstr\n";

	my $clusterid;
	my $longest;
	my %cluster;
	while(<CLUSTER>){
	if ($_=~/^>(.+)$/){
	$clusterid=$1;
	#$longest="";
	#print "$clusterid\n";
	}elsif ($_=~/\d+.+\>(.+)\.\.\..+\*$/){
	my $id=$1;
	#print "$id\n";
	$longest=$id;
	$clusters{$clusterid}{$id}=$longest;
	#print "$clusterid $longest\n";
	}
	elsif ($_=~/\d+.+\>(.+)\.\.\..+$/){
	my $id=$1;
	#print "$id\n";
	$clusters{$clusterid}{$id}=$longest;
	#print "clusterid\t$id Shorter than $longest\n";
	}
	}
	open(OUT,">$result")\|\|die "Can't open $result\n";
	print OUT "ClusterID\tNbSeqs\n";
	foreach my $clid (keys %clusters){
	my $nbseqs=keys %{$clusters{$clid}};
	print OUT "$clid\t$nbseqs\n";
	}