Skip to content

Instantly share code, notes, and snippets.

@Buttonwood
Last active February 21, 2022 06:45
Show Gist options
  • Save Buttonwood/96f9a9ef8159ca111a69 to your computer and use it in GitHub Desktop.
Save Buttonwood/96f9a9ef8159ca111a69 to your computer and use it in GitHub Desktop.
Filter genes in cog database which can't correspond to any COG ID.
#=============================================================================
# FileName: clean_cog_db.pl
# Desc: Filter genes in cog database which can't correspond to any COG ID.
# Author: tanhao
# Email: tanhao2013@gmail.com
# HomePage: http://buttonwood.github.io
# Version: 0.0.1
# LastChange: 2014-05-09 11:19:35
# History:
#=============================================================================
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
my($k_c,$myva,$blast,$verbose,$help);
my %cog;
GetOptions(
"keycol:i" => \$k_c,
"myva:s" => \$myva,
"blast:s" => \$blast,
"verbose" => \$verbose,
"help:s" => \$help,
);
if (@ARGV != 1 || $help) {
die("Examples:\n\tperl $0 -keycol 5 -blast *.blast.tab whog >*.blast.tab.info\n
\tperl $0 -myva myva whog > clean.cog.fa\n");
}
my $whog = shift;
&read_whog($whog,\%cog) if ($whog);
#print Dumper(\%cog);
&filter($myva,\%cog) if ($myva);
&parse_tab($blast, $k_c, \%cog) if ($blast and $k_c);
sub read_whog{
my($whog_file,$cog_ref)=@_;
open IN,$whog_file or die $!;
#=pod
my $title = '';
while (<IN>) {
chomp;
if(/^\[/){
my @t = split("\] ", $_, 2);
#print $t[1],"\n";
$title = $t[1];
}elsif(/^ \w+:/){
s/^\s+//;
my @b = split /\s+/;
shift @b;
foreach my $y (@b) {
$cog_ref->{$y} = $title if ($title);
}
}else{
next;
}
}
=pod
$/="^\["; <>; $/="\n";
while (<>) {
print $_;
my $title = $_;
print $_;
my $seq_name = $1 if($title =~ /^(\S+)/);
print $seq_name, "***\n";
$/="^\[";
my $seq=<>;
chomp $seq;
my @t = split("\n",$seq);
foreach my $x (@t) {
if ($x =~ /^ \w+:/) {
s/^\s+//; chomp;
my @b = split /\s+/;
shift @b;
foreach my $y (@b) {
$cog_ref->{$y} = $seq_name;
}
}
}
$/="\n";
}
=cut
close IN;
}
sub filter {
my ($myva,$cog_ref)=@_;
open IN,$myva or die $!;
while(<IN>){
chomp;
(my $title=$_) =~ s/^>//;
$/=">";
chomp(my $seq=<IN>);
$/="\n";
if(exists $cog_ref->{$title}){
my $info = $cog_ref->{$title};
print ">$title\t$info\n$seq";
}
}
close IN;
}
sub parse_tab {
my ($tab, $col,$cog_ref)=@_;
open IN,$tab or die $!;
while (<IN>) {
chomp;
my @t = split;
print join("\t",@t[0..13]);
print "\t",$cog_ref->{$t[($col - 1)]} if (exists $cog_ref->{$t[($col - 1)]});
print "\n";
}
close IN;
}
die("perl $0 swiss.fa >swiss.simple.fa\n")unless(@ARGV==1);
my $uniprot_file = shift;
open(IN, $uniprot_file) || die ("can not open $uniprot_file\n");
$/=">"; <IN>; $/="\n";
while (<IN>) {
chomp;
#sp|Q01525|14332_ARATH 14-3-3-like protein GF14 omega OS=Arabidopsis thaliana GN=GRF2 PE=2 SV=2
my ($id,$desc) = ($2,$3) if(/^(decoy_sp|decoy_tr)\|([^\|]+)\|(.+)/);
$/=">";
my $seq = <IN>;
chomp $seq;
$/="\n";
print ">$id $desc\n$seq";
}
close(IN);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment