Skip to content

Instantly share code, notes, and snippets.

@pmenzel
Created April 27, 2023 10:28
Show Gist options
  • Save pmenzel/28c880a973340a18bd0d09f3fd029a96 to your computer and use it in GitHub Desktop.
Save pmenzel/28c880a973340a18bd0d09f3fd029a96 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
#
# 2019, Peter Menzel, Labor Berlin
# This script converts the Kraken output format into
# a 3-column output format with columns:
# 1: read name
# 2: taxon id
# 3: score, which is defined as the fraction of k-mers having that taxon id and its ancestors over all k-mers
#
# use as input for ktImportTaxonomy
#
# the program needs to know the k-mer size that the kraken database uses, can be specified with option -k
#
use strict;
use warnings;
use Getopt::Std;
my $kmerlen = 35;
my %options=();
getopts("t:k:", \%options);
exists($options{t}) && defined($options{t}) or die "Specify path to taxonomy folder containing nodes.dmp and merged.dmp via option -t\n";
my $taxdir = $options{t};
$taxdir =~ s/nodes\.dmp//; # just in case the path to nodes.dmp is passed
$taxdir .= '/' unless $taxdir =~ m,/$,;
if(exists($options{k}) && defined($options{k})) {
$kmerlen = $options{k};
}
my $nodesdmp = $taxdir.'nodes.dmp';
if(!defined $ARGV[0]) { die "Usage: kraken-score.pl -t /path/to/taxonomy /path/to/kraken-output-file.tsv\n"; }
my %nodes;
open(NODES,$nodesdmp) or die "Cannot open $nodesdmp\n";
while(<NODES>) {
my @F = split(/\|/,$_);
if(@F > 3) {
my $id = -+- $F[0];
my $parentid = -+- $F[1];
$nodes{$id} = $parentid;
}
}
close(NODES);
while(<>) {
chomp;
next unless length; #skip empty lines
my @F = split(/\t/,$_);
if(@F != 5) { die "Error: Not exactly 5 columns in line $_\n"; };
if($F[0] eq "U") {
print $F[1],"\t0\t1\n";
next;
}
if($F[2]=~/.*\(taxid (\d+)\).*/ or $F[2]=~/(\d+)/) {
my $taxid = $1;
#get all ancestors for taxon
my $lineage;
my $node = $taxid;
while(defined($nodes{$node})) {
$lineage .= " ".$node;
my $parent = $nodes{$node};
if($node == $parent) { last; }
$node = $parent;
}
$lineage .= " ";
my $sum_kmers_all = 0;
while($F[3] =~ /(\d+)/g) { # add the number of k-mers for both reads from column 4, e.g. "250:251"
if($1 > $kmerlen) {
$sum_kmers_all += ($1 - $kmerlen + 1);
}
}
my $sum_kmers_taxid = 0;
while($F[4] =~ /(\d+):(\d+)/g) { # sum up the counts for the lineage tax ids in column 5
my $id = $1;
my $count = $2;
if($lineage =~ / $id /) {
$sum_kmers_taxid += $count;
}
}
if($sum_kmers_all > 0) {
printf("%s\t%i\t%.2f\n", $F[1], $taxid, $sum_kmers_taxid / $sum_kmers_all);
}
else {
print $F[1],"\t0\t1\n";
}
}
else {
die "Error: Cannot read taxon id from line $_\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment