Skip to content

Instantly share code, notes, and snippets.

@wchristian
Created August 9, 2010 00:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wchristian/514713 to your computer and use it in GitHub Desktop.
Save wchristian/514713 to your computer and use it in GitHub Desktop.
package Greenphyl::Load::Family;
use strict;
use warnings;
no warnings 'once';
use Carp;
use Bio::Seq;
use Bio::SearchIO;
use Bio::SeqIO;
use lib "../../lib";
use Greenphyl::Path;
use Greenphyl::SequenceFilter 'filter_text_id';
umask 0000; # set the appropriate permissions for created files/folder to anydody as deamon gives only read permission to others
my ( $mysql, $dbh );
sub new {
my ( $class, %p ) = @_;
my $self = {};
$mysql = $p{mysql};
$dbh = $p{dbh};
bless( $self, $class );
return $self;
}
#calculate the number of sequence in the fasta input file
## inefficient. could be done with a grep in 1 line.
## qx/ grep '>' $file | wc -l /;
sub nbSequence {
my ( $file ) = @_;
my $total_size = 0;
open( IN, $file );
while ( <IN> ) {
$total_size++ if /^>/;
}
close( IN );
return $total_size;
}
# $liste_family is a text file containing family name that were modified and created
# this list will be useful for the maintenance when running MEME and the phylo pipeline
# premier hit ignore car sequence contre elle-meme = meilleur hit
# second hit : -> soit match contre une famille -> si longueur bonne integration dans la famille
# -> sinon ORPHAN
# -> match avec ORPHAN -> longueur bonne = nouvelle famille et on regarde les hit suivant -> si longueur bonne integration dans la nouvelle famille
# -> autre arret de l'algo (longueur mauvaise ou plus de hit)
# -> longueur mauvaise = ORPHAN
# -> math avec rien -> ORPHAN
sub loadFamily {
my ( $self, $file, $count, $evalue, $database, $out_number, $file_out, $file_temp, $species_name, $restart, %family_rem ) = @_;
my $pass = new Greenphyl::Path( $species_name );
my $cpt = 0;
my $cpt_new_family = 0;
my $cpt_modif_family = 0;
my $cpt_progress = 0;
my $nb_sequence = nbSequence( $file );
my %liste_pourcentage;
open( LOG, ">> $Greenphyl::Path::data" )
or die "cannot create the file $Greenphyl::Path::data $!";
open( FAMILY, ">> $pass->{file_result}$Greenphyl::Path::liste_family " )
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
open( NFAM, ">> $pass->{file_result}$Greenphyl::Path::new_family" )
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::new_family\n" );
open( AFF, ">> $Greenphyl::Path::data_path/$Greenphyl::Path::log" )
|| die( "error not create the file $Greenphyl::Path::data_path/$Greenphyl::Path::log\n" );
my $in = Bio::SeqIO->newFh( -file => $file, -format => 'fasta' );
while ( <$in> ) {
# extract a single query sequence from the fasta file with the newly added sequences
my $id = filter_text_id( $_->display_id ); #query
my $seq = $_->seq;
my $fasta = $_;
my $cpt_hit = 0; #counter for the number of hit with blast
my $hit_while = 0; #to last the loop
$cpt_progress++;
if ( $cpt_progress > $restart ) {
print "\n----------------------------------------------\n";
print AFF "Gene family modified: " . $cpt_progress . " / " . $nb_sequence . "\n";
print $cpt_progress. " / " . $nb_sequence . " => $id \n";
$cpt++;
##check if query is in the database
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
my @rep = $mysql->select( $sql );
unless ( $rep[0] ) {
print "ERROR : sequence is not in the db!<br />";
last;
}
my $seq = $_->seq;
#++++ should be in run::blast
##creation input fasta file as input for blast software
$file_temp = clipboard( $seq, $file_temp );
system( "chmod 777 $file_temp" );
$count++;
# run blast to compare it to blast database containing other sequences across all species
my $cmd = "$Greenphyl::Config::BLASTALL -i $file_temp -e $evalue -p blastp -F none -d $database -K $out_number -v $out_number -b $out_number -o $file_out";
system( $cmd );
my $found = 0;
if ( -e $file_out ) # if blast output exist
{
# load blast results
my $in = new Bio::SearchIO(
-format => 'blast',
-file => "$file_out"
);
system( "chmod 666 $file_out" );
#++++
# collect all sequences that are similar to this one from the blast result
# and see if the query sequence can be added to a family or combined with another sequence to form a new family
# making sure that it is not accidentally combined with itself
while ( my $result = $in->next_result and $hit_while == 0 ) {
my $level1 = 0;
my $cpt = 0;
while ( my $hit = $result->next_hit
and $hit_while == 0 ) #sortir ici
{
my $seq_textid = filter_text_id( $hit->name );
$cpt_hit++;
if ( $seq_textid ne $id ) # ignore first hit
{
$found++;
$seq_textid = ucfirst( lc( $seq_textid ) );
my $sql = "SELECT S.seq_textid, F.family_name, F.family_id, FO.class_id
FROM sequences S, seq_is_in SI, family F, found_in FO
WHERE S.seq_textid =\"$seq_textid\" and S.seq_id = SI.seq_id and SI.family_id = F.family_id and F.family_id= FO.family_id";
my @rep7 = $mysql->select( $sql );
my $count_verif; #pour verifier la taille de la sequence
# hit found in an existing family
if ( @rep7 ) {
print "Find family for $seq_textid \t";
my $family_id = $rep7[0][2]; #retrieve family id at level 1
$count_verif = &count( $family_id, $id ); #verifier si ce sont les bons parametres
if ( $count_verif == 1 ) # length ok
{
## retrieve family_id at level 1 down to 4 (if exists)
for my $i ( 0 .. $#rep7 ) {
my $family_id = $rep7[$i][2];
if ( $found == 1 ) {
#retrieve seq_id for the sequence to be inserted in seq_is_in
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
my @rep = $mysql->select( $sql );
my $seq_id = $rep[0];
## check if the sequence is not already classified
my $sql1 = "SELECT SI.family_id
FROM seq_is_in SI, found_in F
WHERE SI.seq_id = '"
. $seq_id . "' and F.family_id = SI.family_id and F.class_id = '" . $rep7[$i][3] . "'";
my @rep2 = $mysql->select( $sql1 );
##if classified
if ( $rep2[0] ) {
#Deja classifiee au niveau $rep7[$i][3] !!<br />\n";
last;
}
#classify second hit
else {
print FAMILY "$family_id\n";
print LOG "Insertion of the sequence $id in the $family_id family\n";
%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_id\")";
$mysql->insert( $sql2 );
$cpt_modif_family++;
}
}
}
}
else #length not ok : query = orphan
{
$hit_while = 1; #last the loop
next;
}
}
# hit found est un orphan
else {
##recup seq_id de query
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
my @rep = $mysql->select( $sql );
my $seq_id = $rep[0];
## verifie que query deja pas classifié dans la base avec 1er orphan de la sortie du blast
my $sql2 = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
my @rep2 = $mysql->select( $sql2 );
my $family_query = $rep2[0];
#pour premier orphan trouve cree une famille, voir else
##deja classe:va voir si autre sorties de blast sont des orphans et si oui les classe dans meme famille
if ( $family_query ) {
##recup id des autres sortie du blast
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$seq_textid'";
my @rep = $mysql->select( $sql );
my $seq_id = $rep[0];
if ( $seq_id ) {
## verifie les autres sortie blast
my $sql = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
my @rep = $mysql->select( $sql );
my $family_id2 = $rep[0];
if ( $family_id2 ) {
$hit_while = 1; #on va pas regarder les autres hits, on stop la boucle ici.
}
else #sortie est un orphan
{
$count_verif = &count( $family_query, $id ); #test la taille de la sequence
if ( $count_verif == 1 ) #le test de taille est concluant
{
%liste_pourcentage = &dataFamily( $family_query, %liste_pourcentage );
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_query\")";
$mysql->insert( $sql2 );
$dbh->do( "TRUNCATE go_family_cache;" );
print FAMILY "$family_query\n";
$cpt_new_family++;
}
}
}
}
#pas de famille cree : premier passage dans la boucle : creation d'une nouvelle famille avec l'orphan
#match avec un orphan
else {
print "new_family with $seq_textid\t";
###compare sequence length between the query and the hit
my $sql00 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$id\""; #query
my $sql01 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\""; #hit
my @rep00 = $mysql->select( $sql00 );
my @rep01 = $mysql->select( $sql01 );
# should be set at the begining of the script
my $soixante = ( $rep01[0] * 60 ) / 100;
my $centquarante = ( $rep01[0] * 140 ) / 100;
# TODO: this size comparison is pretty broken and will need to be replaced once cleaning is done ( minimum to value is wider range than max to value )
if ( ( $rep00[0] < $centquarante )
and ( $rep00[0] > $soixante ) ) #if length control is ok
{
# va chercher max family_number pour creer nouvelle famille
my $sql1 = 'SELECT MAX(family_number) FROM family';
my @rep = $mysql->select( $sql1 );
my $old_number = $rep[0] || 0;
my $family_number = $old_number + 1;
## create a new family in the db table family
my $new = "New created family by $species_name";
my $sql = "INSERT INTO family (family_name, family_number, inference) VALUES(\"$new\", \"$family_number\", '')";
$mysql->insert( $sql );
$dbh->do( "TRUNCATE go_family_cache;" );
## recup le family_id creer
my $sql3 = "SELECT family_id FROM family WHERE family_number LIKE \"$family_number\"";
my @rep1 = $mysql->select( $sql3 );
my $family_id = $rep1[0];
## new family is classify at level 1
my $nb = 1;
my $sql4 = "INSERT INTO found_in (family_id, class_id) values(\"$family_id\", \"$nb\")";
$mysql->insert( $sql4 );
$dbh->do( "TRUNCATE go_family_cache;" );
print "New family level 1: Family_id: $family_id\n";
## 2 sequences in the new family
##Query
my $sql7 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $seq_textid . "'";
my @rep3 = $mysql->select( $sql7 );
my $seq_id = $rep3[0];
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id\",\"$family_id\")";
$mysql->insert( $sql2 );
$dbh->do( "TRUNCATE go_family_cache;" );
##Ancien orphan
my $sql6 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
my @rep2 = $mysql->select( $sql6 );
my $seq_id2 = $rep2[0];
my $sql5 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id2\",\"$family_id\")";
$mysql->insert( $sql5 );
$dbh->do( "TRUNCATE go_family_cache;" );
%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
print FAMILY "$family_id\n";
print NFAM "$family_id\n";
}
else { #taille est incorrecte = orphan
print " too small ORPHAN \n";
$hit_while = 1;
}
}
}
} #end control if hit diff query
} # end boucle sur list HIT
}
}
}
}
$dbh->do( "call countFamilySeqBySpecies()" ) if $ENV{TESTING}; # TODO: this needs to be removed and count_family_seq updated properly!
print LOG "number of families amended : $cpt_modif_family \n";
print LOG "number of families created : $cpt_new_family \n";
close FAMILY;
close AFF;
&pourcentage( $species_name, \%liste_pourcentage, \%family_rem );
return $cpt_new_family, $cpt_modif_family;
}
#track the number of sequence added to the family
#by tracking the number at the beginning and added, we can calculate the number of sequence removed
#allow to calculate the percentage of modification
sub dataFamily {
my ( $family_id, %liste_pourcentage ) = @_;
if ( keys( %liste_pourcentage ) ) {
if ( exists $liste_pourcentage{$family_id} ) #family was modified at least once
{
$liste_pourcentage{$family_id} = $liste_pourcentage{$family_id} + 1;
}
else {
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
my @rep = $mysql->select( $sql );
my $old_size = scalar( @rep );
$liste_pourcentage{$family_id} = 1;
}
}
else {
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
my @rep = $mysql->select( $sql );
$liste_pourcentage{$family_id} = 1;
}
return %liste_pourcentage;
}
#track the number of changes in a family (number of new and/or removed sequences)
#calculate the percentage of modification for a family
# %liste_family contains id of each famiily with new sequences members : key = family_id value = number of sequences added
# %family contains id of each famiily where sequences were removed
sub pourcentage (\%) #BAD way to specify paramter type
{
my ( $species_name, $ref_liste_pourcentage, $ref_family_rem ) = @_;
my %liste_pourcentage = %$ref_liste_pourcentage;
my %family_rem = %$ref_family_rem;
my $pass = new Greenphyl::Path( $species_name );
open( POURCENTAGE, "> $pass->{file_result}$Greenphyl::Path::pourcentage " )
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
for my $key ( keys %liste_pourcentage ) #regarde toutes les familles modifiees avec des ADD
{
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = \"$key\"";
my @rep = $mysql->select( $sql );
my $new_size = scalar( @rep ); #family size
if ( exists $family_rem{$key} ) #famille a ete modifiee en + et -, on a une valeur de cle commune a %liste_family et %family_rem
{
my $old_size = $new_size + $liste_pourcentage{$key} - $family_rem{$key}; #ancienne valeur de la taille
my $pourcentage = ( ( $liste_pourcentage{$key} + $family_rem{$key} ) / $old_size ) * 100;
print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add= > " . $liste_pourcentage{$key} . "\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
delete( $family_rem{$key} ); #efface de la table des familles - cette entree car deja traitee
}
else #famille a ete modifiee en + seulement
{
my $old_size = $new_size - $liste_pourcentage{$key}; #ancienne taille = taille actuelle - ce qui a ete ajoute
my $pourcentage = ( $liste_pourcentage{$key} / $old_size ) * 100;
print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle = > " . $new_size . "\t Number of add => " . $liste_pourcentage{$key} . "\tnumber of remove = /\t pourcentage of modification => " . $pourcentage . "\n";
}
}
for my $key ( keys %family_rem ) # que ce qui a ete modifie en - (car on a enleve ce qui etait commun avec %liste_family
{
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $key";
my @rep = $mysql->select( $sql );
my $new_size = scalar( @rep ); #taille de la famille
my $old_size = $new_size + $family_rem{$key};
my $pourcentage = ( $family_rem{$key} / $old_size ) * 100;
print POURCENTAGE "Family_id => " . $key . "\ttaille origine => " . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add => /\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
}
close POURCENTAGE;
}
# if new_size = 0 obsolete family was probably removed from the db!
# function duplicated in many scripts, i don't know how to get rid of that function; TODO
sub clipboard {
my ( $clipboard, $file_temp ) = @_;
my @donnees = ( split( /\n/, $clipboard ) );
my ( @sequence, $name, $seq );
$name ||= '';
open( OUT, ">$file_temp" ) or die "cannot open file : $!";
for my $i ( 0 .. $#donnees ) {
if ( substr( $donnees[$i], 0, 1 ) eq ">" ) { $name = "$donnees[$i]"; }
else {
$seq .= $donnees[$i];
}
print OUT $name . $seq . "\n";
$name = "";
$seq = "";
}
close OUT;
return $file_temp;
}
=pod
=head2 count
B<Description>: perform control length on the sequences. sequences are inserted in the classification if the pass the test.
B<ArgsCount>: 2
=over 4
=item $seq_textid: (string)
sequence name of the query
=item $family_id: (int)
family identifier in which the query sequence may be classified
=back
=cut
sub count {
my ( $family_id, $seq_textid ) = @_;
my $cpt = 0;
my $somme_length = 0;
my %result;
my $sql = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\"";
my @rep = $mysql->select( $sql );
if ( @rep ) {
my $seq_length = $rep[0];
#select les sequences qui appartiennent a la meme famille que la seq que l'on veut etudier (pour le niveau 1 uniquement)
my $sql = "SELECT S.seq_textid, S.seq_length FROM sequences S, seq_is_in SI WHERE S.seq_id = SI.seq_id and SI.family_id = $family_id";
my @rep = $mysql->select( $sql );
if ( @rep ) #si trouve des sequences (si pas trouve alors que toutes les sequences de ALL sont dans la base = probleme !!!!!!)
{
for my $i ( 0 .. $#rep ) {
$cpt++;
$result{$cpt}{'length'} = $rep[$i][1]; #recupere pour chaque sequence de la famille dans une hashatable, la taille de chaque sequence
}
for my $test ( keys %result ) {
$somme_length += $result{$test}{'length'}; #fait la somme des longueurs des sequences pour la famille
}
my $moy_length = $somme_length / $cpt; #fait la moyenne
# should be more visible for configuration and should work with median
my $soixante = ( $moy_length * 60 ) / 100;
my $centquarante = ( $moy_length * 140 ) / 100;
if ( $seq_length < $soixante ) {
print "Too small sequence $seq_textid !!! $seq_length < $soixante \n";
return 0; # orphan, too small, sequence not integrated
}
if ( $seq_length > $centquarante ) {
print "Too large sequence $seq_textid !!! $seq_length > $centquarante \n";
return 0; #mauvais c'est un orphan, il est trop grand, on integre pas cette sequence.
}
else {
print "OK min $soixante max $centquarante for $seq_length\n";
return 1; #bon, on integre cette sequence.
}
}
else {
print "ERROR: the sequence is not in the Database ! \n";
exit;
}
}
else {
print "cannot find length for $seq_textid , maybe the sequence is not in the database\n";
}
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment