Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065941 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065941 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and finds Schistosoma mansoni genes that are multi-copy, but that have just one or two orthologs in most other animals.
#!/usr/local/bin/perl
#
# Perl script treefam_flatworm2.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 30-Jun-06.
#
# This perl script connects to the MYSQL database of
# TreeFam families and finds flatworm genes that are
# multi-copy, that have 1 or 2 orthologs in most
# other animals.
#
# The command-line format is:
# % perl <treefam_flatworm2.pl> <db>
# where db says whether to use TreeFamA or TreeFamB.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_flatworm2.pl\n\n";
print "perl -w treefam_flatworm2.pl <db>\n";
print "where <db> says whether to use TreeFamA or TreeFamB.\n";
print "For example, >perl -w treefam_flatworm2.pl A\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team54/alc/perl/modules');
}
# DECLARE MYSQL USERNAME AND HOST:
use Avril_modules;
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# CHECK WHETHER WE SHOULD USE TREEFAM A OR TREEFAM B:
$db = $ARGV[0];
#------------------------------------------------------------------#
# FIND THE SPECIES OF EACH TREEFAM GENE:
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
%SPECIES = ();
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), AND
# THE SECOND COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT ID, TAX_ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., ENST00000356572.1
$TAXID = $array[1]; # eg 7165
if ($TAXID eq '9606') { $SPECIES{$ID} = 'HS';} # HOMO SAPIENS
elsif ($TAXID eq '7227') { $SPECIES{$ID} = 'DM';} # DROSOPHILA MELANOGASTER
elsif ($TAXID eq '6239') { $SPECIES{$ID} = 'CE';} # CAENORHABDITIS ELEGANS
elsif ($TAXID eq '9031') { $SPECIES{$ID} = 'GG';} # GALLUS GALLUS
elsif ($TAXID eq '10116'){ $SPECIES{$ID} = 'RN';} # RATTUS NORVEGICUS
elsif ($TAXID eq '10090'){ $SPECIES{$ID} = 'MM';} # MUS MUSCULUS
elsif ($TAXID eq '6183') { $SPECIES{$ID} = 'SM';} # SCHISTOSOMA MANSONI
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN
# THEM FROM THE MYSQL DATABASE:
%NUM = ();
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
if ($db eq 'A') # LOOK AT TREEFAM-A:
{
$table_w = 'famA_gene';
}
elsif ($db eq 'B') # LOOK AT TREEFAM-B:
{
$table_w = 'famB_gene';
}
else { print STDERR "ERROR: db is $db.\n"; exit;}
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME, FAMILY NAME AND WHETHER THE
# TRANSCRIPT IS IN THE SEED/FULL TREE:
# # eg., ENSMUST00000049178.2 TF105085 FULL
$st = "SELECT ID, AC, FLAG from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., ENSMUST00000049178.2
$AC = $array[1]; # eg., TF105085
$FLAG = $array[2]; # eg., FULL OR BOTH
if ($FLAG eq 'FULL' || $FLAG eq 'BOTH')
{
if ($SPECIES{$ID})
{
$species = $SPECIES{$ID};
$key = $AC."_".$species;
if (!($NUM{$key})) { $NUM{$key} = 1; }
else { $NUM{$key}++; }
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# FIND FAMILIES THAT HAVE MANY SCHISTOSOMA MANSONI GENES, BUT
# JUST ONE OR TWO GENES IN OTHER SPECIES:
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
if ($db eq 'A') # Use TreeFam-A
{
$table_w = 'familyA';
}
elsif ($db eq 'B') # Use TreeFam-B
{
$table_w = 'familyB';
}
else { print STDERR "ERROR: db is $db.\n"; exit;}
# THE FIRST COLUMN IN THE TABLE IS THE FAMILY ACCESSION NUMBER eg. TF105036:
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$HS = 0; $DM = 0; $CE = 0; $GG = 0; $RN = 0; $MM = 0; $SM = 0;
$key = $AC."_HS"; if ($NUM{$key}) { $HS = $NUM{$key};}
$key = $AC."_DM"; if ($NUM{$key}) { $DM = $NUM{$key};}
$key = $AC."_CE"; if ($NUM{$key}) { $CE = $NUM{$key};}
$key = $AC."_GG"; if ($NUM{$key}) { $GG = $NUM{$key};}
$key = $AC."_RN"; if ($NUM{$key}) { $RN = $NUM{$key};}
$key = $AC."_MM"; if ($NUM{$key}) { $MM = $NUM{$key};}
$key = $AC."_SM"; if ($NUM{$key}) { $SM = $NUM{$key};}
# IF THERE ARE >= 3 SCHISTOSOMA GENES AND LESS THAN OR EQUAL TO TWO GENES IN MOST
# OF THE OTHER SPECIES:
if ($SM >= 3 && ($HS == 1 || $HS == 2) && ($DM == 1 || $DM == 2) &&
($CE == 1 || $CE == 2) && ($GG == 1 || $GG == 2) &&
($RN == 1 || $RN == 2) && ($MM == 1 || $MM == 2))
{
print "$AC\n";
}
# IF THERE ARE >= 2 SCHISTOSOMA GENES AND ONE GENE IN MOST OTHER SPECIES:
elsif ($SM == 2 && $HS == 1 && $DM == 1 && $CE == 1 && $GG == 1 && $RN == 1 && $MM == 1)
{
print "$AC\n";
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment