Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065676 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065676 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and retrieves all families that have just one human, one rat, one chicken, one Caenorhabditis elegans, and one Drosophila melanogaster gene (as well as possible additional genes from other species).
#!/usr/local/bin/perl
#
# Perl script find_simple_families3.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 27-Jun-05.
# Edited 30-Jun-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of
# TreeFam families and gets all full families that have
# one human, one mouse, one rat, one chicken, one C. elegans and
# one D. melanogaster gene.
#
# The command-line format is:
# % perl <find_simple_families3.pl> <db>
# where db says whether to use TreeFamA or TreeFamB.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of find_simple_families3.pl\n\n";
print "perl -w find_simple_families3.pl <db>\n";
print "where <db> says whether to use TreeFamA or TreeFamB.\n";
print "For example, >perl -w find_simple_families3.pl A\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team54/alc/perl/modules');
}
# DECLARE MYSQL USERNAME AND HOST:
use Avril_modules;
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# CHECK WHETHER WE SHOULD USE TREEFAM A OR TREEFAM B:
$db = $ARGV[0];
#------------------------------------------------------------------#
# FIND THE SPECIES OF EACH TREEFAM GENE:
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
%SPECIES = ();
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), AND
# THE SECOND COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT ID, TAX_ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., ENST00000356572.1
$TAXID = $array[1]; # eg 7165
if ($TAXID eq '9606') { $SPECIES{$ID} = 'HS';} # HOMO SAPIENS
elsif ($TAXID eq '7227') { $SPECIES{$ID} = 'DM';} # DROSOPHILA MELANOGASTER
elsif ($TAXID eq '6239') { $SPECIES{$ID} = 'CE';} # CAENORHABDITIS ELEGANS
elsif ($TAXID eq '9031') { $SPECIES{$ID} = 'GG';} # GALLUS GALLUS
elsif ($TAXID eq '10116'){ $SPECIES{$ID} = 'RN';} # RATTUS NORVEGICUS
elsif ($TAXID eq '10090'){ $SPECIES{$ID} = 'MM';} # MUS MUSCULUS
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN
# THEM FROM THE MYSQL DATABASE:
%DISCARD = ();
%NUM = ();
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
if ($db eq 'A') # LOOK AT TREEFAM-A:
{
$table_w = 'famA_gene';
}
elsif ($db eq 'B') # LOOK AT TREEFAM-B:
{
$table_w = 'famB_gene';
}
else { print STDERR "ERROR: db is $db.\n"; exit;}
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME,
# FAMILY NAME AND WHETHER THE TRANSCRIPT IS IN THE SEED/FULL TREE:
# # eg., ENSMUST00000049178.2 TF105085 FULL
$st = "SELECT ID from $table_w WHERE ID=$family";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., ENSMUST00000049178.2
$AC = $array[1]; # eg., TF105085
$FLAG = $array[2]; # eg., FULL OR BOTH
if (($FLAG eq 'FULL' || $FLAG eq 'BOTH') && !($DISCARD{$AC}))
{
if ($SPECIES{$ID})
{
$species = $SPECIES{$ID};
$key = $AC."_".$species;
if (!($NUM{$key})) { $NUM{$key} = 1; }
else { $DISCARD{$AC} = 1; } # IGNORE THIS FAMILY FROM NOW ON, AS IT HAS MORE THAN
# ONE GENE FROM THIS SPECIES.
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# FIND FAMILIES THAT HAVE JUST ONE HUMAN, FLY, WORM, CHICKEN, MOUSE AND RAT GENE:
$database = 'treefam_3';
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
if ($db eq 'A') # Use TreeFam-A
{
$table_w = 'familyA';
}
elsif ($db eq 'B') # Use TreeFam-B
{
$table_w = 'familyB';
}
else { print STDERR "ERROR: db is $db.\n"; exit;}
# THE FIRST COLUMN IN THE TABLE IS THE FAMILY ACCESSION NUMBER eg. TF105036:
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$AC = $array[0];
if (!($DISCARD{$AC}))
{
$HS = 0; $DM = 0; $CE = 0; $GG = 0; $RN = 0; $MM = 0;
$key = $AC."_HS"; if ($NUM{$key}) { $HS = 1;}
$key = $AC."_DM"; if ($NUM{$key}) { $DM = 1;}
$key = $AC."_CE"; if ($NUM{$key}) { $CE = 1;}
$key = $AC."_GG"; if ($NUM{$key}) { $GG = 1;}
$key = $AC."_RN"; if ($NUM{$key}) { $RN = 1;}
$key = $AC."_MM"; if ($NUM{$key}) { $MM = 1;}
if ($HS == 1 && $DM == 1 && $CE == 1 && $GG == 1 && $RN == 1 && $MM == 1)
{
print "$AC\n";
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment