Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065299 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065299 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and prints out a list of Caenorhabditis elegans and Caenorhabditis briggsae genes in TreeFam families
#!/usr/local/bin/perl
#
# Perl script list_treefam_genes3.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 18-JAN-06.
# Updated 6-Dec-07.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of
# TreeFam families and prints out a list of the C. elegans and
# C. briggsae genes in TreeFam families.
#
# The output has the format:
# WORM_GENE NUMBER_OF_FAMILIES FAMILIES
# where WORM_GENE is the gene name, eg., R13F6.4 for a C. elegans
# gene or CBG100063 for a C. briggsae gene,
# NUMBER_OF_FAMILIES is the number of TreeFam families that
# WORM_GENE appears in,
# FAMILIES is a list of the families that WORM_GENE is in.
#
# The command-line format is:
# % perl <list_treefam_genes3.pl>
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 0)
{
print "Usage of list_treefam_genes3.pl\n\n";
print "perl -w list_treefam_genes3.pl\n";
print "For example, >perl -w list_treefam_genes3.pl\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
#------------------------------------------------------------------#
# GET THE LONG NAMES OF THE TREEFAM GENES FROM THE MYSQL DATABASE:
%WORM = (); # HASH TABLE TO KEEP A LIST OF WORM GENES IN TREEFAM.
$database = 'treefam_4';
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'genes';
# THIS TABLE HAS THE ID AND DISPLAY ID. SOMETIMES THE DISPLAY ID IS
# THE UNIPROT NAME, SOMETIMES THE ID IS:
$st = "SELECT ID, TAX_ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., AH3.1 for a C. elegans gene or
# WBGene00024691 for a C. briggsae gene
$TAX_ID = $array[1]; # eg., 6239 for a C. elegans gene or
# 6238 for a C. briggsae gene
if ($TAX_ID == 6239 || $TAX_ID == 6238) # IT IS A C. ELEGANS OR C. BRIGGSAE GENE
{
# REMEMBER THAT THIS WORM GENE IS IN TREEFAM:
$WORM{$ID} = $TAX_ID;
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN
# THEM FROM THE MYSQL DATABASE:
$database = 'treefam';
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# FIRST READ IN TREEFAM-A AND THEN TREEFAM-B:
%FAMILY = (); # HASH TABLE TO KEEP A RECORD OF THE TREEFAM FAMILIES THAT A WORM GENE IS IN.
for ($i = 1; $i <= 2; $i++)
{
# SPECIFY THE TABLE:
if ($i == 1) # LOOK AT TREEFAM-A:
{
$table_w = 'famA_gene';
}
elsif ($i == 2) # LOOK AT TREEFAM-B:
{
$table_w = 'famB_gene';
}
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME, FAMILY NAME AND WHETHER THE
# TRANSCRIPT IS IN THE SEED/FULL TREE:
# eg., ENSMUST00000049178.2 TF105085 FULL
$st = "SELECT ID, AC, FLAG from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$ID = $array[0]; # eg., F40G9.2.1 for a C. elegans gene OR WBGene00027163 for a C. briggsae gene.
$AC = $array[1]; # eg., TF105085, NAME OF THE TREEFAM FAMILY.
$FLAG = $array[2]; # eg., FULL OR BOTH
if (($FLAG eq 'FULL' || $FLAG eq 'BOTH')) # WE ARE ONLY INTERESTED IN THE FULL TREES.
{
# CHECK IF IT IS A C. BRIGGSAE OR C. ELEGANS GENE:
if ($WORM{$ID})
{
# REMEMBER THE FAMILIES THAT THIS WORM GENE IS IN:
if (!($FAMILY{$ID})) { $FAMILY{$ID} = $AC; }
else { $FAMILY{$ID} = $FAMILY{$ID}.",".$AC;}
}
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
# PRINT OUT A LIST OF THE WORM GENES THAT APPEAR IN TREEFAM, AND THE
# FAMILIES THAT THEY APPEAR IN:
print "WORM_GENE NUMBER_OF_FAMILIES FAMILIES\n";
foreach $ID (keys %FAMILY)
{
$family = $FAMILY{$ID};
@family = split(/\,/,$family); # THIS IS A LIST OF THE FAMILIES THAT A WORM GENE APPEARS IN.
$no_families = $#family + 1; # THIS IS THE NUMBER OF FAMILIES THAT A WORM GENE APPEARS IN.
print "$ID $no_families $family\n";
}
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment