Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064878 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064878 to your computer and use it in GitHub Desktop.
Perl script that reads in a list of 2-C.elegans-to-1-C.briggsae orthologs, and finds cases where the two Caenorhabditis elegans genes are on different chromosomes, with one on an autosome and one X-linked.
#!/usr/local/bin/perl
#
# Perl script get_chroms_from_treefam.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 9-Sep-09.
#
# THis perl script reads in a list of 2Ce-to-1Cb orthologs and
# finds cases where the 2 Ce genes are on different chromosomes,
# with one on an autosome and one X-linked.
#
# The command-line format is:
# % perl <get_chroms_from_treefam.pl> <input> <release> <orthostrap_cutoff>
# where <input> is the input file of 2Ce-to-1Cb orthologs,
# <release> is the release of TreeFam to use,
# <orthostrap_cutoff> is the orthostrap cutoff.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 4)
{
print "Usage of get_chroms_from_treefam.pl\n\n";
print "perl get_chroms_from_treefam.pl <input> <release> <species> <orthostrap_cutoff>\n";
print "where <input> is the input file of 2Ce-to-1Cb orthologs,\n";
print " <release> is the release of TreeFam to use,\n";
print " <species> is the species containing the paralogs (eg. CAEEL),\n";
print " <orthostrap_cutoff> is the orthostrap cutoff.\n";
print "For example, >perl get_chroms_from_treefam.pl CeCb_2to1_reliable_orths_9Sept09 7 CAEEL 70\n";
exit;
}
# READ IN MY PERL MODULES:
use Avril_modules;
use Treefam::DBConnection;
use DBI;
# FIND THE NAME OF THE INPUT FILE, AND RELEASE OF TREEFAM TO USE:
$input = $ARGV[0];
$release = $ARGV[1];
$the_species = $ARGV[2];
$orthostrap_cutoff = $ARGV[3];
#------------------------------------------------------------------#
# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES:
$database = 'treefam_'.$release;
%SEQUENCED = ();
%TAXID = ();
$num = 0;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'species';
$st = "SELECT SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\"";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$num++;
$swcode = $array[0]; # eg. CAEEL
$taxid = $array[1];
$SEQUENCED{$swcode} = 1;
$TAXID{$swcode} = $taxid;
}
}
$rc = $dbh->disconnect();
$rc = "";
if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
print STDERR "Read in fully sequenced species...\n";
#------------------------------------------------------------------#
# FIND THE ID OF EACH TREEFAM GENE FROM SPECIES $the_species:
print STDERR "Now reading in all genes from $the_species...\n";
if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
$taxid1 = $TAXID{$the_species};
print STDERR "taxid1 $taxid1 for $the_species\n";
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT GID, TAX_ID, ID from $table_w WHERE TAX_ID=$taxid1";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
%ID = ();
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$GID = $array[0]; # eg. T06E6.2a
$TAXID = $array[1]; # eg 7165
$ID = $array[2];
if ($TAXID{$the_species} eq $TAXID)
{
$ID{$GID} = $ID;
}
}
}
$rc = $dbh->disconnect();
$rc = "";
print STDERR "Read in all genes from $the_species\n";
#------------------------------------------------------------------#
# READ IN ALL THE CHROMOSOMAL POSITIONS OF GENES FROM $the_species:
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
open(INPUT,"$input") || die "ERROR: cannot open $input.\n";
while(<INPUT>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$ce1 = $temp[1];
$ce2 = $temp[6];
$orthostrap1 = $temp[3];
$orthostrap2 = $temp[8];
@temp = split(/=/,$orthostrap1);
$orthostrap1 = $temp[3];
@temp = split(/=/,$orthostrap2);
$orthostrap2 = $temp[3];
if ($orthostrap1 >= $orthostrap_cutoff && $orthostrap2 >= $orthostrap_cutoff)
{
if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce1- $ID{$ce1}\n"; exit;}
$ce1_id = $ID{$ce1};
if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce2-\n"; exit;}
$ce2_id = $ID{$ce2};
$ce1_chrom = "unknown";
$ce2_chrom = "unknown";
# FIND THE CHROMOSOMAL POSITIONS OF $ce1 AND $ce2:
$table_w = 'map';
$st = "SELECT ID, TARGET from $table_w WHERE (ID=\'$ce1_id\' OR ID=\'$ce2_id\')";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$the_id = $array[0];
$the_chrom = $array[1];
if ($the_id eq $ce1_id) { $ce1_chrom = $the_chrom;}
elsif ($the_id eq $ce2_id) { $ce2_chrom = $the_chrom;}
}
}
if (($ce1_chrom eq 'chromosomeX' && $ce2_chrom ne 'chromosomeX') ||
($ce2_chrom eq 'chromosomeX' && $ce1_chrom ne 'chromosomeX'))
{
print "$ce1 $ce1_chrom $ce2 $ce2_chrom - $line\n";
}
}
}
close(INPUT);
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment