Created
March 1, 2013 14:07
-
-
Save avrilcoghlan/5064878 to your computer and use it in GitHub Desktop.
Perl script that reads in a list of 2-C.elegans-to-1-C.briggsae orthologs, and finds cases where the two Caenorhabditis elegans genes are on different chromosomes, with one on an autosome and one X-linked.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script get_chroms_from_treefam.pl | |
# Written by Avril Coghlan (a.coghlan@ucc.ie) | |
# 9-Sep-09. | |
# | |
# THis perl script reads in a list of 2Ce-to-1Cb orthologs and | |
# finds cases where the 2 Ce genes are on different chromosomes, | |
# with one on an autosome and one X-linked. | |
# | |
# The command-line format is: | |
# % perl <get_chroms_from_treefam.pl> <input> <release> <orthostrap_cutoff> | |
# where <input> is the input file of 2Ce-to-1Cb orthologs, | |
# <release> is the release of TreeFam to use, | |
# <orthostrap_cutoff> is the orthostrap cutoff. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 4) | |
{ | |
print "Usage of get_chroms_from_treefam.pl\n\n"; | |
print "perl get_chroms_from_treefam.pl <input> <release> <species> <orthostrap_cutoff>\n"; | |
print "where <input> is the input file of 2Ce-to-1Cb orthologs,\n"; | |
print " <release> is the release of TreeFam to use,\n"; | |
print " <species> is the species containing the paralogs (eg. CAEEL),\n"; | |
print " <orthostrap_cutoff> is the orthostrap cutoff.\n"; | |
print "For example, >perl get_chroms_from_treefam.pl CeCb_2to1_reliable_orths_9Sept09 7 CAEEL 70\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
# FIND THE NAME OF THE INPUT FILE, AND RELEASE OF TREEFAM TO USE: | |
$input = $ARGV[0]; | |
$release = $ARGV[1]; | |
$the_species = $ARGV[2]; | |
$orthostrap_cutoff = $ARGV[3]; | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES: | |
$database = 'treefam_'.$release; | |
%SEQUENCED = (); | |
%TAXID = (); | |
$num = 0; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'species'; | |
$st = "SELECT SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\""; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$num++; | |
$swcode = $array[0]; # eg. CAEEL | |
$taxid = $array[1]; | |
$SEQUENCED{$swcode} = 1; | |
$TAXID{$swcode} = $taxid; | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;} | |
print STDERR "Read in fully sequenced species...\n"; | |
#------------------------------------------------------------------# | |
# FIND THE ID OF EACH TREEFAM GENE FROM SPECIES $the_species: | |
print STDERR "Now reading in all genes from $the_species...\n"; | |
if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;} | |
$taxid1 = $TAXID{$the_species}; | |
print STDERR "taxid1 $taxid1 for $the_species\n"; | |
# HASH TABLES TO REMEMBER THE SPECIES OF GENES: | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
$table_w = 'genes'; | |
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), | |
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN | |
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165 | |
$st = "SELECT GID, TAX_ID, ID from $table_w WHERE TAX_ID=$taxid1"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
%ID = (); | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$GID = $array[0]; # eg. T06E6.2a | |
$TAXID = $array[1]; # eg 7165 | |
$ID = $array[2]; | |
if ($TAXID{$the_species} eq $TAXID) | |
{ | |
$ID{$GID} = $ID; | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
print STDERR "Read in all genes from $the_species\n"; | |
#------------------------------------------------------------------# | |
# READ IN ALL THE CHROMOSOMAL POSITIONS OF GENES FROM $the_species: | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
open(INPUT,"$input") || die "ERROR: cannot open $input.\n"; | |
while(<INPUT>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$ce1 = $temp[1]; | |
$ce2 = $temp[6]; | |
$orthostrap1 = $temp[3]; | |
$orthostrap2 = $temp[8]; | |
@temp = split(/=/,$orthostrap1); | |
$orthostrap1 = $temp[3]; | |
@temp = split(/=/,$orthostrap2); | |
$orthostrap2 = $temp[3]; | |
if ($orthostrap1 >= $orthostrap_cutoff && $orthostrap2 >= $orthostrap_cutoff) | |
{ | |
if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce1- $ID{$ce1}\n"; exit;} | |
$ce1_id = $ID{$ce1}; | |
if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce2-\n"; exit;} | |
$ce2_id = $ID{$ce2}; | |
$ce1_chrom = "unknown"; | |
$ce2_chrom = "unknown"; | |
# FIND THE CHROMOSOMAL POSITIONS OF $ce1 AND $ce2: | |
$table_w = 'map'; | |
$st = "SELECT ID, TARGET from $table_w WHERE (ID=\'$ce1_id\' OR ID=\'$ce2_id\')"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$the_id = $array[0]; | |
$the_chrom = $array[1]; | |
if ($the_id eq $ce1_id) { $ce1_chrom = $the_chrom;} | |
elsif ($the_id eq $ce2_id) { $ce2_chrom = $the_chrom;} | |
} | |
} | |
if (($ce1_chrom eq 'chromosomeX' && $ce2_chrom ne 'chromosomeX') || | |
($ce2_chrom eq 'chromosomeX' && $ce1_chrom ne 'chromosomeX')) | |
{ | |
print "$ce1 $ce1_chrom $ce2 $ce2_chrom - $line\n"; | |
} | |
} | |
} | |
close(INPUT); | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment