Skip to content

Instantly share code, notes, and snippets.

@leipzig
Created September 18, 2012 02:45
Show Gist options
  • Save leipzig/3740963 to your computer and use it in GitHub Desktop.
Save leipzig/3740963 to your computer and use it in GitHub Desktop.
This program converts mutations in mitochondrial sequences into HaploGrep format using the Mitomaster web service.
#!/usr/bin/perl
use strict;
use Getopt::Std;
use LWP::UserAgent;
use HTTP::Request::Common;
my %opt;
my $opt_string = 'hcms';
getopts( "$opt_string", \%opt ) or &usage();
&usage() if $opt{'h'};
#this is a lookup table for refseq->rCRS lookups
#only the values are important for this script
my %LUT=
qw(C182T C182T T217C T217C G228A G228A G247A G247A T250C T250C C295T C295T C458T C456T C464T C462T T479C T477C T491C T489C G752A A750A A829G A827G G1020A G1018A C1050T C1048T T1191C T1189C G1440A A1438A G1721A G1719A A1738G A1736G T2160C T2158C C2354T T2352T C2485T T2483T G2708A A2706A G2760A G2758A C2791T C2789T T2887C T2885C G3012A G3010A T3198C T3197C A3349G A3348G T3395C T3394C A3481G A3480G A3548G A3547G C3595T C3594T G3667A G3666A A3721G A3720G G3916A G3915A G3919A G3918A C3971T C3970T C3993T C3992T A4025G A4024G A4105G A4104G T4337C T4336C T4562C T4561C G4581A G4580A T4640C T4639C G4770A A4769A G4821A G4820A A4825G A4824G C4884T C4883T A4918G A4917G T4929C T4928C T4978C T4977C T5005C T5004C G5047A G5046A C5179A C5178A C5264T C5263T A5391G A5390G T5443C T5442C G5461A G5460A T5496C T5495C A5657G A5656G G5774A G5773A A5952G A5951G G6027A G6026A C6046T C6045T T6072C T6071C T6153C T6152C T6222C T6221C G6261A G6260A T6366C T6365C C6372T C6371T C6456T C6455T T6681C T6680C T6720C T6719C G6735A G6734A A6753G A6752G T6777C T6776C A7056G A7055G A7147G A7146G T7176C T7175C C7257T C7256T C7275T C7274T G7522A G7521A A7769G A7768G G8207A G8206A G8270A G8269A T8278C T8277C G8617T G8616T C8656T C8655T A8870G A8869G A9073G A9072G T9091C T9090C A9094G A9093G A9222G A9221G G9378A A9377A G9478A G9477A C9541T T9540T A9668G A9667G T9699C T9698C T9717C T9716C T9900C T9899C T9951C T9950C T10035C T10034C A10045G A10044G T10239C T10238C G10311A G10310A T10322C T10321C G10399A A10398A T10464C T10463C A10551G A10550G G10587A G10586A G10590A G10589A G10689A G10688A C10874T T10873T T10916C T10915C A11252G A11251G G11378A G11377A A11468G A11467G T11486C T11485C C11723T T11722T A11813G A11812G T11900C T11899C G11915A G11914A G11970A G11969A A12309G A12308G G12373A G12372A T12415C T12414C G12631A G12630A C12670T C12669T T12706C C12705C G12851A A12850A A13106G A13105G A13264G A13263G T13501C T13500C C13651T C13650T A13781G A13780G T13790C T13789C T13966C T13965C G14017A G14016A T14179C T14178C C14213T T14212T A14234G A14233G T14471C T14470C G14581A A14580A A14583G A14582G T14767C T14766C T14784C T14783C T14799C T14798C A14906G G14905G G15044A G15043A G15111A G15110A A15219G A15218G A15245G A15244G G15258A G15257A A15302G G15301G C15536T C15535T T15671C T15670C A15759G A15758G T15785C T15784C C15834T C15833T C15905T C15904T A15908G A15907G A15925G A15924G G15929A G15928A G15931A G15930A G16130A G16129A T16145C T16144C G16146A G16145A C16149T C16148T A16163G A16162G A16164G A16163G C16184A A16183A C16272T C16270T C16280T C16278T C16329T C16327T G16392A G16390A G16393A G16391A C16521T T16519T);
my @probes = values %LUT;
my @probePos = map { local $_ = $_; s/\D//g; $_ } @probes;
my @probeRefs = map { m/^(\S)/; $1} @probes;
my @probeMuts = map { m/(\S)$/; $1} @probes;
my %probeHash;
@probeHash{@probePos} = @probeMuts;
my $userAgent = LWP::UserAgent->new(timeout => 1800); #a half-hour
my %mm_server=('production'=>'http://mitomaster.mitomap.org/cgi-bin/websrvc.cgi','development'=>'http://resmitod.research.chop.edu/mitomasterdev/websrvc.cgi');
my $request = POST $mm_server{'production'},
Content_Type => 'multipart/form-data',
Content => [ file => [$ARGV[0]],
fileType => 'sequences'];
my $response = $userAgent->request($request);
print $response->error_as_HTML . "
" if $response->is_error;
#haplogrep output
#SampleId Range Haplogroup Polymorphisms (delimited with tabs)
#TestSample1 "16024-576;" ? 16051G 16182C 16183C 16189C 16519C 73G 152C 263G 315.1C 356.1C 8281d 8282d 8283d 8284d 8285d 8286d 8287d 8288d 8289d 11914A
#TestSample2 "16024-16569 ;1-576;" ? 16224C 16311C 16519C 73G 146C 152C 263G 309.1C 315.1C 573.1C 573.2C 573.3C
#gi|78775905|gb|DQ272108.1| 16 16 A T transversion unknown non-coding G3a1 0 0 3838 true false false
my %encounteredSamples;
if ($response->is_success) {
my $report=$response->decoded_content;
#print $report; #debug
my @report_lines = split /\n/,$report;
#get ranges
my %ranges;
foreach my $repLine(@report_lines){
my @cols = split /\t/,$repLine;
my ($sampleid,$pos)=@cols[0,1];
if(!defined $ranges{$sampleid}){$ranges{$sampleid}{'start'}=$pos;$ranges{$sampleid}{'end'}=$pos;}
else{$ranges{$sampleid}{'end'}=$pos}
}
print "SampleId\tRange\tHaplogroup\tPolymorphisms";
foreach my $repLine(@report_lines){
my @cols = split /\t/,$repLine;
my ($sampleid,$pos,$ref,$mut,$type)=@cols[0,1,3,4,5];
if((!$opt{'c'} && !$opt{'m'}) || $probeHash{$pos}){
if (!defined $encounteredSamples{$sampleid}){
print "\n$sampleid.\t\"".$ranges{$sampleid}{'start'}."-".$ranges{$sampleid}{'end'}.";\"\t?";
$encounteredSamples{$sampleid}=1;
}
unless($pos==3107){
if($type eq 'insertion' && !$opt{'s'}){
my @ins_nts=split //,$mut;
#begin at 1 since the insertion format is A->ACC
for(my $intPos=1;$intPos<scalar(@ins_nts);$intPos++){
print "\t".$pos.".".$intPos.$ins_nts[$intPos];
}
}
elsif($type eq 'deletion' && !$opt{'s'}){
#1348 1351 GG : deletion
my @deletion_nts = split //,$ref;
for(my $delPos=$pos,my $nt=0;$nt<scalar(@deletion_nts);$nt++,$delPos++){
print "\t".$delPos."d";
}
}
elsif($type eq 'transition' || $type eq 'transversion'){
if(!$opt{'m'} || $probeHash{$pos} eq $mut){
print "\t".$pos.$mut;
}
}
else{
die("Mutation type unknown: ".$repLine."\n")
}
}
}
}
print "\n";
} else {
die $response->status_line;
}
sub usage
{
print STDERR <<'EOF';
This program converts mutations in mitochondrial sequences into
HaploGrep format using the Mitomaster web service.
usage: ./seq2haplogrep.pl INPUT > myOutput.txt
INPUT: A mitochondrial sequence
-h : this (help) message
-c : report a position only if it is in the 177 probeset (e.g. C182T or C182A or C182G)
-m : report a mutation only if it is in the 177 probeset (e.g. only C182T)
-s : report substitutions only
OUTPUT: HaploGrep calls to stdout
EOF
exit;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment