Last active
August 29, 2015 14:00
-
-
Save standage/11264666 to your computer and use it in GitHub Desktop.
Given output of tdc-diff.py (https://gist.github.com/standage/10765999), extract sequences warranting further exploration.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import getopt, os, re, sys | |
class Seq: | |
def __init__(self, defline, seq): | |
self.defline = defline | |
self.seq = seq | |
match = re.match("^>(\S+)", defline) | |
self.id = match.group(1) | |
def parse_fasta(fp): | |
""" | |
Stolen shamelessly from http://stackoverflow.com/a/7655072/459780. | |
""" | |
name, seq = None, [] | |
for line in fp: | |
line = line.rstrip() | |
if line.startswith(">"): | |
if name: yield (name, ''.join(seq)) | |
name, seq = line, [] | |
else: | |
seq.append(line) | |
if name: yield (name, ''.join(seq)) | |
def print_usage(outstream): | |
usage = ("\nUsage: tdc-extract [options] s1.cds.fa s2.cds.fa < tdc-diff.txt\n" | |
" Options:\n" | |
" -h|--help: print this help message and exit\n" | |
" -o|--out: FILE file to which output will be printed;\n" | |
" default is terminal (stdout)\n" | |
" -x|--label1: STRING label for sample 1 in the output;\n" | |
" default is 's1'\n" | |
" -y|--label2: STRING label for sample 2 in the output;\n" | |
" default is 's2'\n") | |
print >> outstream, usage | |
def parse_options(argv): | |
params = { | |
'l1': 's1', | |
'l2': 's2', | |
'outstream': sys.stdout, | |
} | |
optstr = "ho:x:y:" | |
longopts = ["help", "out", "label1", "label2"] | |
(options, args) = getopt.getopt(argv[1:], optstr, longopts) | |
for key, value in options: | |
if key in ('-h', '--help'): | |
print_usage(sys.stdout) | |
sys.exit(0) | |
elif key in ('-o', '--out'): | |
params['outstream'] = open(value, 'w') | |
elif key in ('-x', '--label1'): | |
params['l1'] = value | |
elif key in ('-y', '--label2'): | |
params['l2'] = value | |
else: | |
print_usage(sys.stderr) | |
assert False, "unsupported option '%s'" % key | |
if len(args) != 2: | |
print_usage(sys.stderr) | |
assert False, "expected 2 arguments, %d provided" % len(args) | |
params['fa1'] = os.path.abspath(args[0]) | |
params['fa2'] = os.path.abspath(args[1]) | |
return params | |
if __name__ =='__main__': | |
params = parse_options(sys.argv) | |
s1seqs = {} | |
s2seqs = {} | |
for line in sys.stdin: | |
line = line.rstrip() | |
matches = re.search("^\S+\tcds\.(TCONS_\d+)\|\S+\tcds\.(TCONS_\d+)\|\S+", line) | |
if matches: | |
s1seqs[matches.group(1)] = 1 | |
s2seqs[matches.group(2)] = 1 | |
with open(params['fa1'], 'r') as fp: | |
for defline, seq in parse_fasta(fp): | |
record = Seq(defline, seq) | |
if record.id in s1seqs: | |
record.id = params['l1'] +"."+ record.id | |
print >> params['outstream'], ">%s\n%s" % (record.id, record.seq) | |
with open(params['fa2'], 'r') as fp: | |
for defline, seq in parse_fasta(fp): | |
record = Seq(defline, seq) | |
if record.id in s2seqs: | |
record.id = params['l2'] +"."+ record.id | |
print >> params['outstream'], ">%s\n%s" % (record.id, record.seq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment