Skip to content

Instantly share code, notes, and snippets.

@standage
Last active August 29, 2015 14:00
Show Gist options
  • Save standage/11264666 to your computer and use it in GitHub Desktop.
Save standage/11264666 to your computer and use it in GitHub Desktop.
Given output of tdc-diff.py (https://gist.github.com/standage/10765999), extract sequences warranting further exploration.
#!/usr/bin/env python
import getopt, os, re, sys
class Seq:
def __init__(self, defline, seq):
self.defline = defline
self.seq = seq
match = re.match("^>(\S+)", defline)
self.id = match.group(1)
def parse_fasta(fp):
"""
Stolen shamelessly from http://stackoverflow.com/a/7655072/459780.
"""
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
def print_usage(outstream):
usage = ("\nUsage: tdc-extract [options] s1.cds.fa s2.cds.fa < tdc-diff.txt\n"
" Options:\n"
" -h|--help: print this help message and exit\n"
" -o|--out: FILE file to which output will be printed;\n"
" default is terminal (stdout)\n"
" -x|--label1: STRING label for sample 1 in the output;\n"
" default is 's1'\n"
" -y|--label2: STRING label for sample 2 in the output;\n"
" default is 's2'\n")
print >> outstream, usage
def parse_options(argv):
params = {
'l1': 's1',
'l2': 's2',
'outstream': sys.stdout,
}
optstr = "ho:x:y:"
longopts = ["help", "out", "label1", "label2"]
(options, args) = getopt.getopt(argv[1:], optstr, longopts)
for key, value in options:
if key in ('-h', '--help'):
print_usage(sys.stdout)
sys.exit(0)
elif key in ('-o', '--out'):
params['outstream'] = open(value, 'w')
elif key in ('-x', '--label1'):
params['l1'] = value
elif key in ('-y', '--label2'):
params['l2'] = value
else:
print_usage(sys.stderr)
assert False, "unsupported option '%s'" % key
if len(args) != 2:
print_usage(sys.stderr)
assert False, "expected 2 arguments, %d provided" % len(args)
params['fa1'] = os.path.abspath(args[0])
params['fa2'] = os.path.abspath(args[1])
return params
if __name__ =='__main__':
params = parse_options(sys.argv)
s1seqs = {}
s2seqs = {}
for line in sys.stdin:
line = line.rstrip()
matches = re.search("^\S+\tcds\.(TCONS_\d+)\|\S+\tcds\.(TCONS_\d+)\|\S+", line)
if matches:
s1seqs[matches.group(1)] = 1
s2seqs[matches.group(2)] = 1
with open(params['fa1'], 'r') as fp:
for defline, seq in parse_fasta(fp):
record = Seq(defline, seq)
if record.id in s1seqs:
record.id = params['l1'] +"."+ record.id
print >> params['outstream'], ">%s\n%s" % (record.id, record.seq)
with open(params['fa2'], 'r') as fp:
for defline, seq in parse_fasta(fp):
record = Seq(defline, seq)
if record.id in s2seqs:
record.id = params['l2'] +"."+ record.id
print >> params['outstream'], ">%s\n%s" % (record.id, record.seq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment