ivan-krukov/fastaparse.py

## fastaparse.py
#Read a fasta file and only keep the sequences with correct headers (id_pattern regex)

import re
import sys

seq_pattern = re.compile(r">[^>]+\n",re.MULTILINE)
id_pattern = re.compile(r"protein_id:(?P<id>[.\w]+)")

with open(sys.argv[1]) as f:
    text = f.read()
sequences = seq_pattern.findall(text)

for seq in sequences:
    lines = seq.split("\n")
    id_line, data = lines[0],lines[1:]
    match = id_pattern.search(id_line)
    if match:
	print(">{seq_id}\n{data}".format(seq_id = match.group('id'),data=''.join(data)))
	#Read a fasta file and only keep the sequences with correct headers (id_pattern regex)

	import re
	import sys

	seq_pattern = re.compile(r">[^>]+\n",re.MULTILINE)
	id_pattern = re.compile(r"protein_id:(?P<id>[.\w]+)")

	with open(sys.argv[1]) as f:
	text = f.read()
	sequences = seq_pattern.findall(text)

	for seq in sequences:
	lines = seq.split("\n")
	id_line, data = lines[0],lines[1:]
	match = id_pattern.search(id_line)
	if match:
	print(">{seq_id}\n{data}".format(seq_id = match.group('id'),data=''.join(data)))