yk-tanigawa/file0.txt

## file0.txt
from itertools import groupby
import gzip

# Fasta IO
def fasta_iter(fasta_name):
    '''
    given a fasta file. yield tuples of header, sequence
    modified from Brent Pedersen
    Correct Way To Parse A Fasta File In Python
    https://www.biostars.org/p/710/
    '''

    if((fasta_name[-3:] == '.gz') or
       (fasta_name[-5:] == '.gzip')):
        with gzip.open(fasta_name, 'rb') as f:
            data = (x[1] for x in groupby(f, lambda line: line.decode('utf-8')[0] == ">"))
            for header in data:
                header = header.__next__().decode('utf-8')[1:].strip()
                seq = "".join(s.decode('utf-8').strip() for s in data.__next__())
                yield(header, seq)
    else:
        with open(fasta_name) as f:
            # ditch the boolean (x[0]) and just keep the header or sequence since
            # we know they alternate.
            data = (x[1] for x in groupby(f, lambda line: line[0] == ">"))
            for header in data:
                # drop the ">"
                header = header.__next__()[1:].strip()
                # join all sequence lines to one.
                seq = "".join(s.strip() for s in data.__next__())
                yield(header, seq)

## file1.txt
fasta = '/somewhere/hg19.fa.gz'

seqs = {}
for (head, seq) in fasta_iter(fasta):
    seqs[head] = seq

print(seqs)
	from itertools import groupby
	import gzip

	# Fasta IO
	def fasta_iter(fasta_name):
	'''
	given a fasta file. yield tuples of header, sequence
	modified from Brent Pedersen
	Correct Way To Parse A Fasta File In Python
	https://www.biostars.org/p/710/
	'''

	if((fasta_name[-3:] == '.gz') or
	(fasta_name[-5:] == '.gzip')):
	with gzip.open(fasta_name, 'rb') as f:
	data = (x[1] for x in groupby(f, lambda line: line.decode('utf-8')[0] == ">"))
	for header in data:
	header = header.__next__().decode('utf-8')[1:].strip()
	seq = "".join(s.decode('utf-8').strip() for s in data.__next__())
	yield(header, seq)
	else:
	with open(fasta_name) as f:
	# ditch the boolean (x[0]) and just keep the header or sequence since
	# we know they alternate.
	data = (x[1] for x in groupby(f, lambda line: line[0] == ">"))
	for header in data:
	# drop the ">"
	header = header.__next__()[1:].strip()
	# join all sequence lines to one.
	seq = "".join(s.strip() for s in data.__next__())
	yield(header, seq)
	fasta = '/somewhere/hg19.fa.gz'

	seqs = {}
	for (head, seq) in fasta_iter(fasta):
	seqs[head] = seq

	print(seqs)