Skip to content

Instantly share code, notes, and snippets.

@sminot
Created August 13, 2020 16:35
Show Gist options
  • Save sminot/8b8394c32e3df83f58139fdc1ac5723c to your computer and use it in GitHub Desktop.
Save sminot/8b8394c32e3df83f58139fdc1ac5723c to your computer and use it in GitHub Desktop.
Parse FASTA
from functools import lru_cache
import os
import pandas as pd
@lru_cache(maxsize=16)
def read_fasta(fasta_fp):
assert os.path.exists(fasta_fp)
fasta = {}
header = None
seq = []
with open(fasta_fp, "r") as handle:
for line in handle:
if line[0] == ">":
if header is not None and len(seq) > 0:
fasta[header] = "".join(seq)
header = line[1:].split(" ")[0].rstrip("\n")
seq = []
else:
if len(line) > 1:
seq.append(line.rstrip("\n"))
if header is not None and len(seq) > 0:
fasta[header] = "".join(seq)
# Format as a DataFrame
df = pd.DataFrame([
{
"header": header,
"seq": seq,
"len": len(seq)
}
for header, seq in fasta.items()
]).sort_values(by="len", ascending=False)
# Set the row index by header
df.set_index("header", inplace=True)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment