Skip to content

Instantly share code, notes, and snippets.

@dnanto
Created May 4, 2020 14:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dnanto/90eea476db1cf58138f5b4d8a8594268 to your computer and use it in GitHub Desktop.
Save dnanto/90eea476db1cf58138f5b4d8a8594268 to your computer and use it in GitHub Desktop.
Parse the Feature Table format of Entrez Direct E-utilities.
def parse_coor(coor):
return int(coor[0].lstrip(">").lstrip("<")), int(coor[1].lstrip(">").lstrip("<"))
def parse_ft(file):
acc, feat, coors, anno = None, None, None, None
for line in map(str.rstrip, file):
if line.startswith(">Feature"):
acc = line[9:].split("|", maxsplit=2)[1].split(".")[0]
elif acc and line:
if not line[0].isspace():
tokens = line.split("\t")
if len(tokens) == 3:
yield from ((acc, feat, *parse_coor(coor), anno) for coor in coors) if coors else ()
feat, coors, anno = tokens[-1], [tokens[:-1]], {}
elif len(tokens) == 2:
coors += [tokens]
else:
tokens = line.lstrip().split(maxsplit=1)
key, val = tokens * (1 + (len(tokens) == 1))
anno[key] = val
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment