Skip to content

Instantly share code, notes, and snippets.

@matteoferla
Created May 13, 2016 19:36
Show Gist options
  • Save matteoferla/9bc42c5dedbfa882f6d9b06af78341cd to your computer and use it in GitHub Desktop.
Save matteoferla/9bc42c5dedbfa882f6d9b06af78341cd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#Written for python 3, not tested under 2.
"""
Finds all gene symbols (`get_all_genes()` method) and find the genes in common with the word list.
"""
__author__ = "Matteo Ferla. [Github](https://github.com/matteoferla)"
__email__ = "matteo.ferla@gmail.com"
__date__ = "10/05/16"
N = "\n"
T = "\t"
#N = "<br/>
import csv,os
def get_all_genes(out_fp='symbol.txt'):
"""
Requires the all.ptt folder downloaded from FTP of NCBI.
"""
genebag=set()
os.chdir('all.ptt')
for dp in os.scandir():
if dp.is_dir():
os.chdir(dp.name)
for fp in os.listdir():
if fp.find(".ptt") != -1:
fh=open(fp, "r")
next(fh)
next(fh)
for gene in csv.DictReader(fh, dialect='excel-tab'):
genebag.add(gene['Gene'])
os.chdir('..')
os.chdir('..')
open(out_fp,'w').write("\n".join(genebag))
def get_all_commonon(gene_fp='symbol.txt',word_fp='en.txt'):
def genecase(name):
return name[0:3].lower()+name[3:].upper()
def parse_to_set(fp):
return set({x.replace('\n','').lower() for x in open(fp).readlines()})
genebag=parse_to_set(gene_fp)
wordbag=parse_to_set(word_fp)
print("\n".join(genecase(x) for x in sorted(genebag.intersection(wordbag))))
if __name__ == "__main__":
get_all_genes()
get_all_commonon()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment