Created
May 13, 2016 19:36
-
-
Save matteoferla/9bc42c5dedbfa882f6d9b06af78341cd to your computer and use it in GitHub Desktop.
Script used in http://blog.matteoferla.com/2016/05/gene-symbol-poetry.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
#Written for python 3, not tested under 2. | |
""" | |
Finds all gene symbols (`get_all_genes()` method) and find the genes in common with the word list. | |
""" | |
__author__ = "Matteo Ferla. [Github](https://github.com/matteoferla)" | |
__email__ = "matteo.ferla@gmail.com" | |
__date__ = "10/05/16" | |
N = "\n" | |
T = "\t" | |
#N = "<br/> | |
import csv,os | |
def get_all_genes(out_fp='symbol.txt'): | |
""" | |
Requires the all.ptt folder downloaded from FTP of NCBI. | |
""" | |
genebag=set() | |
os.chdir('all.ptt') | |
for dp in os.scandir(): | |
if dp.is_dir(): | |
os.chdir(dp.name) | |
for fp in os.listdir(): | |
if fp.find(".ptt") != -1: | |
fh=open(fp, "r") | |
next(fh) | |
next(fh) | |
for gene in csv.DictReader(fh, dialect='excel-tab'): | |
genebag.add(gene['Gene']) | |
os.chdir('..') | |
os.chdir('..') | |
open(out_fp,'w').write("\n".join(genebag)) | |
def get_all_commonon(gene_fp='symbol.txt',word_fp='en.txt'): | |
def genecase(name): | |
return name[0:3].lower()+name[3:].upper() | |
def parse_to_set(fp): | |
return set({x.replace('\n','').lower() for x in open(fp).readlines()}) | |
genebag=parse_to_set(gene_fp) | |
wordbag=parse_to_set(word_fp) | |
print("\n".join(genecase(x) for x in sorted(genebag.intersection(wordbag)))) | |
if __name__ == "__main__": | |
get_all_genes() | |
get_all_commonon() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment