Created
March 24, 2017 21:08
-
-
Save MinaGabriel/be3bbaae196e9d5a4191d56099f1ff9a to your computer and use it in GitHub Desktop.
download_parse_entrezgene
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
from urllib2 import urlopen, URLError, HTTPError | |
import re | |
# Change information as appropriate | |
# Where files will be downloaded and unzipped | |
downloadDir = "./entrez_downloads" | |
# Where formatted files (ready for database import) will be saved | |
SQL = "./entrez_sql_ready" | |
# End of User variables | |
# list of all tables | |
# Note: mim2gene removed on 27 June 2014 -- It needs to be downloaded from omim.org (and requires a username/password) | |
# We can add it back later but as of now we don't use it so let's skip it. | |
table = [ | |
'gene2accession', | |
'gene_history', | |
'gene_info', | |
'gene2go', | |
'gene2pubmed', | |
'gene2refseq', | |
'gene2sts', | |
'gene2unigene', | |
'generifs_basic', | |
'interactions', | |
'tax2name', | |
'refSeqSummary' | |
] | |
# downloaded files - zipped | |
zip_files = [ | |
'gene_history', | |
'gene2accession', | |
'gene_info', | |
'gene2go', | |
'gene2pubmed', | |
'gene2refseq' | |
] | |
# downloaded files - unzipped | |
download_files = [ | |
'gene2sts', | |
'gene2unigene' | |
] | |
# downloaded files -- RIFs | |
geneRIF = [ | |
'generifs_basic', | |
'interactions' | |
] | |
# Make directories (if needed) | |
# Else empty both directories | |
if not os.path.exists(downloadDir): | |
os.system('mkdir ' + downloadDir) | |
else: | |
os.system('rm -rf ' + downloadDir + '/*') | |
if not os.path.exists(SQL): | |
os.system('mkdir ' + SQL) | |
else: | |
os.system('rm -rf ' + SQL + '/*') | |
# using unix command wget - download new zip files | |
# rm *.zip file after unzipping | |
for zip in zip_files: | |
try: | |
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + zip + ".gz" | |
zip_input_file = downloadDir + "/" + zip + ".gz" | |
print "Downloading " + zip + " to " + downloadDir + " and gunzipping..." | |
file = urlopen(ftp_site) | |
with open(zip_input_file, "wb") as local_file: | |
local_file.write(file.read()) | |
os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + zip) | |
os.system('rm ' + zip_input_file) | |
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m') | |
except HTTPError, e: | |
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m') | |
except URLError, e: | |
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m') | |
# using unix command wget - download new files | |
for down in download_files: | |
try: | |
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + down | |
zip_input_file = downloadDir + "/" + down | |
print "Downloading " + down + " to " + zip_input_file | |
file = urlopen(ftp_site) | |
with open(zip_input_file, "wb") as local_file: | |
local_file.write(file.read()) | |
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m') | |
except HTTPError, e: | |
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m') | |
except URLError, e: | |
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m') | |
# using unix command wget - download new zip files | |
# rm *.zip file after unzipping | |
for rif in geneRIF: | |
try: | |
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/" + rif + ".gz" | |
zip_input_file = downloadDir + "/" + rif + ".gz" | |
print "Downloading " + rif + " to " + downloadDir + " and gunzipping..." | |
file = urlopen(ftp_site) | |
with open(zip_input_file, "wb") as local_file: | |
local_file.write(file.read()) | |
os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + rif) | |
os.system('rm ' + zip_input_file) | |
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m') | |
except HTTPError, e: | |
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m') | |
except URLError, e: | |
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m') | |
# using unix command wget - download new zip files | |
# rm *.zip file after unzipping | |
try: | |
ftp_site = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" | |
zip_input_file = downloadDir + "/taxdump.tar.gz" | |
print "Downloading taxdump to " + downloadDir + "..." | |
file = urlopen(ftp_site) | |
with open(zip_input_file, "wb") as local_file: | |
local_file.write(file.read()) | |
os.system('tar -zxf ' + zip_input_file + ' -C ' + downloadDir) | |
except HTTPError, e: | |
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m') | |
except URLError, e: | |
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m') | |
####### | |
# | |
# INSERT INTO FILES | |
# | |
####### | |
# for each newly downloaded file; | |
# parse each file (ignoring comments), add a primary id, and print out | |
# information (tab-delimited) to a new file (named the same as $file) | |
line = '' | |
primary_id = 0 | |
for current_file in table: | |
if current_file == 'refSeqSummary' or current_file == '': | |
continue | |
print "Inserting unique identifier (line number) into " + current_file | |
if current_file == 'tax2name': | |
# Taxonomy is a little different from Entrez Gene | |
# I need to parse out the information that I need from names.dmp | |
file_names = downloadDir + '/names.dmp' | |
with open(SQL + '/tax2name', 'w+') as write: | |
with open(file_names, 'r') as read: | |
for line in read: | |
fields = re.split(r'\t', line) | |
if re.match(r'scientific', fields[6], flags=re.I): | |
tax_id = fields[0] | |
name = fields[2] | |
write.write(tax_id.strip() + '\t' + name.strip() + '\n') | |
elif current_file == 'gene2refseq': | |
file_names = downloadDir + '/' + current_file | |
with open(SQL + '/' + current_file, 'w+') as write: | |
with open(file_names, 'r') as read: | |
primary_id = 0 | |
for line in read: | |
if re.match(r'^#', line): | |
continue | |
primary_id += 1 | |
data = re.split(r'\t', line) | |
key = str(primary_id) + '\t' | |
data_array = [key] | |
for i in range(0, len(data)): | |
if i == 3 or i == 5 or i == 7: | |
data[i].replace('.', '') | |
data_array.append(data[i].strip()) | |
write.write("\t".join(data_array) + '\n') | |
else: | |
file_names = downloadDir + '/' + current_file | |
with open(SQL + '/' + current_file, 'w+') as write: | |
with open(file_names, 'r') as read: | |
primary_id = 0 | |
for line in read: | |
if re.match(r'^#', line): | |
continue | |
primary_id += 1 | |
data = re.split(r'\t', line) | |
key = str(primary_id) + '\t' | |
data_array = [key] | |
if current_file == "gene_history" and data[1] == "-": | |
data[1] = 0 | |
elif current_file == "interactions" and data[5] == "-": | |
data[5] = 0 | |
for i in range(0, len(data)): | |
data_array.append(str(data[i]).strip()) | |
write.write("\t".join(data_array) + '\n') | |
print('\x1b[1;30;43m' + current_file + ' created successfully!' + '\x1b[0m') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment