Skip to content

Instantly share code, notes, and snippets.

@MinaGabriel
Created March 24, 2017 21:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MinaGabriel/be3bbaae196e9d5a4191d56099f1ff9a to your computer and use it in GitHub Desktop.
Save MinaGabriel/be3bbaae196e9d5a4191d56099f1ff9a to your computer and use it in GitHub Desktop.
download_parse_entrezgene
#!/usr/bin/python
import os
from urllib2 import urlopen, URLError, HTTPError
import re
# Change information as appropriate
# Where files will be downloaded and unzipped
downloadDir = "./entrez_downloads"
# Where formatted files (ready for database import) will be saved
SQL = "./entrez_sql_ready"
# End of User variables
# list of all tables
# Note: mim2gene removed on 27 June 2014 -- It needs to be downloaded from omim.org (and requires a username/password)
# We can add it back later but as of now we don't use it so let's skip it.
table = [
'gene2accession',
'gene_history',
'gene_info',
'gene2go',
'gene2pubmed',
'gene2refseq',
'gene2sts',
'gene2unigene',
'generifs_basic',
'interactions',
'tax2name',
'refSeqSummary'
]
# downloaded files - zipped
zip_files = [
'gene_history',
'gene2accession',
'gene_info',
'gene2go',
'gene2pubmed',
'gene2refseq'
]
# downloaded files - unzipped
download_files = [
'gene2sts',
'gene2unigene'
]
# downloaded files -- RIFs
geneRIF = [
'generifs_basic',
'interactions'
]
# Make directories (if needed)
# Else empty both directories
if not os.path.exists(downloadDir):
os.system('mkdir ' + downloadDir)
else:
os.system('rm -rf ' + downloadDir + '/*')
if not os.path.exists(SQL):
os.system('mkdir ' + SQL)
else:
os.system('rm -rf ' + SQL + '/*')
# using unix command wget - download new zip files
# rm *.zip file after unzipping
for zip in zip_files:
try:
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + zip + ".gz"
zip_input_file = downloadDir + "/" + zip + ".gz"
print "Downloading " + zip + " to " + downloadDir + " and gunzipping..."
file = urlopen(ftp_site)
with open(zip_input_file, "wb") as local_file:
local_file.write(file.read())
os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + zip)
os.system('rm ' + zip_input_file)
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
except HTTPError, e:
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
except URLError, e:
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')
# using unix command wget - download new files
for down in download_files:
try:
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + down
zip_input_file = downloadDir + "/" + down
print "Downloading " + down + " to " + zip_input_file
file = urlopen(ftp_site)
with open(zip_input_file, "wb") as local_file:
local_file.write(file.read())
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
except HTTPError, e:
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
except URLError, e:
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')
# using unix command wget - download new zip files
# rm *.zip file after unzipping
for rif in geneRIF:
try:
ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/" + rif + ".gz"
zip_input_file = downloadDir + "/" + rif + ".gz"
print "Downloading " + rif + " to " + downloadDir + " and gunzipping..."
file = urlopen(ftp_site)
with open(zip_input_file, "wb") as local_file:
local_file.write(file.read())
os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + rif)
os.system('rm ' + zip_input_file)
print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
except HTTPError, e:
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
except URLError, e:
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')
# using unix command wget - download new zip files
# rm *.zip file after unzipping
try:
ftp_site = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
zip_input_file = downloadDir + "/taxdump.tar.gz"
print "Downloading taxdump to " + downloadDir + "..."
file = urlopen(ftp_site)
with open(zip_input_file, "wb") as local_file:
local_file.write(file.read())
os.system('tar -zxf ' + zip_input_file + ' -C ' + downloadDir)
except HTTPError, e:
print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
except URLError, e:
print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')
#######
#
# INSERT INTO FILES
#
#######
# for each newly downloaded file;
# parse each file (ignoring comments), add a primary id, and print out
# information (tab-delimited) to a new file (named the same as $file)
line = ''
primary_id = 0
for current_file in table:
if current_file == 'refSeqSummary' or current_file == '':
continue
print "Inserting unique identifier (line number) into " + current_file
if current_file == 'tax2name':
# Taxonomy is a little different from Entrez Gene
# I need to parse out the information that I need from names.dmp
file_names = downloadDir + '/names.dmp'
with open(SQL + '/tax2name', 'w+') as write:
with open(file_names, 'r') as read:
for line in read:
fields = re.split(r'\t', line)
if re.match(r'scientific', fields[6], flags=re.I):
tax_id = fields[0]
name = fields[2]
write.write(tax_id.strip() + '\t' + name.strip() + '\n')
elif current_file == 'gene2refseq':
file_names = downloadDir + '/' + current_file
with open(SQL + '/' + current_file, 'w+') as write:
with open(file_names, 'r') as read:
primary_id = 0
for line in read:
if re.match(r'^#', line):
continue
primary_id += 1
data = re.split(r'\t', line)
key = str(primary_id) + '\t'
data_array = [key]
for i in range(0, len(data)):
if i == 3 or i == 5 or i == 7:
data[i].replace('.', '')
data_array.append(data[i].strip())
write.write("\t".join(data_array) + '\n')
else:
file_names = downloadDir + '/' + current_file
with open(SQL + '/' + current_file, 'w+') as write:
with open(file_names, 'r') as read:
primary_id = 0
for line in read:
if re.match(r'^#', line):
continue
primary_id += 1
data = re.split(r'\t', line)
key = str(primary_id) + '\t'
data_array = [key]
if current_file == "gene_history" and data[1] == "-":
data[1] = 0
elif current_file == "interactions" and data[5] == "-":
data[5] = 0
for i in range(0, len(data)):
data_array.append(str(data[i]).strip())
write.write("\t".join(data_array) + '\n')
print('\x1b[1;30;43m' + current_file + ' created successfully!' + '\x1b[0m')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment