MinaGabriel/download_parse_entrezgene.py

## download_parse_entrezgene.py
#!/usr/bin/python

import os
from urllib2 import urlopen, URLError, HTTPError
import re

#  Change information as appropriate

# Where files will be downloaded and unzipped

downloadDir = "./entrez_downloads"
# Where formatted files (ready for database import) will be saved
SQL = "./entrez_sql_ready"

# End of User variables

# list of all tables

# Note: mim2gene removed on 27 June 2014 -- It needs to be downloaded from omim.org (and requires a username/password)
# We can add it back later but as of now we don't use it so let's skip it.

table = [
    'gene2accession',
    'gene_history',
    'gene_info',
    'gene2go',
    'gene2pubmed',
    'gene2refseq',
    'gene2sts',
    'gene2unigene',
    'generifs_basic',
    'interactions',
    'tax2name',
    'refSeqSummary'
]

# downloaded files - zipped


zip_files = [
    'gene_history',
    'gene2accession',
    'gene_info',
    'gene2go',
    'gene2pubmed',
    'gene2refseq'

]

# downloaded files - unzipped

download_files = [
    'gene2sts',
    'gene2unigene'
]

# downloaded files -- RIFs

geneRIF = [
    'generifs_basic',
    'interactions'
]

# Make directories (if needed)
# Else empty both directories

if not os.path.exists(downloadDir):
    os.system('mkdir ' + downloadDir)
else:
    os.system('rm -rf ' + downloadDir + '/*')
if not os.path.exists(SQL):
    os.system('mkdir ' + SQL)
else:
    os.system('rm -rf ' + SQL + '/*')

# using unix command wget - download new zip files
# rm *.zip file after unzipping

for zip in zip_files:
    try:
        ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + zip + ".gz"
        zip_input_file = downloadDir + "/" + zip + ".gz"
        print "Downloading " + zip + " to " + downloadDir + " and gunzipping..."
        file = urlopen(ftp_site)
        with open(zip_input_file, "wb") as local_file:
            local_file.write(file.read())
        os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + zip)
        os.system('rm ' + zip_input_file)
        print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
    except HTTPError, e:
        print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
    except URLError, e:
        print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

# using unix command wget - download new files

for down in download_files:
    try:
        ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + down
        zip_input_file = downloadDir + "/" + down
        print "Downloading " + down + " to " + zip_input_file
        file = urlopen(ftp_site)
        with open(zip_input_file, "wb") as local_file:
            local_file.write(file.read())
        print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
    except HTTPError, e:
        print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
    except URLError, e:
        print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

# using unix command wget - download new zip files
# rm *.zip file after unzipping

for rif in geneRIF:
    try:
        ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/" + rif + ".gz"
        zip_input_file = downloadDir + "/" + rif + ".gz"
        print "Downloading " + rif + " to " + downloadDir + " and gunzipping..."
        file = urlopen(ftp_site)
        with open(zip_input_file, "wb") as local_file:
            local_file.write(file.read())
        os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + rif)
        os.system('rm ' + zip_input_file)
        print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
    except HTTPError, e:
        print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
    except URLError, e:
        print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

# using unix command wget - download new zip files
# rm *.zip file after unzipping
try:
    ftp_site = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
    zip_input_file = downloadDir + "/taxdump.tar.gz"
    print "Downloading taxdump to " + downloadDir + "..."
    file = urlopen(ftp_site)
    with open(zip_input_file, "wb") as local_file:
        local_file.write(file.read())
    os.system('tar -zxf ' + zip_input_file + ' -C ' + downloadDir)
except HTTPError, e:
    print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
except URLError, e:
    print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

#######
#
# INSERT INTO FILES
#
#######

# for each newly downloaded file;
# parse each file (ignoring comments), add a primary id, and print out
# information (tab-delimited) to a new file (named the same as $file)

line = ''
primary_id = 0

for current_file in table:

    if current_file == 'refSeqSummary' or current_file == '':
        continue
    print "Inserting unique identifier (line number) into " + current_file

    if current_file == 'tax2name':
        # Taxonomy is a little different from Entrez Gene
        # I need to parse out the information that I need from names.dmp
        file_names = downloadDir + '/names.dmp'
        with open(SQL + '/tax2name', 'w+') as write:
            with open(file_names, 'r') as read:
                for line in read:
                    fields = re.split(r'\t', line)
                    if re.match(r'scientific', fields[6], flags=re.I):
                        tax_id = fields[0]
                        name = fields[2]
                        write.write(tax_id.strip() + '\t' + name.strip() + '\n')
    elif current_file == 'gene2refseq':
        file_names = downloadDir + '/' + current_file
        with open(SQL + '/' + current_file, 'w+') as write:
            with open(file_names, 'r') as read:
                primary_id = 0
                for line in read:
                    if re.match(r'^#', line):
                        continue
                    primary_id += 1
                    data = re.split(r'\t', line)
                    key = str(primary_id) + '\t'
                    data_array = [key]
                    for i in range(0, len(data)):
                        if i == 3 or i == 5 or i == 7:
                            data[i].replace('.', '')
                    data_array.append(data[i].strip())
                    write.write("\t".join(data_array) + '\n')
    else:
        file_names = downloadDir + '/' + current_file
        with open(SQL + '/' + current_file, 'w+') as write:
            with open(file_names, 'r') as read:
                primary_id = 0
                for line in read:
                    if re.match(r'^#', line):
                        continue
                    primary_id += 1
                    data = re.split(r'\t', line)
                    key = str(primary_id) + '\t'
                    data_array = [key]
                    if current_file == "gene_history" and data[1] == "-":
                        data[1] = 0
                    elif current_file == "interactions" and data[5] == "-":
                        data[5] = 0
                    for i in range(0, len(data)):
                        data_array.append(str(data[i]).strip())
                    write.write("\t".join(data_array) + '\n')
    print('\x1b[1;30;43m' + current_file + ' created successfully!' + '\x1b[0m')
	#!/usr/bin/python

	import os
	from urllib2 import urlopen, URLError, HTTPError
	import re

	# Change information as appropriate

	# Where files will be downloaded and unzipped

	downloadDir = "./entrez_downloads"
	# Where formatted files (ready for database import) will be saved
	SQL = "./entrez_sql_ready"

	# End of User variables

	# list of all tables

	# Note: mim2gene removed on 27 June 2014 -- It needs to be downloaded from omim.org (and requires a username/password)
	# We can add it back later but as of now we don't use it so let's skip it.

	table = [
	'gene2accession',
	'gene_history',
	'gene_info',
	'gene2go',
	'gene2pubmed',
	'gene2refseq',
	'gene2sts',
	'gene2unigene',
	'generifs_basic',
	'interactions',
	'tax2name',
	'refSeqSummary'
	]

	# downloaded files - zipped


	zip_files = [
	'gene_history',
	'gene2accession',
	'gene_info',
	'gene2go',
	'gene2pubmed',
	'gene2refseq'

	]

	# downloaded files - unzipped

	download_files = [
	'gene2sts',
	'gene2unigene'
	]

	# downloaded files -- RIFs

	geneRIF = [
	'generifs_basic',
	'interactions'
	]

	# Make directories (if needed)
	# Else empty both directories

	if not os.path.exists(downloadDir):
	os.system('mkdir ' + downloadDir)
	else:
	os.system('rm -rf ' + downloadDir + '/*')
	if not os.path.exists(SQL):
	os.system('mkdir ' + SQL)
	else:
	os.system('rm -rf ' + SQL + '/*')

	# using unix command wget - download new zip files
	# rm *.zip file after unzipping

	for zip in zip_files:
	try:
	ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + zip + ".gz"
	zip_input_file = downloadDir + "/" + zip + ".gz"
	print "Downloading " + zip + " to " + downloadDir + " and gunzipping..."
	file = urlopen(ftp_site)
	with open(zip_input_file, "wb") as local_file:
	local_file.write(file.read())
	os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + zip)
	os.system('rm ' + zip_input_file)
	print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
	except HTTPError, e:
	print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
	except URLError, e:
	print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

	# using unix command wget - download new files

	for down in download_files:
	try:
	ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" + down
	zip_input_file = downloadDir + "/" + down
	print "Downloading " + down + " to " + zip_input_file
	file = urlopen(ftp_site)
	with open(zip_input_file, "wb") as local_file:
	local_file.write(file.read())
	print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
	except HTTPError, e:
	print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
	except URLError, e:
	print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

	# using unix command wget - download new zip files
	# rm *.zip file after unzipping

	for rif in geneRIF:
	try:
	ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/" + rif + ".gz"
	zip_input_file = downloadDir + "/" + rif + ".gz"
	print "Downloading " + rif + " to " + downloadDir + " and gunzipping..."
	file = urlopen(ftp_site)
	with open(zip_input_file, "wb") as local_file:
	local_file.write(file.read())
	os.system('gunzip -cq ' + zip_input_file + ' > ' + downloadDir + '/' + rif)
	os.system('rm ' + zip_input_file)
	print('\x1b[6;30;42m' + 'Success!' + '\x1b[0m')
	except HTTPError, e:
	print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
	except URLError, e:
	print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

	# using unix command wget - download new zip files
	# rm *.zip file after unzipping
	try:
	ftp_site = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
	zip_input_file = downloadDir + "/taxdump.tar.gz"
	print "Downloading taxdump to " + downloadDir + "..."
	file = urlopen(ftp_site)
	with open(zip_input_file, "wb") as local_file:
	local_file.write(file.read())
	os.system('tar -zxf ' + zip_input_file + ' -C ' + downloadDir)
	except HTTPError, e:
	print('\x1b[0;30;41m' + "HTTP: " + str(e.reason) + '\x1b[0m')
	except URLError, e:
	print('\x1b[0;30;41m' + "Error: " + str(e.reason) + '\x1b[0m')

	#######
	#
	# INSERT INTO FILES
	#
	#######

	# for each newly downloaded file;
	# parse each file (ignoring comments), add a primary id, and print out
	# information (tab-delimited) to a new file (named the same as $file)

	line = ''
	primary_id = 0

	for current_file in table:

	if current_file == 'refSeqSummary' or current_file == '':
	continue
	print "Inserting unique identifier (line number) into " + current_file

	if current_file == 'tax2name':
	# Taxonomy is a little different from Entrez Gene
	# I need to parse out the information that I need from names.dmp
	file_names = downloadDir + '/names.dmp'
	with open(SQL + '/tax2name', 'w+') as write:
	with open(file_names, 'r') as read:
	for line in read:
	fields = re.split(r'\t', line)
	if re.match(r'scientific', fields[6], flags=re.I):
	tax_id = fields[0]
	name = fields[2]
	write.write(tax_id.strip() + '\t' + name.strip() + '\n')
	elif current_file == 'gene2refseq':
	file_names = downloadDir + '/' + current_file
	with open(SQL + '/' + current_file, 'w+') as write:
	with open(file_names, 'r') as read:
	primary_id = 0
	for line in read:
	if re.match(r'^#', line):
	continue
	primary_id += 1
	data = re.split(r'\t', line)
	key = str(primary_id) + '\t'
	data_array = [key]
	for i in range(0, len(data)):
	if i == 3 or i == 5 or i == 7:
	data[i].replace('.', '')
	data_array.append(data[i].strip())
	write.write("\t".join(data_array) + '\n')
	else:
	file_names = downloadDir + '/' + current_file
	with open(SQL + '/' + current_file, 'w+') as write:
	with open(file_names, 'r') as read:
	primary_id = 0
	for line in read:
	if re.match(r'^#', line):
	continue
	primary_id += 1
	data = re.split(r'\t', line)
	key = str(primary_id) + '\t'
	data_array = [key]
	if current_file == "gene_history" and data[1] == "-":
	data[1] = 0
	elif current_file == "interactions" and data[5] == "-":
	data[5] = 0
	for i in range(0, len(data)):
	data_array.append(str(data[i]).strip())
	write.write("\t".join(data_array) + '\n')
	print('\x1b[1;30;43m' + current_file + ' created successfully!' + '\x1b[0m')