sdhutchins/mygene_example.py

## mygene_example.py
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 13 18:18:52 2017

@author: sdhutchins

"""
#------------------------------------------------------------------------------
# Modules Used
#------------------------------------------------------------------------------
import mygene
import csv
import pandas as pd
import sys

# Import mygene.MyGeneInfo() search command
mg = mygene.MyGeneInfo()


#------------------------------------------------------------------------------
# Create a list of gene symbols/names for .csv file
#------------------------------------------------------------------------------
g = open('genes.csv')  # List of genes
genes_list = []   # Initialize a list of genes
genes_list.append('')
file2 = csv.reader(g)
for gene in file2:    # Format a list of genes
    genes = str(gene)
    genes = genes.replace("'", "")
    genes = genes.replace("[", "")
    genes = genes.replace("]", "")
    genes = genes.replace(" ", "_")
    genes_list.append(genes)
print(genes_list)

#------------------------------------------------------------------------------
# Set up Input to start command if gene list is correct
#------------------------------------------------------------------------------
"""
x = str(input('Is the input properly formatted? (Type Yes or No) '))
if x == 'Yes':
    print("\n" + "MyGene will start." + "\n")
else:
     raise SystemExit
"""
#------------------------------------------------------------------------------
# Use MyGene to get gene information
#------------------------------------------------------------------------------
"""
Call querymany method.
Scopes is your query, and it can be "entrezgene", "symbol" such as HTR1A, "mim" for omim id,
and "accession". FOr more, see: http://mygene.info/doc/query_service.html#available_fields
Input is "symbol" in this example. Scroll to the bottom of this script for a list of genes I used.

Set as_dataframe to True will return a pandas dataframe object
Set verbose to False as this will suppress the messages like "finished".
The resuls will be a list of dictionaries.
The dictionary contains the entrezid for the "entrezgene" field.
If you want the ensembl ids, use fields='ensembl.gene'

List of fields: http://mygene.info/metadata/fields
Fields can be set to 'all' for all fields to return.

There are also multiple species available or you can input the Taxonomy ID.

Examples:
entrez_ids = mg.querymany(genes_list, scopes='symbol,ensembl.gene', fields='entrezgene',
                          species='human', returnall=True, as_dataframe=True)

ensembl_ids = mg.querymany(genes_list, scopes='symbol', fields='ensembl.gene',
                          species='9606', returnall=True, as_dataframe=True)
"""
# This creates a dictionary of basic human gene information to be used later
basic_gene_info = mg.querymany(genes_list, scopes='symbol',
                          fields='symbol,name,entrezgene,summary',
                           species='human', returnall=True, as_dataframe=True,
                           size=1)

#------------------------------------------------------------------------------
# Use pandas to turn results of the mygene queries into dataframes
#------------------------------------------------------------------------------
"""
Use dict.keys() or basic_info.keys() to find out what the data keys are.
The data keys will be 'out' for output, 'missing' for any missing genes, 'dup' for any duplicates
Write the dataframe to a csv file using pandas (it saved as a dataframe).
Save the data as a .csv file.
Use df.drop to delete columns of the data you don't want.

Additional dictionary command:
To return a dictionary of MyGene.info metadata, use metadata = mg.metadata
"""
# Turn the dict into a pandas csv file
basic_gene_info['out'].to_csv('basic_gene_info.csv', sep=',', encoding='utf-8')
df = pd.read_csv('basic_gene_info.csv')
data = df
gene_info = pd.DataFrame(data)
gene_info.drop(data.columns[[1,2,6]], axis=1, inplace=True)

# Rename the columns
gene_info.rename(columns={'entrezgene': 'Entrez ID','summary':
    'Gene Summary','query': 'Gene Symbol','name': 'Gene Name'}, inplace=True)

gene_info.to_csv('basic_gene_info.csv', index=False)


"""
List of Genes I used (I saved them to a csv file - genes.csv in this example)
ADRA1A
ADRA1B
ADRA1D
ADRA2A
ADRA2B
CHRM1
CHRM2
CHRM3
CHRM5
CNR1
CNR2
DRD2
DRD3
GABBR2
HTR1A
HTR1D
HTR1F
HTR2A
HTR2B
HTR4
HTR5A
HTR7
OPRK1
OPRM1
"""
	# -- coding: utf-8 --
	"""
	Created on Fri Jan 13 18:18:52 2017

	@author: sdhutchins

	"""
	#------------------------------------------------------------------------------
	# Modules Used
	#------------------------------------------------------------------------------
	import mygene
	import csv
	import pandas as pd
	import sys

	# Import mygene.MyGeneInfo() search command
	mg = mygene.MyGeneInfo()


	#------------------------------------------------------------------------------
	# Create a list of gene symbols/names for .csv file
	#------------------------------------------------------------------------------
	g = open('genes.csv') # List of genes
	genes_list = [] # Initialize a list of genes
	genes_list.append('')
	file2 = csv.reader(g)
	for gene in file2: # Format a list of genes
	genes = str(gene)
	genes = genes.replace("'", "")
	genes = genes.replace("[", "")
	genes = genes.replace("]", "")
	genes = genes.replace(" ", "_")
	genes_list.append(genes)
	print(genes_list)

	#------------------------------------------------------------------------------
	# Set up Input to start command if gene list is correct
	#------------------------------------------------------------------------------
	"""
	x = str(input('Is the input properly formatted? (Type Yes or No) '))
	if x == 'Yes':
	print("\n" + "MyGene will start." + "\n")
	else:
	raise SystemExit
	"""
	#------------------------------------------------------------------------------
	# Use MyGene to get gene information
	#------------------------------------------------------------------------------
	"""
	Call querymany method.
	Scopes is your query, and it can be "entrezgene", "symbol" such as HTR1A, "mim" for omim id,
	and "accession". FOr more, see: http://mygene.info/doc/query_service.html#available_fields
	Input is "symbol" in this example. Scroll to the bottom of this script for a list of genes I used.

	Set as_dataframe to True will return a pandas dataframe object
	Set verbose to False as this will suppress the messages like "finished".
	The resuls will be a list of dictionaries.
	The dictionary contains the entrezid for the "entrezgene" field.
	If you want the ensembl ids, use fields='ensembl.gene'

	List of fields: http://mygene.info/metadata/fields
	Fields can be set to 'all' for all fields to return.

	There are also multiple species available or you can input the Taxonomy ID.

	Examples:
	entrez_ids = mg.querymany(genes_list, scopes='symbol,ensembl.gene', fields='entrezgene',
	species='human', returnall=True, as_dataframe=True)

	ensembl_ids = mg.querymany(genes_list, scopes='symbol', fields='ensembl.gene',
	species='9606', returnall=True, as_dataframe=True)
	"""
	# This creates a dictionary of basic human gene information to be used later
	basic_gene_info = mg.querymany(genes_list, scopes='symbol',
	fields='symbol,name,entrezgene,summary',
	species='human', returnall=True, as_dataframe=True,
	size=1)

	#------------------------------------------------------------------------------
	# Use pandas to turn results of the mygene queries into dataframes
	#------------------------------------------------------------------------------
	"""
	Use dict.keys() or basic_info.keys() to find out what the data keys are.
	The data keys will be 'out' for output, 'missing' for any missing genes, 'dup' for any duplicates
	Write the dataframe to a csv file using pandas (it saved as a dataframe).
	Save the data as a .csv file.
	Use df.drop to delete columns of the data you don't want.

	Additional dictionary command:
	To return a dictionary of MyGene.info metadata, use metadata = mg.metadata
	"""
	# Turn the dict into a pandas csv file
	basic_gene_info['out'].to_csv('basic_gene_info.csv', sep=',', encoding='utf-8')
	df = pd.read_csv('basic_gene_info.csv')
	data = df
	gene_info = pd.DataFrame(data)
	gene_info.drop(data.columns[[1,2,6]], axis=1, inplace=True)

	# Rename the columns
	gene_info.rename(columns={'entrezgene': 'Entrez ID','summary':
	'Gene Summary','query': 'Gene Symbol','name': 'Gene Name'}, inplace=True)

	gene_info.to_csv('basic_gene_info.csv', index=False)


	"""
	List of Genes I used (I saved them to a csv file - genes.csv in this example)
	ADRA1A
	ADRA1B
	ADRA1D
	ADRA2A
	ADRA2B
	CHRM1
	CHRM2
	CHRM3
	CHRM5
	CNR1
	CNR2
	DRD2
	DRD3
	GABBR2
	HTR1A
	HTR1D
	HTR1F
	HTR2A
	HTR2B
	HTR4
	HTR5A
	HTR7
	OPRK1
	OPRM1
	"""