acthp/example.ipynb

## example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xenaPython as xena\n",
    "\n",
    "GENES = ['FOXM1', 'TP53']\n",
    "\n",
    "def get_codes(host, dataset, fields, data):\n",
    "    \"get codes for enumerations\"\n",
    "    codes = xena.field_codes(host, dataset, fields)\n",
    "    codes_idx = dict([(x['name'], x['code'].split('\\t')) for x in codes if x['code'] is not None])\n",
    "    for i in range(len(fields)):\n",
    "        if fields[i] in codes_idx:\n",
    "            data[i] = [None if v == 'NaN' else codes_idx[fields[i]][int(v)] for v in data[i]]\n",
    "    return data\n",
    "\n",
    "def get_fields(host, dataset, samples, fields):\n",
    "    \"get field values\"\n",
    "    data = xena.dataset_fetch(host, dataset, samples, fields)\n",
    "    return data\n",
    "\n",
    "def get_fields_and_codes(host, dataset, samples, fields):\n",
    "    \"get fields and resolve codes\"\n",
    "    return get_codes( host, dataset, fields, get_fields( host, dataset, samples, fields))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "# pancanAtlas cohort\n",
    "#\n",
    "\n",
    "cohort = 'TCGA PanCanAtlas'\n",
    "host = xena.PUBLIC_HUBS['pancanAtlasHub']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TCGA-P4-A5E8-11',\n",
       " 'TCGA-EE-A181-06',\n",
       " 'TCGA-AA-3511-11',\n",
       " 'TCGA-BR-8590-01',\n",
       " 'TCGA-06-6390-01',\n",
       " 'TCGA-26-5139-01',\n",
       " 'TCGA-B0-4813-11',\n",
       " 'TCGA-29-1763-01',\n",
       " 'TCGA-D1-A17S-01',\n",
       " 'TCGA-EJ-7797-11']"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get samples in cohort\n",
    "samples = xena.cohort_samples(host, cohort, None)\n",
    "samples[0: 10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[dict_keys(['FOXM1', 'TP53']),\n",
       " 'FOXM1',\n",
       " [5.18, 11.31, 9.65, 9.22, 'NaN', 9.48, 'NaN', 10.27, 8.63, 5.13]]"
      ]
     },
     "execution_count": 190,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get expression for GENES\n",
    "dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'\n",
    "expression = get_fields(host, dataset, samples, GENES) # list of lists.\n",
    "expression_by_gene = dict(zip(GENES, expression))      # index by gene.\n",
    "[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]\n",
    "# note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on\n",
    "# the later analysis tools."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[dict_keys(['samples', 'rows']),\n",
       " 3726,\n",
       " ['TCGA-56-8624-11',\n",
       "  'TCGA-25-1329-01',\n",
       "  'TCGA-HC-8260-11',\n",
       "  'TCGA-AG-3727-01',\n",
       "  'TCGA-DX-A23R-01']]"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get mutation for GENES\n",
    "dataset = 'mc3.v0.2.8.PUBLIC.xena'\n",
    "mutation_columns = xena.sparse_data(host, dataset, samples, GENES)\n",
    "# Two keys are returned: 'rows', which is all the variants (in a column orientation), and \n",
    "# 'samples', which is the list of all samples in the dataset. 'samples' is required in order to\n",
    "# distinguish samples without this assay (not in the dataset) from samples found to have\n",
    "# no mutations in these genes. A sampleID in ['samples'] that is not in ['rows']['sampleID']\n",
    "# was found to have no mutations. A sample not in ['samples'] has no assessment: we can't\n",
    "# say anything about its mutations.\n",
    "samples_without_mutation_data = list(set(samples) - set(mutation_columns['samples']))\n",
    "[mutation_columns.keys(), len(samples_without_mutation_data), samples_without_mutation_data[0: 5]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'alt': 'T',\n",
       "  'altGene': None,\n",
       "  'amino-acid': '',\n",
       "  'dna-vaf': 0.33,\n",
       "  'effect': \"3'UTR\",\n",
       "  'genes': ['FOXM1'],\n",
       "  'position': {'chrom': 'chr12',\n",
       "   'chromend': 2967148,\n",
       "   'chromstart': 2967148,\n",
       "   'strand': '0'},\n",
       "  'ref': 'G',\n",
       "  'rna-vaf': None,\n",
       "  'sampleID': 'TCGA-EY-A1GI-01'},\n",
       " {'alt': 'G',\n",
       "  'altGene': None,\n",
       "  'amino-acid': '',\n",
       "  'dna-vaf': 0.46,\n",
       "  'effect': \"3'UTR\",\n",
       "  'genes': ['FOXM1'],\n",
       "  'position': {'chrom': 'chr12',\n",
       "   'chromend': 2967228,\n",
       "   'chromstart': 2967228,\n",
       "   'strand': '0'},\n",
       "  'ref': 'T',\n",
       "  'rna-vaf': None,\n",
       "  'sampleID': 'TCGA-EY-A5W2-01'}]"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a row orientation takes more memory, and won't fit in a numpy array, but is easier\n",
    "# to view.\n",
    "rows = mutation_columns['rows']\n",
    "keys = rows.keys()\n",
    "mutations = [dict(zip(keys, [rows[k][i] for k in keys])) for i in range(len(rows['sampleID']))]\n",
    "# You might want to groupby ['genes'][0] at this point, to build per-gene stats. ['genes'] is a\n",
    "# list because in the general case a variant can hit mutiple genes. For this dataset, gene-level\n",
    "# non-silent mutations, they do not.\n",
    "mutations[0: 2]                                                                    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['KIRP', 'SKCM', 'COAD', 'STAD', 'GBM', 'GBM', 'KIRC', 'OV', 'UCEC', 'PRAD']"
      ]
     },
     "execution_count": 193,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get disease type and survival columns\n",
    "dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp'\n",
    "fields = ['cancer type abbreviation', 'OS', 'OS.time']\n",
    "values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
    "phenotypes = dict(zip(fields, values)) # index by phenotype\n",
    "phenotypes['cancer type abbreviation'][0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Additional - New Primary',\n",
       " 'Additional Metastatic',\n",
       " 'Metastatic',\n",
       " None,\n",
       " 'Primary Blood Derived Cancer - Peripheral Blood',\n",
       " 'Primary Tumor',\n",
       " 'Recurrent Tumor',\n",
       " 'Solid Tissue Normal'}"
      ]
     },
     "execution_count": 194,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get sample type. TCGA includes a few \"normal\" tissue samples. These normals are of\n",
    "# limited value because there are few of them, and they are not entirely normal, being\n",
    "# taken from disease tissue, outside of the visible tumor. It's often best to omit them.\n",
    "dataset = 'TCGA_phenotype_denseDataOnlyDownload.tsv'\n",
    "fields = ['sample_type']\n",
    "values = get_fields_and_codes(host, dataset, samples, fields)\n",
    "set(values[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1475,\n",
       " ['TCGA-P4-A5E8-11',\n",
       "  'TCGA-AA-3511-11',\n",
       "  'TCGA-B0-4813-11',\n",
       "  'TCGA-EJ-7797-11',\n",
       "  'TCGA-CV-7406-11']]"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples_to_omit = [samples[i] for i in range(len(samples)) if values[0][i] == 'Solid Tissue Normal']\n",
    "[len(samples_to_omit), samples_to_omit[0: 5]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "pancan_summary = {\n",
    "    'samples': samples,\n",
    "    'expression': expression_by_gene,\n",
    "    'mutations': mutations,\n",
    "    'samples_without_mutation_data': samples_without_mutation_data,\n",
    "    'phenotypes': phenotypes,\n",
    "    'samples_to_omit': samples_to_omit\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#\n",
    "# TCGA TARGET GTEx\n",
    "#\n",
    "cohort = 'TCGA TARGET GTEx'\n",
    "host = xena.PUBLIC_HUBS['toilHub']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TCGA-BR-8590-01',\n",
       " 'TCGA-P4-A5E8-11',\n",
       " 'TCGA-61-1727-01',\n",
       " 'GTEX-QCQG-0326-SM-2I3ES',\n",
       " 'TCGA-CN-5361-01',\n",
       " 'GTEX-1399Q-2326-SM-5KM2X',\n",
       " 'TCGA-D1-A17H-01',\n",
       " 'TCGA-D1-A17S-01',\n",
       " 'TCGA-EJ-7797-11',\n",
       " 'GTEX-11DXY-0006-SM-5NQ8N']"
      ]
     },
     "execution_count": 198,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get samples in cohort\n",
    "samples = xena.cohort_samples(host, cohort, None)\n",
    "samples[0: 10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[dict_keys(['FOXM1', 'TP53']),\n",
       " 'FOXM1',\n",
       " [10.5357,\n",
       "  5.5018,\n",
       "  'NaN',\n",
       "  4.4925,\n",
       "  12.0133,\n",
       "  9.4554,\n",
       "  'NaN',\n",
       "  'NaN',\n",
       "  7.0413,\n",
       "  6.3072]]"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = 'TcgaTargetGtex_gene_expected_count'\n",
    "# This dataset is not in HUGO space, so we have to use a mapping dataset to connect\n",
    "# gene names to probes. You can determine the namespace of dataset probes by inspecting\n",
    "# probemap (ID/Gene Mapping) metadata for the dataset.\n",
    "# https://xenabrowser.net/datapages/?host=https%3A%2F%2Ftoil.xenahubs.net&dataset=TcgaTargetGtex_gene_expected_count\n",
    "# The dataset_gene_probe_avg query resolves probes for a gene. \n",
    "expression = xena.dataset_gene_probe_avg(host, dataset, samples, GENES)\n",
    "expression_by_gene = dict([(g['gene'], g['scores'][0]) for g in expression])\n",
    "[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TCGA', 'TCGA', None, 'GTEX', 'TCGA', 'GTEX', None, None, 'TCGA', 'GTEX']"
      ]
     },
     "execution_count": 200,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = 'TcgaTargetGTEX_phenotype.txt'\n",
    "fields = ['_study', '_sample_type']\n",
    "# As in pancan, there are normal samples in tcga which should probably be removed. _sample_type will\n",
    "# identify normals. _study will identify tcga vs. gtex vs. target.\n",
    "values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
    "phenotypes = dict(zip(fields, values)) # index by phenotype\n",
    "phenotypes['_study'][0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "toil_summary = {\n",
    "    'samples': samples,\n",
    "    'expression': expression_by_gene,\n",
    "    'phenotypes': phenotypes\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 187,
	"metadata": {},
	"outputs": [],
	"source": [
	"import xenaPython as xena\n",
	"\n",
	"GENES = ['FOXM1', 'TP53']\n",
	"\n",
	"def get_codes(host, dataset, fields, data):\n",
	" \"get codes for enumerations\"\n",
	" codes = xena.field_codes(host, dataset, fields)\n",
	" codes_idx = dict([(x['name'], x['code'].split('\\t')) for x in codes if x['code'] is not None])\n",
	" for i in range(len(fields)):\n",
	" if fields[i] in codes_idx:\n",
	" data[i] = [None if v == 'NaN' else codes_idx[fields[i]][int(v)] for v in data[i]]\n",
	" return data\n",
	"\n",
	"def get_fields(host, dataset, samples, fields):\n",
	" \"get field values\"\n",
	" data = xena.dataset_fetch(host, dataset, samples, fields)\n",
	" return data\n",
	"\n",
	"def get_fields_and_codes(host, dataset, samples, fields):\n",
	" \"get fields and resolve codes\"\n",
	" return get_codes( host, dataset, fields, get_fields( host, dataset, samples, fields))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 188,
	"metadata": {},
	"outputs": [],
	"source": [
	"#\n",
	"# pancanAtlas cohort\n",
	"#\n",
	"\n",
	"cohort = 'TCGA PanCanAtlas'\n",
	"host = xena.PUBLIC_HUBS['pancanAtlasHub']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 189,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['TCGA-P4-A5E8-11',\n",
	" 'TCGA-EE-A181-06',\n",
	" 'TCGA-AA-3511-11',\n",
	" 'TCGA-BR-8590-01',\n",
	" 'TCGA-06-6390-01',\n",
	" 'TCGA-26-5139-01',\n",
	" 'TCGA-B0-4813-11',\n",
	" 'TCGA-29-1763-01',\n",
	" 'TCGA-D1-A17S-01',\n",
	" 'TCGA-EJ-7797-11']"
	]
	},
	"execution_count": 189,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get samples in cohort\n",
	"samples = xena.cohort_samples(host, cohort, None)\n",
	"samples[0: 10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 190,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[dict_keys(['FOXM1', 'TP53']),\n",
	" 'FOXM1',\n",
	" [5.18, 11.31, 9.65, 9.22, 'NaN', 9.48, 'NaN', 10.27, 8.63, 5.13]]"
	]
	},
	"execution_count": 190,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get expression for GENES\n",
	"dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'\n",
	"expression = get_fields(host, dataset, samples, GENES) # list of lists.\n",
	"expression_by_gene = dict(zip(GENES, expression)) # index by gene.\n",
	"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]\n",
	"# note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on\n",
	"# the later analysis tools."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 191,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[dict_keys(['samples', 'rows']),\n",
	" 3726,\n",
	" ['TCGA-56-8624-11',\n",
	" 'TCGA-25-1329-01',\n",
	" 'TCGA-HC-8260-11',\n",
	" 'TCGA-AG-3727-01',\n",
	" 'TCGA-DX-A23R-01']]"
	]
	},
	"execution_count": 191,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get mutation for GENES\n",
	"dataset = 'mc3.v0.2.8.PUBLIC.xena'\n",
	"mutation_columns = xena.sparse_data(host, dataset, samples, GENES)\n",
	"# Two keys are returned: 'rows', which is all the variants (in a column orientation), and \n",
	"# 'samples', which is the list of all samples in the dataset. 'samples' is required in order to\n",
	"# distinguish samples without this assay (not in the dataset) from samples found to have\n",
	"# no mutations in these genes. A sampleID in ['samples'] that is not in ['rows']['sampleID']\n",
	"# was found to have no mutations. A sample not in ['samples'] has no assessment: we can't\n",
	"# say anything about its mutations.\n",
	"samples_without_mutation_data = list(set(samples) - set(mutation_columns['samples']))\n",
	"[mutation_columns.keys(), len(samples_without_mutation_data), samples_without_mutation_data[0: 5]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 192,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'alt': 'T',\n",
	" 'altGene': None,\n",
	" 'amino-acid': '',\n",
	" 'dna-vaf': 0.33,\n",
	" 'effect': \"3'UTR\",\n",
	" 'genes': ['FOXM1'],\n",
	" 'position': {'chrom': 'chr12',\n",
	" 'chromend': 2967148,\n",
	" 'chromstart': 2967148,\n",
	" 'strand': '0'},\n",
	" 'ref': 'G',\n",
	" 'rna-vaf': None,\n",
	" 'sampleID': 'TCGA-EY-A1GI-01'},\n",
	" {'alt': 'G',\n",
	" 'altGene': None,\n",
	" 'amino-acid': '',\n",
	" 'dna-vaf': 0.46,\n",
	" 'effect': \"3'UTR\",\n",
	" 'genes': ['FOXM1'],\n",
	" 'position': {'chrom': 'chr12',\n",
	" 'chromend': 2967228,\n",
	" 'chromstart': 2967228,\n",
	" 'strand': '0'},\n",
	" 'ref': 'T',\n",
	" 'rna-vaf': None,\n",
	" 'sampleID': 'TCGA-EY-A5W2-01'}]"
	]
	},
	"execution_count": 192,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# a row orientation takes more memory, and won't fit in a numpy array, but is easier\n",
	"# to view.\n",
	"rows = mutation_columns['rows']\n",
	"keys = rows.keys()\n",
	"mutations = [dict(zip(keys, [rows[k][i] for k in keys])) for i in range(len(rows['sampleID']))]\n",
	"# You might want to groupby ['genes'][0] at this point, to build per-gene stats. ['genes'] is a\n",
	"# list because in the general case a variant can hit mutiple genes. For this dataset, gene-level\n",
	"# non-silent mutations, they do not.\n",
	"mutations[0: 2] "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 193,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['KIRP', 'SKCM', 'COAD', 'STAD', 'GBM', 'GBM', 'KIRC', 'OV', 'UCEC', 'PRAD']"
	]
	},
	"execution_count": 193,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get disease type and survival columns\n",
	"dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp'\n",
	"fields = ['cancer type abbreviation', 'OS', 'OS.time']\n",
	"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
	"phenotypes = dict(zip(fields, values)) # index by phenotype\n",
	"phenotypes['cancer type abbreviation'][0:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 194,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'Additional - New Primary',\n",
	" 'Additional Metastatic',\n",
	" 'Metastatic',\n",
	" None,\n",
	" 'Primary Blood Derived Cancer - Peripheral Blood',\n",
	" 'Primary Tumor',\n",
	" 'Recurrent Tumor',\n",
	" 'Solid Tissue Normal'}"
	]
	},
	"execution_count": 194,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get sample type. TCGA includes a few \"normal\" tissue samples. These normals are of\n",
	"# limited value because there are few of them, and they are not entirely normal, being\n",
	"# taken from disease tissue, outside of the visible tumor. It's often best to omit them.\n",
	"dataset = 'TCGA_phenotype_denseDataOnlyDownload.tsv'\n",
	"fields = ['sample_type']\n",
	"values = get_fields_and_codes(host, dataset, samples, fields)\n",
	"set(values[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 195,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[1475,\n",
	" ['TCGA-P4-A5E8-11',\n",
	" 'TCGA-AA-3511-11',\n",
	" 'TCGA-B0-4813-11',\n",
	" 'TCGA-EJ-7797-11',\n",
	" 'TCGA-CV-7406-11']]"
	]
	},
	"execution_count": 195,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"samples_to_omit = [samples[i] for i in range(len(samples)) if values[0][i] == 'Solid Tissue Normal']\n",
	"[len(samples_to_omit), samples_to_omit[0: 5]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 196,
	"metadata": {},
	"outputs": [],
	"source": [
	"pancan_summary = {\n",
	" 'samples': samples,\n",
	" 'expression': expression_by_gene,\n",
	" 'mutations': mutations,\n",
	" 'samples_without_mutation_data': samples_without_mutation_data,\n",
	" 'phenotypes': phenotypes,\n",
	" 'samples_to_omit': samples_to_omit\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 197,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#\n",
	"# TCGA TARGET GTEx\n",
	"#\n",
	"cohort = 'TCGA TARGET GTEx'\n",
	"host = xena.PUBLIC_HUBS['toilHub']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 198,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['TCGA-BR-8590-01',\n",
	" 'TCGA-P4-A5E8-11',\n",
	" 'TCGA-61-1727-01',\n",
	" 'GTEX-QCQG-0326-SM-2I3ES',\n",
	" 'TCGA-CN-5361-01',\n",
	" 'GTEX-1399Q-2326-SM-5KM2X',\n",
	" 'TCGA-D1-A17H-01',\n",
	" 'TCGA-D1-A17S-01',\n",
	" 'TCGA-EJ-7797-11',\n",
	" 'GTEX-11DXY-0006-SM-5NQ8N']"
	]
	},
	"execution_count": 198,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# get samples in cohort\n",
	"samples = xena.cohort_samples(host, cohort, None)\n",
	"samples[0: 10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 199,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[dict_keys(['FOXM1', 'TP53']),\n",
	" 'FOXM1',\n",
	" [10.5357,\n",
	" 5.5018,\n",
	" 'NaN',\n",
	" 4.4925,\n",
	" 12.0133,\n",
	" 9.4554,\n",
	" 'NaN',\n",
	" 'NaN',\n",
	" 7.0413,\n",
	" 6.3072]]"
	]
	},
	"execution_count": 199,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset = 'TcgaTargetGtex_gene_expected_count'\n",
	"# This dataset is not in HUGO space, so we have to use a mapping dataset to connect\n",
	"# gene names to probes. You can determine the namespace of dataset probes by inspecting\n",
	"# probemap (ID/Gene Mapping) metadata for the dataset.\n",
	"# https://xenabrowser.net/datapages/?host=https%3A%2F%2Ftoil.xenahubs.net&dataset=TcgaTargetGtex_gene_expected_count\n",
	"# The dataset_gene_probe_avg query resolves probes for a gene. \n",
	"expression = xena.dataset_gene_probe_avg(host, dataset, samples, GENES)\n",
	"expression_by_gene = dict([(g['gene'], g['scores'][0]) for g in expression])\n",
	"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 200,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['TCGA', 'TCGA', None, 'GTEX', 'TCGA', 'GTEX', None, None, 'TCGA', 'GTEX']"
	]
	},
	"execution_count": 200,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset = 'TcgaTargetGTEX_phenotype.txt'\n",
	"fields = ['_study', '_sample_type']\n",
	"# As in pancan, there are normal samples in tcga which should probably be removed. _sample_type will\n",
	"# identify normals. _study will identify tcga vs. gtex vs. target.\n",
	"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
	"phenotypes = dict(zip(fields, values)) # index by phenotype\n",
	"phenotypes['_study'][0:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 201,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"toil_summary = {\n",
	" 'samples': samples,\n",
	" 'expression': expression_by_gene,\n",
	" 'phenotypes': phenotypes\n",
	"}"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}