Skip to content

Instantly share code, notes, and snippets.

@grisaitis
Created August 25, 2022 15:13
Show Gist options
  • Save grisaitis/d30b1609046b598927dc1c49653747d3 to your computer and use it in GitHub Desktop.
Save grisaitis/d30b1609046b598927dc1c49653747d3 to your computer and use it in GitHub Desktop.
demo of GDC's google buckets with TCGA data, and their BigQuery tables with metadata thereof
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "44490386-8e95-42c2-bbf0-5a56a03848ac",
"metadata": {},
"source": [
"### Note: requires `pandas-gbq`\n",
"\n",
"```\n",
"conda install -c conda-forge pandas-gbq\n",
"```\n",
"\n",
"or\n",
"\n",
"```\n",
"pip install pandas-gbq\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "495649d9-ed04-482f-956a-8861c432d7f5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9264e8c5-568d-481e-b388-55016a711764",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading: 100%|██████████| 10/10 [00:00<00:00, 67.92rows/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_type_name</th>\n",
" <th>count_</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Primary solid Tumor</td>\n",
" <td>10841</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Blood Derived Normal</td>\n",
" <td>9395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Solid Tissue Normal</td>\n",
" <td>2726</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Metastatic</td>\n",
" <td>397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Primary Blood Derived Cancer - Peripheral Blood</td>\n",
" <td>356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Recurrent Solid Tumor</td>\n",
" <td>60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Additional - New Primary</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Buccal Cell Normal</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Bone Marrow Normal</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Additional Metastatic</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample_type_name count_\n",
"0 Primary solid Tumor 10841\n",
"1 Blood Derived Normal 9395\n",
"2 Solid Tissue Normal 2726\n",
"3 Metastatic 397\n",
"4 Primary Blood Derived Cancer - Peripheral Blood 356\n",
"5 Recurrent Solid Tumor 60\n",
"6 Additional - New Primary 11\n",
"7 Buccal Cell Normal 5\n",
"8 Bone Marrow Normal 4\n",
"9 Additional Metastatic 2"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"select \n",
" sample_type_name,\n",
" count(*) as count_\n",
"from isb-cgc-bq.TCGA.biospecimen_gdc_current\n",
"group by 1\n",
"order by 2 desc\n",
"\"\"\"\n",
"\n",
"pd.read_gbq(query, progress_bar_type=\"tqdm\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "565c7431-694a-4008-a944-444b633cb197",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading: 100%|██████████| 4/4 [00:00<00:00, 22.75rows/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_type_name</th>\n",
" <th>count_</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Primary solid Tumor</td>\n",
" <td>602</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Blood Derived Normal</td>\n",
" <td>425</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Solid Tissue Normal</td>\n",
" <td>274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Recurrent Solid Tumor</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample_type_name count_\n",
"0 Primary solid Tumor 602\n",
"1 Blood Derived Normal 425\n",
"2 Solid Tissue Normal 274\n",
"3 Recurrent Solid Tumor 2"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"select \n",
" sample_type_name,\n",
" count(*) as count_\n",
"from isb-cgc-bq.TCGA.biospecimen_gdc_current\n",
"where project_short_name = 'TCGA-LUAD'\n",
"group by 1\n",
"order by 2 desc\n",
"\"\"\"\n",
"\n",
"pd.read_gbq(query, progress_bar_type=\"tqdm\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3e6bda65-9afd-4c18-8653-49ae170b27ed",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading: 100%|██████████| 1303/1303 [00:00<00:00, 2617.58rows/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_barcode</th>\n",
" <th>sample_gdc_id</th>\n",
" <th>case_barcode</th>\n",
" <th>case_gdc_id</th>\n",
" <th>sample_type</th>\n",
" <th>sample_type_name</th>\n",
" <th>program_name</th>\n",
" <th>project_short_name</th>\n",
" <th>batch_number</th>\n",
" <th>bcr</th>\n",
" <th>...</th>\n",
" <th>max_percent_tumor_cells</th>\n",
" <th>max_percent_tumor_nuclei</th>\n",
" <th>min_percent_lymphocyte_infiltration</th>\n",
" <th>min_percent_monocyte_infiltration</th>\n",
" <th>min_percent_necrosis</th>\n",
" <th>min_percent_neutrophil_infiltration</th>\n",
" <th>min_percent_normal_cells</th>\n",
" <th>min_percent_stromal_cells</th>\n",
" <th>min_percent_tumor_cells</th>\n",
" <th>min_percent_tumor_nuclei</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TCGA-55-7914-01A</td>\n",
" <td>356e0f37-c095-455e-b3ea-252afa34806e</td>\n",
" <td>TCGA-55-7914</td>\n",
" <td>69dba721-a168-47a4-b7ff-80a448bad654</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>196</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>90.0</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>90.0</td>\n",
" <td>60.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCGA-93-8067-10A</td>\n",
" <td>4d489693-6c2d-486f-a33b-70a760f9ce8c</td>\n",
" <td>TCGA-93-8067</td>\n",
" <td>bbe88801-34f3-46d2-bbfd-b46c3901ed71</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>222</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TCGA-55-6981-01A</td>\n",
" <td>2b87b12c-5ca7-4492-a2f6-26f9644e1d5a</td>\n",
" <td>TCGA-55-6981</td>\n",
" <td>0a45f302-5748-48f3-9dc9-66c01843a68e</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>160</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>100.0</td>\n",
" <td>72.0</td>\n",
" <td>16.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>100.0</td>\n",
" <td>60.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TCGA-75-5147-01A</td>\n",
" <td>f9f9c1dd-8462-4560-82b8-aea4309854c9</td>\n",
" <td>TCGA-75-5147</td>\n",
" <td>d2824e6d-3784-45c2-9b0f-52b17356b5da</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>84</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>70.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>80.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TCGA-91-8496-01A</td>\n",
" <td>8f55069f-2232-44b6-b9e8-7e12b3a268bd</td>\n",
" <td>TCGA-91-8496</td>\n",
" <td>656a5eb4-e4a5-4d21-a800-3586f4d6588b</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>238</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>75.0</td>\n",
" <td>75.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>15.0</td>\n",
" <td>10.0</td>\n",
" <td>30.0</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1298</th>\n",
" <td>TCGA-97-A4M5-01A</td>\n",
" <td>091e0712-7fde-442c-aef1-5c504f91b227</td>\n",
" <td>TCGA-97-A4M5</td>\n",
" <td>5fe77d4a-a8a5-4c90-8ff2-9c3bbbb309ef</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>264</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>35.0</td>\n",
" <td>75.0</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>20.0</td>\n",
" <td>1.0</td>\n",
" <td>18.0</td>\n",
" <td>27.0</td>\n",
" <td>35.0</td>\n",
" <td>75.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>TCGA-50-5930-11A</td>\n",
" <td>4cddb392-fc4b-4400-a44f-2e92a78b3e97</td>\n",
" <td>TCGA-50-5930</td>\n",
" <td>368e23f0-e573-4547-bf5a-14080baf737b</td>\n",
" <td>11</td>\n",
" <td>Solid Tissue Normal</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>119</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1300</th>\n",
" <td>TCGA-78-7145-01A</td>\n",
" <td>6b0b11c9-2adc-4b55-b7bb-cf67b11cab86</td>\n",
" <td>TCGA-78-7145</td>\n",
" <td>4ef872e1-82c9-4939-9248-41ed9d3085b2</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>166</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>85.0</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>75.0</td>\n",
" <td>75.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1301</th>\n",
" <td>TCGA-05-4403-01A</td>\n",
" <td>44b1d21c-0794-428c-ba01-82812d74da2f</td>\n",
" <td>TCGA-05-4403</td>\n",
" <td>ce15f31f-2bad-4485-96fa-495bfa262e66</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>58</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>75.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" <td>15.0</td>\n",
" <td>80.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1302</th>\n",
" <td>TCGA-91-6828-10A</td>\n",
" <td>4a3b397b-ad28-4817-a3ec-ef0b80996384</td>\n",
" <td>TCGA-91-6828</td>\n",
" <td>9536e32d-2707-48d2-a36d-08c521665bb9</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>TCGA</td>\n",
" <td>TCGA-LUAD</td>\n",
" <td>144</td>\n",
" <td>Nationwide Children's Hospital</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1303 rows × 39 columns</p>\n",
"</div>"
],
"text/plain": [
" sample_barcode sample_gdc_id case_barcode \\\n",
"0 TCGA-55-7914-01A 356e0f37-c095-455e-b3ea-252afa34806e TCGA-55-7914 \n",
"1 TCGA-93-8067-10A 4d489693-6c2d-486f-a33b-70a760f9ce8c TCGA-93-8067 \n",
"2 TCGA-55-6981-01A 2b87b12c-5ca7-4492-a2f6-26f9644e1d5a TCGA-55-6981 \n",
"3 TCGA-75-5147-01A f9f9c1dd-8462-4560-82b8-aea4309854c9 TCGA-75-5147 \n",
"4 TCGA-91-8496-01A 8f55069f-2232-44b6-b9e8-7e12b3a268bd TCGA-91-8496 \n",
"... ... ... ... \n",
"1298 TCGA-97-A4M5-01A 091e0712-7fde-442c-aef1-5c504f91b227 TCGA-97-A4M5 \n",
"1299 TCGA-50-5930-11A 4cddb392-fc4b-4400-a44f-2e92a78b3e97 TCGA-50-5930 \n",
"1300 TCGA-78-7145-01A 6b0b11c9-2adc-4b55-b7bb-cf67b11cab86 TCGA-78-7145 \n",
"1301 TCGA-05-4403-01A 44b1d21c-0794-428c-ba01-82812d74da2f TCGA-05-4403 \n",
"1302 TCGA-91-6828-10A 4a3b397b-ad28-4817-a3ec-ef0b80996384 TCGA-91-6828 \n",
"\n",
" case_gdc_id sample_type sample_type_name \\\n",
"0 69dba721-a168-47a4-b7ff-80a448bad654 01 Primary solid Tumor \n",
"1 bbe88801-34f3-46d2-bbfd-b46c3901ed71 10 Blood Derived Normal \n",
"2 0a45f302-5748-48f3-9dc9-66c01843a68e 01 Primary solid Tumor \n",
"3 d2824e6d-3784-45c2-9b0f-52b17356b5da 01 Primary solid Tumor \n",
"4 656a5eb4-e4a5-4d21-a800-3586f4d6588b 01 Primary solid Tumor \n",
"... ... ... ... \n",
"1298 5fe77d4a-a8a5-4c90-8ff2-9c3bbbb309ef 01 Primary solid Tumor \n",
"1299 368e23f0-e573-4547-bf5a-14080baf737b 11 Solid Tissue Normal \n",
"1300 4ef872e1-82c9-4939-9248-41ed9d3085b2 01 Primary solid Tumor \n",
"1301 ce15f31f-2bad-4485-96fa-495bfa262e66 01 Primary solid Tumor \n",
"1302 9536e32d-2707-48d2-a36d-08c521665bb9 10 Blood Derived Normal \n",
"\n",
" program_name project_short_name batch_number \\\n",
"0 TCGA TCGA-LUAD 196 \n",
"1 TCGA TCGA-LUAD 222 \n",
"2 TCGA TCGA-LUAD 160 \n",
"3 TCGA TCGA-LUAD 84 \n",
"4 TCGA TCGA-LUAD 238 \n",
"... ... ... ... \n",
"1298 TCGA TCGA-LUAD 264 \n",
"1299 TCGA TCGA-LUAD 119 \n",
"1300 TCGA TCGA-LUAD 166 \n",
"1301 TCGA TCGA-LUAD 58 \n",
"1302 TCGA TCGA-LUAD 144 \n",
"\n",
" bcr ... max_percent_tumor_cells \\\n",
"0 Nationwide Children's Hospital ... 90.0 \n",
"1 Nationwide Children's Hospital ... NaN \n",
"2 Nationwide Children's Hospital ... 100.0 \n",
"3 Nationwide Children's Hospital ... 80.0 \n",
"4 Nationwide Children's Hospital ... 75.0 \n",
"... ... ... ... \n",
"1298 Nationwide Children's Hospital ... 35.0 \n",
"1299 Nationwide Children's Hospital ... NaN \n",
"1300 Nationwide Children's Hospital ... 85.0 \n",
"1301 Nationwide Children's Hospital ... 80.0 \n",
"1302 Nationwide Children's Hospital ... NaN \n",
"\n",
" max_percent_tumor_nuclei min_percent_lymphocyte_infiltration \\\n",
"0 60.0 NaN \n",
"1 NaN NaN \n",
"2 72.0 16.0 \n",
"3 70.0 NaN \n",
"4 75.0 NaN \n",
"... ... ... \n",
"1298 75.0 3.0 \n",
"1299 NaN NaN \n",
"1300 80.0 NaN \n",
"1301 75.0 NaN \n",
"1302 NaN NaN \n",
"\n",
" min_percent_monocyte_infiltration min_percent_necrosis \\\n",
"0 NaN 0.0 \n",
"1 NaN NaN \n",
"2 0.0 0.0 \n",
"3 NaN 5.0 \n",
"4 NaN 0.0 \n",
"... ... ... \n",
"1298 3.0 20.0 \n",
"1299 NaN NaN \n",
"1300 NaN 5.0 \n",
"1301 NaN 0.0 \n",
"1302 NaN NaN \n",
"\n",
" min_percent_neutrophil_infiltration min_percent_normal_cells \\\n",
"0 NaN 0.0 \n",
"1 NaN NaN \n",
"2 0.0 0.0 \n",
"3 NaN 0.0 \n",
"4 NaN 15.0 \n",
"... ... ... \n",
"1298 1.0 18.0 \n",
"1299 NaN NaN \n",
"1300 NaN 0.0 \n",
"1301 NaN 5.0 \n",
"1302 NaN NaN \n",
"\n",
" min_percent_stromal_cells min_percent_tumor_cells \\\n",
"0 10.0 90.0 \n",
"1 NaN NaN \n",
"2 0.0 100.0 \n",
"3 10.0 80.0 \n",
"4 10.0 30.0 \n",
"... ... ... \n",
"1298 27.0 35.0 \n",
"1299 NaN NaN \n",
"1300 10.0 75.0 \n",
"1301 15.0 80.0 \n",
"1302 NaN NaN \n",
"\n",
" min_percent_tumor_nuclei \n",
"0 60.0 \n",
"1 NaN \n",
"2 60.0 \n",
"3 65.0 \n",
"4 40.0 \n",
"... ... \n",
"1298 75.0 \n",
"1299 NaN \n",
"1300 75.0 \n",
"1301 65.0 \n",
"1302 NaN \n",
"\n",
"[1303 rows x 39 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"select *\n",
"from isb-cgc-bq.TCGA.biospecimen_gdc_current\n",
"where project_short_name = 'TCGA-LUAD'\n",
"\"\"\"\n",
"\n",
"pd.read_gbq(\n",
" query,\n",
" progress_bar_type=\"tqdm\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8f989ad3-f520-434a-8589-24af44168efa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading: 100%|██████████| 31635/31635 [00:04<00:00, 7699.01rows/s]\n"
]
}
],
"source": [
"query = \"\"\"\n",
"SELECT \n",
" a.sample_barcode,\n",
" a.sample_type,\n",
" a.sample_type_name,\n",
" a.is_ffpe,\n",
" b.data_type,\n",
" b.data_category,\n",
" b.experimental_strategy,\n",
" b.file_type,\n",
" b.file_size,\n",
" b.data_format,\n",
" b.platform,\n",
" b.file_name_key,\n",
" b.access,\n",
" b.acl\n",
"from `isb-cgc-bq.TCGA.biospecimen_gdc_current` a\n",
" join `isb-cgc-bq.TCGA.per_sample_file_metadata_hg19_gdc_current` b\n",
" on a.sample_barcode = b.sample_barcode\n",
"where a.project_short_name = 'TCGA-LUAD'\n",
"order by a.sample_barcode\n",
"\"\"\"\n",
"\n",
"df_genomic_stuff_hg19 = pd.read_gbq(\n",
" query,\n",
" progress_bar_type=\"tqdm\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d9233d59-7c8c-4a8a-9443-4c9078430b4e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_barcode</th>\n",
" <th>sample_type</th>\n",
" <th>sample_type_name</th>\n",
" <th>is_ffpe</th>\n",
" <th>data_type</th>\n",
" <th>data_category</th>\n",
" <th>experimental_strategy</th>\n",
" <th>file_type</th>\n",
" <th>file_size</th>\n",
" <th>data_format</th>\n",
" <th>platform</th>\n",
" <th>file_name_key</th>\n",
" <th>access</th>\n",
" <th>acl</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Genotypes</td>\n",
" <td>Simple nucleotide variation</td>\n",
" <td>Genotyping array</td>\n",
" <td>file</td>\n",
" <td>20850936</td>\n",
" <td>TXT</td>\n",
" <td>Affymetrix SNP Array 6.0</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/0e668a9c-92...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned reads</td>\n",
" <td>Raw sequencing data</td>\n",
" <td>miRNA-Seq</td>\n",
" <td>file</td>\n",
" <td>108192253</td>\n",
" <td>BAM</td>\n",
" <td>Illumina GA</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/2c4897d4-20...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned reads</td>\n",
" <td>Raw sequencing data</td>\n",
" <td>RNA-Seq</td>\n",
" <td>file</td>\n",
" <td>9339790525</td>\n",
" <td>BAM</td>\n",
" <td>Illumina HiSeq</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/LUAD/RNA/RN...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned reads</td>\n",
" <td>Raw sequencing data</td>\n",
" <td>miRNA-Seq</td>\n",
" <td>file</td>\n",
" <td>107422900</td>\n",
" <td>BAM</td>\n",
" <td>Illumina GA</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/LUAD/Total_...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned reads</td>\n",
" <td>Raw sequencing data</td>\n",
" <td>WXS</td>\n",
" <td>file</td>\n",
" <td>22424966198</td>\n",
" <td>BAM</td>\n",
" <td>Illumina HiSeq</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/LUAD/DNA/WX...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31630</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Aligned reads</td>\n",
" <td>Raw sequencing data</td>\n",
" <td>WXS</td>\n",
" <td>file</td>\n",
" <td>9682094455</td>\n",
" <td>BAM</td>\n",
" <td>Illumina HiSeq</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/LUAD/DNA/WX...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31631</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Raw intensities</td>\n",
" <td>Raw microarray data</td>\n",
" <td>Genotyping array</td>\n",
" <td>file</td>\n",
" <td>69094361</td>\n",
" <td>CEL</td>\n",
" <td>Affymetrix SNP Array 6.0</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/40fd2151-d8...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31632</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Normalized copy numbers</td>\n",
" <td>Copy number variation</td>\n",
" <td>Genotyping array</td>\n",
" <td>file</td>\n",
" <td>62338924</td>\n",
" <td>TXT</td>\n",
" <td>Affymetrix SNP Array 6.0</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/44b407cf-eb...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31633</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Simple nucleotide variation</td>\n",
" <td>Simple nucleotide variation</td>\n",
" <td>DNA-Seq</td>\n",
" <td>file</td>\n",
" <td>6885761</td>\n",
" <td>VCF</td>\n",
" <td>Illumina GA</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/127a5ca6-9a...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31634</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Simple nucleotide variation</td>\n",
" <td>Simple nucleotide variation</td>\n",
" <td>DNA-Seq</td>\n",
" <td>file</td>\n",
" <td>13689</td>\n",
" <td>VCF</td>\n",
" <td>Illumina GA</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/f7f9051d-e2...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>31635 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" sample_barcode sample_type sample_type_name is_ffpe \\\n",
"0 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"1 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"2 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"3 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"4 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"... ... ... ... ... \n",
"31630 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"31631 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"31632 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"31633 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"31634 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"\n",
" data_type data_category \\\n",
"0 Genotypes Simple nucleotide variation \n",
"1 Aligned reads Raw sequencing data \n",
"2 Aligned reads Raw sequencing data \n",
"3 Aligned reads Raw sequencing data \n",
"4 Aligned reads Raw sequencing data \n",
"... ... ... \n",
"31630 Aligned reads Raw sequencing data \n",
"31631 Raw intensities Raw microarray data \n",
"31632 Normalized copy numbers Copy number variation \n",
"31633 Simple nucleotide variation Simple nucleotide variation \n",
"31634 Simple nucleotide variation Simple nucleotide variation \n",
"\n",
" experimental_strategy file_type file_size data_format \\\n",
"0 Genotyping array file 20850936 TXT \n",
"1 miRNA-Seq file 108192253 BAM \n",
"2 RNA-Seq file 9339790525 BAM \n",
"3 miRNA-Seq file 107422900 BAM \n",
"4 WXS file 22424966198 BAM \n",
"... ... ... ... ... \n",
"31630 WXS file 9682094455 BAM \n",
"31631 Genotyping array file 69094361 CEL \n",
"31632 Genotyping array file 62338924 TXT \n",
"31633 DNA-Seq file 6885761 VCF \n",
"31634 DNA-Seq file 13689 VCF \n",
"\n",
" platform \\\n",
"0 Affymetrix SNP Array 6.0 \n",
"1 Illumina GA \n",
"2 Illumina HiSeq \n",
"3 Illumina GA \n",
"4 Illumina HiSeq \n",
"... ... \n",
"31630 Illumina HiSeq \n",
"31631 Affymetrix SNP Array 6.0 \n",
"31632 Affymetrix SNP Array 6.0 \n",
"31633 Illumina GA \n",
"31634 Illumina GA \n",
"\n",
" file_name_key access \\\n",
"0 gs://gdc-tcga-phs000178-controlled/0e668a9c-92... controlled \n",
"1 gs://gdc-tcga-phs000178-controlled/2c4897d4-20... controlled \n",
"2 gs://gdc-tcga-phs000178-controlled/LUAD/RNA/RN... controlled \n",
"3 gs://gdc-tcga-phs000178-controlled/LUAD/Total_... controlled \n",
"4 gs://gdc-tcga-phs000178-controlled/LUAD/DNA/WX... controlled \n",
"... ... ... \n",
"31630 gs://gdc-tcga-phs000178-controlled/LUAD/DNA/WX... controlled \n",
"31631 gs://gdc-tcga-phs000178-controlled/40fd2151-d8... controlled \n",
"31632 gs://gdc-tcga-phs000178-controlled/44b407cf-eb... controlled \n",
"31633 gs://gdc-tcga-phs000178-controlled/127a5ca6-9a... controlled \n",
"31634 gs://gdc-tcga-phs000178-controlled/f7f9051d-e2... controlled \n",
"\n",
" acl \n",
"0 phs000178 \n",
"1 phs000178 \n",
"2 phs000178 \n",
"3 phs000178 \n",
"4 phs000178 \n",
"... ... \n",
"31630 phs000178 \n",
"31631 phs000178 \n",
"31632 phs000178 \n",
"31633 phs000178 \n",
"31634 phs000178 \n",
"\n",
"[31635 rows x 14 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_genomic_stuff_hg19"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "03a5e622-10fa-4997-89f8-d3b9cb16254f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"access data_format experimental_strategy\n",
"controlled BAM Bisulfite-Seq 6\n",
" RNA-Seq 639\n",
" VALIDATION 445\n",
" WGS 382\n",
" WXS 1377\n",
" miRNA-Seq 1019\n",
" CEL Genotyping array 1171\n",
" FASTQ RNA-Seq 601\n",
" TXT Genotyping array 5783\n",
" VCF Bisulfite-Seq 6\n",
" DNA-Seq 6324\n",
"open BED Bisulfite-Seq 6\n",
" TXT Gene expression array 96\n",
" Genotyping array 4564\n",
" Methylation array 1314\n",
" RNA-Seq 3942\n",
" Total RNA-Seq 144\n",
" WGS 258\n",
" miRNA-Seq 2244\n",
" idat Methylation array 1314\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_genomic_stuff_hg19[\n",
" [\n",
" \"access\",\n",
" \"data_format\",\n",
" \"experimental_strategy\",\n",
" ]\n",
"].value_counts(sort=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3403cbb9-0674-48c9-89ae-17452b2055ee",
"metadata": {},
"outputs": [],
"source": [
"import pathlib"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7e8c7a19-64f0-4599-9711-eefd4a0d9274",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gs://gdc-tcga-phs000178-open/ffd741d6-d3b8-4074-a11a-2fe5598f4f80/unc.edu.e6a101b9-61f9-4ed1-a59f-d9db3fdb4555.1213505.rsem.genes.results\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gene_id</th>\n",
" <th>raw_count</th>\n",
" <th>scaled_estimate</th>\n",
" <th>transcript_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5519</th>\n",
" <td>ELF3|1999</td>\n",
" <td>35952.00</td>\n",
" <td>3.225764e-04</td>\n",
" <td>uc001gxg.3,uc001gxh.3,uc001gxi.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6312</th>\n",
" <td>FCER2|2208</td>\n",
" <td>29.00</td>\n",
" <td>5.727658e-07</td>\n",
" <td>uc002mhm.2,uc002mhn.2,uc010dvo.2,uc010xjs.1,uc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13248</th>\n",
" <td>PEX11B|8799</td>\n",
" <td>4752.00</td>\n",
" <td>8.984239e-05</td>\n",
" <td>uc001eny.1,uc010oyu.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6642</th>\n",
" <td>FOXRED2|80020</td>\n",
" <td>1141.00</td>\n",
" <td>6.996941e-06</td>\n",
" <td>uc003apm.3,uc003apn.3,uc003apo.3,uc003app.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20089</th>\n",
" <td>ZNF323|64288</td>\n",
" <td>1012.00</td>\n",
" <td>9.271469e-06</td>\n",
" <td>uc003nla.2,uc003nlb.2,uc003nlc.2,uc003nld.2,uc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>431</th>\n",
" <td>AGPAT2|10555</td>\n",
" <td>6829.00</td>\n",
" <td>1.354104e-04</td>\n",
" <td>uc004cii.1,uc004cij.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5295</th>\n",
" <td>DYDC2|84332</td>\n",
" <td>76.42</td>\n",
" <td>1.235370e-06</td>\n",
" <td>uc001kbz.1,uc001kca.1,uc001kcb.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9773</th>\n",
" <td>LOC100272228|100272228</td>\n",
" <td>551.00</td>\n",
" <td>6.170962e-06</td>\n",
" <td>uc004fea.2,uc004feb.2,uc004fec.1,uc004fed.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4017</th>\n",
" <td>COBLL1|22837</td>\n",
" <td>1448.00</td>\n",
" <td>8.531628e-06</td>\n",
" <td>uc002ucn.2,uc002uco.2,uc002ucp.2,uc002ucq.2,uc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8845</th>\n",
" <td>KCNT1|57582</td>\n",
" <td>0.00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>uc004cgo.1,uc010nbf.2,uc011mdq.1,uc011mdr.1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20531 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" gene_id raw_count scaled_estimate \\\n",
"5519 ELF3|1999 35952.00 3.225764e-04 \n",
"6312 FCER2|2208 29.00 5.727658e-07 \n",
"13248 PEX11B|8799 4752.00 8.984239e-05 \n",
"6642 FOXRED2|80020 1141.00 6.996941e-06 \n",
"20089 ZNF323|64288 1012.00 9.271469e-06 \n",
"... ... ... ... \n",
"431 AGPAT2|10555 6829.00 1.354104e-04 \n",
"5295 DYDC2|84332 76.42 1.235370e-06 \n",
"9773 LOC100272228|100272228 551.00 6.170962e-06 \n",
"4017 COBLL1|22837 1448.00 8.531628e-06 \n",
"8845 KCNT1|57582 0.00 0.000000e+00 \n",
"\n",
" transcript_id \n",
"5519 uc001gxg.3,uc001gxh.3,uc001gxi.3 \n",
"6312 uc002mhm.2,uc002mhn.2,uc010dvo.2,uc010xjs.1,uc... \n",
"13248 uc001eny.1,uc010oyu.1 \n",
"6642 uc003apm.3,uc003apn.3,uc003apo.3,uc003app.3 \n",
"20089 uc003nla.2,uc003nlb.2,uc003nlc.2,uc003nld.2,uc... \n",
"... ... \n",
"431 uc004cii.1,uc004cij.1 \n",
"5295 uc001kbz.1,uc001kca.1,uc001kcb.1 \n",
"9773 uc004fea.2,uc004feb.2,uc004fec.1,uc004fed.1 \n",
"4017 uc002ucn.2,uc002uco.2,uc002ucp.2,uc002ucq.2,uc... \n",
"8845 uc004cgo.1,uc010nbf.2,uc011mdq.1,uc011mdr.1 \n",
"\n",
"[20531 rows x 4 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uri = df_genomic_stuff_hg19.loc[\n",
" lambda row: row[\"file_name_key\"].str.match(\".*genes\\.results$\")\n",
"].iloc[0][\"file_name_key\"]\n",
"\n",
"print(uri)\n",
"\n",
"pd.read_csv(uri, sep=\"\\t\").sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "aa7b2109-df64-4aab-951c-bd0b6c48034e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading: 100%|██████████| 36926/36926 [00:04<00:00, 8081.68rows/s]\n"
]
}
],
"source": [
"query = \"\"\"\n",
"SELECT \n",
" a.sample_barcode,\n",
" a.sample_type,\n",
" a.sample_type_name,\n",
" a.is_ffpe,\n",
" b.data_type,\n",
" b.data_category,\n",
" b.experimental_strategy,\n",
" b.file_type,\n",
" b.file_size,\n",
" b.data_format,\n",
" b.platform,\n",
" b.file_name_key,\n",
" b.access,\n",
" b.acl\n",
"from `isb-cgc-bq.TCGA.biospecimen_gdc_current` a\n",
" join `isb-cgc-bq.TCGA.per_sample_file_metadata_hg38_gdc_current` b\n",
" on a.sample_barcode = b.sample_barcode\n",
"where a.project_short_name = 'TCGA-LUAD'\n",
"order by a.sample_barcode\n",
"\"\"\"\n",
"\n",
"df_genomic_stuff_hg38 = pd.read_gbq(\n",
" query,\n",
" progress_bar_type=\"tqdm\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "17d8e116-11a6-45f7-afc2-e654564c45a8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_barcode</th>\n",
" <th>sample_type</th>\n",
" <th>sample_type_name</th>\n",
" <th>is_ffpe</th>\n",
" <th>data_type</th>\n",
" <th>data_category</th>\n",
" <th>experimental_strategy</th>\n",
" <th>file_type</th>\n",
" <th>file_size</th>\n",
" <th>data_format</th>\n",
" <th>platform</th>\n",
" <th>file_name_key</th>\n",
" <th>access</th>\n",
" <th>acl</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Slide Image</td>\n",
" <td>Biospecimen</td>\n",
" <td>Tissue Slide</td>\n",
" <td>slide_image</td>\n",
" <td>34081131</td>\n",
" <td>SVS</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-open/65d182ad-0f56-42f...</td>\n",
" <td>open</td>\n",
" <td>open</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Slide Image</td>\n",
" <td>Biospecimen</td>\n",
" <td>Tissue Slide</td>\n",
" <td>slide_image</td>\n",
" <td>156857419</td>\n",
" <td>SVS</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-open/55277208-4cb2-441...</td>\n",
" <td>open</td>\n",
" <td>open</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned Reads</td>\n",
" <td>Sequencing Reads</td>\n",
" <td>WXS</td>\n",
" <td>aligned_reads</td>\n",
" <td>26658195494</td>\n",
" <td>BAM</td>\n",
" <td>Illumina</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/0c92ec5d-5c...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned Reads</td>\n",
" <td>Sequencing Reads</td>\n",
" <td>RNA-Seq</td>\n",
" <td>aligned_reads</td>\n",
" <td>19123725739</td>\n",
" <td>BAM</td>\n",
" <td>Illumina</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/3fef6a25-0f...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TCGA-05-4244-01A</td>\n",
" <td>01</td>\n",
" <td>Primary solid Tumor</td>\n",
" <td>NO</td>\n",
" <td>Aligned Reads</td>\n",
" <td>Sequencing Reads</td>\n",
" <td>miRNA-Seq</td>\n",
" <td>aligned_reads</td>\n",
" <td>78929804</td>\n",
" <td>BAM</td>\n",
" <td>Illumina</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/97b66e25-06...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36921</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Raw Simple Somatic Mutation</td>\n",
" <td>Simple Nucleotide Variation</td>\n",
" <td>WXS</td>\n",
" <td>simple_somatic_mutation</td>\n",
" <td>118243</td>\n",
" <td>VCF</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/21e7cbdb-ec...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36922</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Raw Simple Somatic Mutation</td>\n",
" <td>Simple Nucleotide Variation</td>\n",
" <td>WXS</td>\n",
" <td>simple_somatic_mutation</td>\n",
" <td>150025</td>\n",
" <td>VCF</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/da3a7488-1d...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36923</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Raw Simple Somatic Mutation</td>\n",
" <td>Simple Nucleotide Variation</td>\n",
" <td>WXS</td>\n",
" <td>simple_somatic_mutation</td>\n",
" <td>25753</td>\n",
" <td>VCF</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/ad68d502-93...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36924</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Raw Simple Somatic Mutation</td>\n",
" <td>Simple Nucleotide Variation</td>\n",
" <td>WXS</td>\n",
" <td>simple_somatic_mutation</td>\n",
" <td>23785</td>\n",
" <td>VCF</td>\n",
" <td>None</td>\n",
" <td>gs://gdc-tcga-phs000178-controlled/89cb4cd6-f4...</td>\n",
" <td>controlled</td>\n",
" <td>phs000178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36925</th>\n",
" <td>TCGA-S2-AA1A-10A</td>\n",
" <td>10</td>\n",
" <td>Blood Derived Normal</td>\n",
" <td>NO</td>\n",
" <td>Allele-specific Copy Number Segment</td>\n",
" <td>Copy Number Variation</td>\n",
" <td>Genotyping Array</td>\n",
" <td>copy_number_segment</td>\n",
" <td>8958</td>\n",
" <td>TXT</td>\n",
" <td>Affymetrix SNP 6.0</td>\n",
" <td>gs://gdc-tcga-phs000178-open/234d31fd-2f43-4bf...</td>\n",
" <td>open</td>\n",
" <td>open</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>36926 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" sample_barcode sample_type sample_type_name is_ffpe \\\n",
"0 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"1 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"2 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"3 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"4 TCGA-05-4244-01A 01 Primary solid Tumor NO \n",
"... ... ... ... ... \n",
"36921 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"36922 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"36923 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"36924 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"36925 TCGA-S2-AA1A-10A 10 Blood Derived Normal NO \n",
"\n",
" data_type data_category \\\n",
"0 Slide Image Biospecimen \n",
"1 Slide Image Biospecimen \n",
"2 Aligned Reads Sequencing Reads \n",
"3 Aligned Reads Sequencing Reads \n",
"4 Aligned Reads Sequencing Reads \n",
"... ... ... \n",
"36921 Raw Simple Somatic Mutation Simple Nucleotide Variation \n",
"36922 Raw Simple Somatic Mutation Simple Nucleotide Variation \n",
"36923 Raw Simple Somatic Mutation Simple Nucleotide Variation \n",
"36924 Raw Simple Somatic Mutation Simple Nucleotide Variation \n",
"36925 Allele-specific Copy Number Segment Copy Number Variation \n",
"\n",
" experimental_strategy file_type file_size data_format \\\n",
"0 Tissue Slide slide_image 34081131 SVS \n",
"1 Tissue Slide slide_image 156857419 SVS \n",
"2 WXS aligned_reads 26658195494 BAM \n",
"3 RNA-Seq aligned_reads 19123725739 BAM \n",
"4 miRNA-Seq aligned_reads 78929804 BAM \n",
"... ... ... ... ... \n",
"36921 WXS simple_somatic_mutation 118243 VCF \n",
"36922 WXS simple_somatic_mutation 150025 VCF \n",
"36923 WXS simple_somatic_mutation 25753 VCF \n",
"36924 WXS simple_somatic_mutation 23785 VCF \n",
"36925 Genotyping Array copy_number_segment 8958 TXT \n",
"\n",
" platform file_name_key \\\n",
"0 None gs://gdc-tcga-phs000178-open/65d182ad-0f56-42f... \n",
"1 None gs://gdc-tcga-phs000178-open/55277208-4cb2-441... \n",
"2 Illumina gs://gdc-tcga-phs000178-controlled/0c92ec5d-5c... \n",
"3 Illumina gs://gdc-tcga-phs000178-controlled/3fef6a25-0f... \n",
"4 Illumina gs://gdc-tcga-phs000178-controlled/97b66e25-06... \n",
"... ... ... \n",
"36921 None gs://gdc-tcga-phs000178-controlled/21e7cbdb-ec... \n",
"36922 None gs://gdc-tcga-phs000178-controlled/da3a7488-1d... \n",
"36923 None gs://gdc-tcga-phs000178-controlled/ad68d502-93... \n",
"36924 None gs://gdc-tcga-phs000178-controlled/89cb4cd6-f4... \n",
"36925 Affymetrix SNP 6.0 gs://gdc-tcga-phs000178-open/234d31fd-2f43-4bf... \n",
"\n",
" access acl \n",
"0 open open \n",
"1 open open \n",
"2 controlled phs000178 \n",
"3 controlled phs000178 \n",
"4 controlled phs000178 \n",
"... ... ... \n",
"36921 controlled phs000178 \n",
"36922 controlled phs000178 \n",
"36923 controlled phs000178 \n",
"36924 controlled phs000178 \n",
"36925 open open \n",
"\n",
"[36926 rows x 14 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_genomic_stuff_hg38"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "80adfbfc-5bc8-450a-9e44-75ec7746ce11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"access data_format experimental_strategy\n",
"controlled BAM ATAC-Seq 22\n",
" RNA-Seq 1794\n",
" WGS 196\n",
" WXS 1300\n",
" miRNA-Seq 567\n",
" BEDPE RNA-Seq 1196\n",
" MAF WXS 7536\n",
" TSV RNA-Seq 1794\n",
" VCF WXS 12600\n",
"open IDAT Methylation Array 1314\n",
" MAF WXS 1236\n",
" SVS Tissue Slide 1067\n",
" TSV RNA-Seq 598\n",
" TXT Genotyping Array 3915\n",
" Methylation Array 657\n",
" miRNA-Seq 1134\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_genomic_stuff_hg38[\n",
" [\n",
" \"access\",\n",
" \"data_format\",\n",
" \"experimental_strategy\",\n",
" ]\n",
"].value_counts(sort=False)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "397d99c3-426e-49d8-8287-50e72cd0e19a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gs://gdc-tcga-phs000178-open/ef337612-6a73-4c29-a8b0-85557cbeaff4/e0e055b6-6800-40e7-bde5-718823408f0c.rna_seq.augmented_star_gene_counts.tsv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gene_id</th>\n",
" <th>gene_name</th>\n",
" <th>gene_type</th>\n",
" <th>unstranded</th>\n",
" <th>stranded_first</th>\n",
" <th>stranded_second</th>\n",
" <th>tpm_unstranded</th>\n",
" <th>fpkm_unstranded</th>\n",
" <th>fpkm_uq_unstranded</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ENSG00000000003.15</td>\n",
" <td>TSPAN6</td>\n",
" <td>protein_coding</td>\n",
" <td>5001</td>\n",
" <td>2509</td>\n",
" <td>2492</td>\n",
" <td>62.5468</td>\n",
" <td>17.8238</td>\n",
" <td>18.5999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>ENSG00000000005.6</td>\n",
" <td>TNMD</td>\n",
" <td>protein_coding</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>ENSG00000000419.13</td>\n",
" <td>DPM1</td>\n",
" <td>protein_coding</td>\n",
" <td>1452</td>\n",
" <td>760</td>\n",
" <td>692</td>\n",
" <td>68.2465</td>\n",
" <td>19.4480</td>\n",
" <td>20.2948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>ENSG00000000457.14</td>\n",
" <td>SCYL3</td>\n",
" <td>protein_coding</td>\n",
" <td>1308</td>\n",
" <td>1023</td>\n",
" <td>1005</td>\n",
" <td>10.7808</td>\n",
" <td>3.0722</td>\n",
" <td>3.2059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>ENSG00000000460.17</td>\n",
" <td>C1orf112</td>\n",
" <td>protein_coding</td>\n",
" <td>789</td>\n",
" <td>787</td>\n",
" <td>816</td>\n",
" <td>7.4976</td>\n",
" <td>2.1366</td>\n",
" <td>2.2296</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60654</th>\n",
" <td>ENSG00000288661.1</td>\n",
" <td>AL451106.1</td>\n",
" <td>protein_coding</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60659</th>\n",
" <td>ENSG00000288669.1</td>\n",
" <td>AC008763.4</td>\n",
" <td>protein_coding</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60661</th>\n",
" <td>ENSG00000288671.1</td>\n",
" <td>AC006486.3</td>\n",
" <td>protein_coding</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60662</th>\n",
" <td>ENSG00000288674.1</td>\n",
" <td>AL391628.1</td>\n",
" <td>protein_coding</td>\n",
" <td>25</td>\n",
" <td>14</td>\n",
" <td>11</td>\n",
" <td>0.1474</td>\n",
" <td>0.0420</td>\n",
" <td>0.0438</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60663</th>\n",
" <td>ENSG00000288675.1</td>\n",
" <td>AP006621.6</td>\n",
" <td>protein_coding</td>\n",
" <td>17</td>\n",
" <td>31</td>\n",
" <td>20</td>\n",
" <td>0.5741</td>\n",
" <td>0.1636</td>\n",
" <td>0.1707</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19962 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" gene_id gene_name gene_type unstranded \\\n",
"4 ENSG00000000003.15 TSPAN6 protein_coding 5001 \n",
"5 ENSG00000000005.6 TNMD protein_coding 0 \n",
"6 ENSG00000000419.13 DPM1 protein_coding 1452 \n",
"7 ENSG00000000457.14 SCYL3 protein_coding 1308 \n",
"8 ENSG00000000460.17 C1orf112 protein_coding 789 \n",
"... ... ... ... ... \n",
"60654 ENSG00000288661.1 AL451106.1 protein_coding 0 \n",
"60659 ENSG00000288669.1 AC008763.4 protein_coding 0 \n",
"60661 ENSG00000288671.1 AC006486.3 protein_coding 0 \n",
"60662 ENSG00000288674.1 AL391628.1 protein_coding 25 \n",
"60663 ENSG00000288675.1 AP006621.6 protein_coding 17 \n",
"\n",
" stranded_first stranded_second tpm_unstranded fpkm_unstranded \\\n",
"4 2509 2492 62.5468 17.8238 \n",
"5 0 0 0.0000 0.0000 \n",
"6 760 692 68.2465 19.4480 \n",
"7 1023 1005 10.7808 3.0722 \n",
"8 787 816 7.4976 2.1366 \n",
"... ... ... ... ... \n",
"60654 0 0 0.0000 0.0000 \n",
"60659 0 0 0.0000 0.0000 \n",
"60661 0 0 0.0000 0.0000 \n",
"60662 14 11 0.1474 0.0420 \n",
"60663 31 20 0.5741 0.1636 \n",
"\n",
" fpkm_uq_unstranded \n",
"4 18.5999 \n",
"5 0.0000 \n",
"6 20.2948 \n",
"7 3.2059 \n",
"8 2.2296 \n",
"... ... \n",
"60654 0.0000 \n",
"60659 0.0000 \n",
"60661 0.0000 \n",
"60662 0.0438 \n",
"60663 0.1707 \n",
"\n",
"[19962 rows x 9 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uri = df_genomic_stuff_hg38.query(\n",
" \"access == 'open' and experimental_strategy == 'RNA-Seq'\"\n",
").iloc[0][\"file_name_key\"]\n",
"\n",
"print(uri)\n",
"\n",
"pd.read_csv(uri, sep=\"\\t\", skiprows=1).query(\"gene_type == 'protein_coding'\")"
]
}
],
"metadata": {
"environment": {
"kernel": "conda-env-deconv-py",
"name": "common-cpu.m91",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m91"
},
"kernelspec": {
"display_name": "Python [conda env:deconv]",
"language": "python",
"name": "conda-env-deconv-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment