Skip to content

Instantly share code, notes, and snippets.

@inodb
Last active May 7, 2018 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inodb/26e77fb017d6b7b639a0d17d58b79728 to your computer and use it in GitHub Desktop.
Save inodb/26e77fb017d6b7b639a0d17d58b79728 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check range mrna v2 continous in pancan\n",
"\n",
"Data file generated on cgds_gdac db pipelines with:\n",
"```\n",
"echo \"select * from genetic_alteration where genetic_profile_id in (select genetic_profile_id from genetic_profile where stable_id like '%pan_can%%v2%mrna%' and datatype='CONTINUOUS')\" | mysql -h localhost -u username -ppassword database_name > genetic_alteration_pancan.tsv\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"first_1k = pd.read_csv(\"1000_values.txt\",header=None)[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"all_values = pd.read_csv(\"all_values.txt\",header=None)[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"780000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(first_1k)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"206143299"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(all_values)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-0.99121057347755-7092440.0 (mean: 1004.0774317541006, min >0 : 4.823772759010581e-07)\n"
]
}
],
"source": [
"def print_stats(x):\n",
" print(\"{} - {} (mean: {}, min >0 : {})\".format(\n",
" x.min(),\n",
" x.max(),\n",
" x.mean(),\n",
" x[x>0].min()\n",
" ))\n",
"print_stats(all_values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"plt.hist(all_values[~pd.isnull(all_values)], log=True)\n",
"plt.title('All values mrna v2 TCGA pancan values')\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of values below 0: 1190144 (0.58%) of 206143299 total\n"
]
}
],
"source": [
"print(\"Number of values below 0: {} ({:.2f}%) of {} total\".format(\n",
" len(all_values[all_values < 0]),\n",
" len(all_values[all_values < 0]) * 100.0 / len(all_values),\n",
" len(all_values)\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5,1,'Distribution values < 1 (17.17% of total)')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x1c481b5048>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(all_values[(all_values < 1)], log=True)\n",
"plt.title(\"Distribution values < 1 ({:.2f}% of total)\".format(len(all_values[(all_values < 1)]) * 100.0 / len(all_values)))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of values that are 0: 25329688 (12.29%)\n"
]
}
],
"source": [
"print(\"Number of values that are 0: {} ({:.2f}%)\".format(\n",
" len(all_values[all_values == 0]),\n",
" len(all_values[all_values == 0]) * 100.0 / len(all_values)\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of profiles:\n"
]
}
],
"source": [
"print(\"Number of profiles:\")"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 33\r\n"
]
}
],
"source": [
"!cut -f1 genetic_alteration_pancan.tsv | tail -n+2 | cut -f1 | sort -u | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of profiles with negative values:\n"
]
}
],
"source": [
"print(\"Number of profiles with negative values:\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 8\r\n"
]
}
],
"source": [
"!grep '-' genetic_alteration_pancan.tsv | cut -f1 | sort -u | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x10be16400>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(all_values[~pd.isnull(all_values)], bins=np.logspace(np.log10(0.1),np.log10(1000000), 50))\n",
"plt.gca().set_xscale(\"log\")\n",
"plt.title(\"mrna v2 continuous data in pancan studies\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:anaconda3]",
"language": "python",
"name": "conda-env-anaconda3-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment