Created
July 6, 2020 09:18
-
-
Save OnlyBelter/904c9672b069fd20af749118a859baff to your computer and use it in GitHub Desktop.
Calculate TPM in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# \n", | |
"\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"\n", | |
"def read_counts2tpm(df, sample_name):\n", | |
" \"\"\"\n", | |
" convert read counts to TPM (transcripts per million)\n", | |
" :param df: a dataFrame contains the result coming from featureCounts\n", | |
" :param sample_name: a list, all sample names, same as the result of featureCounts\n", | |
" :return: TPM\n", | |
" \"\"\"\n", | |
" result = df\n", | |
" sample_reads = result.loc[:, sample_name].copy()\n", | |
" gene_len = result.loc[:, ['Length']]\n", | |
" rate = sample_reads.values / gene_len.values\n", | |
" tpm = rate / np.sum(rate, axis=0).reshape(1, -1) * 1e6\n", | |
" return pd.DataFrame(data=tpm, columns=sample_name, index=df['Gene'])\n", | |
"\n", | |
"def read_counts2rpkm(df, sample_name):\n", | |
" result = df\n", | |
" sample_reads = result.loc[:, sample_name].copy()\n", | |
" gene_len = result.loc[:, ['Length']]\n", | |
" total_reads = np.sum(sample_reads.values, axis=0).reshape(1, -1)\n", | |
" rate = sample_reads.values / gene_len.values\n", | |
" tpm = rate / total_reads * 1e6\n", | |
" return pd.DataFrame(data=tpm, columns=sample_name, index=df['Gene'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Gene</th>\n", | |
" <th>Length</th>\n", | |
" <th>S1</th>\n", | |
" <th>S2</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" <td>100</td>\n", | |
" <td>80</td>\n", | |
" <td>20</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B</td>\n", | |
" <td>50</td>\n", | |
" <td>10</td>\n", | |
" <td>20</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>C</td>\n", | |
" <td>25</td>\n", | |
" <td>6</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>D</td>\n", | |
" <td>5</td>\n", | |
" <td>3</td>\n", | |
" <td>50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>E</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>400</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Gene Length S1 S2\n", | |
"0 A 100 80 20\n", | |
"1 B 50 10 20\n", | |
"2 C 25 6 10\n", | |
"3 D 5 3 50\n", | |
"4 E 1 1 400" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# raw data\n", | |
"a = pd.DataFrame(data = {\n", | |
" 'Gene': (\"A\",\"B\",\"C\",\"D\",\"E\"),\n", | |
" 'Length': (100, 50, 25, 5, 1),\n", | |
" 'S1': (80, 10, 6, 3, 1),\n", | |
" 'S2': (20, 20, 10, 50, 400)\n", | |
"})\n", | |
"a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>S1</th>\n", | |
" <th>S2</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Gene</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <td>281690.140845</td>\n", | |
" <td>486.618005</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>B</th>\n", | |
" <td>70422.535211</td>\n", | |
" <td>973.236010</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>C</th>\n", | |
" <td>84507.042254</td>\n", | |
" <td>973.236010</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>D</th>\n", | |
" <td>211267.605634</td>\n", | |
" <td>24330.900243</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>E</th>\n", | |
" <td>352112.676056</td>\n", | |
" <td>973236.009732</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" S1 S2\n", | |
"Gene \n", | |
"A 281690.140845 486.618005\n", | |
"B 70422.535211 973.236010\n", | |
"C 84507.042254 973.236010\n", | |
"D 211267.605634 24330.900243\n", | |
"E 352112.676056 973236.009732" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tpm = read_counts2tpm(a, ['S1', 'S2'])\n", | |
"tpm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"S1 1000000.0\n", | |
"S2 1000000.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tpm.sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"S1 200000.0\n", | |
"S2 200000.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tpm.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>S1</th>\n", | |
" <th>S2</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Gene</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <td>8000.0</td>\n", | |
" <td>400.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>B</th>\n", | |
" <td>2000.0</td>\n", | |
" <td>800.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>C</th>\n", | |
" <td>2400.0</td>\n", | |
" <td>800.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>D</th>\n", | |
" <td>6000.0</td>\n", | |
" <td>20000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>E</th>\n", | |
" <td>10000.0</td>\n", | |
" <td>800000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" S1 S2\n", | |
"Gene \n", | |
"A 8000.0 400.0\n", | |
"B 2000.0 800.0\n", | |
"C 2400.0 800.0\n", | |
"D 6000.0 20000.0\n", | |
"E 10000.0 800000.0" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rpkm = read_counts2rpkm(df=a, sample_name=['S1', 'S2'])\n", | |
"rpkm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"S1 28400.0\n", | |
"S2 822000.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rpkm.sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"S1 5680.0\n", | |
"S2 164400.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rpkm.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>S1</th>\n", | |
" <th>S2</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Gene</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <td>281690.140845</td>\n", | |
" <td>486.618005</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>B</th>\n", | |
" <td>70422.535211</td>\n", | |
" <td>973.236010</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>C</th>\n", | |
" <td>84507.042254</td>\n", | |
" <td>973.236010</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>D</th>\n", | |
" <td>211267.605634</td>\n", | |
" <td>24330.900243</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>E</th>\n", | |
" <td>352112.676056</td>\n", | |
" <td>973236.009732</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" S1 S2\n", | |
"Gene \n", | |
"A 281690.140845 486.618005\n", | |
"B 70422.535211 973.236010\n", | |
"C 84507.042254 973.236010\n", | |
"D 211267.605634 24330.900243\n", | |
"E 352112.676056 973236.009732" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# same result as tpm\n", | |
"rpkm / rpkm.sum() * 1000000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment