Skip to content

Instantly share code, notes, and snippets.

@OnlyBelter
Created July 6, 2020 09:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OnlyBelter/904c9672b069fd20af749118a859baff to your computer and use it in GitHub Desktop.
Save OnlyBelter/904c9672b069fd20af749118a859baff to your computer and use it in GitHub Desktop.
Calculate TPM in Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# \n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"def read_counts2tpm(df, sample_name):\n",
" \"\"\"\n",
" convert read counts to TPM (transcripts per million)\n",
" :param df: a dataFrame contains the result coming from featureCounts\n",
" :param sample_name: a list, all sample names, same as the result of featureCounts\n",
" :return: TPM\n",
" \"\"\"\n",
" result = df\n",
" sample_reads = result.loc[:, sample_name].copy()\n",
" gene_len = result.loc[:, ['Length']]\n",
" rate = sample_reads.values / gene_len.values\n",
" tpm = rate / np.sum(rate, axis=0).reshape(1, -1) * 1e6\n",
" return pd.DataFrame(data=tpm, columns=sample_name, index=df['Gene'])\n",
"\n",
"def read_counts2rpkm(df, sample_name):\n",
" result = df\n",
" sample_reads = result.loc[:, sample_name].copy()\n",
" gene_len = result.loc[:, ['Length']]\n",
" total_reads = np.sum(sample_reads.values, axis=0).reshape(1, -1)\n",
" rate = sample_reads.values / gene_len.values\n",
" tpm = rate / total_reads * 1e6\n",
" return pd.DataFrame(data=tpm, columns=sample_name, index=df['Gene'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gene</th>\n",
" <th>Length</th>\n",
" <th>S1</th>\n",
" <th>S2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A</td>\n",
" <td>100</td>\n",
" <td>80</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>B</td>\n",
" <td>50</td>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C</td>\n",
" <td>25</td>\n",
" <td>6</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>D</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>E</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>400</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gene Length S1 S2\n",
"0 A 100 80 20\n",
"1 B 50 10 20\n",
"2 C 25 6 10\n",
"3 D 5 3 50\n",
"4 E 1 1 400"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# raw data\n",
"a = pd.DataFrame(data = {\n",
" 'Gene': (\"A\",\"B\",\"C\",\"D\",\"E\"),\n",
" 'Length': (100, 50, 25, 5, 1),\n",
" 'S1': (80, 10, 6, 3, 1),\n",
" 'S2': (20, 20, 10, 50, 400)\n",
"})\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>S1</th>\n",
" <th>S2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Gene</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>281690.140845</td>\n",
" <td>486.618005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>70422.535211</td>\n",
" <td>973.236010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>84507.042254</td>\n",
" <td>973.236010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>211267.605634</td>\n",
" <td>24330.900243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>352112.676056</td>\n",
" <td>973236.009732</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" S1 S2\n",
"Gene \n",
"A 281690.140845 486.618005\n",
"B 70422.535211 973.236010\n",
"C 84507.042254 973.236010\n",
"D 211267.605634 24330.900243\n",
"E 352112.676056 973236.009732"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tpm = read_counts2tpm(a, ['S1', 'S2'])\n",
"tpm"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"S1 1000000.0\n",
"S2 1000000.0\n",
"dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tpm.sum()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"S1 200000.0\n",
"S2 200000.0\n",
"dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tpm.mean()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>S1</th>\n",
" <th>S2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Gene</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>8000.0</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>2000.0</td>\n",
" <td>800.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>2400.0</td>\n",
" <td>800.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>6000.0</td>\n",
" <td>20000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>10000.0</td>\n",
" <td>800000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" S1 S2\n",
"Gene \n",
"A 8000.0 400.0\n",
"B 2000.0 800.0\n",
"C 2400.0 800.0\n",
"D 6000.0 20000.0\n",
"E 10000.0 800000.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rpkm = read_counts2rpkm(df=a, sample_name=['S1', 'S2'])\n",
"rpkm"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"S1 28400.0\n",
"S2 822000.0\n",
"dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rpkm.sum()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"S1 5680.0\n",
"S2 164400.0\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rpkm.mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>S1</th>\n",
" <th>S2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Gene</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>281690.140845</td>\n",
" <td>486.618005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>70422.535211</td>\n",
" <td>973.236010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>84507.042254</td>\n",
" <td>973.236010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>211267.605634</td>\n",
" <td>24330.900243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>352112.676056</td>\n",
" <td>973236.009732</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" S1 S2\n",
"Gene \n",
"A 281690.140845 486.618005\n",
"B 70422.535211 973.236010\n",
"C 84507.042254 973.236010\n",
"D 211267.605634 24330.900243\n",
"E 352112.676056 973236.009732"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# same result as tpm\n",
"rpkm / rpkm.sum() * 1000000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment