Skip to content

Instantly share code, notes, and snippets.

@brianray
Last active June 10, 2018 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brianray/4ce15234e6ac2975b335c8d90a4b6882 to your computer and use it in GitHub Desktop.
Save brianray/4ce15234e6ac2975b335c8d90a4b6882 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"This is a re-implementation of some Pythona and R code based off the article, https://shiring.github.io/r_vs_python/2017/01/22/R_vs_Py_post which was based off of https://www.toptal.com/python/comprehensive-introduction-your-genome-scipy\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# !wget ftp://ftp.ensembl.org/pub/release-85/gff3/homo_sapiens/Homo_sapiens.GRCh38.85.gff3.gz"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"%load_ext rpy2.ipython"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"%%R\n",
"library(dplyr)\n",
"library(ggplot2)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"global df\n",
"col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']\n",
"df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',\n",
" sep='\\t', comment='#', low_memory=False,\n",
" header=None, names=col_names)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" seqid source type start end score strand phase\n",
"1 1 GRCh38 chromosome 1 248956422 . . .\n",
"2 1 . biological_region 10469 11240 1.3e+03 . .\n",
"3 1 . biological_region 10650 10657 0.999 + .\n",
"4 1 . biological_region 10655 10657 0.999 - .\n",
"5 1 . biological_region 10678 10687 0.999 + .\n",
"6 1 . biological_region 10681 10688 0.999 - .\n",
" attributes\n",
"1 ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11\n",
"2 external_name=oe %3D 0.79;logic_name=cpg\n",
"3 logic_name=eponine\n",
"4 logic_name=eponine\n",
"5 logic_name=eponine\n",
"6 logic_name=eponine\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"df <- read.csv(\"Homo_sapiens.GRCh38.85.gff3.gz\", \n",
" header = FALSE, \n",
" sep = \"\\t\", \n",
" col.names = c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'), \n",
" comment.char = \"#\")\n",
"head(df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22'\n",
" '3' '4' '5' '6' '7' '8' '9' 'GL000008.2' 'GL000009.2' 'GL000194.1'\n",
" 'GL000195.1' 'GL000205.2' 'GL000208.1' 'GL000213.1' 'GL000214.1'\n",
" 'GL000216.2' 'GL000218.1' 'GL000219.1' 'GL000220.1' 'GL000221.1'\n",
" 'GL000224.1' 'GL000225.1' 'GL000226.1' 'KI270302.1' 'KI270303.1'\n",
" 'KI270304.1' 'KI270305.1' 'KI270310.1' 'KI270311.1' 'KI270312.1'\n",
" 'KI270315.1' 'KI270316.1' 'KI270317.1' 'KI270320.1' 'KI270322.1'\n",
" 'KI270329.1' 'KI270330.1' 'KI270333.1' 'KI270334.1' 'KI270335.1'\n",
" 'KI270336.1' 'KI270337.1' 'KI270338.1' 'KI270340.1' 'KI270362.1'\n",
" 'KI270363.1' 'KI270364.1' 'KI270366.1' 'KI270371.1' 'KI270372.1'\n",
" 'KI270373.1' 'KI270374.1' 'KI270375.1' 'KI270376.1' 'KI270378.1'\n",
" 'KI270379.1' 'KI270381.1' 'KI270382.1' 'KI270383.1' 'KI270384.1'\n",
" 'KI270385.1' 'KI270386.1' 'KI270387.1' 'KI270388.1' 'KI270389.1'\n",
" 'KI270390.1' 'KI270391.1' 'KI270392.1' 'KI270393.1' 'KI270394.1'\n",
" 'KI270395.1' 'KI270396.1' 'KI270411.1' 'KI270412.1' 'KI270414.1'\n",
" 'KI270417.1' 'KI270418.1' 'KI270419.1' 'KI270420.1' 'KI270422.1'\n",
" 'KI270423.1' 'KI270424.1' 'KI270425.1' 'KI270429.1' 'KI270435.1'\n",
" 'KI270438.1' 'KI270442.1' 'KI270448.1' 'KI270465.1' 'KI270466.1'\n",
" 'KI270467.1' 'KI270468.1' 'KI270507.1' 'KI270508.1' 'KI270509.1'\n",
" 'KI270510.1' 'KI270511.1' 'KI270512.1' 'KI270515.1' 'KI270516.1'\n",
" 'KI270517.1' 'KI270518.1' 'KI270519.1' 'KI270521.1' 'KI270522.1'\n",
" 'KI270528.1' 'KI270529.1' 'KI270530.1' 'KI270538.1' 'KI270539.1'\n",
" 'KI270544.1' 'KI270548.1' 'KI270579.1' 'KI270580.1' 'KI270581.1'\n",
" 'KI270582.1' 'KI270583.1' 'KI270584.1' 'KI270587.1' 'KI270588.1'\n",
" 'KI270589.1' 'KI270590.1' 'KI270591.1' 'KI270593.1' 'KI270706.1'\n",
" 'KI270707.1' 'KI270708.1' 'KI270709.1' 'KI270710.1' 'KI270711.1'\n",
" 'KI270712.1' 'KI270713.1' 'KI270714.1' 'KI270715.1' 'KI270716.1'\n",
" 'KI270717.1' 'KI270718.1' 'KI270719.1' 'KI270720.1' 'KI270721.1'\n",
" 'KI270722.1' 'KI270723.1' 'KI270724.1' 'KI270725.1' 'KI270726.1'\n",
" 'KI270727.1' 'KI270728.1' 'KI270729.1' 'KI270730.1' 'KI270731.1'\n",
" 'KI270732.1' 'KI270733.1' 'KI270734.1' 'KI270735.1' 'KI270736.1'\n",
" 'KI270737.1' 'KI270738.1' 'KI270739.1' 'KI270740.1' 'KI270741.1'\n",
" 'KI270742.1' 'KI270743.1' 'KI270744.1' 'KI270745.1' 'KI270746.1'\n",
" 'KI270747.1' 'KI270748.1' 'KI270749.1' 'KI270750.1' 'KI270751.1'\n",
" 'KI270752.1' 'KI270753.1' 'KI270754.1' 'KI270755.1' 'KI270756.1'\n",
" 'KI270757.1' 'MT' 'X' 'Y']\n",
"71.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"print(df.seqid.unique())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" [1] 1 10 11 12 13 14 \n",
" [7] 15 16 17 18 19 2 \n",
" [13] 20 21 22 3 4 5 \n",
" [19] 6 7 8 9 GL000008.2 GL000009.2\n",
" [25] GL000194.1 GL000195.1 GL000205.2 GL000208.1 GL000213.1 GL000214.1\n",
" [31] GL000216.2 GL000218.1 GL000219.1 GL000220.1 GL000221.1 GL000224.1\n",
" [37] GL000225.1 GL000226.1 KI270302.1 KI270303.1 KI270304.1 KI270305.1\n",
" [43] KI270310.1 KI270311.1 KI270312.1 KI270315.1 KI270316.1 KI270317.1\n",
" [49] KI270320.1 KI270322.1 KI270329.1 KI270330.1 KI270333.1 KI270334.1\n",
" [55] KI270335.1 KI270336.1 KI270337.1 KI270338.1 KI270340.1 KI270362.1\n",
" [61] KI270363.1 KI270364.1 KI270366.1 KI270371.1 KI270372.1 KI270373.1\n",
" [67] KI270374.1 KI270375.1 KI270376.1 KI270378.1 KI270379.1 KI270381.1\n",
" [73] KI270382.1 KI270383.1 KI270384.1 KI270385.1 KI270386.1 KI270387.1\n",
" [79] KI270388.1 KI270389.1 KI270390.1 KI270391.1 KI270392.1 KI270393.1\n",
" [85] KI270394.1 KI270395.1 KI270396.1 KI270411.1 KI270412.1 KI270414.1\n",
" [91] KI270417.1 KI270418.1 KI270419.1 KI270420.1 KI270422.1 KI270423.1\n",
" [97] KI270424.1 KI270425.1 KI270429.1 KI270435.1 KI270438.1 KI270442.1\n",
"[103] KI270448.1 KI270465.1 KI270466.1 KI270467.1 KI270468.1 KI270507.1\n",
"[109] KI270508.1 KI270509.1 KI270510.1 KI270511.1 KI270512.1 KI270515.1\n",
"[115] KI270516.1 KI270517.1 KI270518.1 KI270519.1 KI270521.1 KI270522.1\n",
"[121] KI270528.1 KI270529.1 KI270530.1 KI270538.1 KI270539.1 KI270544.1\n",
"[127] KI270548.1 KI270579.1 KI270580.1 KI270581.1 KI270582.1 KI270583.1\n",
"[133] KI270584.1 KI270587.1 KI270588.1 KI270589.1 KI270590.1 KI270591.1\n",
"[139] KI270593.1 KI270706.1 KI270707.1 KI270708.1 KI270709.1 KI270710.1\n",
"[145] KI270711.1 KI270712.1 KI270713.1 KI270714.1 KI270715.1 KI270716.1\n",
"[151] KI270717.1 KI270718.1 KI270719.1 KI270720.1 KI270721.1 KI270722.1\n",
"[157] KI270723.1 KI270724.1 KI270725.1 KI270726.1 KI270727.1 KI270728.1\n",
"[163] KI270729.1 KI270730.1 KI270731.1 KI270732.1 KI270733.1 KI270734.1\n",
"[169] KI270735.1 KI270736.1 KI270737.1 KI270738.1 KI270739.1 KI270740.1\n",
"[175] KI270741.1 KI270742.1 KI270743.1 KI270744.1 KI270745.1 KI270746.1\n",
"[181] KI270747.1 KI270748.1 KI270749.1 KI270750.1 KI270751.1 KI270752.1\n",
"[187] KI270753.1 KI270754.1 KI270755.1 KI270756.1 KI270757.1 MT \n",
"[193] X Y \n",
"194 Levels: 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 ... Y\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"35.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"unique(df$seqid)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"79.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"df.seqid.unique().shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1] 194\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"36.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"length(unique(df$seqid))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"300 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"df.source.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\n",
" havana ensembl_havana ensembl . mirbase \n",
" 1441093 745065 228212 182510 4701 \n",
" GRCh38 insdc \n",
" 194 74 \n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"274 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"sort(table(df$source), decreasing = TRUE)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" seqid source type start end score strand \\\n",
"2511495 KI270389.1 GRCh38 supercontig 1 1298 . . \n",
"2594560 Y GRCh38 chromosome 2781480 56887902 . . \n",
"2511468 KI270333.1 GRCh38 supercontig 1 2699 . . \n",
"2513704 KI270739.1 GRCh38 supercontig 1 73985 . . \n",
"2511559 KI270522.1 GRCh38 supercontig 1 5674 . . \n",
"2511388 GL000224.1 GRCh38 supercontig 1 179693 . . \n",
"2511481 KI270373.1 GRCh38 supercontig 1 1451 . . \n",
"2511460 KI270312.1 GRCh38 supercontig 1 998 . . \n",
"2511573 KI270584.1 GRCh38 supercontig 1 4513 . . \n",
"2511588 KI270709.1 GRCh38 supercontig 1 66860 . . \n",
"\n",
" phase attributes \n",
"2511495 . ID=supercontig:KI270389.1;Alias=chrUn_KI270389... \n",
"2594560 . ID=chromosome:Y;Alias=CM000686.2,chrY,NC_00002... \n",
"2511468 . ID=supercontig:KI270333.1;Alias=chrUn_KI270333... \n",
"2513704 . ID=supercontig:KI270739.1;Alias=chr22_KI270739... \n",
"2511559 . ID=supercontig:KI270522.1;Alias=chrUn_KI270522... \n",
"2511388 . ID=supercontig:GL000224.1;Alias=chrUn_GL000224... \n",
"2511481 . ID=supercontig:KI270373.1;Alias=chrUn_KI270373... \n",
"2511460 . ID=supercontig:KI270312.1;Alias=chrUn_KI270312... \n",
"2511573 . ID=supercontig:KI270584.1;Alias=chrUn_KI270584... \n",
"2511588 . ID=supercontig:KI270709.1;Alias=chr1_KI270709v... \n",
"192 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"global gdf\n",
"gdf = df[df.source == 'GRCh38']\n",
"gdf.shape\n",
"print(gdf.sample(10))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" seqid source type start end score strand phase\n",
"865054 16 GRCh38 chromosome 1 90338345 . . .\n",
"2511482 KI270373.1 GRCh38 supercontig 1 1451 . . .\n",
"2511636 KI270711.1 GRCh38 supercontig 1 42210 . . .\n",
"2511491 KI270384.1 GRCh38 supercontig 1 1658 . . .\n",
"2511458 KI270305.1 GRCh38 supercontig 1 1472 . . .\n",
"2511544 KI270465.1 GRCh38 supercontig 1 1774 . . .\n",
"235069 10 GRCh38 chromosome 1 133797422 . . .\n",
"2511515 KI270429.1 GRCh38 supercontig 1 1361 . . .\n",
"990811 17 GRCh38 chromosome 1 83257441 . . .\n",
"2511485 KI270376.1 GRCh38 supercontig 1 1136 . . .\n",
" attributes\n",
"865054 ID=chromosome:16;Alias=CM000678.2,chr16,NC_000016.10\n",
"2511482 ID=supercontig:KI270373.1;Alias=chrUn_KI270373v1,NT_187492.1\n",
"2511636 ID=supercontig:KI270711.1;Alias=chr1_KI270711v1_random,NT_187366.1\n",
"2511491 ID=supercontig:KI270384.1;Alias=chrUn_KI270384v1,NT_187484.1\n",
"2511458 ID=supercontig:KI270305.1;Alias=chrUn_KI270305v1,NT_187399.1\n",
"2511544 ID=supercontig:KI270465.1;Alias=chrUn_KI270465v1,NT_187422.1\n",
"235069 ID=chromosome:10;Alias=CM000672.2,chr10,NC_000010.11\n",
"2511515 ID=supercontig:KI270429.1;Alias=chrUn_KI270429v1,NT_187419.1\n",
"990811 ID=chromosome:17;Alias=CM000679.2,chr17,NC_000017.11\n",
"2511485 ID=supercontig:KI270376.1;Alias=chrUn_KI270376v1,NT_187489.1\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"878 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"gdf <- df[df$source == \"GRCh38\", ]\n",
"dim(gdf)\n",
"sample_n(gdf, 10)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3096629726\n",
"52.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"global gdf\n",
"gdf = gdf.copy()\n",
"gdf['length'] = gdf.end - gdf.start + 1\n",
"\n",
"print(gdf.length.sum())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1] 3096629726\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%R\n",
"# %%timeit -n 1 -r 1 gives an error due to $\n",
"gdf$length <- gdf$end - gdf$start + 1\n",
"sum(gdf$length)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00370219174212\n",
"2.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"print(gdf[(gdf['type'] == 'supercontig')].length.sum() / gdf.length.sum())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1] 0.003702192\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"%%R\n",
"chrs <- c(1:23, \"X\", \"Y\", \"MT\")\n",
"sum(subset(gdf, !seqid %in% chrs)$length) / sum(gdf$length)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"exon 1180596\n",
"CDS 704604\n",
"five_prime_UTR 142387\n",
"three_prime_UTR 133938\n",
"transcript 96375\n",
"gene 42470\n",
"processed_transcript 28228\n",
"aberrant_processed_transcript 26944\n",
"NMD_transcript_variant 13761\n",
"lincRNA 13247\n",
"processed_pseudogene 10722\n",
"lincRNA_gene 7533\n",
"pseudogene 3049\n",
"RNA 2221\n",
"snRNA_gene 1909\n",
"snRNA 1909\n",
"snoRNA 956\n",
"snoRNA_gene 944\n",
"pseudogenic_transcript 737\n",
"rRNA 549\n",
"rRNA_gene 549\n",
"miRNA 302\n",
"V_gene_segment 216\n",
"J_gene_segment 158\n",
"VD_gene_segment 37\n",
"C_gene_segment 29\n",
"Name: type, dtype: int64\n",
"799 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"global edf\n",
"edf = df[df.source.isin(['ensembl', 'havana', 'ensembl_havana'])]\n",
"edf.shape\n",
"\n",
"edf.sample(10)\n",
"print(edf.type.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" seqid source type start end score strand\n",
"128494 1 ensembl_havana CDS 117098833 117098907 . +\n",
"2548429 X havana exon 66000422 66001071 . +\n",
"2178494 6 ensembl_havana CDS 142309544 142309644 . +\n",
"1953812 4 havana CDS 167000349 167000463 . -\n",
"1497584 2 ensembl_havana exon 178731693 178731971 . -\n",
"416225 11 ensembl CDS 66838367 66838445 . +\n",
"2439754 9 ensembl_havana exon 37854780 37854982 . +\n",
"975953 16 havana exon 79713836 79714134 . +\n",
"38652 1 ensembl_havana three_prime_UTR 24364372 24364482 . +\n",
"1013586 17 ensembl_havana five_prime_UTR 7630142 7630172 . +\n",
" phase\n",
"128494 1\n",
"2548429 .\n",
"2178494 1\n",
"1953812 2\n",
"1497584 .\n",
"416225 0\n",
"2439754 .\n",
"975953 .\n",
"38652 .\n",
"1013586 .\n",
" attributes\n",
"128494 ID=CDS:ENSP00000358478;Parent=transcript:ENST00000369466;protein_id=ENSP00000358478\n",
"2548429 Parent=transcript:ENST00000424241;Name=ENSE00001750385;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001750385;rank=2;version=1\n",
"2178494 ID=CDS:ENSP00000296932;Parent=transcript:ENST00000296932;protein_id=ENSP00000296932\n",
"1953812 ID=CDS:ENSP00000420920;Parent=transcript:ENST00000506886;protein_id=ENSP00000420920\n",
"1497584 Parent=transcript:ENST00000589042;Name=ENSE00003797539;constitutive=0;ensembl_end_phase=1;ensembl_phase=1;exon_id=ENSE00003797539;rank=58;version=1\n",
"416225 ID=CDS:ENSP00000354227;Parent=transcript:ENST00000360962;protein_id=ENSP00000354227\n",
"2439754 Parent=transcript:ENST00000377724;Name=ENSE00003462136;constitutive=0;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSE00003462136;rank=4;version=1\n",
"975953 Parent=transcript:ENST00000563360;Name=ENSE00002611138;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002611138;rank=2;version=1\n",
"38652 Parent=transcript:ENST00000350501\n",
"1013586 Parent=transcript:ENST00000380450\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"671 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"edf <- subset(df, source %in% c(\"ensembl\", \"havana\", \"ensembl_havana\"))\n",
"dim(edf)\n",
"sample_n(edf, 10)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(42470, 9)\n",
"191 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"global ndf\n",
"ndf = edf[edf.type == 'gene']\n",
"ndf = ndf.copy()\n",
"ndf.sample(10).attributes.values\n",
"print(ndf.shape)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" [1] ID=gene:ENSG00000163157;Name=TMOD4;biotype=protein_coding;description=tropomodulin 4 [Source:HGNC Symbol%3BAcc:HGNC:11874];gene_id=ENSG00000163157;havana_gene=OTTHUMG00000012350;havana_version=6;logic_name=ensembl_havana_gene;version=14 \n",
" [2] ID=gene:ENSG00000072201;Name=LNX1;biotype=protein_coding;description=ligand of numb-protein X 1 [Source:HGNC Symbol%3BAcc:HGNC:6657];gene_id=ENSG00000072201;havana_gene=OTTHUMG00000102099;havana_version=4;logic_name=ensembl_havana_gene;version=13 \n",
" [3] ID=gene:ENSG00000158122;Name=AAED1;biotype=protein_coding;description=AhpC/TSA antioxidant enzyme domain containing 1 [Source:HGNC Symbol%3BAcc:HGNC:16881];gene_id=ENSG00000158122;havana_gene=OTTHUMG00000020299;havana_version=1;logic_name=ensembl_havana_gene;version=11\n",
" [4] ID=gene:ENSG00000163083;Name=INHBB;biotype=protein_coding;description=inhibin beta B subunit [Source:HGNC Symbol%3BAcc:HGNC:6067];gene_id=ENSG00000163083;havana_gene=OTTHUMG00000131437;havana_version=1;logic_name=ensembl_havana_gene;version=5 \n",
" [5] ID=gene:ENSG00000230011;Name=CTSLP4;biotype=unprocessed_pseudogene;description=cathepsin L pseudogene 4 [Source:HGNC Symbol%3BAcc:HGNC:23645];gene_id=ENSG00000230011;havana_gene=OTTHUMG00000018237;havana_version=1;logic_name=havana;version=2 \n",
" [6] ID=gene:ENSG00000274601;Name=WI2-88277B6.1;biotype=unprocessed_pseudogene;gene_id=ENSG00000274601;havana_gene=OTTHUMG00000188051;havana_version=1;logic_name=havana;version=1 \n",
" [7] ID=gene:ENSG00000261221;Name=ZNF865;biotype=protein_coding;description=zinc finger protein 865 [Source:HGNC Symbol%3BAcc:HGNC:38705];gene_id=ENSG00000261221;havana_gene=OTTHUMG00000177108;havana_version=1;logic_name=ensembl_havana_gene;version=3 \n",
" [8] ID=gene:ENSG00000233816;Name=IFNA13;biotype=protein_coding;description=interferon%2C alpha 13 [Source:HGNC Symbol%3BAcc:HGNC:5419];gene_id=ENSG00000233816;havana_gene=OTTHUMG00000019675;havana_version=2;logic_name=ensembl_havana_gene;version=3 \n",
" [9] ID=gene:ENSG00000270863;Name=DDX55P1;biotype=processed_pseudogene;description=DEAD-box helicase 55 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:49852];gene_id=ENSG00000270863;havana_gene=OTTHUMG00000184770;havana_version=1;logic_name=havana;version=1 \n",
"[10] ID=gene:ENSG00000217643;Name=PTGES3P2;biotype=processed_pseudogene;description=prostaglandin E synthase 3 (cytosolic) pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:43822];gene_id=ENSG00000217643;havana_gene=OTTHUMG00000152177;havana_version=1;logic_name=havana;version=1 \n",
"1623077 Levels: external_name=Ala;logic_name=trnascan ...\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2min 5s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"ndf <- subset(edf, type == \"gene\")\n",
"sample_n(ndf, 10)$attributes"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"198 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"import re\n",
"\n",
"RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);')\n",
"def extract_gene_name(attributes_str):\n",
" res = RE_GENE_NAME.search(attributes_str)\n",
" return res.group('gene_name')\n",
"\n",
"\n",
"ndf['gene_name'] = ndf.attributes.apply(extract_gene_name)\n",
"\n",
"RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')\n",
"def extract_gene_id(attributes_str):\n",
" res = RE_GENE_ID.search(attributes_str)\n",
" return res.group('gene_id')\n",
"\n",
"\n",
"ndf['gene_id'] = ndf.attributes.apply(extract_gene_id)\n",
"\n",
"\n",
"RE_DESC = re.compile('description=(?P<desc>.+?);')\n",
"def extract_description(attributes_str):\n",
" res = RE_DESC.search(attributes_str)\n",
" if res is None:\n",
" return ''\n",
" else:\n",
" return res.group('desc')\n",
"\n",
"\n",
"ndf['desc'] = ndf.attributes.apply(extract_description)\n",
"\n",
"ndf.drop('attributes', axis=1, inplace=True)\n",
"ndf.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" user system elapsed \n",
" 1.307 0.017 1.322 \n",
" seqid source type start end score strand phase gene_name\n",
"17 1 havana gene 11869 14409 . + . DDX11L1\n",
"29 1 havana gene 14404 29570 . - . WASH7P\n",
"72 1 havana gene 52473 53312 . + . OR4G4P\n",
"75 1 havana gene 62948 63887 . + . OR4G11P\n",
"78 1 ensembl_havana gene 69091 70008 . + . OR4F5\n",
"109 1 havana gene 131025 134836 . + . CICP27\n",
" gene_id\n",
"17 ENSG00000223972\n",
"29 ENSG00000227232\n",
"72 ENSG00000268020\n",
"75 ENSG00000240361\n",
"78 ENSG00000186092\n",
"109 ENSG00000233750\n",
" desc\n",
"17 DEAD/H-box helicase 11 like 1 [Source:HGNC Symbol%3BAcc:HGNC:37102]\n",
"29 WAS protein family homolog 7 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:38034]\n",
"72 olfactory receptor family 4 subfamily G member 4 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:14822]\n",
"75 olfactory receptor family 4 subfamily G member 11 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:31276]\n",
"78 olfactory receptor family 4 subfamily F member 5 [Source:HGNC Symbol%3BAcc:HGNC:14825]\n",
"109 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol%3BAcc:HGNC:48835]\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%R\n",
"ptm <- proc.time()\n",
"ndf$gene_name <- gsub(\"(.*Name=)(.*?)(;biotype.*)\", \"\\\\2\", ndf$attributes)\n",
"ndf$gene_id <- gsub(\"(ID=gene:)(.*?)(;Name.*)\", \"\\\\2\", ndf$attributes)\n",
"ndf$desc <- gsub(\"(.*description=)(.*?)(;.*)\", \"\\\\2\", ndf$attributes)\n",
"\n",
"# some genes don't have a description\n",
"ndf$desc <- ifelse(grepl(\"^ID=.*\", ndf$desc), \"\", ndf$desc)\n",
"\n",
"ndf <- subset(ndf, select = -attributes)\n",
"print(proc.time() - ptm)\n",
"head(ndf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Jump to plotting\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 4.247000e+04\n",
"mean 3.583348e+04\n",
"std 9.683485e+04\n",
"min 8.000000e+00\n",
"25% 8.840000e+02\n",
"50% 5.170500e+03\n",
"75% 3.055200e+04\n",
"max 2.304997e+06\n",
"Name: length, dtype: float64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ndf['length'] = ndf.end - ndf.start + 1\n",
"ndf.length.describe()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
" 8 884 5170 35834 30552 2304997 \n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%R\n",
"ndf$length <- ndf$end - ndf$start + 1\n",
"summary(ndf$length)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"255 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD8CAYAAABthzNFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAExZJREFUeJzt3X+wXGV9x/H31yAgaCNIbJmEmGAY\nasapFVfQam3rqA1ipDpWk3ZGqzSpP9LW6XTGUJ2WfzpDO/5oGdJCbJFqLRi1UlLDxB9Vmc4wQrAo\nYIxEpOUaxoBMg1XbCH77x54Ly/Xs3rP33idnz73v18yd7D67e873HPbeD895nnNOZCaSJM30hLYL\nkCRNJgNCklTLgJAk1TIgJEm1DAhJUi0DQpJUy4CQJNUyICRJtQwISVKt49ouYD5OO+20XLNmTdtl\nSFKn3HrrrQ9k5orZ3tfpgFizZg379u1ruwxJ6pSI+M8m7/MQkySpVicDIiI2RsTOI0eOtF2KJC1a\nnQyIzNydmVuXL1/edimStGh1MiAkSeUZEJKkWp0MCMcgJKm8TgaEYxCSVF4nA0KSVF6nT5SbjzXb\nP13bfs+lFxzjSiRpMtmDkCTVMiAkSbU6GRDOYpKk8joZEM5ikqTyOhkQkqTyDAhJUi0DQpJUy4CQ\nJNUyICRJtQwISVKtTgaE50FIUnmdDAjPg5Ck8joZEJKk8gwISVItA0KSVMuAkCTVMiAkSbUMCElS\nLQNCklRrogIiIk6OiFsj4lVt1yJJS13RgIiIqyLicETcMaN9Q0QciIiDEbF94KV3AbtK1iRJaqZ0\nD+JqYMNgQ0QsA3YA5wPrgc0RsT4iXgZ8Hfhu4ZokSQ0cV3LhmXljRKyZ0XwucDAz7waIiGuBC4En\nAyfTD40fRcSezPxJyfokScMVDYghVgL3DjyfAs7LzG0AEfE7wAPDwiEitgJbAVavXl22UklawtoY\npI6atnz0QebVmfmvwz6cmTszs5eZvRUrVhQpUJLUTkBMAWcMPF8FHBpnAV7uW5LKayMgbgHOioi1\nEXE8sAm4fpwFeLlvSSqv9DTXa4CbgLMjYioiLsrMh4FtwF5gP7ArM+8cc7n2ICSpsNKzmDYPad8D\n7JnHcncDu3u93pa5LkOSNNpEnUndlD0ISSqvkwHhGIQkldfJgJAkldfJgPAQkySV18mA8BCTJJXX\nyYCQJJVnQEiSanUyIByDkKTyOhkQjkFIUnmdDAhJUnkGhCSpVicDwjEISSqvjTvKzVvJi/Wt2f7p\n2vZ7Lr1goVclSROtkz0ISVJ5BoQkqZYBIUmq1cmAcJBaksrrZEB4opwkldfJgJAklWdASJJqGRCS\npFoGhCSplgEhSaplQEiSanUyIDwPQpLK62RAeB6EJJXXyYCQJJVnQEiSanXyfhBtGHafCPBeEZIW\nJ3sQkqRaBoQkqZYBIUmqZUBIkmpNTEBExLMi4oqI+EREvK3teiRpqSsaEBFxVUQcjog7ZrRviIgD\nEXEwIrYDZOb+zHwr8HqgV7IuSdLsSvcgrgY2DDZExDJgB3A+sB7YHBHrq9deDfw78PnCdUmSZlE0\nIDLzRuDBGc3nAgcz8+7MPApcC1xYvf/6zPwl4LdL1iVJml0bJ8qtBO4deD4FnBcRvwq8FjgB2DPs\nwxGxFdgKsHr16nJVStIS10ZARE1bZuYXgS/O9uHM3AnsBOj1ermglUmSHtXGLKYp4IyB56uAQ+Ms\nwMt9S1J5bfQgbgHOioi1wHeATcBvjbOAzNwN7O71elsK1De2Yddp8hpNkrqs9DTXa4CbgLMjYioi\nLsrMh4FtwF5gP7ArM+8cc7n2ICSpsKI9iMzcPKR9DyMGohssd6J6EJK0GE3MmdSSpMnSyYDwEJMk\nldcoICLi2aULGYf3pJak8pr2IK6IiJsj4u0R8dSiFUmSJkKjgMjMF9O//MUZwL6I+KeIeHnRykbw\nEJMkldd4DCIz7wLeA7wL+BXgsoj4RkS8tlRxI2rxEJMkFdZ0DOIXIuID9M9beCmwMTOfVT3+QMH6\nJEktadqDuBz4CvCczHxHZn4FIDMP0e9VHFMeYpKk8iJz9uvdRcSTgR9l5iPV8ycAJ2bmDwvXN1Kv\n18t9+/bN6bPDLo9xLHgJDkltiohbM3PWG7M17UF8DnjSwPOTqjZJ0iLVNCBOzMz/mX5SPT6pTEmS\npEnQNCB+EBHnTD+JiOcBPypTkiRpEjS9WN87gY9HxPR9G04H3lCmpNlFxEZg47p169oqQZIWvUYB\nkZm3RMTPA2fTvyPcNzLzx0UrG12PV3OVpMLGudz384E11WeeGxFk5oeLVCVJal2jgIiIjwDPBG4D\nHqmaEzAgJGmRatqD6AHrs8lJE5KkRaFpQNwB/BxwX8FaGuv6ILX3sJbUBU2nuZ4GfD0i9kbE9dM/\nJQsbxYv1SVJ5TXsQl5QsQpI0eZpOc/1SRDwDOCszPxcRJwHLypYmSWpT08t9bwE+AVxZNa0EritV\nlCSpfU3HIN4BvAh4CB69edDTSxUlSWpf04D4v8w8Ov0kIo6jfx6EJGmRajpI/aWI+BPgSdW9qN8O\n7C5X1tLk9FdJk6RpD2I7cD9wO/B7wB5auJPcNO8oJ0nlNZ3F9BPgg9VP67xYnySV1/RaTN+mZswh\nM89c8IokSRNhnGsxTTsR+E3g1IUvR5I0KRqNQWTm9wZ+vpOZfwW8tHBtkqQWNT3EdM7A0yfQ71E8\npUhFkqSJ0PQQ0/sGHj8M3AO8fsGrUS2nv0pqQ9NZTL9WuhBJ0mRpeojpj0a9npnvX5hyNA57FpJK\nGmcW0/OB6XtAbARuBO5dyGIi4jeAC+hf52lHZn5mIZcvSWquaUCcBpyTmd8HiIhLgI9n5u/O9sGI\nuAp4FXA4M5890L4B+Gv6lw3/u8y8NDOvA66LiFOA9wIGhCS1pOmlNlYDRweeHwXWNPzs1cCGwYaI\nWAbsAM4H1gObI2L9wFveU70uSWpJ0x7ER4CbI+JT9M+ofg3w4SYfzMwbI2LNjOZzgYOZeTdARFwL\nXBgR+4FLgRsy8ysNa5MkFdB0FtOfR8QNwC9XTW/OzP+Yx3pX8vjxiyngPOD3gZcByyNiXWZeMfOD\nEbEV2AqwevXqeZQgSRqlaQ8C4CTgocz8UESsiIi1mfntOa43atoyMy8DLhv1wczcCewE6PV63pNC\nkgppOs31z+jPZDob+BDwROAf6d9lbi6mgDMGnq8CDjX9cERsBDauW7dujqtf3Jz+KmkhNB2kfg3w\nauAHAJl5iPldauMW4KyIWBsRxwObeGwK7awyc3dmbl2+fPk8SpAkjdI0II5mZlJd8jsiTm66goi4\nBrgJODsipiLiosx8GNgG7AX2A7sy884xlukNgySpsKZjELsi4krgqRGxBXgLDW8elJmbh7TvoX9n\nurF5wyBJKq/pLKb3Vveifoj+OMSfZuZni1YmSWrVrAFRndS2NzNfBkxEKDhIPTcOXksax6xjEJn5\nCPDDiJiYEWEHqSWpvKZjEP8L3B4Rn6WayQSQmX9QpKpZ2IOQpPKaBsSnq5+J4CC1JJU3MiAiYnVm\n/ldm/sOxKkiSNBlmG4O4bvpBRHyycC2SpAky2yGmwWsmnVmykHE4BrGwhs1uGsZZT9LSMFsPIoc8\nbpWzmCSpvNl6EM+JiIfo9ySeVD2mep6Z+TNFq5MktWZkQGTmsmNViCRpsjS9WN9E8WJ9klReJwPC\nMQhJKq+TASFJKs+AkCTVMiAkSbUMCElSrU4GhLOYJKm8TgaEs5gkqbxOBoQkqbym94OQHuWtS6Wl\nwR6EJKmWPQi1yt6INLkMCC2YUfeV8A++1D2dPMTkNFdJKq+TPYjM3A3s7vV6W9quRc2Me9e6hVq+\nPRdp7jrZg5AklWdASJJqdfIQk5au0oeqJD3GHoQkqZYBIUmqZUBIkmoZEJKkWgaEJKnWxMxiiogz\ngXcDyzPzdW3Xo8XBE+ikuSvag4iIqyLicETcMaN9Q0QciIiDEbEdIDPvzsyLStYjSWqudA/iauBy\n4MPTDRGxDNgBvByYAm6JiOsz8+uFa1GHeL6D1L6iPYjMvBF4cEbzucDBqsdwFLgWuLDpMiNia0Ts\ni4h9999//wJWK0ka1MYg9Urg3oHnU8DKiHhaRFwBPDciLh724czcmZm9zOytWLGidK2StGS1MUgd\nNW2Zmd8D3tpoAREbgY3r1q1b0MKkcTkIrsWsjR7EFHDGwPNVwKFxFpCZuzNz6/Llyxe0MEnSY9oI\niFuAsyJibUQcD2wCrm+hDknSCKWnuV4D3AScHRFTEXFRZj4MbAP2AvuBXZl555jL9Y5yklRY0TGI\nzNw8pH0PsGcey/WOcpJU2MScST0OB6k1X6UHl+dyHse463aAXKV18lpMDlJLUnmdDAhJUnmdDAgH\nqSWpvE4GhIeYJKm8TgaEJKk8ZzFJS4SznjSuTvYgPMQkSeV1MiAkSeUZEJKkWgaEJKlWJwPC8yAk\nqbxOBoSD1JJUXicDQpJUngEhSaplQEiSankmtdTAXO7v0BWeYa1hOtmDcJBaksrrZEBIksozICRJ\ntQwISVItA0KSVMuAkCTVMiAkSbUiM9uuYWwD50Fsueuuu+a0jMU8r10qadzzIxbDeRZtbkOJdUfE\nrZnZm+19nexBeB6EJJXXyYCQJJVnQEiSahkQkqRaBoQkqZYBIUmqZUBIkmoZEJKkWhNzw6CIOBn4\nG+Ao8MXM/GjLJUnSkla0BxERV0XE4Yi4Y0b7hog4EBEHI2J71fxa4BOZuQV4dcm6JEmzK32I6Wpg\nw2BDRCwDdgDnA+uBzRGxHlgF3Fu97ZHCdUmSZlE0IDLzRuDBGc3nAgcz8+7MPApcC1wITNEPieJ1\nSZJm18YYxEoe6ylAPxjOAy4DLo+IC4Ddwz4cEVuBrQCrV68uWKakkhbygpnDLlw37oXuSl/Es2sX\nCW0jIKKmLTPzB8CbZ/twZu4EdgL0er3uXYpWkjqijUM5U8AZA89XAYfGWUBEbIyInUeOHFnQwiRJ\nj2kjIG4BzoqItRFxPLAJuH6cBXi5b0kqr/Q012uAm4CzI2IqIi7KzIeBbcBeYD+wKzPvHHO59iAk\nqbCiYxCZuXlI+x5gzzyWuxvY3ev1tsx1GZKk0ZxOKkmq1cmA8BCTJJXXyYBwkFqSyutkQEiSyovM\n7p1rFhEbgY3AG4C75riY04AHFqyo7nI/9Lkf+twPfYt9PzwjM1fM9qZOBsRCiIh9mdlru462uR/6\n3A997oc+90Ofh5gkSbUMCElSraUcEDvbLmBCuB/63A997oc+9wNLeAxCkjTaUu5BSJJGWJIBMeSe\n2J0TEfdExO0RcVtE7KvaTo2Iz0bEXdW/p1TtERGXVdv8tYg4Z2A5b6ref1dEvGmg/XnV8g9Wn41R\n6ziG2/1T9zpvc7tHraOF/XBJRHyn+k7cFhGvHHjt4qrGAxHx6wPttb8P1RWXv1xt78eqqy8TESdU\nzw9Wr6+ZbR0lRcQZEfGFiNgfEXdGxB9W7UvuO7HgMnNJ/QDLgG8BZwLHA18F1rdd1xy35R7gtBlt\nfwlsrx5vB/6ievxK4Ab6N2x6AfDlqv1U4O7q31Oqx6dUr90MvLD6zA3A+aPWcQy3+yXAOcAdk7Dd\nw9bR0n64BPjjmveur77rJwBrq9+BZaN+H4BdwKbq8RXA26rHbweuqB5vAj42ah3HYD+cDpxTPX4K\n8M2qliX3nVjwfdt2Acd8g/v/kfcOPL8YuLjtuua4Lffw0wFxADi9enw6cKB6fCWweeb7gM3AlQPt\nV1ZtpwPfGGh/9H3D1nGMt33NjD+MrW33sHW0tB8uoT4gHvc9p3+5/RcO+32o/rA9ABxXtT/6vunP\nVo+Pq94Xw9bRwnfjX4CXL9XvxEL+LMVDTHX3xF7ZUi3zlcBnIuLW6N+rG+BnM/M+gOrfp1ftw7Z7\nVPtUTfuodbSpze2etO/UtuqwxlUDh//G3Q9PA/47+/dvGWx/3LKq149U7299P1SHu54LfBm/E/O2\nFAOi9p7Yx7yKhfGizDwHOB94R0S8ZMR7h233uO1dcyy2e5L21d8CzwR+EbgPeF/VvpD7YSK/MxHx\nZOCTwDsz86FRb61pW8zfiTlbigEx73tiT4rMPFT9exj4FHAu8N2IOB2g+vdw9fZh2z2qfVVNOyPW\n0aY2t3tivlOZ+d3MfCQzfwJ8kP53AsbfDw8AT42I42a0P25Z1evLgQdHLKu4iHgi/XD4aGb+c9Xs\nd2KelmJAzPue2JMgIk6OiKdMPwZeAdxBf1umZ1+8if7xWKr2N1azK14AHKm6xHuBV0TEKdXhiFfQ\nP9Z8H/D9iHhBNWPjjTOWVbeONrW53cPWccxN/7GqvIb+dwL6NW6qZiCtBc6iP/Ba+/uQ/QPnXwBe\nV31+5vZO74fXAf9WvX/YOoqq/jv9PbA/M98/8JLfiflqexCkjR/6Mwy+SX+WxbvbrmeO23Am/Rkj\nXwXunN4O+seCP0//KrefB06t2gPYUW3z7UBvYFlvAQ5WP28eaO/R/wPzLeByHjuxsnYdx3Dbr6F/\n+OTH9P9P7aI2t3vUOlrYDx+pavga/T9Spw+8/91VjQeoZuGM+n2ovmM3V/vn48AJVfuJ1fOD1etn\nzraOwvvhxfQP33wNuK36eeVS/E4s9I9nUkuSai3FQ0ySpAYMCElSLQNCklTLgJAk1TIgJEm1DAhJ\nUi0DQpJUy4CQJNX6f27vUR1OEDNTAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x10f95ae80>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"ndf.length.plot(kind='hist', bins=50, logy=True)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAHgCAYAAAB91L6VAAAEDWlDQ1BJQ0MgUHJvZmlsZQAAOI2N\nVV1oHFUUPrtzZyMkzlNsNIV0qD8NJQ2TVjShtLp/3d02bpZJNtoi6GT27s6Yyc44M7v9oU9FUHwx\n6psUxL+3gCAo9Q/bPrQvlQol2tQgKD60+INQ6Ium65k7M5lpurHeZe58853vnnvuuWfvBei5qliW\nkRQBFpquLRcy4nOHj4g9K5CEh6AXBqFXUR0rXalMAjZPC3e1W99Dwntf2dXd/p+tt0YdFSBxH2Kz\n5qgLiI8B8KdVy3YBevqRHz/qWh72Yui3MUDEL3q44WPXw3M+fo1pZuQs4tOIBVVTaoiXEI/MxfhG\nDPsxsNZfoE1q66ro5aJim3XdoLFw72H+n23BaIXzbcOnz5mfPoTvYVz7KzUl5+FRxEuqkp9G/Aji\na219thzg25abkRE/BpDc3pqvphHvRFys2weqvp+krbWKIX7nhDbzLOItiM8358pTwdirqpPFnMF2\nxLc1WvLyOwTAibpbmvHHcvttU57y5+XqNZrLe3lE/Pq8eUj2fXKfOe3pfOjzhJYtB/yll5SDFcSD\niH+hRkH25+L+sdxKEAMZahrlSX8ukqMOWy/jXW2m6M9LDBc31B9LFuv6gVKg/0Szi3KAr1kGq1GM\njU/aLbnq6/lRxc4XfJ98hTargX++DbMJBSiYMIe9Ck1YAxFkKEAG3xbYaKmDDgYyFK0UGYpfoWYX\nG+fAPPI6tJnNwb7ClP7IyF+D+bjOtCpkhz6CFrIa/I6sFtNl8auFXGMTP34sNwI/JhkgEtmDz14y\nSfaRcTIBInmKPE32kxyyE2Tv+thKbEVePDfW/byMM1Kmm0XdObS7oGD/MypMXFPXrCwOtoYjyyn7\nBV29/MZfsVzpLDdRtuIZnbpXzvlf+ev8MvYr/Gqk4H/kV/G3csdazLuyTMPsbFhzd1UabQbjFvDR\nmcWJxR3zcfHkVw9GfpbJmeev9F08WW8uDkaslwX6avlWGU6NRKz0g/SHtCy9J30o/ca9zX3Kfc19\nzn3BXQKRO8ud477hLnAfc1/G9mrzGlrfexZ5GLdn6ZZrrEohI2wVHhZywjbhUWEy8icMCGNCUdiB\nlq3r+xafL549HQ5jH+an+1y+LlYBifuxAvRN/lVVVOlwlCkdVm9NOL5BE4wkQ2SMlDZU97hX86Ei\nlU/lUmkQUztTE6mx1EEPh7OmdqBtAvv8HdWpbrJS6tJj3n0CWdM6busNzRV3S9KTYhqvNiqWmuro\niKgYhshMjmhTh9ptWhsF7970j/SbMrsPE1suR5z7DMC+P/Hs+y7ijrQAlhyAgccjbhjPygfeBTjz\nhNqy28EdkUh8C+DU9+z2v/oyeH791OncxHOs5y2AtTc7nb/f73TWPkD/qwBnjX8BoJ98VVBg/m8A\nACs+SURBVHgB7d17rBxlwT/w5/ScXuiFomBLC6JUQIWigIDGBCSvigLKX1iCErwkCJGIYsR4Qfh5\nwRCVYJqIiUVBQNQAMSiXpCgJoiBBUjEVU6hyUU+5tkAv9N6XZ17O/rZ7ds+Zs+fZ2dnZzySnnZ19\n5pnn+Twz+92Zs2d2YNcrUzARIECAAAEChQpMKXRrNkaAAAECBAhkAgLYjkCAAAECBLogIIC7gG6T\nBAgQIEBAANsHCBAgQIBAFwSGurDNtje5ZcuWsHXr1rbXH2vFgYGB7OmyfyYttrPsbYyQU6ZMCTt3\n7hyLvOvP9Yrl4OBg2LFjR9e9xmtAL3jG/TIeP2U/hnrBMrYx/jjOxzsyQpg2bVqYPn36qII9FcAx\nfDds2DCqEykWzJkzJ6umU/WnaGOsI7az7G2cOnVqmDt3bnjuuedSdbsj9cyYMSMLtm3btnWk/lSV\nzp8/P7MsewjPnj279Ptm3C+3b98eNm3alGp4OlJPLxzn8fiZOXNmWLt2bUcMUlUa27h58+auvlGI\nx0azAHYJOtUoq4cAAQIECExAQABPAEtRAgQIECCQSkAAp5JUDwECBAgQmICAAJ4AlqIECBAgQCCV\ngABOJakeAgQIECAwAQEBPAEsRQkQIECAQCoBAZxKUj0ECBAgQGACAgJ4AliKEiBAgACBVAICOJWk\neggQIECAwAQEBPAEsBQlQIAAAQKpBARwKkn1ECBAgACBCQgI4AlgKUqAAAECBFIJCOBUkuohQIAA\nAQITEBDAE8BSlAABAgQIpBIQwKkk1UOAAAECBCYgIIAngKUoAQIECBBIJSCAU0mqhwABAgQITEBA\nAE8AS1ECBAgQIJBKQACnklQPAQIECBCYgMDQBMpWrujChQua9Gl2bdnw8JravBkCBAgQIJBSwBlw\nSk11ESBAgACBnAICOCeUYgQIECBAIKWAAE6pqS4CBAgQIJBTQADnhFKMAAECBAikFBDAKTXVRYAA\nAQIEcgoI4JxQihEgQIAAgZQCAjilproIECBAgEBOAQGcE0oxAgQIECCQUkAAp9RUFwECBAgQyCkg\ngHNCKUaAAAECBFIKCOCUmuoiQIAAAQI5BQRwTijFCBAgQIBASgEBnFJTXQQIECBAIKeAAM4JpRgB\nAgQIEEgpIIBTaqqLAAECBAjkFBDAOaEUI0CAAAECKQUEcEpNdREgQIAAgZwCAjgnlGIECBAgQCCl\ngABOqakuAgQIECCQU0AA54RSjAABAgQIpBQQwCk11UWAAAECBHIKCOCcUIoRIECAAIGUAgI4paa6\nCBAgQIBATgEBnBNKMQIECBAgkFJAAKfUVBcBAgQIEMgpIIBzQilGgAABAgRSCgjglJrqIkCAAAEC\nOQUEcE4oxQgQIECAQEoBAZxSU10ECBAgQCCngADOCaUYAQIECBBIKSCAU2qqiwABAgQI5BQQwDmh\nFCNAgAABAikFBHBKTXURIECAAIGcAgI4J5RiBAgQIEAgpYAATqmpLgIECBAgkFNAAOeEUowAAQIE\nCKQUEMApNdVFgAABAgRyCgjgnFCKESBAgACBlAICOKWmuggQIECAQE4BAZwTSjECBAgQIJBSQACn\n1FQXAQIECBDIKTCUs1wpig0MDIQ5c+YU1pYit5W3U9OmTSvUIG+76stNmTIlxJ8y+tW3c3BwMOza\ntSvs3LmzfnHp5uN+P2vWrKytpWtcXYPivhnbWuZp6tSpIY57/Cnz1AvH+dDQUOZY9uM8jnlsazzW\nyzb1VABHwA0bNiQ0nD1mXevXrx/z+W48GXf2Mrar3iLu8PGn7O2cMWNG2LFjR9i2bVt980s3P3Pm\nzLBx48asraVrXF2DZs+enfj4rKs80Wx8YxjHe9OmTYlq7Ew1vXCcx+Mn7ptlP85jGzdv3tzVN9rx\n2Gg2uQTdTMUyAgQIECDQYQEB3GFg1RMgQIAAgWYCAriZimUECBAgQKDDAgK4w8CqJ0CAAAECzQQE\ncDMVywgQIECAQIcFBHCHgVVPgAABAgSaCQjgZiqWESBAgACBDgsI4A4Dq54AAQIECDQTEMDNVCwj\nQIAAAQIdFhDAHQZWPQECBAgQaCYggJupWEaAAAECBDosIIA7DKx6AgQIECDQTEAAN1OxjAABAgQI\ndFigp74NqcMWo6pfuHDBqGX1C4aH19Q/NE+AAAECBHILOAPOTaUgAQIECBBIJyCA01mqiQABAgQI\n5BYQwLmpFCRAgAABAukEBHA6SzURIECAAIHcAgI4N5WCBAgQIEAgnYAATmepJgIECBAgkFtAAOem\nUpAAAQIECKQTEMDpLNVEgAABAgRyCwjg3FQKEiBAgACBdAICOJ2lmggQIECAQG4BAZybSkECBAgQ\nIJBOQACns1QTAQIECBDILSCAc1MpSIAAAQIE0gkI4HSWaiJAgAABArkFBHBuKgUJECBAgEA6AQGc\nzlJNBAgQIEAgt4AAzk2lIAECBAgQSCcggNNZqokAAQIECOQWEMC5qRQkQIAAAQLpBARwOks1ESBA\ngACB3AICODeVggQIECBAIJ2AAE5nqSYCBAgQIJBbQADnplKQAAECBAikExDA6SzVRIAAAQIEcgsI\n4NxUChIgQIAAgXQCAjidpZoIECBAgEBuAQGcm0pBAgQIECCQTkAAp7NUEwECBAgQyC0ggHNTKUiA\nAAECBNIJCOB0lmoiQIAAAQK5BQRwbioFCRAgQIBAOgEBnM5STQQIECBAILeAAM5NpSABAgQIEEgn\nIIDTWaqJAAECBAjkFhDAuakUJECAAAEC6QQEcDpLNREgQIAAgdwCAjg3lYIECBAgQCCdgABOZ6km\nAgQIECCQW0AA56ZSkAABAgQIpBMQwOks1USAAAECBHILCODcVAoSIECAAIF0AgI4naWaCBAgQIBA\nboGh3CUVHCWwcOGCUcvqFwwPr6l/aJ4AAQIECNQEnAHXKMwQIECAAIHiBARwcda2RIAAAQIEagIC\nuEZhhgABAgQIFCcggIuztiUCBAgQIFATEMA1CjMECBAgQKA4AQFcnLUtESBAgACBmoAArlGYIUCA\nAAECxQkI4OKsbYkAAQIECNQEBHCNwgwBAgQIEChOQAAXZ21LBAgQIECgJiCAaxRmCBAgQIBAcQIC\nuDhrWyJAgAABAjUBAVyjMEOAAAECBIoTEMDFWdsSAQIECBCoCQjgGoUZAgQIECBQnIAALs7alggQ\nIECAQE1AANcozBAgQIAAgeIEBHBx1rZEgAABAgRqAgK4RmGGAAECBAgUJyCAi7O2JQIECBAgUBMQ\nwDUKMwQIECBAoDgBAVyctS0RIECAAIGagACuUZghQIAAAQLFCQjg4qxtiQABAgQI1AQEcI3CDAEC\nBAgQKE5AABdnbUsECBAgQKAmMFSbM5NcYOHCBS3rHB5e0/I5TxAgQIBA9QWcAVd/jPWQAAECBEoo\n0JEAXrVqVdi5c2etu88880x46KGHwo4dO8Zctnr16hB/TAQIECBAoOoCyQP4gQceCOedd17Yvn17\nZhcfX3755eH+++8P3/3ud1suW7ZsWVi+fHm49tprw2233VZ1d/0jQIAAgT4XSPo74JUrV4Z77rkn\nLFq0qMZ60003hYsuuijMmTMnXHjhheGll14KzZatWLEiXHnllVlwn3/++eGUU07J6li7dm148MEH\ns/kFCxaEhQsX1uru5Znp06e31fzBwcHQ7rptbbCNlYaGhsLAwEDp2zl16tQQPadMSf4+tA21sVeZ\nNm3ableVxi7dnWfjuJd934zjvWvXrp5oZ9kt4/ETj52yt3Nkv6y/Klv0ERJfD5tNSQN48eLFIf7E\nAB2ZXnzxxSx84+N58+aFeDm6cVm87Dx37txslYhVf6n6qaeeys6K45Mf/vCHw8EHH5yV6/V/Zs2a\n1VYXeiEw4s4WD8x2+9gWTBsrxTbGF+P4U+Ypeu6xxx5lbmLWtl7YN+PrSxz3+H+Zp16wjI6xnWU/\nzmMb45uFbh7nI1eEG/e5QvfC2IjGd0txWXxxqX93Un9wHHrooeG6667L2r1+/foQz4jTTa0/pZxu\nG81rarcf8UpCdCjzFHf2+Iaq3T4W1bcZM2Zkb/a2bdtW1Cbb2s78+fOzN631b0zbqqjDK82ePTts\n2LChw1uZXPVxv4zjvWnTpslV1OG1e+E4j8fPzJkzS3+cxzZu3rx5t4zp8PCNqj4eG82mjl97i2e9\nTz/9dLbt4eHhsO+++2ZnwvXLDjrooOwFJhbauHFjT7zbb4ZpGQECBAgQyCvQ8TPgs846KyxdujQ7\n0zjhhBOySwHNli1ZsiRcfPHFYd26deGcc87J237lCBAgQIBATwoMvHJdvJBfgG3dujXED5LUT43L\n4uXo+HuF+NNsipdeU17iGutGGc22n3JZuzfi6IVLUyOXoJ977rmUZMnr6qVL0NHSJejJ7wIuQU/e\ncKQGl6BHJMb/P16Cjq/djVPHz4BHNtgYvnF547L63/2OrOd/AgQIECBQRYHmp5pV7Kk+ESBAgACB\nEgkI4BINhqYQIECAQP8ICOD+GWs9JUCAAIESCQjgEg2GphAgQIBA/wgI4P4Zaz0lQIAAgRIJCOAS\nDYamECBAgED/CAjg/hlrPSVAgACBEgkI4BINhqYQIECAQP8ICOD+GWs9JUCAAIESCQjgEg2GphAg\nQIBA/wgI4P4Zaz0lQIAAgRIJCOASDYamECBAgED/CAjg/hlrPSVAgACBEgkU9m1IJepzKZoy3lch\ntvt1haXonEYQIECAwLgCzoDHJVKAAAECBAikFxDA6U3VSIAAAQIExhUQwOMSKUCAAAECBNILCOD0\npmokQIAAAQLjCgjgcYkUIECAAAEC6QUEcHpTNRIgQIAAgXEFBPC4RAoQIECAAIH0AgI4vakaCRAg\nQIDAuAICeFwiBQgQIECAQHoBAZzeVI0ECBAgQGBcAQE8LpECBAgQIEAgvYAATm+qRgIECBAgMK6A\nAB6XSAECBAgQIJBeQACnN1UjAQIECBAYV0AAj0ukAAECBAgQSC8ggNObqpEAAQIECIwrIIDHJVKA\nAAECBAikFxDA6U3VSIAAAQIExhUYGreEAl0RWLhwwRjbnR2Gh9eM8bynCBAgQKDsAs6Ayz5C2keA\nAAEClRQQwJUcVp0iQIAAgbILCOCyj5D2ESBAgEAlBQRwJYdVpwgQIECg7AICuOwjpH0ECBAgUEkB\nAVzJYdUpAgQIECi7gAAu+whpHwECBAhUUkAAV3JYdYoAAQIEyi4ggMs+QtpHgAABApUUEMCVHFad\nIkCAAIGyCwjgso+Q9hEgQIBAJQUEcCWHVacIECBAoOwCArjsI6R9BAgQIFBJAQFcyWHVKQIECBAo\nu4AALvsIaR8BAgQIVFJAAFdyWHWKAAECBMouIIDLPkLaR4AAAQKVFBgzgK+++upw991379bxL33p\nS2H58uW7LfOAAAECBAgQmJjAULPiDz/8cDj11FPDCy+8EKZOnRpmzpyZFdu1a1dYt25dOPfcc5ut\nZlmBAgsXLhhza8PDa8Z83pMECBAg0F2BpgF86KGHhnvvvTfceOONYf/99w/HHnts1srBwcGw9957\nh/i/iQABAgQIEGhfoGkAx+rmzZsXzjvvvLBy5crsMvT27dtrW3n/+98f5s+fX3tshgABAgQIEJiY\nQMsAjtXcdddd4fTTTw8nnnhi7TJ0XB7PiAVwlDARIECAAIH2BMYM4DvvvDMsXbo0nHHGGe3Vbi0C\nBAgQIECgqcCYn4I+6qijwooVK5quaCEBAgQIECDQvsCYZ8B77bVXuOqqq8KvfvWrcMQRR9S2cuml\nl4bFixfXHpshQIAAAQIEJiYwZgAfcsgh4Ze//OWoGvfbb79RyywgQIAAAQIE8guMGcBbtmwJL730\n0qja6j8RPepJCwgQIECAAIFxBcYM4NWrV4dbbrklq2THjh3h0UcfDZs3b86Wve51rxu3cgUIECBA\ngACB5gJjBvDJJ58c4k/9tGTJkrB169b6ReYJECBAgACBCQqM+SnoZnXtu+++Id6q0kSAAAECBAi0\nLzDmGfBvf/vbcO2112a1x/tAx3tDxztjff3rX29/i9YkQIAAAQIEwpgBfNhhh4VPfOITNabp06eH\no48+OsQ/TzKVW8CXNZR7fLSOAAECY16CXrRoUTjmmGPCY489Fm644YZw//33h/jJaBMBAgQIECAw\nOYExA3jDhg3hXe96V3jiiSfCSSedFB5//PEQv4ghfhLaRIAAAQIECLQvMGYA/+IXvwgXXHBB+N73\nvhfOPPPMsGzZshBvTxm/qtBEgAABAgQItC8wZgAPDAyMOtuNZ7/xk9AmAgQIECBAoH2BMT+Eddpp\np4W3ve1t4R//+Ed4+9vfHv7whz9kn4Q+9NBD29+iNQkQIECAAIGxPwUdP+38+9//PlxzzTXZ73/j\nTTnqPxXNr3cFfEq6d8dOywkQqIbAmJeg//Of/4RTTz01fPCDHwxXXHFF9inoxjtjVYNBLwgQIECA\nQLECYwbwzTffHL785S+H4447LmvVj3/843DggQeGBx54oNhW2hoBAgQIEKiYwJgBHL+A4emnn96t\ny+vXrw+zZs3abZkHBAgQIECAwMQExv0QVrz8HM94Dz/88HDfffeFeEtKH8KaGLLSBAgQIECgUWDM\nM+ADDjgg3HXXXeHEE0/MgveSSy4Jd9xxR2MdHhMgQIAAAQITFBjzDDjWFf/m9+yzz55gtYoTIECA\nAAECYwmMeQY81oqeI0CAAAECBNoXEMDt21mTAAECBAi0LSCA26azIgECBAgQaF9AALdvZ00CBAgQ\nINC2gABum86KBAgQIECgfQEB3L6dNQkQIECAQNsC4/4ZUts1d2DF+PWIM2bM6EDNqmwUmIzz0NBQ\nmDJlSunHaurUqWFwcDD7aex/mR7H/X769Olh586dZWrWqLbEcZ/MfjOqwg4siOMdp7K3sxcs4/HT\nK8d5vIFU/OnWFI/hZlNPBXAE3L59e7N+WJZY4LWvfU3LGp955tmWz8Un4s7WC2MVXzxiqJV9nxqx\nLHsA94plvMVu2ce8Fyzjm5mRfXPMF4QuPxnfzMQx7+bxE9vQbGq+tFnJkiwr+4FTEqaONmO8MeiV\nAB45MMfrT0cxc1YeX0DiT5mnXgiN2MZeaWfZ98t4/PRCAI+Md/y/bJPfAZdtRLSHAAECBPpCQAD3\nxTDrJAECBAiUTUAAl21EtIcAAQIE+kJAAPfFMOskAQIECJRNQACXbUS0hwABAgT6QkAA98Uw6yQB\nAgQIlE1AAJdtRLSHAAECBPpCQAD3xTDrJAECBAiUTUAAl21EtIcAAQIE+kJAAPfFMOskAQIECJRN\nQACXbUS0hwABAgT6QkAA98Uw6yQBAgQIlE2g576MoWyA/diehQsXjNntZ599bsznPUmAAAECITgD\nthcQIECAAIEuCAjgLqDbJAECBAgQEMD2AQIECBAg0AUBAdwFdJskQIAAAQIC2D5AgAABAgS6ICCA\nu4BukwQIECBAwJ8h2QcKFxjvz5iGh9cU3iYbJECAQNECArho8T7Y3utet8+rvRz774X7gEIXCRAg\n0FLAJeiWNJ4gQIAAAQKdExDAnbNVMwECBAgQaCkggFvSeIIAAQIECHROQAB3zlbNBAgQIECgpYAA\nbknjCQIECBAg0DkBAdw5WzUTIECAAIGWAgK4JY0nCBAgQIBA5wQEcOds1UyAAAECBFoKCOCWNJ4g\nQIAAAQKdExDAnbNVMwECBAgQaCkggFvSeIIAAQIECHROQAB3zlbNBAgQIECgpYAAbknjCQIECBAg\n0DkBAdw5WzUTIECAAIGWAgK4JY0nCBAgQIBA5wQEcOds1UyAAAECBFoKCOCWNJ4gQIAAAQKdExDA\nnbNVMwECBAgQaCkggFvSeIIAAQIECHROQAB3zlbNBAgQIECgpYAAbknjCQIECBAg0DkBAdw5WzUT\nIECAAIGWAgK4JY0nCBAgQIBA5wQEcOds1UyAAAECBFoKCOCWNJ4gQIAAAQKdExjqXNVqJtCewMKF\nC8ZccXh4zZjPe5IAAQK9IOAMuBdGSRsJECBAoHICArhyQ6pDBAgQINALAgK4F0ZJGwkQIECgcgIC\nuHJDqkMECBAg0AsCArgXRkkbCRAgQKByAgK4ckOqQwQIECDQCwICuBdGSRsJECBAoHICArhyQ6pD\nBAgQINALAm7E0QujpI3JBNzkIxmliggQmKSAM+BJAlqdAAECBAi0I+AMuB0165RaYLyz3FI3XuMI\nEOgbAWfAfTPUOkqAAAECZRIQwGUaDW0hQIAAgb4REMB9M9Q6SoAAAQJlEhDAZRoNbSFAgACBvhEQ\nwH0z1DpKgAABAmUS8CnoMo2GtuQS8CnnXEwKESBQcgFnwCUfIM0jQIAAgWoKOAOu5rjqVYcExjv7\nHh5e06Etq5YAgaoJOAOu2ojqDwECBAj0hIAz4J4YJo3sFQFnyL0yUtpJoPsCzoC7PwZaQIAAAQJ9\nKCCA+3DQdZkAAQIEui8ggLs/BlpAgAABAn0oIID7cNB1mQABAgS6LyCAuz8GWkCAAAECfSgggPtw\n0HWZAAECBLovIIC7PwZaQIAAAQJ9KCCA+3DQdZkAAQIEui8ggLs/BlpAgAABAn0o4E5YfTjoutw9\nAXfK6p69LRMom4Az4LKNiPYQIECAQF8ICOC+GGadJECAAIGyCQjgso2I9hAgQIBAXwgI4L4YZp0k\nQIAAgbIJCOCyjYj2ECBAgEBfCAjgvhhmnSRAgACBsgl0NYBXrVoVdu7cWTNZvXp1iD8mAgQIECBQ\ndYGuBfADDzwQzjvvvLB9+/bMeNmyZWH58uXh2muvDbfddlvV3fWPAAECBPpcoCs34li5cmW45557\nwqJFi2r8K1asCFdeeWUWyOeff3445ZRTas+ZIUCAAAECVRPoSgAvXrw4xJ8YtHFat25dmDt3bjY/\nNDQUduzYkc3Hf/7yl7+Ez372s9njT33qUyH+mAh0SmDevHmdqjpXvc22PzAwEPbee+9c63ezUGzn\nzJkzu9mEcbcd2zhjxowwe/bscct2s0Bs5x577NHNJoy77djG+NNsnx135QILxDbOmTMn7Nq1q8Ct\n7r6pLVu27L7g1UddCeDGlgwODu72u+AYwiPTYYcdFn7+859nD6dNmxaef/75kacS/N/dF9sEHVBF\nYoGhocHENU6sumb79z777JO9Sa3/vMTEai2mdAzfTZs2FbOxNrey5557hm3btoWXX365zRqKWW3W\nrFlh48aNxWysza1Mnz49e5PwwgsvtFlDMavFNzIxALt5/LR6M/X/k64Yi6ZbiQfFiy++mD0Xd7r6\nxsb5kUvV69evDxs2bGhah4UEqiBQf/Wnvj/xxaPVc/XlujkfzzDK3sboyDLNXhIde2XM434Z21u2\nqRQBHFGWLFkSLr744uyd/jnnnFM2J+0hQIAAAQJJBboawEuXLq115n/+53/C8ccfH6ZMmZL91J4w\nQ4AAAQIEKijQ1QBu9Kz/3W/jcx4T6AeB1l9XOC8MD6+ZFEHruv+v2snWP6nGWZlAHwp07e+A+9Ba\nlwkQIECAQE1AANcozBAgQIAAgeIEBHBx1rZEgAABAgRqAgK4RmGGAAECBAgUJyCAi7O2JQIECBAg\nUBMQwDUKMwQIECBAoDgBAVyctS0RIECAAIGagACuUZghQIAAAQLFCQjg4qxtiQABAgQI1ARKdSes\nWqvMECBQuMB4d8oar0EvvbR+vCKeJ0CgTsAZcB2GWQIECBAgUJSAAC5K2nYIECBAgECdgACuwzBL\ngAABAgSKEhDARUnbDgECBAgQqBMQwHUYZgkQIECAQFECArgoadshQIAAAQJ1AgK4DsMsAQIECBAo\nSkAAFyVtOwQIECBAoE5AANdhmCVAgAABAkUJCOCipG2HAAECBAjUCQjgOgyzBAgQIECgKAEBXJS0\n7RAgQIAAgToBX8ZQh2GWAIHOCYz1ZQ/Dw2smteGx6o4VT7b+STXOygRaCDgDbgFjMQECBAgQ6KSA\nAO6krroJECBAgEALAQHcAsZiAgQIECDQSQEB3ElddRMgQIAAgRYCArgFjMUECBAgQKCTAgK4k7rq\nJkCAAAECLQQEcAsYiwkQIECAQCcF/B1wJ3XVTYBALgF/x5uLSaGKCTgDrtiA6g4BAgQI9IaAAO6N\ncdJKAgQIEKiYgACu2IDqDgECBAj0hoAA7o1x0koCBAgQqJiAAK7YgOoOAQIECPSGgADujXHSSgIE\nCBComIAArtiA6g4BAgQI9IaAAO6NcdJKAgQIEKiYgACu2IDqDgECBAj0hoAA7o1x0koCBAgQqJiA\nAK7YgOoOAQIECPSGgHtB98Y4aSWBcQXGu5/yuBVUuMBom7m79XZ4eM1ujz0gUISAM+AilG2DAAEC\nBAg0CAjgBhAPCRAgQIBAEQICuAhl2yBAgAABAg0CArgBxEMCBAgQIFCEgAAuQtk2CBAgQIBAg4AA\nbgDxkAABAgQIFCEggItQtg0CBAgQINAgIIAbQDwkQIAAAQJFCAjgIpRtgwABAgQINAgI4AYQDwkQ\nIECAQBECArgIZdsgQIAAAQINAgK4AcRDAgQIECBQhIAALkLZNggQIECAQIOAAG4A8ZAAAQIECBQh\n4OsIi1C2DQIJBEZ/pV6CShNWseeec16pLf6knzrd9/Hqn8zXFXay7vTSaWssc9/L0DZnwGn3N7UR\nIECAAIFcAgI4F5NCBAgQIEAgrYAATuupNgIECBAgkEtAAOdiUogAAQIECKQVEMBpPdVGgAABAgRy\nCQjgXEwKESBAgACBtAICOK2n2ggQIECAQC4BAZyLSSECBAgQIJBWQACn9VQbAQIECBDIJSCAczEp\nRIAAAQIE0goI4LSeaiNAgAABArkEBHAuJoUIECBAgEBaAQGc1lNtBAgQIEAgl4AAzsWkEAECBAgQ\nSCsggNN6qo0AAQIECOQSEMC5mBQiQIAAAQJpBQRwWk+1ESBAgACBXAICOBeTQgQIECBAIK2AAE7r\nqTYCBAgQIJBLQADnYlKIAAECBAikFRDAaT3VRoAAAQIEcgkI4FxMChEgQIAAgbQCAjitp9oIECBA\ngEAuAQGci0khAgQIECCQVkAAp/VUGwECBAgQyCUggHMxKUSAAAECBNIKDOx6ZUpbZedq27BhQ8cq\nnzZtWlb31q1bO7aNFBXHdpa9jVOmTAnTp08PL7/8cooud6yOwcHBEHf/nTt3dmwbKSqeNWtW2LRp\nU9bWFPV1qo5e2DfjfhnHe9u2bZ1iSFJvL1gODQ2F+LN58+Ykfe5UJVOnTg3bt2/v+vEze/bsUV0c\nGrWkxAvii2WnQnjOnDlZz9evX19igRBiO8vexrjDx5+yt3PGjBlhx44dpX8xnjlzZti4cWPW1jLv\nnPEFplPHZ6p+xzeHMXzjG5oyT71wnMfjJ+6bZT/OYxvjm4RuvtFuFr5x/3MJusxHobYRIECAQGUF\nBHBlh1bHCBAgQKDMAgK4zKOjbQQIECBQWQEBXNmh1TECBAgQKLOAAC7z6GgbAQIECFRWQABXdmh1\njAABAgTKLCCAyzw62kaAAAEClRUQwJUdWh0jQIAAgTILCOAyj462ESBAgEBlBQRwZYdWxwgQIECg\nzAICuMyjo20ECBAgUFkBAVzZodUxAgQIECizgAAu8+hoGwECBAhUVkAAV3ZodYwAAQIEyiwggMs8\nOtpGgAABApUVEMCVHVodI0CAAIEyCwjgMo+OthEgQIBAZQUEcGWHVscIECBAoMwCA7temcrcwKLa\nduedd4YpU6aE9773vUVtsrLbefLJJ8Ott94aPvOZz1S2j0V27LLLLgvnnntu2GuvvYrcbCW39etf\n/zosXLgwvPOd76xk/4rs1N///vfw4IMPhrPOOqvIzVZqW86AXx3OFStWhL/+9a+VGtxudeaZZ54J\nt99+e7c2X7nt3njjjWHDhg2V61c3OnTvvfeGRx55pBubrtw24xvtu+66q3L9KrJDArhIbdsiQIAA\nAQKvCrgE/SrE2rVrw8DAQHjNa15j55ikwObNm8Pzzz8f9ttvv0nWZPUoEM804mXToaEhIJMUePbZ\nZ8P06dPDnnvuOcmarL5x48bsysz8+fNhtCkggNuEsxoBAgQIEJiMgEvQr+qtXr06xB/T7gLr1q0L\nDz/8cPbzxBNP1J5s9Nq+fXt46KGHQvz978jUbFl81xw/uBH/H5maLRt5rir/b9myJTz22GO17jSz\nabasmU2zZdE9+u/YsaO2jWbLak/2+MyqVavCzp07s17YR9sfzE2bNmX7zbZt22qVNNtv8i5rfF2I\nlTYua7af1zbeZzOD/++Vqc/6PKq7y5YtC48++mj405/+FOIOecghh4wq068LrrvuuuzDafEAjDZv\nfvObQ6PXwQcfHL72ta9ll/Cvv/768Ja3vCW7lN+4LBp+9atfDTNnzgw/+clPwvve977w0ksvjVpW\ntUut0e3SSy8N8fLn0UcfHeIfHjTaxF99NC7L6xU/QBg943Z+97vfheOOOy488MADo5ZVZR+OffvC\nF74QzjjjjDA4OBjso+2N7L///e/wzW9+MztWf/jDH4bjjz8+/O1vfxu13zTbl5ota3xdiK+jjcua\nvVa89rWvba8DFVjLL5VeGcT4AnbllVeG+M7s/PPPD6ecckoFhjZNF/75z3+Gr3zlK2HatGlZcMZa\nG73e9KY3hQULFoSPfexj4Ygjjsj+BOkDH/jAqGXz5s0Lp59+enagx7OX+++/P/z3v/8dtSy+EFRp\nivvWgQceGEbOMuLZW0qv2267LVx00UVhzpw54cILL8ze1Nx0002jllXh954rV64M99xzT1i0aFFt\nF7GP1igmNPPcc8+Fz33ucyEev/GN8J///OfsDVyefanZ/tX4uhBfRxuXNXutiG3o16nvL0HHy1dz\n587Nxj+eedVfwuvXnaK+3//617/Cz372s3DxxReHW265JTTzWrNmTRYocb34gYynn346TGZZ/far\nMP/FL34xvOMd76h1ZTI2zdZ98cUXs/CNG4hvcuLVimbLag3o4ZnFixdnZ78zZsyo9cI+WqOY0MyR\nRx6ZhW88puObmvi42X6TZ1m8zNz4Opr3tWJCja5Y4b4/A46XsEZ+lxTHtmqXPye7v15zzTVh1qxZ\n2dWBT3/60+GEE04Y5RVvYDJiGN/AxE+ZTmbZZNtc9vUnY9Ns3fr+xqs40b9+aras/vlen7ePtj+C\n8Sw4vrn+/Oc/H/bZZ5/dKmq237Ratscee9ReA2Il8XW02WvrePvvbg3ogwd9fwYcL8vFd3hxih9u\niTuS6f8E4sH205/+NHswEqzxXW6j1xve8IbaB4wef/zxcMABB4TJLKu6/2Rsmq0bz3rjVYc4DQ8P\nh3333Tc7E25cVkVX+2j7oxrDN34EKP7a4rDDDssqyrsvNZY76KCDRr0uNHttbbb/tt+D3l+z78+A\n4xAuWbIkexcYL5mcc845vT+qiXoQ38XGD0h8+9vfzs5oP/nJT2Y1N3q98Y1vzN49f+Mb3wjxoP7O\nd76TXRKN76jrl8V3xN///vfDfffdl52lxd8Xxw9qNC5L1PzSVpPaa/bs2WHp0qXZr0/iFYqpU6dm\ntwdsXFZakEk0zD7aPl58cx2P1yuuuCKr5LTTTmu638RbTTbuS82WNb4uxEoblzXb99vvQe+v6e+A\nXx3D+E46Xh6JP6bdBeKHh+KLev3UzGvr1q3Zh7Xqy01mWX09VZyfjM1k1q2ipX007ai2u381e11o\ntqxZ/Wl70Bu1CeDeGCetJECAAIGKCTjdq9iA6g4BAgQI9IaAAO6NcdJKAgQIEKiYgACu2IDqDgEC\nBAj0hoAA7o1x0koCuQTiXbE+8pGP5Cqbt1C8B/hVV12VFY+3Eh351Gze9ZUjQKC5gABu7mIpAQKv\nCsQ/G4s/JgIE0goI4LSeaiNQGoG77747HH744dl9qOM9uOP9fuMU/97zW9/6VvbFGvGeysuXL8+W\nx9sJvuc978nus3zBBRdkX+oQb04T75R06623Zn+vHQvGLy056qijwv777x8uv/zybF3/ECAwcQEB\nPHEzaxDoCYF4k/urr746PPLII2G//fYLN9xwQ9bueLes+O1f8YsN4k1QLrnkkmz5mWeeGeLNVuJz\nAwMD4cknn8xuQxq/MedDH/pQiPe0jlP8Fp0Ywn/84x+zb3CKf+dpIkBg4gICeOJm1iBQeoEYovFW\nlHfccUe47LLLwubNm8PNN99ca3f8Kr94c5X49YjxbkgbNmzIvvP5ox/9aHYP37PPPrtWtnEmhnG8\nZWu8q9HrX//67Is3Gst4TIDA+AJuRTm+kRIEek5g7dq12bfTvPWtb83aHv8/+eSTa/2I94uOU7w9\naPx+4ngHuPiVk3E+TmPdES7e+nJkGll/5LH/CRDIL+AMOL+VkgR6RuDYY48NW7ZsCcccc0z2O984\nHy8Zt5pmzpwZ3v3ud4cbb7wx+1ab+BWUI1MM5njrQBMBAmkFnAGn9VQbgVIIxN/h/uAHP8hurh/P\nauO3WV1//fVjtu3SSy8NH//4x7M/MzrxxBOz3//GFeIHueJXUcavrItBbSJAII2Ae0GncVQLgdIK\nxN/v1l82btXQeNYbPy0dv+w+foI6fsL5N7/5TVY8Bnj8iWfDJgIE0gg4A07jqBYCpRXIE76x8WvW\nrAknnXRSiF8Tefvtt4cf/ehHtT7F3/XGHxMBAukEnAGns1QTgZ4XeOqpp8KqVavCkUceGeIXqpsI\nEOicgADunK2aCRAgQIBASwGfgm5J4wkCBAgQINA5AQHcOVs1EyBAgACBlgICuCWNJwgQIECAQOcE\nBHDnbNVMgAABAgRaCvwvr5/BusbuCS0AAAAASUVORK5CYII=\n"
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.06 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%%timeit -n 1 -r 1\n",
"%%R\n",
"ndf %>% ggplot(aes(x = length)) + \n",
" geom_histogram(bins = 50, fill = \"blue\") + \n",
" scale_y_log10()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment