Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsolk/d70425d21be9602108ec7451f4ada5be to your computer and use it in GitHub Desktop.
Save johnsolk/d70425d21be9602108ec7451f4ada5be to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the mininum readlength for the longest 10% of quality-filtered reads?"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from pylab import *\n",
"import screed"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0m\u001b[01;34m1-E7-H7\u001b[0m/ E7-H7_md5sum.txt robots.txt\r\n",
"assembly-stats.log fastq_9.fastq slurm-23812998.out\r\n",
"assembly-stats.sh md5sum_A_xenica.log slurm-25430571.out\r\n",
"A_xenica_combined.fastq md5sum_A_xenica.sh summary.sh\r\n",
"A_xenica_PromethION_QC.ipynb miniasm.sh wget_index.html.tmp\r\n"
]
}
],
"source": [
"ls"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"... 0\n",
"... 1000000\n",
"... 2000000\n",
"... 3000000\n",
"... 4000000\n",
"... 5000000\n",
"... 6000000\n",
"... 7000000\n",
"... 8000000\n",
"... 9000000\n",
"... 10000000\n",
"... 11000000\n",
"... 12000000\n",
"... 13000000\n",
"... 14000000\n",
"... 15000000\n"
]
}
],
"source": [
"lengths = []\n",
"for n,r in enumerate(screed.open('A_xenica_combined.fastq')):\n",
" if n % 1000000 == 0:\n",
" print('...',n)\n",
" lengths.append(len(r.sequence))"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15704522\n"
]
}
],
"source": [
"total_n_reads = len(lengths)\n",
"print(total_n_reads)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"lengths = sorted(lengths)\n",
"rank_position = 0.1*total_n_reads\n",
"lengths_grab = total_n_reads - int(rank_position)\n",
"top10percent = sorted(lengths[lengths_grab:],reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAaUAAAD8CAYAAADXJLslAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGQtJREFUeJzt3X2QXfV93/H3p1KEsVMjARuKJTySazkdmUlrvMHyuM04kIAAj0VnqCuaKYpDrUnAqRNnxhbxH6R2MoPTTIiZ2NiMpVh4XAQlbtDYYFUBUk9nysPiB54xG8BmNWDJiIcmnhpjf/vH/Qlf1vfuWrti91z0fs3c2XO+53fO9xydhQ/33J8uqSokSeqCf7LYJyBJ0kGGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6oxZQynJ9iT7ktw7rf67SR5Mcl+SP+2rX5JkMslDSc7sq29otckkW/vqa5Lc3urXJlnW6ke19cm2ffVsPSRJoy2zfaNDkl8B/gG4uqpObrVfBT4CnFNVP0jyC1W1L8k64BrgVOB1wN8Cb2qH+hbw68AUcCdwflXdn+Q64ItVtTPJp4FvVtWVSS4CfqmqfjvJJuDfVtW/H9ajqn4003Ucf/zxtXr16kP845GkI9tdd931vaoaW6h+S2cbUFVf7X+X0vwOcFlV/aCN2dfqG4Gdrf5okkl64QEwWVWPACTZCWxM8gBwGvAf2pgdwB8BV7Zj/VGrXw/8ZZLM0OP/zHQdq1evZmJiYrbLlST1SfLthew318+U3gT8m/ZY7X8l+eVWXwk83jduqtWG1Y8DnqmqF6bVX3Kstv3ZNn7YsSRJI27Wd0oz7HcssB74ZeC6JG84bGd1mCTZAmwBeP3rX7/IZyNJms1c3ylN0fscqKrqDuDHwPHAXuCkvnGrWm1Y/SlgeZKl0+r079O2H9PGDzvWT6mqq6pqvKrGx8YW7JGoJGmO5hpKfwP8KkCSNwHLgO8Bu4BNbebcGmAtcAe9iQ1r20y7ZcAmYFf1ZlncCpzXjrsZuKEt72rrtO23tPHDekiSRtysj++SXAO8Ezg+yRRwKbAd2N6miT8PbG6BcV+bTXc/8AJw8cFZcUneD+wGlgDbq+q+1uLDwM4kfwx8HdjW6tuAz7eJDAfoBRlVNbSHJGm0zTol/JVifHy8nH0nSYcmyV1VNb5Q/fxGB0lSZxhKkqTOMJQkSZ0x17+ndERZvfXL89r/scvOOUxnIkmvbL5TkiR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6gxDSZLUGYaSJKkzDCVJUmcYSpKkzpg1lJJsT7Ivyb0Dtv1BkkpyfFtPkiuSTCa5O8kpfWM3J3m4vTb31d+a5J62zxVJ0urHJtnTxu9JsmK2HpKk0fazvFP6HLBhejHJScAZwHf6ymcBa9trC3BlG3sscCnwNuBU4NKDIdPGvK9vv4O9tgI3V9Va4Oa2PrSHJGn0zRpKVfVV4MCATZcDHwKqr7YRuLp6bgOWJzkROBPYU1UHquppYA+woW17bVXdVlUFXA2c23esHW15x7T6oB6SpBE3p8+UkmwE9lbVN6dtWgk83rc+1Woz1acG1AFOqKon2vKTwAmz9JAkjbhD/t+hJ3k18If0Ht0tiKqqJDX7yJdKsoXeIz5e//rXH/bzkiQdXnN5p/TPgTXAN5M8BqwCvpbknwF7gZP6xq5qtZnqqwbUAb578LFc+7mv1Ycd66dU1VVVNV5V42NjY4d4mZKkhXbIoVRV91TVL1TV6qpaTe/x2SlV9SSwC7igzZBbDzzbHsHtBs5IsqJNcDgD2N22PZdkfZt1dwFwQ2u1Czg4S2/ztPqgHpKkETfr47sk1wDvBI5PMgVcWlXbhgy/ETgbmAS+D7wXoKoOJPkYcGcb99GqOjh54iJ6M/yOBm5qL4DLgOuSXAh8G3jPTD0kSaNv1lCqqvNn2b66b7mAi4eM2w5sH1CfAE4eUH8KOH1AfWgPSdJo8xsdJEmdYShJkjrDUJIkdYahJEnqDENJktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6Y9ZQSrI9yb4k9/bV/muSB5PcneR/JFnet+2SJJNJHkpyZl99Q6tNJtnaV1+T5PZWvzbJslY/qq1Ptu2rZ+shSRptP8s7pc8BG6bV9gAnV9UvAd8CLgFIsg7YBLy57fOpJEuSLAE+CZwFrAPOb2MBPg5cXlVvBJ4GLmz1C4GnW/3yNm5oj0O8bklSB80aSlX1VeDAtNr/rKoX2uptwKq2vBHYWVU/qKpHgUng1PaarKpHqup5YCewMUmA04Dr2/47gHP7jrWjLV8PnN7GD+shSRpxh+Mzpd8CbmrLK4HH+7ZNtdqw+nHAM30Bd7D+kmO17c+28cOO9VOSbEkykWRi//79c7o4SdLCmVcoJfkI8ALwhcNzOodXVV1VVeNVNT42NrbYpyNJmsXSue6Y5DeBdwGnV1W18l7gpL5hq1qNIfWngOVJlrZ3Q/3jDx5rKslS4Jg2fqYekqQRNqd3Skk2AB8C3l1V3+/btAvY1GbOrQHWAncAdwJr20y7ZfQmKuxqYXYrcF7bfzNwQ9+xNrfl84Bb2vhhPSRJI27Wd0pJrgHeCRyfZAq4lN5su6OAPb25B9xWVb9dVfcluQ64n95jvYur6kftOO8HdgNLgO1VdV9r8WFgZ5I/Br4ObGv1bcDnk0zSm2ixCWCmHpKk0ZafPHl7ZRsfH6+JiYk57bt665fn1fuxy86Z1/6StFiS3FVV4wvVz290kCR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6gxDSZLUGYaSJKkzDCVJUmcYSpKkzjCUJEmdYShJkjrDUJIkdYahJEnqjFlDKcn2JPuS3NtXOzbJniQPt58rWj1JrkgymeTuJKf07bO5jX84yea++luT3NP2uSJJ5tpDkjTafpZ3Sp8DNkyrbQVurqq1wM1tHeAsYG17bQGuhF7AAJcCbwNOBS49GDJtzPv69tswlx6SpNE3ayhV1VeBA9PKG4EdbXkHcG5f/erquQ1YnuRE4ExgT1UdqKqngT3AhrbttVV1W1UVcPW0Yx1KD0nSiJvrZ0onVNUTbflJ4IS2vBJ4vG/cVKvNVJ8aUJ9Lj5+SZEuSiSQT+/fv/xkvTZK0WOY90aG9w6nDcC6HvUdVXVVV41U1PjY29jKcmSTpcJprKH334COz9nNfq+8FTuobt6rVZqqvGlCfSw9J0oibayjtAg7OoNsM3NBXv6DNkFsPPNsewe0Gzkiyok1wOAPY3bY9l2R9m3V3wbRjHUoPSdKIWzrbgCTXAO8Ejk8yRW8W3WXAdUkuBL4NvKcNvxE4G5gEvg+8F6CqDiT5GHBnG/fRqjo4eeIiejP8jgZuai8OtYckafTNGkpVdf6QTacPGFvAxUOOsx3YPqA+AZw8oP7UofaQJI02v9FBktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6gxDSZLUGYaSJKkz5hVKSX4/yX1J7k1yTZJXJVmT5PYkk0muTbKsjT2qrU+27av7jnNJqz+U5My++oZWm0yyta8+sIckabTNOZSSrAT+MzBeVScDS4BNwMeBy6vqjcDTwIVtlwuBp1v98jaOJOvafm8GNgCfSrIkyRLgk8BZwDrg/DaWGXpIkkbYfB/fLQWOTrIUeDXwBHAacH3bvgM4ty1vbOu07acnSavvrKofVNWjwCRwantNVtUjVfU8sBPY2PYZ1kOSNMLmHEpVtRf4M+A79MLoWeAu4JmqeqENmwJWtuWVwONt3xfa+OP669P2GVY/boYeL5FkS5KJJBP79++f66VKkhbIfB7fraD3LmcN8DrgNfQev3VGVV1VVeNVNT42NrbYpyNJmsV8Ht/9GvBoVe2vqh8CXwTeASxvj/MAVgF72/Je4CSAtv0Y4Kn++rR9htWfmqGHJGmEzSeUvgOsT/Lq9jnP6cD9wK3AeW3MZuCGtryrrdO231JV1eqb2uy8NcBa4A7gTmBtm2m3jN5kiF1tn2E9JEkjbD6fKd1Ob7LB14B72rGuAj4MfDDJJL3Pf7a1XbYBx7X6B4Gt7Tj3AdfRC7SvABdX1Y/aZ0bvB3YDDwDXtbHM0EOSNMLSe+Pxyjc+Pl4TExNz2nf11i/Pq/djl50zr/0labEkuauqxheqn9/oIEnqDENJktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6ox5hVKS5UmuT/JgkgeSvD3JsUn2JHm4/VzRxibJFUkmk9yd5JS+42xu4x9Osrmv/tYk97R9rkiSVh/YQ5I02ub7TukTwFeq6l8A/xJ4ANgK3FxVa4Gb2zrAWcDa9toCXAm9gAEuBd4GnApc2hcyVwLv69tvQ6sP6yFJGmFzDqUkxwC/AmwDqKrnq+oZYCOwow3bAZzbljcCV1fPbcDyJCcCZwJ7qupAVT0N7AE2tG2vrarbqqqAq6cda1APSdIIm887pTXAfuCvknw9yWeTvAY4oaqeaGOeBE5oyyuBx/v2n2q1mepTA+rM0EOSNMLmE0pLgVOAK6vqLcA/Mu0xWnuHU/PoMauZeiTZkmQiycT+/ftfztOQJB0G8wmlKWCqqm5v69fTC6nvtkdvtJ/72va9wEl9+69qtZnqqwbUmaHHS1TVVVU1XlXjY2Njc7pISdLCmXMoVdWTwONJfrGVTgfuB3YBB2fQbQZuaMu7gAvaLLz1wLPtEdxu4IwkK9oEhzOA3W3bc0nWt1l3F0w71qAekqQRtnSe+/8u8IUky4BHgPfSC7rrklwIfBt4Txt7I3A2MAl8v42lqg4k+RhwZxv30ao60JYvAj4HHA3c1F4Alw3pIUkaYfMKpar6BjA+YNPpA8YWcPGQ42wHtg+oTwAnD6g/NaiHJGm0+Y0OkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1BmGkiSpMwwlSVJnGEqSpM4wlCRJnWEoSZI6w1CSJHWGoSRJ6gxDSZLUGYaSJKkzDCVJUmcYSpKkzjCUJEmdMe9QSrIkydeTfKmtr0lye5LJJNcmWdbqR7X1ybZ9dd8xLmn1h5Kc2Vff0GqTSbb21Qf2kCSNtsPxTukDwAN96x8HLq+qNwJPAxe2+oXA061+eRtHknXAJuDNwAbgUy3olgCfBM4C1gHnt7Ez9ZAkjbB5hVKSVcA5wGfbeoDTgOvbkB3AuW15Y1unbT+9jd8I7KyqH1TVo8AkcGp7TVbVI1X1PLAT2DhLD0nSCJvvO6W/AD4E/LitHwc8U1UvtPUpYGVbXgk8DtC2P9vGv1ifts+w+kw9JEkjbM6hlORdwL6quuswns9hlWRLkokkE/v371/s05EkzWI+75TeAbw7yWP0Hq2dBnwCWJ5kaRuzCtjblvcCJwG07ccAT/XXp+0zrP7UDD1eoqquqqrxqhofGxub+5VKkhbEnEOpqi6pqlVVtZreRIVbquo3gFuB89qwzcANbXlXW6dtv6WqqtU3tdl5a4C1wB3AncDaNtNuWeuxq+0zrIckaYS9HH9P6cPAB5NM0vv8Z1urbwOOa/UPAlsBquo+4DrgfuArwMVV9aP2mdH7gd30Zvdd18bO1EOSNMKWzj5kdlX1d8DfteVH6M2cmz7m/wH/bsj+fwL8yYD6jcCNA+oDe0iSRpvf6CBJ6gxDSZLUGYaSJKkzDCVJUmcYSpKkzjCUJEmdYShJkjrDUJIkdYahJEnqDENJktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1BlzDqUkJyW5Ncn9Se5L8oFWPzbJniQPt58rWj1JrkgymeTuJKf0HWtzG/9wks199bcmuaftc0WSzNRDkjTa5vNO6QXgD6pqHbAeuDjJOmArcHNVrQVubusAZwFr22sLcCX0Aga4FHgbcCpwaV/IXAm8r2+/Da0+rIckaYTNOZSq6omq+lpb/r/AA8BKYCOwow3bAZzbljcCV1fPbcDyJCcCZwJ7qupAVT0N7AE2tG2vrarbqqqAq6cda1APSdIIOyyfKSVZDbwFuB04oaqeaJueBE5oyyuBx/t2m2q1mepTA+rM0EOSNMLmHUpJfh74a+D3quq5/m3tHU7Nt8dMZuqRZEuSiSQT+/fvfzlPQ5J0GMwrlJL8HL1A+kJVfbGVv9sevdF+7mv1vcBJfbuvarWZ6qsG1Gfq8RJVdVVVjVfV+NjY2NwuUpK0YOYz+y7ANuCBqvrzvk27gIMz6DYDN/TVL2iz8NYDz7ZHcLuBM5KsaBMczgB2t23PJVnfel0w7ViDekiSRtjSeez7DuA/Avck+Uar/SFwGXBdkguBbwPvadtuBM4GJoHvA+8FqKoDST4G3NnGfbSqDrTli4DPAUcDN7UXM/SQJI2wOYdSVf1vIEM2nz5gfAEXDznWdmD7gPoEcPKA+lODekiSRpvf6CBJ6gxDSZLUGYaSJKkzDCVJUmcYSpKkzjCUJEmdYShJkjrDUJIkdYahJEnqDENJktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeoMQ0mS1Blz/t+hd0GSDcAngCXAZ6vqskU+pYFWb/3ynPd97LJzDuOZSFK3jew7pSRLgE8CZwHrgPOTrFvcs5IkzcfIhhJwKjBZVY9U1fPATmDjIp+TJGkeRjmUVgKP961PtZokaUSN9GdKs0myBdjSVv8hyUNzOMzxwPcO31kdmnx8sTq/aFGvvwO8fq//SL5+gF9cyGajHEp7gZP61le12ouq6irgqvk0STJRVePzOcYo8/q9fq//yL1+6P0ZLGS/UX58dyewNsmaJMuATcCuRT4nSdI8jOw7pap6Icn7gd30poRvr6r7Fvm0JEnzMLKhBFBVNwI3vsxt5vX47xXA6z+yef1a0D+DVNVC9pMkaahR/kxJkvQKYygNkWRDkoeSTCbZutjnc6iSnJTk1iT3J7kvyQda/dgke5I83H6uaPUkuaJd791JTuk71uY2/uEkm/vqb01yT9vniiSZqcdiSLIkydeTfKmtr0lyezvna9skGZIc1dYn2/bVfce4pNUfSnJmX33g78iwHgstyfIk1yd5MMkDSd5+JN3/JL/ffvfvTXJNkle90u9/ku1J9iW5t6+2aPd8ph5DVZWvaS96Eyf+HngDsAz4JrBusc/rEK/hROCUtvxPgW/R+zqmPwW2tvpW4ONt+WzgJiDAeuD2Vj8WeKT9XNGWV7Rtd7Sxafue1eoDeyzSn8MHgf8GfKmtXwdsasufBn6nLV8EfLotbwKubcvr2v0/CljTfi+WzPQ7MqzHIlz7DuA/teVlwPIj5f7T+4v0jwJH992T33yl33/gV4BTgHv7aot2z4f1mPEaFuMflq6/gLcDu/vWLwEuWezzmuc13QD8OvAQcGKrnQg81JY/A5zfN/6htv184DN99c+02onAg331F8cN67EI17wKuBk4DfhS+wfje8DS6feZ3izOt7flpW1cpt/7g+OG/Y7M1GOBr/0Yev9SzrT6EXH/+ck3vhzb7ueXgDOPhPsPrOalobRo93xYj5nO38d3g72ivsKoPYp4C3A7cEJVPdE2PQmc0JaHXfNM9akBdWbosdD+AvgQ8OO2fhzwTFW90Nb7z/nF62zbn23jD/XPZaYeC2kNsB/4q/QeX342yWs4Qu5/Ve0F/gz4DvAEvft5F0fO/e+3mPf8kP9daii9wiX5eeCvgd+rquf6t1XvP11e1umXC9FjkCTvAvZV1V0L3bsjltJ7jHNlVb0F+Ed6j1Ve9Aq//yvofUHzGuB1wGuADQt9Hl0zCvfcUBps1q8wGgVJfo5eIH2hqr7Yyt9NcmLbfiKwr9WHXfNM9VUD6jP1WEjvAN6d5DF63yB/Gr3/99byJAf/fl7/Ob94nW37McBTHPqfy1Mz9FhIU8BUVd3e1q+nF1JHyv3/NeDRqtpfVT8Evkjvd+JIuf/9FvOeH/K/Sw2lwUb+K4zarJhtwANV9ed9m3YBB2fTbKb3WdPB+gVttsx64Nn2dnw3cEaSFe2/Ps+g94z8CeC5JOtbrwumHWtQjwVTVZdU1aqqWk3v/t1SVb8B3AqcN+Dc+s/5vDa+Wn1Tm521BlhL78Pegb8jbZ9hPRZMVT0JPJ7k4Jdpng7czxFy/+k9tluf5NXt/A5e/xFx/6dZzHs+rMdwC/kB3Ci96M0a+Ra9GTYfWezzmcP5/2t6b6HvBr7RXmfTe+Z9M/Aw8LfAsW186P1PE/8euAcY7zvWbwGT7fXevvo4cG/b5y/5yV/GHthjEf8s3slPZt+9gd6/VCaB/w4c1eqvauuTbfsb+vb/SLvGh2izjWb6HRnWYxGu+18BE+134G/ozaQ6Yu4/8F+AB9s5fp7eDLpX9P0HrqH3GdoP6b1bvnAx7/lMPYa9/EYHSVJn+PhOktQZhpIkqTMMJUlSZxhKkqTOMJQkSZ1hKEmSOsNQkiR1hqEkSeqM/w+vSU6XlYCGHgAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"hist(top10percent,bins=20);"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[5146]\n"
]
}
],
"source": [
"print(top10percent[-1:])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment