Skip to content

Instantly share code, notes, and snippets.

@johnsolk
Created August 23, 2018 17:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsolk/895449b2932aff2b5b89f22c142caa73 to your computer and use it in GitHub Desktop.
Save johnsolk/895449b2932aff2b5b89f22c142caa73 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import sourmash_lib, sourmash_lib.fig"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sklearn.cluster as cluster\n",
"import time\n",
"import hdbscan\n",
"import numpy as np\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import palettable as pal\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A_xenica_minisasm.fa\r\n",
"F_notti_subset.fa\r\n",
"F_nottii_miniasm.fa\r\n",
"GCF_000826765.1_Fundulus_heteroclitus-3.0.2_genomic.fna\r\n",
"cluster_contigs.ipynb\r\n"
]
}
],
"source": [
"ls"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "Failed to interpret file <_io.BufferedReader name='F_nottii_subset.fa'> as a pickle",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/anaconda3/envs/ONT/lib/python3.6/site-packages/numpy/lib/npyio.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 428\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mpickle_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 429\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, '>'.",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-385394994a12>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msubs2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubs2_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msourmash_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_matrix_and_labels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'F_nottii_subset.fa'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/ONT/lib/python3.6/site-packages/sourmash/fig.py\u001b[0m in \u001b[0;36mload_matrix_and_labels\u001b[0;34m(basefile)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0ma\u001b[0m \u001b[0msquare\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0mmatrix\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \"\"\"\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbasefile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mlabeltext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbasefile\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'.labels.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeltext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/ONT/lib/python3.6/site-packages/numpy/lib/npyio.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 430\u001b[0m raise IOError(\n\u001b[0;32m--> 431\u001b[0;31m \"Failed to interpret file %s as a pickle\" % repr(file))\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mown_fid\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mOSError\u001b[0m: Failed to interpret file <_io.BufferedReader name='F_nottii_subset.fa'> as a pickle"
]
}
],
"source": [
"subs2, subs2_labels = sourmash_lib.fig.load_matrix_and_labels('F_nottii_subset.fa')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# cluster data using 'algorithm', then use TSNE to plot & color by clustesr\n",
"def plot_clusters(data, algorithm, args, kwds):\n",
" start_time = time.time()\n",
" labels = algorithm(*args, **kwds).fit_predict(data)\n",
" #print(labels)\n",
" end_time = time.time()\n",
" \n",
" palette = sns.color_palette('deep', np.unique(labels).max() + 1)\n",
" colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]\n",
" \n",
" t = TSNE(n_components=2, perplexity=5).fit_transform(data)\n",
"\n",
" df = pd.DataFrame(t)\n",
" df.columns=['t1','t2']\n",
" df['colors'] = pd.Series(colors, index=df.index)\n",
"\n",
" plt.scatter(df.t1, df.t2, c=colors, **plot_kwds)\n",
" frame = plt.gca()\n",
" frame.axes.get_xaxis().set_visible(False)\n",
" frame.axes.get_yaxis().set_visible(False)\n",
" plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)\n",
" return labels\n",
" \n",
"# do a quick test run on a small subset\n",
"cluster_labels = plot_clusters(subs2[:200, :200], hdbscan.HDBSCAN, (), {'min_cluster_size':5})"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment