Skip to content

Instantly share code, notes, and snippets.

@pkerpedjiev
Last active April 23, 2018 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pkerpedjiev/188b1676496e0264c8fe0fa01572f7f6 to your computer and use it in GitHub Desktop.
Save pkerpedjiev/188b1676496e0264c8fe0fa01572f7f6 to your computer and use it in GitHub Desktop.
streaming_loader
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:35:31.631090Z",
"start_time": "2018-01-31T19:35:31.138300Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"import cooler\n",
"from cooler.io import ContactBinner, CoolerMerger\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import bresenham"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:35:32.261309Z",
"start_time": "2018-01-31T19:35:32.256140Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"#matrix_file = '../98.out.gz'\n",
"matrix_file = '../short_brh_10.gz'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:35:33.197169Z",
"start_time": "2018-01-31T19:35:32.885505Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CM000663.2\t248946422\t0\t100000\t+\tCM000663.2\t248946422\t0\t100000\t99.9942\t87\t87\t197999\r\n",
"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t100000\t200000\t99.9942\t87\t87\t197999\r\n",
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t200000\t300000\t99.9942\t87\t87\t197999\r\n",
"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
"CM000663.2\t248946422\t600000\t700000\t+\tCM000663.2\t248946422\t0\t100000\t98.0156\t46\t87\t5999\r\n",
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.9146\t77\t85\t32999\r\n",
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t100000\t200000\t98.9146\t77\t85\t32999\r\n",
"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t0\t100000\t98.1158\t61\t86\t43999\r\n",
"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t100000\t200000\t98.1158\t61\t86\t43999\r\n"
]
}
],
"source": [
"!gzcat {matrix_file} | head"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:34:41.547674Z",
"start_time": "2018-01-31T19:33:01.642302Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 17\r\n"
]
}
],
"source": [
"!gzcat {matrix_file} | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:35:34.909247Z",
"start_time": "2018-01-31T19:35:34.896192Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chrom1</th>\n",
" <th>chrom1_len</th>\n",
" <th>start1</th>\n",
" <th>end1</th>\n",
" <th>strand</th>\n",
" <th>chrom2</th>\n",
" <th>chrom2_len</th>\n",
" <th>start2</th>\n",
" <th>end2</th>\n",
" <th>identity</th>\n",
" <th>foo</th>\n",
" <th>bar</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>99.9942</td>\n",
" <td>87</td>\n",
" <td>87</td>\n",
" <td>197999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>100000</td>\n",
" <td>200000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>100000</td>\n",
" <td>200000</td>\n",
" <td>99.9942</td>\n",
" <td>87</td>\n",
" <td>87</td>\n",
" <td>197999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>200000</td>\n",
" <td>300000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>200000</td>\n",
" <td>300000</td>\n",
" <td>99.9942</td>\n",
" <td>87</td>\n",
" <td>87</td>\n",
" <td>197999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>100000</td>\n",
" <td>200000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>98.4586</td>\n",
" <td>51</td>\n",
" <td>80</td>\n",
" <td>27999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>200000</td>\n",
" <td>300000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>98.4586</td>\n",
" <td>51</td>\n",
" <td>80</td>\n",
" <td>27999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>600000</td>\n",
" <td>700000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>98.0156</td>\n",
" <td>46</td>\n",
" <td>87</td>\n",
" <td>5999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>200000</td>\n",
" <td>300000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>98.9146</td>\n",
" <td>77</td>\n",
" <td>85</td>\n",
" <td>32999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>200000</td>\n",
" <td>300000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>100000</td>\n",
" <td>200000</td>\n",
" <td>98.9146</td>\n",
" <td>77</td>\n",
" <td>85</td>\n",
" <td>32999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>223900000</td>\n",
" <td>224000000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>0</td>\n",
" <td>100000</td>\n",
" <td>98.1158</td>\n",
" <td>61</td>\n",
" <td>86</td>\n",
" <td>43999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>223900000</td>\n",
" <td>224000000</td>\n",
" <td>+</td>\n",
" <td>CM000663.2</td>\n",
" <td>248946422</td>\n",
" <td>100000</td>\n",
" <td>200000</td>\n",
" <td>98.1158</td>\n",
" <td>61</td>\n",
" <td>86</td>\n",
" <td>43999</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chrom1 chrom1_len start1 end1 strand chrom2 \\\n",
"0 CM000663.2 248946422 0 100000 + CM000663.2 \n",
"1 CM000663.2 248946422 100000 200000 + CM000663.2 \n",
"2 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
"3 CM000663.2 248946422 100000 200000 + CM000663.2 \n",
"4 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
"5 CM000663.2 248946422 600000 700000 + CM000663.2 \n",
"6 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
"7 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
"8 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n",
"9 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n",
"\n",
" chrom2_len start2 end2 identity foo bar count \n",
"0 248946422 0 100000 99.9942 87 87 197999 \n",
"1 248946422 100000 200000 99.9942 87 87 197999 \n",
"2 248946422 200000 300000 99.9942 87 87 197999 \n",
"3 248946422 0 100000 98.4586 51 80 27999 \n",
"4 248946422 0 100000 98.4586 51 80 27999 \n",
"5 248946422 0 100000 98.0156 46 87 5999 \n",
"6 248946422 0 100000 98.9146 77 85 32999 \n",
"7 248946422 100000 200000 98.9146 77 85 32999 \n",
"8 248946422 0 100000 98.1158 61 86 43999 \n",
"9 248946422 100000 200000 98.1158 61 86 43999 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fields = [('chrom1', object),\n",
" ('chrom1_len', int), \n",
" ('start1', int),\n",
" ('end1', int),\n",
" ('strand', object),\n",
" ('chrom2', object),\n",
" ('chrom2_len', int),\n",
" ('start2', int),\n",
" ('end2', int),\n",
" ('identity', float),\n",
" ('foo', int),\n",
" ('bar', int),\n",
" ('count', int)]\n",
"\n",
"reader = pd.read_csv(matrix_file, iterator=True, sep='\\t', \n",
" names=[f[0] for f in fields],\n",
" dtype=dict(fields))\n",
"reader.read(10)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2018-01-31T19:36:36.833388Z",
"start_time": "2018-01-31T19:36:36.523384Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fields: [('chrom1', <class 'object'>), ('chrom1_len', <class 'int'>), ('start1', <class 'int'>), ('end1', <class 'int'>), ('strand', <class 'object'>), ('chrom2', <class 'object'>), ('chrom2_len', <class 'int'>), ('start2', <class 'int'>), ('end2', <class 'int'>), ('identity', <class 'float'>), ('foo', <class 'int'>), ('bar', <class 'int'>), ('count', <class 'int'>)]\n"
]
}
],
"source": [
"from cooler._binning import GenomeSegmentation\n",
"from collections import OrderedDict\n",
"import bresenham as brh\n",
"import functools as ft\n",
"import tempfile\n",
"import os\n",
"\n",
"\n",
"# add field number to the list of fields\n",
"'''\n",
"fields_and_nums = [tuple(list(f[:2)] + [i]) \n",
" for i, f in enumerate(fields)]\n",
"'''\n",
"print(\"fields:\", fields)\n",
"\n",
"\n",
"def load_contact_matrix(inpath, outpath, bins, chunksize, binsize, count_as_float=False, \n",
" read_options=None, sep='\\t', **kwargs):\n",
" \"\"\"\n",
" Load any flat contact matrix file into a cooler.\n",
" \n",
" Parameters\n",
" ----------\n",
" filepath : str\n",
" Path to the file\n",
" outpath : str\n",
" Cooler URI\n",
" bins : data frame\n",
" bin table\n",
" chunksize : int\n",
" Number of lines to process to generate each temp file\n",
" \"\"\"\n",
" if read_options is None:\n",
" read_options = {}\n",
" \n",
" print('dtypes', [np.dtype(f[1]) for f in fields])\n",
" \n",
" reader = pd.read_csv(\n",
" inpath, \n",
" iterator=True, \n",
" names=[f[0] for f in fields],\n",
" dtype=dict(fields),\n",
" sep=sep,\n",
" **read_options)\n",
" \n",
" reader.chunksize = chunksize\n",
" gs = GenomeSegmentation(chromsizes, bins)\n",
" bins =gs.bins.copy()\n",
" bins['chrom'] = bins['chrom'].astype(object)\n",
" bins['bin'] = bins.index\n",
" out_columns = ['bin1_id', 'bin2_id', 'count' ]\n",
" binsize = binsize\n",
" \n",
" temp_files = []\n",
" for df in reader:\n",
" # check for flips XXX - TODO: just flip everything\n",
" idmap = gs.idmap\n",
" chrom1_ids = df['chrom1'].apply(idmap.__getitem__)\n",
" chrom2_ids = df['chrom2'].apply(idmap.__getitem__)\n",
" assert np.all(chrom1_ids <= chrom2_ids)\n",
" \n",
" # print(\"df.head:\", df.head())\n",
" print(\"bins:\", bins.head())\n",
"\n",
" # assign bin IDs from bin table\n",
" df = (df.merge(bins, \n",
" left_on=['chrom1', 'start1', 'end1'], \n",
" right_on=['chrom', 'start', 'end'])\n",
" .merge(bins, \n",
" left_on=['chrom2', 'start2', 'end2'], \n",
" right_on=['chrom', 'start', 'end'], \n",
" suffixes=('1', '2'))\n",
" .rename(columns={'bin1': 'bin1_id', \n",
" 'bin2': 'bin2_id'}))\n",
" \n",
" # ensure output is sorted\n",
" df = (df[out_columns]\n",
" .sort_values(['bin1_id', 'bin2_id']))\n",
" \n",
" tf = tempfile.NamedTemporaryFile(suffix='.cool')\n",
" temp_files.append(tf)\n",
"\n",
" print('Creating', tf.name)\n",
" #pixels = StreamingLoader(bins, reader, chunksize=chunksize, binsize=binsize)\n",
" print(\"pixels\")\n",
" cooler.io.create(tf.name, bins, df)\n",
" print(\"flal\")\n",
"\n",
"\n",
" print('Merging...')\n",
" pixels = CoolerMerger([cooler.Cooler(tf.name) for tf in temp_files], chunksize)\n",
" cooler.io.create(outpath, bins, pixels, **kwargs)\n",
"\n",
" #print('Deleting temp files...')\n",
" #for tf in temp_files:\n",
" # os.remove(tf.name)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"binsize = 100000\n",
"#chromsizes = cooler.util.fetch_chromsizes('mm9')\n",
"#print(\"chromsizes\", chromsizes)\n",
"chromsizes = pd.read_table('../chromInfo_chr1.txt', delimiter='\\t', \n",
" names=['chromname', 'length'],\n",
" dtype=dict([(\"chromname\", object), ('length', int)]),\n",
" index_col='chromname')['length']\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"start_time": "2018-01-31T19:36:37.085Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dtypes [dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64')]\n",
"bins: chrom start end bin\n",
"0 CM000663.2 0 100000 0\n",
"1 CM000663.2 100000 200000 1\n",
"2 CM000663.2 200000 300000 2\n",
"3 CM000663.2 300000 400000 3\n",
"4 CM000663.2 400000 500000 4\n",
"Creating /var/folders/yb/3jwdnlf17gd1dcq65th786d00000gn/T/tmpgmq5f_ne.cool\n",
"pixels\n",
"creating\n",
"flal\n",
"Merging...\n",
"creating\n"
]
}
],
"source": [
"\n",
"bins = cooler.binnify(chromsizes, binsize)\n",
"load_contact_matrix(matrix_file, 'out.cool', bins, \n",
" count_as_float=False, \n",
" chunksize=50000000, binsize=binsize, sep='\\t',\n",
" aggregate=\"max\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame({\"a\": [1,2]})\n",
"a.apply(lambda x: pd.DataFrame({\"x\": [x['a']+1,x['a']+2]}), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"gist": {
"data": {
"description": "streaming_loader.ipynb",
"public": false
},
"id": ""
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment