pkerpedjiev/streaming_loader.ipynb

## streaming_loader.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:35:31.631090Z",
     "start_time": "2018-01-31T19:35:31.138300Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import cooler\n",
    "from cooler.io import ContactBinner, CoolerMerger\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import bresenham"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:35:32.261309Z",
     "start_time": "2018-01-31T19:35:32.256140Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#matrix_file = '../98.out.gz'\n",
    "matrix_file = '../short_brh_10.gz'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:35:33.197169Z",
     "start_time": "2018-01-31T19:35:32.885505Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CM000663.2\t248946422\t0\t100000\t+\tCM000663.2\t248946422\t0\t100000\t99.9942\t87\t87\t197999\r\n",
      "CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t100000\t200000\t99.9942\t87\t87\t197999\r\n",
      "CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t200000\t300000\t99.9942\t87\t87\t197999\r\n",
      "CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
      "CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
      "CM000663.2\t248946422\t600000\t700000\t+\tCM000663.2\t248946422\t0\t100000\t98.0156\t46\t87\t5999\r\n",
      "CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.9146\t77\t85\t32999\r\n",
      "CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t100000\t200000\t98.9146\t77\t85\t32999\r\n",
      "CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t0\t100000\t98.1158\t61\t86\t43999\r\n",
      "CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t100000\t200000\t98.1158\t61\t86\t43999\r\n"
     ]
    }
   ],
   "source": [
    "!gzcat {matrix_file} | head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:34:41.547674Z",
     "start_time": "2018-01-31T19:33:01.642302Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      17\r\n"
     ]
    }
   ],
   "source": [
    "!gzcat {matrix_file} | wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:35:34.909247Z",
     "start_time": "2018-01-31T19:35:34.896192Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chrom1</th>\n",
       "      <th>chrom1_len</th>\n",
       "      <th>start1</th>\n",
       "      <th>end1</th>\n",
       "      <th>strand</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>chrom2_len</th>\n",
       "      <th>start2</th>\n",
       "      <th>end2</th>\n",
       "      <th>identity</th>\n",
       "      <th>foo</th>\n",
       "      <th>bar</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>99.9942</td>\n",
       "      <td>87</td>\n",
       "      <td>87</td>\n",
       "      <td>197999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>100000</td>\n",
       "      <td>200000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>100000</td>\n",
       "      <td>200000</td>\n",
       "      <td>99.9942</td>\n",
       "      <td>87</td>\n",
       "      <td>87</td>\n",
       "      <td>197999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>200000</td>\n",
       "      <td>300000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>200000</td>\n",
       "      <td>300000</td>\n",
       "      <td>99.9942</td>\n",
       "      <td>87</td>\n",
       "      <td>87</td>\n",
       "      <td>197999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>100000</td>\n",
       "      <td>200000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>98.4586</td>\n",
       "      <td>51</td>\n",
       "      <td>80</td>\n",
       "      <td>27999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>200000</td>\n",
       "      <td>300000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>98.4586</td>\n",
       "      <td>51</td>\n",
       "      <td>80</td>\n",
       "      <td>27999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>600000</td>\n",
       "      <td>700000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>98.0156</td>\n",
       "      <td>46</td>\n",
       "      <td>87</td>\n",
       "      <td>5999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>200000</td>\n",
       "      <td>300000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>98.9146</td>\n",
       "      <td>77</td>\n",
       "      <td>85</td>\n",
       "      <td>32999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>200000</td>\n",
       "      <td>300000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>100000</td>\n",
       "      <td>200000</td>\n",
       "      <td>98.9146</td>\n",
       "      <td>77</td>\n",
       "      <td>85</td>\n",
       "      <td>32999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>223900000</td>\n",
       "      <td>224000000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>0</td>\n",
       "      <td>100000</td>\n",
       "      <td>98.1158</td>\n",
       "      <td>61</td>\n",
       "      <td>86</td>\n",
       "      <td>43999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>223900000</td>\n",
       "      <td>224000000</td>\n",
       "      <td>+</td>\n",
       "      <td>CM000663.2</td>\n",
       "      <td>248946422</td>\n",
       "      <td>100000</td>\n",
       "      <td>200000</td>\n",
       "      <td>98.1158</td>\n",
       "      <td>61</td>\n",
       "      <td>86</td>\n",
       "      <td>43999</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       chrom1  chrom1_len     start1       end1 strand      chrom2  \\\n",
       "0  CM000663.2   248946422          0     100000      +  CM000663.2   \n",
       "1  CM000663.2   248946422     100000     200000      +  CM000663.2   \n",
       "2  CM000663.2   248946422     200000     300000      +  CM000663.2   \n",
       "3  CM000663.2   248946422     100000     200000      +  CM000663.2   \n",
       "4  CM000663.2   248946422     200000     300000      +  CM000663.2   \n",
       "5  CM000663.2   248946422     600000     700000      +  CM000663.2   \n",
       "6  CM000663.2   248946422     200000     300000      +  CM000663.2   \n",
       "7  CM000663.2   248946422     200000     300000      +  CM000663.2   \n",
       "8  CM000663.2   248946422  223900000  224000000      +  CM000663.2   \n",
       "9  CM000663.2   248946422  223900000  224000000      +  CM000663.2   \n",
       "\n",
       "   chrom2_len  start2    end2  identity  foo  bar   count  \n",
       "0   248946422       0  100000   99.9942   87   87  197999  \n",
       "1   248946422  100000  200000   99.9942   87   87  197999  \n",
       "2   248946422  200000  300000   99.9942   87   87  197999  \n",
       "3   248946422       0  100000   98.4586   51   80   27999  \n",
       "4   248946422       0  100000   98.4586   51   80   27999  \n",
       "5   248946422       0  100000   98.0156   46   87    5999  \n",
       "6   248946422       0  100000   98.9146   77   85   32999  \n",
       "7   248946422  100000  200000   98.9146   77   85   32999  \n",
       "8   248946422       0  100000   98.1158   61   86   43999  \n",
       "9   248946422  100000  200000   98.1158   61   86   43999  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fields = [('chrom1', object),\n",
    "          ('chrom1_len', int), \n",
    "          ('start1', int),\n",
    "          ('end1', int),\n",
    "          ('strand', object),\n",
    "          ('chrom2', object),\n",
    "          ('chrom2_len', int),\n",
    "          ('start2', int),\n",
    "          ('end2', int),\n",
    "          ('identity', float),\n",
    "          ('foo', int),\n",
    "          ('bar', int),\n",
    "          ('count', int)]\n",
    "\n",
    "reader = pd.read_csv(matrix_file, iterator=True, sep='\\t', \n",
    "            names=[f[0] for f in fields],\n",
    "            dtype=dict(fields))\n",
    "reader.read(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-01-31T19:36:36.833388Z",
     "start_time": "2018-01-31T19:36:36.523384Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fields: [('chrom1', <class 'object'>), ('chrom1_len', <class 'int'>), ('start1', <class 'int'>), ('end1', <class 'int'>), ('strand', <class 'object'>), ('chrom2', <class 'object'>), ('chrom2_len', <class 'int'>), ('start2', <class 'int'>), ('end2', <class 'int'>), ('identity', <class 'float'>), ('foo', <class 'int'>), ('bar', <class 'int'>), ('count', <class 'int'>)]\n"
     ]
    }
   ],
   "source": [
    "from cooler._binning import GenomeSegmentation\n",
    "from collections import OrderedDict\n",
    "import bresenham as brh\n",
    "import functools as ft\n",
    "import tempfile\n",
    "import os\n",
    "\n",
    "\n",
    "# add field number to the list of fields\n",
    "'''\n",
    "fields_and_nums = [tuple(list(f[:2)] + [i]) \n",
    "                   for i, f in enumerate(fields)]\n",
    "'''\n",
    "print(\"fields:\", fields)\n",
    "\n",
    "\n",
    "def load_contact_matrix(inpath, outpath, bins, chunksize, binsize, count_as_float=False, \n",
    "                        read_options=None, sep='\\t', **kwargs):\n",
    "    \"\"\"\n",
    "    Load any flat contact matrix file into a cooler.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    filepath : str\n",
    "        Path to the file\n",
    "    outpath : str\n",
    "        Cooler URI\n",
    "    bins : data frame\n",
    "        bin table\n",
    "    chunksize : int\n",
    "        Number of lines to process to generate each temp file\n",
    "    \"\"\"\n",
    "    if read_options is None:\n",
    "        read_options = {}\n",
    "        \n",
    "    print('dtypes', [np.dtype(f[1]) for f in fields])\n",
    "    \n",
    "    reader = pd.read_csv(\n",
    "        inpath, \n",
    "        iterator=True, \n",
    "        names=[f[0] for f in fields],\n",
    "        dtype=dict(fields),\n",
    "        sep=sep,\n",
    "        **read_options)\n",
    "    \n",
    "    reader.chunksize = chunksize\n",
    "    gs = GenomeSegmentation(chromsizes, bins)\n",
    "    bins =gs.bins.copy()\n",
    "    bins['chrom'] = bins['chrom'].astype(object)\n",
    "    bins['bin'] = bins.index\n",
    "    out_columns = ['bin1_id', 'bin2_id', 'count' ]\n",
    "    binsize = binsize\n",
    "    \n",
    "    temp_files = []\n",
    "    for df in reader:\n",
    "        # check for flips XXX - TODO: just flip everything\n",
    "        idmap = gs.idmap\n",
    "        chrom1_ids = df['chrom1'].apply(idmap.__getitem__)\n",
    "        chrom2_ids = df['chrom2'].apply(idmap.__getitem__)\n",
    "        assert np.all(chrom1_ids <= chrom2_ids)\n",
    "        \n",
    "           # print(\"df.head:\", df.head())\n",
    "        print(\"bins:\", bins.head())\n",
    "\n",
    "        # assign bin IDs from bin table\n",
    "        df = (df.merge(bins, \n",
    "                       left_on=['chrom1', 'start1', 'end1'], \n",
    "                       right_on=['chrom', 'start', 'end'])\n",
    "                .merge(bins, \n",
    "                       left_on=['chrom2', 'start2', 'end2'], \n",
    "                       right_on=['chrom', 'start', 'end'], \n",
    "                       suffixes=('1', '2'))\n",
    "                .rename(columns={'bin1': 'bin1_id', \n",
    "                                 'bin2': 'bin2_id'}))\n",
    "        \n",
    "        # ensure output is sorted\n",
    "        df = (df[out_columns]\n",
    "                .sort_values(['bin1_id', 'bin2_id']))\n",
    "        \n",
    "        tf = tempfile.NamedTemporaryFile(suffix='.cool')\n",
    "        temp_files.append(tf)\n",
    "\n",
    "        print('Creating', tf.name)\n",
    "        #pixels = StreamingLoader(bins, reader, chunksize=chunksize, binsize=binsize)\n",
    "        print(\"pixels\")\n",
    "        cooler.io.create(tf.name, bins, df)\n",
    "        print(\"flal\")\n",
    "\n",
    "\n",
    "    print('Merging...')\n",
    "    pixels = CoolerMerger([cooler.Cooler(tf.name) for tf in temp_files], chunksize)\n",
    "    cooler.io.create(outpath, bins, pixels, **kwargs)\n",
    "\n",
    "    #print('Deleting temp files...')\n",
    "    #for tf in temp_files:\n",
    "    #    os.remove(tf.name)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "binsize = 100000\n",
    "#chromsizes = cooler.util.fetch_chromsizes('mm9')\n",
    "#print(\"chromsizes\", chromsizes)\n",
    "chromsizes = pd.read_table('../chromInfo_chr1.txt', delimiter='\\t', \n",
    "                           names=['chromname', 'length'],\n",
    "                           dtype=dict([(\"chromname\", object), ('length', int)]),\n",
    "                           index_col='chromname')['length']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2018-01-31T19:36:37.085Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dtypes [dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64')]\n",
      "bins:         chrom   start     end  bin\n",
      "0  CM000663.2       0  100000    0\n",
      "1  CM000663.2  100000  200000    1\n",
      "2  CM000663.2  200000  300000    2\n",
      "3  CM000663.2  300000  400000    3\n",
      "4  CM000663.2  400000  500000    4\n",
      "Creating /var/folders/yb/3jwdnlf17gd1dcq65th786d00000gn/T/tmpgmq5f_ne.cool\n",
      "pixels\n",
      "creating\n",
      "flal\n",
      "Merging...\n",
      "creating\n"
     ]
    }
   ],
   "source": [
    "\n",
    "bins = cooler.binnify(chromsizes, binsize)\n",
    "load_contact_matrix(matrix_file, 'out.cool', bins, \n",
    "                    count_as_float=False, \n",
    "                    chunksize=50000000, binsize=binsize, sep='\\t',\n",
    "                    aggregate=\"max\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.DataFrame({\"a\": [1,2]})\n",
    "a.apply(lambda x: pd.DataFrame({\"x\": [x['a']+1,x['a']+2]}), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "gist": {
   "data": {
    "description": "streaming_loader.ipynb",
    "public": false
   },
   "id": ""
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"%load_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:35:31.631090Z",
	"start_time": "2018-01-31T19:35:31.138300Z"
	},
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import cooler\n",
	"from cooler.io import ContactBinner, CoolerMerger\n",
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import bresenham"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:35:32.261309Z",
	"start_time": "2018-01-31T19:35:32.256140Z"
	},
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#matrix_file = '../98.out.gz'\n",
	"matrix_file = '../short_brh_10.gz'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:35:33.197169Z",
	"start_time": "2018-01-31T19:35:32.885505Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CM000663.2\t248946422\t0\t100000\t+\tCM000663.2\t248946422\t0\t100000\t99.9942\t87\t87\t197999\r\n",
	"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t100000\t200000\t99.9942\t87\t87\t197999\r\n",
	"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t200000\t300000\t99.9942\t87\t87\t197999\r\n",
	"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
	"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n",
	"CM000663.2\t248946422\t600000\t700000\t+\tCM000663.2\t248946422\t0\t100000\t98.0156\t46\t87\t5999\r\n",
	"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.9146\t77\t85\t32999\r\n",
	"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t100000\t200000\t98.9146\t77\t85\t32999\r\n",
	"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t0\t100000\t98.1158\t61\t86\t43999\r\n",
	"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t100000\t200000\t98.1158\t61\t86\t43999\r\n"
	]
	}
	],
	"source": [
	"!gzcat {matrix_file} \| head"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:34:41.547674Z",
	"start_time": "2018-01-31T19:33:01.642302Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 17\r\n"
	]
	}
	],
	"source": [
	"!gzcat {matrix_file} \| wc -l"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:35:34.909247Z",
	"start_time": "2018-01-31T19:35:34.896192Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>chrom1</th>\n",
	" <th>chrom1_len</th>\n",
	" <th>start1</th>\n",
	" <th>end1</th>\n",
	" <th>strand</th>\n",
	" <th>chrom2</th>\n",
	" <th>chrom2_len</th>\n",
	" <th>start2</th>\n",
	" <th>end2</th>\n",
	" <th>identity</th>\n",
	" <th>foo</th>\n",
	" <th>bar</th>\n",
	" <th>count</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>99.9942</td>\n",
	" <td>87</td>\n",
	" <td>87</td>\n",
	" <td>197999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>100000</td>\n",
	" <td>200000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>100000</td>\n",
	" <td>200000</td>\n",
	" <td>99.9942</td>\n",
	" <td>87</td>\n",
	" <td>87</td>\n",
	" <td>197999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>200000</td>\n",
	" <td>300000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>200000</td>\n",
	" <td>300000</td>\n",
	" <td>99.9942</td>\n",
	" <td>87</td>\n",
	" <td>87</td>\n",
	" <td>197999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>100000</td>\n",
	" <td>200000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>98.4586</td>\n",
	" <td>51</td>\n",
	" <td>80</td>\n",
	" <td>27999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>200000</td>\n",
	" <td>300000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>98.4586</td>\n",
	" <td>51</td>\n",
	" <td>80</td>\n",
	" <td>27999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>600000</td>\n",
	" <td>700000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>98.0156</td>\n",
	" <td>46</td>\n",
	" <td>87</td>\n",
	" <td>5999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>200000</td>\n",
	" <td>300000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>98.9146</td>\n",
	" <td>77</td>\n",
	" <td>85</td>\n",
	" <td>32999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>200000</td>\n",
	" <td>300000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>100000</td>\n",
	" <td>200000</td>\n",
	" <td>98.9146</td>\n",
	" <td>77</td>\n",
	" <td>85</td>\n",
	" <td>32999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>223900000</td>\n",
	" <td>224000000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>0</td>\n",
	" <td>100000</td>\n",
	" <td>98.1158</td>\n",
	" <td>61</td>\n",
	" <td>86</td>\n",
	" <td>43999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>223900000</td>\n",
	" <td>224000000</td>\n",
	" <td>+</td>\n",
	" <td>CM000663.2</td>\n",
	" <td>248946422</td>\n",
	" <td>100000</td>\n",
	" <td>200000</td>\n",
	" <td>98.1158</td>\n",
	" <td>61</td>\n",
	" <td>86</td>\n",
	" <td>43999</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" chrom1 chrom1_len start1 end1 strand chrom2 \\\n",
	"0 CM000663.2 248946422 0 100000 + CM000663.2 \n",
	"1 CM000663.2 248946422 100000 200000 + CM000663.2 \n",
	"2 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
	"3 CM000663.2 248946422 100000 200000 + CM000663.2 \n",
	"4 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
	"5 CM000663.2 248946422 600000 700000 + CM000663.2 \n",
	"6 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
	"7 CM000663.2 248946422 200000 300000 + CM000663.2 \n",
	"8 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n",
	"9 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n",
	"\n",
	" chrom2_len start2 end2 identity foo bar count \n",
	"0 248946422 0 100000 99.9942 87 87 197999 \n",
	"1 248946422 100000 200000 99.9942 87 87 197999 \n",
	"2 248946422 200000 300000 99.9942 87 87 197999 \n",
	"3 248946422 0 100000 98.4586 51 80 27999 \n",
	"4 248946422 0 100000 98.4586 51 80 27999 \n",
	"5 248946422 0 100000 98.0156 46 87 5999 \n",
	"6 248946422 0 100000 98.9146 77 85 32999 \n",
	"7 248946422 100000 200000 98.9146 77 85 32999 \n",
	"8 248946422 0 100000 98.1158 61 86 43999 \n",
	"9 248946422 100000 200000 98.1158 61 86 43999 "
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fields = [('chrom1', object),\n",
	" ('chrom1_len', int), \n",
	" ('start1', int),\n",
	" ('end1', int),\n",
	" ('strand', object),\n",
	" ('chrom2', object),\n",
	" ('chrom2_len', int),\n",
	" ('start2', int),\n",
	" ('end2', int),\n",
	" ('identity', float),\n",
	" ('foo', int),\n",
	" ('bar', int),\n",
	" ('count', int)]\n",
	"\n",
	"reader = pd.read_csv(matrix_file, iterator=True, sep='\\t', \n",
	" names=[f[0] for f in fields],\n",
	" dtype=dict(fields))\n",
	"reader.read(10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 54,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2018-01-31T19:36:36.833388Z",
	"start_time": "2018-01-31T19:36:36.523384Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"fields: [('chrom1', <class 'object'>), ('chrom1_len', <class 'int'>), ('start1', <class 'int'>), ('end1', <class 'int'>), ('strand', <class 'object'>), ('chrom2', <class 'object'>), ('chrom2_len', <class 'int'>), ('start2', <class 'int'>), ('end2', <class 'int'>), ('identity', <class 'float'>), ('foo', <class 'int'>), ('bar', <class 'int'>), ('count', <class 'int'>)]\n"
	]
	}
	],
	"source": [
	"from cooler._binning import GenomeSegmentation\n",
	"from collections import OrderedDict\n",
	"import bresenham as brh\n",
	"import functools as ft\n",
	"import tempfile\n",
	"import os\n",
	"\n",
	"\n",
	"# add field number to the list of fields\n",
	"'''\n",
	"fields_and_nums = [tuple(list(f[:2)] + [i]) \n",
	" for i, f in enumerate(fields)]\n",
	"'''\n",
	"print(\"fields:\", fields)\n",
	"\n",
	"\n",
	"def load_contact_matrix(inpath, outpath, bins, chunksize, binsize, count_as_float=False, \n",
	" read_options=None, sep='\\t', **kwargs):\n",
	" \"\"\"\n",
	" Load any flat contact matrix file into a cooler.\n",
	" \n",
	" Parameters\n",
	" ----------\n",
	" filepath : str\n",
	" Path to the file\n",
	" outpath : str\n",
	" Cooler URI\n",
	" bins : data frame\n",
	" bin table\n",
	" chunksize : int\n",
	" Number of lines to process to generate each temp file\n",
	" \"\"\"\n",
	" if read_options is None:\n",
	" read_options = {}\n",
	" \n",
	" print('dtypes', [np.dtype(f[1]) for f in fields])\n",
	" \n",
	" reader = pd.read_csv(\n",
	" inpath, \n",
	" iterator=True, \n",
	" names=[f[0] for f in fields],\n",
	" dtype=dict(fields),\n",
	" sep=sep,\n",
	" **read_options)\n",
	" \n",
	" reader.chunksize = chunksize\n",
	" gs = GenomeSegmentation(chromsizes, bins)\n",
	" bins =gs.bins.copy()\n",
	" bins['chrom'] = bins['chrom'].astype(object)\n",
	" bins['bin'] = bins.index\n",
	" out_columns = ['bin1_id', 'bin2_id', 'count' ]\n",
	" binsize = binsize\n",
	" \n",
	" temp_files = []\n",
	" for df in reader:\n",
	" # check for flips XXX - TODO: just flip everything\n",
	" idmap = gs.idmap\n",
	" chrom1_ids = df['chrom1'].apply(idmap.__getitem__)\n",
	" chrom2_ids = df['chrom2'].apply(idmap.__getitem__)\n",
	" assert np.all(chrom1_ids <= chrom2_ids)\n",
	" \n",
	" # print(\"df.head:\", df.head())\n",
	" print(\"bins:\", bins.head())\n",
	"\n",
	" # assign bin IDs from bin table\n",
	" df = (df.merge(bins, \n",
	" left_on=['chrom1', 'start1', 'end1'], \n",
	" right_on=['chrom', 'start', 'end'])\n",
	" .merge(bins, \n",
	" left_on=['chrom2', 'start2', 'end2'], \n",
	" right_on=['chrom', 'start', 'end'], \n",
	" suffixes=('1', '2'))\n",
	" .rename(columns={'bin1': 'bin1_id', \n",
	" 'bin2': 'bin2_id'}))\n",
	" \n",
	" # ensure output is sorted\n",
	" df = (df[out_columns]\n",
	" .sort_values(['bin1_id', 'bin2_id']))\n",
	" \n",
	" tf = tempfile.NamedTemporaryFile(suffix='.cool')\n",
	" temp_files.append(tf)\n",
	"\n",
	" print('Creating', tf.name)\n",
	" #pixels = StreamingLoader(bins, reader, chunksize=chunksize, binsize=binsize)\n",
	" print(\"pixels\")\n",
	" cooler.io.create(tf.name, bins, df)\n",
	" print(\"flal\")\n",
	"\n",
	"\n",
	" print('Merging...')\n",
	" pixels = CoolerMerger([cooler.Cooler(tf.name) for tf in temp_files], chunksize)\n",
	" cooler.io.create(outpath, bins, pixels, **kwargs)\n",
	"\n",
	" #print('Deleting temp files...')\n",
	" #for tf in temp_files:\n",
	" # os.remove(tf.name)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"binsize = 100000\n",
	"#chromsizes = cooler.util.fetch_chromsizes('mm9')\n",
	"#print(\"chromsizes\", chromsizes)\n",
	"chromsizes = pd.read_table('../chromInfo_chr1.txt', delimiter='\\t', \n",
	" names=['chromname', 'length'],\n",
	" dtype=dict([(\"chromname\", object), ('length', int)]),\n",
	" index_col='chromname')['length']\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {
	"ExecuteTime": {
	"start_time": "2018-01-31T19:36:37.085Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"dtypes [dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64')]\n",
	"bins: chrom start end bin\n",
	"0 CM000663.2 0 100000 0\n",
	"1 CM000663.2 100000 200000 1\n",
	"2 CM000663.2 200000 300000 2\n",
	"3 CM000663.2 300000 400000 3\n",
	"4 CM000663.2 400000 500000 4\n",
	"Creating /var/folders/yb/3jwdnlf17gd1dcq65th786d00000gn/T/tmpgmq5f_ne.cool\n",
	"pixels\n",
	"creating\n",
	"flal\n",
	"Merging...\n",
	"creating\n"
	]
	}
	],
	"source": [
	"\n",
	"bins = cooler.binnify(chromsizes, binsize)\n",
	"load_contact_matrix(matrix_file, 'out.cool', bins, \n",
	" count_as_float=False, \n",
	" chunksize=50000000, binsize=binsize, sep='\\t',\n",
	" aggregate=\"max\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"a = pd.DataFrame({\"a\": [1,2]})\n",
	"a.apply(lambda x: pd.DataFrame({\"x\": [x['a']+1,x['a']+2]}), axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"gist": {
	"data": {
	"description": "streaming_loader.ipynb",
	"public": false
	},
	"id": ""
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": "block",
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}