Skip to content

Instantly share code, notes, and snippets.

@alendit
Created November 19, 2018 15:47
Show Gist options
  • Save alendit/32ebfbc1e5a4c2ff3190ed8b77e0a990 to your computer and use it in GitHub Desktop.
Save alendit/32ebfbc1e5a4c2ff3190ed8b77e0a990 to your computer and use it in GitHub Desktop.
Build 1kk string with 10 chars each using numba builder.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from fletcher._numba_compat import NumbaString, NumbaStringArray\n",
"from fletcher._algorithms import _startswith\n",
"import fletcher as fr\n",
"import numba\n",
"import pyarrow as pa"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from random import choice\n",
"from string import ascii_letters"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"char_arr = np.zeros((10 ** 7,), dtype=np.uint8)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"for i in range(len(char_arr)):\n",
" char_arr[i] = ord(choice(ascii_letters))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 109, 103, 104, 81, 70, 118, 78, 71, 105], dtype=uint8)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"char_arr[10:20]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"@numba.njit(nogil=True)\n",
"def build_strs(arr):\n",
" sb = fr._numba_compat.NumbaStringArrayBuilder(10 ** 6, 10 ** 7)\n",
" for i in range(0, 10 ** 6, 10):\n",
" for j in range(0, 10):\n",
" sb.put_byte(arr[i + j])\n",
" sb.finish_string()\n",
" sb.finish()\n",
" return NumbaStringArray(sb.missing, sb.offsets, sb.data, 0)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"39.1 ms ± 211 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit build_strs(char_arr)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"s_arr = build_strs(char_arr)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 109, 103, 104, 81, 70, 118, 78, 71, 105], dtype=int32)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s_arr.decode(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment