Skip to content

Instantly share code, notes, and snippets.

@kfsone
Created April 23, 2024 20:48
Show Gist options
  • Save kfsone/dcb0d7811570e40e73136a14c23bf128 to your computer and use it in GitHub Desktop.
Save kfsone/dcb0d7811570e40e73136a14c23bf128 to your computer and use it in GitHub Desktop.
Faster ways to count lines in a file in python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "95efb8da-aafd-42de-a298-d599a0da47ec",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Volume in drive C has no label.\n",
" Volume Serial Number is A8FE-4354\n",
"\n",
" Directory of C:\\Users\\oliver.smith\\source\\github.com\\kfsone\\Trade-Dangerous\\tradedangerous\\tmp\n",
"\n",
"04/22/2024 12:02 AM 90,447,209 spansh.prices\n",
" 1 File(s) 90,447,209 bytes\n",
" 0 Dir(s) 1,414,821,974,016 bytes free\n"
]
}
],
"source": [
"from pathlib import Path\n",
"import mmap\n",
"\n",
"list_path = Path(\"tmp/spansh.prices\")\n",
"!dir tmp\\spansh.prices\n",
"\n",
"# Current implementation that chunks out the file in 64k blocks\n",
"def blocks1(f, size=65536):\n",
" while True:\n",
" b = f.read(size)\n",
" if not b:\n",
" break\n",
" yield b\n",
"\n",
"# imlementation that uses a fixed-size buffer to readinto\n",
"def blocks2(f, size=64*1024):\n",
" buf = bytearray(size)\n",
" reader = f.readinto\n",
" read = reader(buf)\n",
" while read:\n",
" yield buf[:read]\n",
" read = reader(buf)\n",
"\n",
"# Current Approach: open the file naively and count newline strings\n",
"def counting1(size=65536):\n",
" total = 0\n",
" with list_path.open(\"r\") as f:\n",
" total += sum(bl.count(\"\\n\") for bl in blocks1(f, size=size))\n",
" return total\n",
"\n",
"# Alternate: open the file in binary and count bytes\n",
"def counting2(size=65536):\n",
" total = 0\n",
" with list_path.open(\"rb\") as f:\n",
" total += sum(bl.count(b'\\n') for bl in blocks1(f, size=size))\n",
" return total\n",
"\n",
"# Alternate: use binary on the fixed-buffer blocks method\n",
"def counting3(size=65536):\n",
" with list_path.open(\"rb\") as f:\n",
" return sum(bl.count(b'\\n') for bl in blocks2(f, size=size))\n",
"\n",
"# Alternate: mmap the file and count directly off the data\n",
"# (and encounter python shenanigans)\n",
"def counting4():\n",
" with list_path.open(\"rb\") as f:\n",
" with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:\n",
" return mm.read().count(b'\\n')\n",
"\n",
"# Alternate: bypass *some* of python's shenanigans using mmap\n",
"# (but not all)\n",
"def counting5():\n",
" with list_path.open(\"rb\") as f:\n",
" with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:\n",
" return mm[:].count(b'\\n')\n",
"\n",
"# Alternate: do the counting directly off the buffer without yielding,\n",
"# and use a micro-faster path for full buffer reads.\n",
"def counting6(size=64*1024):\n",
" buf = bytearray(size)\n",
" counter = buf.count\n",
" total = 0\n",
" with list_path.open(\"rb\") as f:\n",
" reader = f.readinto\n",
" # For a large file, we're oging to see a lot of full\n",
" # reads, so we can do less work and just call count\n",
" # directly on the buf object. For the last one\n",
" # we need to use a slice of the buffer to count only\n",
" # from the bytes we actually read.\n",
" while (read := reader(buf)) == size:\n",
" total += counter(b'\\n')\n",
" else:\n",
" total += buf[:read].count(b'\\n')\n",
" return total\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ae8a7f4d-02d0-428d-af76-65716d135493",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1, 261 ms ± 5.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 49.3 ms ± 805 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 50.6 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"4, 80.9 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"5, 81.3 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 46.8 ms ± 714 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"---\n",
"6, 128k, 44.7 ms ± 893 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 256k, 43.9 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"print(\"1, \", end='')\n",
"%timeit counting1()\n",
"print(\"2, \", end='')\n",
"%timeit counting2()\n",
"print(\"3, \", end='')\n",
"%timeit counting3()\n",
"print(\"4, \", end='')\n",
"%timeit counting4()\n",
"print(\"5, \", end='')\n",
"%timeit counting5()\n",
"print(\"6, \", end='')\n",
"%timeit counting6()\n",
"print(\"---\")\n",
"print(\"6, 128k, \", end='')\n",
"%timeit counting6(128*1024) # 128kb at a time\n",
"print(\"6, 256k, \", end='')\n",
"%timeit counting6(256*1024) # 128kb at a time\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ef0fff5d-80aa-45f0-b3b5-65fe53c5d28f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4096\n",
"1 324 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 82.9 ms ± 954 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 84.8 ms ± 886 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 78.8 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"8192\n",
"1 317 ms ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 77.4 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 82.9 ms ± 1.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 76.1 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"16384\n",
"1 291 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 61.2 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 63.5 ms ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 59.8 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"65536\n",
"1 260 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 49.1 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 50.3 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 48.3 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"262144\n",
"1 300 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 45.7 ms ± 959 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 49.2 ms ± 786 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 45.3 ms ± 2.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"524288\n",
"1 341 ms ± 28.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 55.1 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 49.8 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 45 ms ± 944 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"1048576\n",
"1 349 ms ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 64.9 ms ± 892 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 66.1 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 45.2 ms ± 845 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"4194304\n",
"1 380 ms ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"2, 66.7 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"3, 69.8 ms ± 4.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"6, 47.5 ms ± 955 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"for size in (4096, 8192, 16384, 65536, 256*1024, 512 * 1024, 1024*1024, 4*1024*1024):\n",
" print(size)\n",
" print(\"1 \", end='')\n",
" %timeit counting1(size)\n",
" print(\"2, \", end='')\n",
" %timeit counting2(size)\n",
" print(\"3, \", end='')\n",
" %timeit counting3(size)\n",
" print(\"6, \", end='')\n",
" %timeit counting6(size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0228a688-1945-405f-8aad-cc6c580cef0a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment