-
-
Save kfsone/dcb0d7811570e40e73136a14c23bf128 to your computer and use it in GitHub Desktop.
Faster ways to count lines in a file in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "95efb8da-aafd-42de-a298-d599a0da47ec", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Volume in drive C has no label.\n", | |
" Volume Serial Number is A8FE-4354\n", | |
"\n", | |
" Directory of C:\\Users\\oliver.smith\\source\\github.com\\kfsone\\Trade-Dangerous\\tradedangerous\\tmp\n", | |
"\n", | |
"04/22/2024 12:02 AM 90,447,209 spansh.prices\n", | |
" 1 File(s) 90,447,209 bytes\n", | |
" 0 Dir(s) 1,414,821,974,016 bytes free\n" | |
] | |
} | |
], | |
"source": [ | |
"from pathlib import Path\n", | |
"import mmap\n", | |
"\n", | |
"list_path = Path(\"tmp/spansh.prices\")\n", | |
"!dir tmp\\spansh.prices\n", | |
"\n", | |
"# Current implementation that chunks out the file in 64k blocks\n", | |
"def blocks1(f, size=65536):\n", | |
" while True:\n", | |
" b = f.read(size)\n", | |
" if not b:\n", | |
" break\n", | |
" yield b\n", | |
"\n", | |
"# imlementation that uses a fixed-size buffer to readinto\n", | |
"def blocks2(f, size=64*1024):\n", | |
" buf = bytearray(size)\n", | |
" reader = f.readinto\n", | |
" read = reader(buf)\n", | |
" while read:\n", | |
" yield buf[:read]\n", | |
" read = reader(buf)\n", | |
"\n", | |
"# Current Approach: open the file naively and count newline strings\n", | |
"def counting1(size=65536):\n", | |
" total = 0\n", | |
" with list_path.open(\"r\") as f:\n", | |
" total += sum(bl.count(\"\\n\") for bl in blocks1(f, size=size))\n", | |
" return total\n", | |
"\n", | |
"# Alternate: open the file in binary and count bytes\n", | |
"def counting2(size=65536):\n", | |
" total = 0\n", | |
" with list_path.open(\"rb\") as f:\n", | |
" total += sum(bl.count(b'\\n') for bl in blocks1(f, size=size))\n", | |
" return total\n", | |
"\n", | |
"# Alternate: use binary on the fixed-buffer blocks method\n", | |
"def counting3(size=65536):\n", | |
" with list_path.open(\"rb\") as f:\n", | |
" return sum(bl.count(b'\\n') for bl in blocks2(f, size=size))\n", | |
"\n", | |
"# Alternate: mmap the file and count directly off the data\n", | |
"# (and encounter python shenanigans)\n", | |
"def counting4():\n", | |
" with list_path.open(\"rb\") as f:\n", | |
" with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:\n", | |
" return mm.read().count(b'\\n')\n", | |
"\n", | |
"# Alternate: bypass *some* of python's shenanigans using mmap\n", | |
"# (but not all)\n", | |
"def counting5():\n", | |
" with list_path.open(\"rb\") as f:\n", | |
" with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:\n", | |
" return mm[:].count(b'\\n')\n", | |
"\n", | |
"# Alternate: do the counting directly off the buffer without yielding,\n", | |
"# and use a micro-faster path for full buffer reads.\n", | |
"def counting6(size=64*1024):\n", | |
" buf = bytearray(size)\n", | |
" counter = buf.count\n", | |
" total = 0\n", | |
" with list_path.open(\"rb\") as f:\n", | |
" reader = f.readinto\n", | |
" # For a large file, we're oging to see a lot of full\n", | |
" # reads, so we can do less work and just call count\n", | |
" # directly on the buf object. For the last one\n", | |
" # we need to use a slice of the buffer to count only\n", | |
" # from the bytes we actually read.\n", | |
" while (read := reader(buf)) == size:\n", | |
" total += counter(b'\\n')\n", | |
" else:\n", | |
" total += buf[:read].count(b'\\n')\n", | |
" return total\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "ae8a7f4d-02d0-428d-af76-65716d135493", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1, 261 ms ± 5.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 49.3 ms ± 805 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 50.6 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"4, 80.9 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"5, 81.3 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 46.8 ms ± 714 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"---\n", | |
"6, 128k, 44.7 ms ± 893 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 256k, 43.9 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"1, \", end='')\n", | |
"%timeit counting1()\n", | |
"print(\"2, \", end='')\n", | |
"%timeit counting2()\n", | |
"print(\"3, \", end='')\n", | |
"%timeit counting3()\n", | |
"print(\"4, \", end='')\n", | |
"%timeit counting4()\n", | |
"print(\"5, \", end='')\n", | |
"%timeit counting5()\n", | |
"print(\"6, \", end='')\n", | |
"%timeit counting6()\n", | |
"print(\"---\")\n", | |
"print(\"6, 128k, \", end='')\n", | |
"%timeit counting6(128*1024) # 128kb at a time\n", | |
"print(\"6, 256k, \", end='')\n", | |
"%timeit counting6(256*1024) # 128kb at a time\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "ef0fff5d-80aa-45f0-b3b5-65fe53c5d28f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4096\n", | |
"1 324 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 82.9 ms ± 954 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 84.8 ms ± 886 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 78.8 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"8192\n", | |
"1 317 ms ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 77.4 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 82.9 ms ± 1.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 76.1 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"16384\n", | |
"1 291 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 61.2 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 63.5 ms ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 59.8 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"65536\n", | |
"1 260 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 49.1 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 50.3 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 48.3 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"262144\n", | |
"1 300 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 45.7 ms ± 959 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 49.2 ms ± 786 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 45.3 ms ± 2.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"524288\n", | |
"1 341 ms ± 28.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 55.1 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 49.8 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 45 ms ± 944 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"1048576\n", | |
"1 349 ms ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 64.9 ms ± 892 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 66.1 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 45.2 ms ± 845 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"4194304\n", | |
"1 380 ms ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"2, 66.7 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"3, 69.8 ms ± 4.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | |
"6, 47.5 ms ± 955 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"for size in (4096, 8192, 16384, 65536, 256*1024, 512 * 1024, 1024*1024, 4*1024*1024):\n", | |
" print(size)\n", | |
" print(\"1 \", end='')\n", | |
" %timeit counting1(size)\n", | |
" print(\"2, \", end='')\n", | |
" %timeit counting2(size)\n", | |
" print(\"3, \", end='')\n", | |
" %timeit counting3(size)\n", | |
" print(\"6, \", end='')\n", | |
" %timeit counting6(size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0228a688-1945-405f-8aad-cc6c580cef0a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment