Skip to content

Instantly share code, notes, and snippets.

@mawillcockson
Created August 28, 2021 20:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mawillcockson/ea887ff974e22e7012766aa640d488ad to your computer and use it in GitHub Desktop.
Save mawillcockson/ea887ff974e22e7012766aa640d488ad to your computer and use it in GitHub Desktop.
Python String Concatenation Timing
import re
from pathlib import Path
from itertools import islice
from csv import DictWriter
from collections import defaultdict
def main():
results_path = Path("~/projects/time_python_string_concatenation_results.txt").expanduser().resolve(strict=True)
parse_path = Path("~/projects/time_python_string_concatenation_parsed_results.txt").expanduser().resolve()
parse_path.touch()
lines = results_path.read_text().splitlines()
runs = [list(islice(lines, i, i + 2)) for i in range(0, len(lines), 2)]
results = defaultdict(dict)
units = {
"usec": 1/1_000_000,
"msec": 1/1_000,
"sec": 1,
}
for run in runs:
first_line = re.match(r"^#(?P<attempt>\d+) - (?P<count>\d+)$", run[0])
count = int(first_line["count"])
attempt_number = int(first_line["attempt"])
time = re.search(r": (?P<value>\d+(\.\d+)?(e\d+)?) (?P<unit>.?sec)", run[-1])
parsed_time = float(time["value"]) * units[time["unit"]]
results[count][attempt_number] = parsed_time
with parse_path.open(mode="wt") as file:
writer = DictWriter(file, fieldnames=["count"] + list(results[list(results)[0]]))
writer.writeheader()
for count in results:
writer.writerow({"count": count, **results[count]})
if __name__ == "__main__":
main()
set -eu
ATTEMPT_1="
for _ in range(%s):
s += data
"
ATTEMPT_2="
for _ in range(%s):
l.append(data)
''.join(l)
"
ATTEMPT_3="
for _ in range(%s):
s = f'{s}{data}'
"
ATTEMPT_4="
for _ in range(%s):
d[_] = data
s = ''.join(d.values())
"
ATTEMPT_5="
for _ in range(%s):
s = ''.join((s,data))
"
ATTEMPT_6="
for _ in range(%s):
a.frombytes(data.encode('utf-8'))
a.tobytes().decode()
"
ATTEMPT_7="
for _ in range(%s):
au.fromunicode(data)
au.tounicode()
"
ATTEMPT_8="
for _ in range(%s):
d[data] = 0
s = ''.join(d)
"
for count in $(seq 1000 10000 101000); do
for attempt_number in $(seq 1 1 8); do
ATTEMPT="$(eval 'echo "${ATTEMPT_'"$attempt_number"'}"')"
FORMATTED_ATTEMPT="$(printf "$ATTEMPT" "$count")"
echo "#$attempt_number - $count"
python -m timeit -s "import array;a=array.array('B');au=array.array('u');s='';l=[];d={};data='a'" "$FORMATTED_ATTEMPT"
done
done
set -eu
ATTEMPT_1="
for _ in range(%s):
s += data
"
ATTEMPT_2="
for _ in range(%s):
l.append(data)
b''.join(l)
"
ATTEMPT_3="0"
ATTEMPT_4="
for _ in range(%s):
d[_] = data
s = b''.join(d.values())
"
ATTEMPT_5="
for _ in range(%s):
s = b''.join((s,data))
"
ATTEMPT_6="
for _ in range(%s):
a.frombytes(data)
s = a.tobytes()
"
ATTEMPT_7="
for _ in range(%s):
au.fromunicode(data.decode())
au.tobytes()
"
ATTEMPT_8="
for _ in range(%s):
d[data] = 0
s = b''.join(d)
"
for count in $(seq 1000 10000 101000); do
for attempt_number in $(seq 1 1 8); do
ATTEMPT="$(eval 'echo "${ATTEMPT_'"$attempt_number"'}"')"
FORMATTED_ATTEMPT="$(printf "$ATTEMPT" "$count")"
echo "#$attempt_number - $count"
python -m timeit -s "import array;a=array.array('B');au=array.array('u');s=b'';l=[];d={};data=b'a'" "$FORMATTED_ATTEMPT"
done
done
set -eu
ATTEMPT_1="
for _ in range(%s):
s += data
"
ATTEMPT_2="
for _ in range(%s):
l.append(data)
b''.join(l)
"
ATTEMPT_3="0"
ATTEMPT_4="
for _ in range(%s):
d[_] = data
s = b''.join(d.values())
"
ATTEMPT_5="
for _ in range(%s):
s = b''.join((s,data))
"
ATTEMPT_6="
for _ in range(%s):
a.frombytes(data)
s = a.tobytes()
"
ATTEMPT_7="
for _ in range(%s):
au.fromunicode(data.decode())
au.tobytes()
"
ATTEMPT_8="
for _ in range(%s):
d[data] = 0
s = b''.join(d)
"
for count in $(seq 10 10 100) $(seq 200 100 1000); do
for attempt_number in $(seq 1 1 8); do
ATTEMPT="$(eval 'echo "${ATTEMPT_'"$attempt_number"'}"')"
FORMATTED_ATTEMPT="$(printf "$ATTEMPT" "10000")"
echo "#$attempt_number - $count"
python -m timeit -s "import array;a=array.array('B');au=array.array('u');s=b'';l=[];d={};data=b'a'*$count" "$FORMATTED_ATTEMPT"
done
done
@mawillcockson
Copy link
Author

graphs of the results

These kind of microbenchmarks aren't necessarily useful, but they're fun.

Ultimately the fastest for all scenarios is something like:

from itertools import repeat

b"".join(repeat(data, count))

In my use case I had a stream of data, so the above wouldn't work.

@mawillcockson
Copy link
Author

What is interesting is that:

list = []
list.append(data)

is slower than

dictionary = {}
dictionary[data] = 0

It makes sense to me how the fastest insertion of all is:

list = [data]
list[0] = data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment