Last active
January 16, 2021 10:39
-
-
Save dutc/d6c774084b9ed04644d498ec5c3cfe3e to your computer and use it in GitHub Desktop.
“Python Expert” Newsletter (Jan 13, 2021): ‘mutating tuples’ code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from matplotlib.pyplot import show | |
from pandas import read_csv, to_numeric, DataFrame | |
from collections import namedtuple | |
class Analysis(namedtuple('Analysis', 'raw df')): | |
TUP_PATTERN = r'id\(t\) = (?P<addr>0x[\da-z_]+)' | |
NDARR_PATTERN = r'xs.__array_interface__\["data"\]\[0\] = (?P<addr>0x[\da-z_]+)' | |
@classmethod | |
def from_csv(cls, filename, names): | |
raw = read_csv(filename, delimiter='\t', names=names) | |
return cls.from_raw(raw) | |
@classmethod | |
def from_raw(cls, raw): | |
columns = { | |
'size': to_numeric(raw['size'].str.extract(r'args.size = (?P<sz>\d+)')['sz']), | |
} | |
if 'tuple_address' in raw.columns: | |
columns[('address', 'tuple')] = ( | |
raw['tuple_address'].str.extract(cls.TUP_PATTERN)['addr'] | |
.str.replace('_', '') | |
.apply(lambda x: int(x, base=16)) | |
) | |
if 'ndarray_data_address' in raw.columns: | |
columns[('address', 'ndarray')] = ( | |
raw['ndarray_data_address'].str.extract(cls.NDARR_PATTERN)['addr'] | |
.str.replace('_', '') | |
.apply(lambda x: int(x, base=16)) | |
) | |
df = DataFrame(columns) | |
df = df.set_index('size') | |
df.columns = *df.columns, | |
return cls(raw, df) | |
lt = lambda s, sz: s.df[s.df.index < sz] | |
le = lambda s, sz: s.df[s.df.index <= sz] | |
gt = lambda s, sz: s.df[s.df.index > sz] | |
ge = lambda s, sz: s.df[s.df.index > sz] | |
eq = lambda s, sz: s.df[s.df.index == sz] | |
ne = lambda s, sz: s.df[s.df.index != sz] | |
if __name__ == '__main__': | |
tup = Analysis.from_csv('tuple.tsv', names='tuple_address size'.split()) | |
ndarr = Analysis.from_csv('ndarray.tsv', names='_ ndarray_data_address size'.split()) | |
# ① tuple allocation covers a large range | |
print('single `tuple` stats'.center(80, '=')) | |
print(f'{tup.df["address", "tuple"].max() = :#16_x}') | |
print(f'{tup.df["address", "tuple"].min() = :#16_x}') | |
print(f'{tup.df["address", "tuple"].mean().astype(int) = :#16_x}') | |
print(f'{tup.df["address", "tuple"].std().astype(int) = :#16_x}') | |
print(f'{tup.df["address", "tuple"].unique().size = :,}') | |
# ==============================single `tuple` stats============================== | |
# tup.df["address", "tuple"].max() = 0x7ffb_fae3_a130 | |
# tup.df["address", "tuple"].min() = 0x5555_58b8_e360 | |
# tup.df["address", "tuple"].mean().astype(int) = 0x5fa8_e3bd_42d7 | |
# tup.df["address", "tuple"].std().astype(int) = 0x11b0_444c_adef | |
# tup.df["address", "tuple"].unique().size = 25,000 | |
print('single `ndarray` stats'.center(80, '=')) | |
print(f'{ndarr.df["address", "ndarray"].max() = :#16_x}') | |
print(f'{ndarr.df["address", "ndarray"].min() = :#16_x}') | |
print(f'{ndarr.df["address", "ndarray"].mean().astype(int) = :#16_x}') | |
print(f'{ndarr.df["address", "ndarray"].std().astype(int) = :#16_x}') | |
print(f'{ndarr.df["address", "ndarray"].unique().size = :,}') | |
# =============================single `ndarray` stats============================= | |
# ndarr.df["address", "ndarray"].max() = 0x5655_566e_a3a0 | |
# ndarr.df["address", "ndarray"].min() = 0x5555_561a_2800 | |
# ndarr.df["address", "ndarray"].mean().astype(int) = 0x55d5_5ec0_3d75 | |
# ndarr.df["address", "ndarray"].std().astype(int) = 0x49_7571_d012 | |
# ndarr.df["address", "ndarray"].unique().size = 25,000 | |
print('Range of Addresses'.center(80, '=')) | |
print(f'{tup.df ["address", "tuple" ].max() - tup.df ["address", "tuple" ].min() = :#16_x}') | |
print(f'{ndarr.df["address", "ndarray"].max() - ndarr.df["address", "ndarray"].min() = :#16_x}') | |
# ===============================Range of Addresses=============================== | |
# tup.df ["address", "tuple" ].max() - tup.df ["address", "tuple" ].min() = 0x2aa6_a22a_bdd0 | |
# ndarr.df["address", "ndarray"].max() - ndarr.df["address", "ndarray"].min() = 0x100_0054_7ba0 | |
if (SHOW_PLOTS := False): | |
show() | |
# ② visible discontinuity in tuple allocation | |
plt = DataFrame({ | |
'tuple': tup.df["address", "tuple"].groupby("size").mean(), | |
'ndarray': ndarr.df["address", "ndarray"].groupby("size").mean(), | |
}).plot(title='Memory Location', legend=True) | |
plt.axes.yaxis.set_visible(False) | |
# ③ discontinuity in allocation at size 60 elements | |
print('Mean Address Around Discontinuty'.center(80, '=')) | |
print((df := tup.df["address", "tuple"].groupby("size").mean()).loc[(idx := df.diff().idxmin())-1: idx+1]) | |
# ========================Mean Address Around Discontinuty======================== | |
# size | |
# 59 1.401593e+14 | |
# 60 9.436053e+13 | |
# 61 9.440086e+13 | |
# Name: (address, tuple), dtype: float64 | |
tup_ndarr = Analysis.from_csv('tuple-ndarray.tsv', names='tuple_address _ ndarray_data_address size'.split()) | |
ndarr_tup = Analysis.from_csv('ndarray-tuple.tsv', names='_ ndarray_data_address tuple_address size'.split()) | |
# ④ for sizes < 60, `tuple` is always before `ndarray` in memory, irrespective of allocation order | |
assert ((df := tup_ndarr.lt(60))['address', 'tuple'].min() > df['address', 'ndarray']).max(), '`tuple` appears before `ndarray`!' | |
assert ((df := ndarr_tup.lt(60))['address', 'tuple'].min() > df['address', 'ndarray']).max(), '`tuple` appears before `ndarray`!' | |
# ④ for sizes ≥ 60, `tuple` may or may not be before `ndarray` in memory | |
print('Frequency `tuple` before/after `ndarray`'.center(80, '=')) | |
print(f'{((df := tup_ndarr.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = :.2f}') | |
print(f'{((df := tup_ndarr.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = :.2f}') | |
print(f'{((df := ndarr_tup.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = :.2f}') | |
print(f'{((df := ndarr_tup.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = :.2f}') | |
# ====================Frequency `tuple` before/after `ndarray`==================== | |
# ((df := tup_ndarr.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = 0.89 | |
# ((df := tup_ndarr.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = 0.11 | |
# ((df := ndarr_tup.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = 0.08 | |
# ((df := ndarr_tup.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = 0.92 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
::run() { | |
local debug_py=( "${(@PA)1}" ) | |
local script_py=( "${(@PA)2}" ) | |
local script_zsh=( "${(@PA)3}" ) | |
cmd=( | |
bwrap | |
--ro-bind / / | |
--bind /tmp /tmp | |
--tmpfs /tmp/code | |
--tmpfs /tmp/results | |
--ro-bind-data '<( printf "%s\n" "${(@)debug_py}" )(:t)' /tmp/code/debug.py | |
--ro-bind-data '<( printf "%s\n" "${(@)script_py}" )(:t)' /tmp/code/script.py | |
--ro-bind-data '<( printf "%s\n" "${(@)script_zsh}" )(:t)' /tmp/code/script.zsh | |
zsh /tmp/code/script.zsh | |
) | |
eval "$cmd" | |
} | |
::dedent() { | |
first_line="$(head -n1)" | |
local indent indent_text | |
if (( $# )); then | |
indent="${1}" | |
else | |
indent_text="$(sed -r 's/^([ ]*).*/\1/' <<< "$first_line")" | |
indent="${#indent_text}" | |
fi | |
sed -r "s/^[ ]{$indent}//" <( echo "$first_line" ; cat ) | |
} | |
::generate-backtrace() { | |
local flags=( | |
-nh -q | |
--ex 'source /tmp/code/debug.py' | |
--args python3 /tmp/code/script.py "$@" | |
) | |
local script_zsh="$(::dedent <<-EOF | |
#!/bin/zsh | |
gdb ${(@q+)flags} | |
EOF | |
)" | |
local debug_py="$(::dedent <<-EOF | |
from gdb import execute, Breakpoint | |
setup = [ | |
# 'set logging file /tmp/results/gdb.txt', | |
# 'set logging redirect on', | |
# 'set logging on', | |
'set pagination off', | |
'set breakpoint pending on', | |
'set confirm off', | |
] | |
for cmd in setup: | |
execute(cmd) | |
breakpoints = [] | |
for func in 'malloc tuple_alloc npy_alloc_cache'.split(): | |
b = Breakpoint(function=func) | |
# ① print backtrace and continue execution | |
b.commands = f''' | |
backtrace | |
continue | |
''' | |
b.silent = True | |
b.enabled = False | |
breakpoints.append(b) | |
execute('run') | |
for b in breakpoints: | |
b.enabled = True | |
execute('continue') | |
execute('quit') | |
EOF | |
)" | |
script_py="$(::dedent <<-EOF | |
#!/usr/bin/env python3 | |
from argparse import ArgumentParser | |
from enum import Enum, auto | |
from random import random | |
from numpy import array | |
from os import kill, getpid | |
from signal import SIGTRAP | |
class CodePath(Enum): | |
tuple_alloc = auto() | |
npy_alloc_cache = auto() | |
parser = ArgumentParser() | |
parser.add_argument('--size', type=int) | |
parser.add_argument('--path', type=CodePath.__getitem__) | |
# ② drop into debugger after initialisation | |
import sys; sys.breakpointhook = lambda: kill(getpid(), SIGTRAP) | |
if __name__ == '__main__': | |
args = parser.parse_args() | |
breakpoint() | |
if args.path is CodePath.tuple_alloc: | |
xs = tuple([random() for _ in range(args.size)]) | |
elif args.path is CodePath.npy_alloc_cache: | |
xs = array([random() for _ in range(args.size)]) | |
print(f'{type(xs) = }') | |
EOF | |
)" | |
::run debug_py script_py script_zsh | |
} | |
::generate-backtrace "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
::run() { | |
local py_script=( "${(@PA)1}" ) | |
local zsh_script=( "${(@PA)2}" ) | |
cmd=( | |
bwrap # ① bwrap helper | |
--ro-bind / / | |
--bind /tmp /tmp | |
--tmpfs /tmp/code | |
--ro-bind-data '<( printf "%s\n" "${(@)py_script}" )(:t)' /tmp/code/script.py | |
--ro-bind-data '<( printf "%s\n" "${(@)zsh_script}" )(:t)' /tmp/code/script.zsh | |
zsh /tmp/code/script.zsh | |
) | |
eval "$cmd" | |
} | |
::dedent() { | |
first_line="$(head -n1)" | |
local indent indent_text | |
if (( $# )); then | |
indent="${1}" | |
else | |
indent_text="$(sed -r 's/^([ ]*).*/\1/' <<< "$first_line")" | |
indent="${#indent_text}" | |
fi | |
sed -r "s/^[ ]{$indent}//" <( echo "$first_line" ; cat ) | |
} | |
::test-tuple-id() { # ② `tuple` only | |
local py_script="$(::dedent <<-EOF | |
#!/usr/bin/env python3 | |
from argparse import ArgumentParser | |
parser = ArgumentParser() | |
parser.add_argument('--size', type=int) | |
if __name__ == '__main__': | |
args = parser.parse_args() | |
t = tuple([None for _ in range(args.size)]) | |
print(f'{id(t) = :#_x}\t{args.size = }') | |
EOF | |
)" | |
::run py_script zsh_script | |
} | |
::test-ndarray-id() { # ③ `ndarray` only | |
local py_script="$(::dedent <<-EOF | |
#!/usr/bin/env python3 | |
from argparse import ArgumentParser | |
from numpy import array | |
parser = ArgumentParser() | |
parser.add_argument('--size', type=int) | |
if __name__ == '__main__': | |
args = parser.parse_args() | |
xs = array([None for _ in range(args.size)]) | |
print(f'{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{args.size = }') | |
EOF | |
)" | |
::run py_script zsh_script | |
} | |
::test-tuple-ndarray-id() { # ④ `tuple` then `ndarray` | |
local py_script="$(::dedent <<-EOF | |
#!/usr/bin/env python3 | |
from argparse import ArgumentParser | |
from numpy import array | |
parser = ArgumentParser() | |
parser.add_argument('--size', type=int) | |
if __name__ == '__main__': | |
args = parser.parse_args() | |
t = tuple([None for _ in range(args.size)]) | |
xs = array([None for _ in range(args.size)]) | |
print(f'{id(t) = :#_x}\t{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{args.size = }') | |
EOF | |
)" | |
::run py_script zsh_script | |
} | |
::test-ndarray-tuple-id() { # ⑤ `ndarray` then `tuple` | |
local py_script="$(::dedent <<-EOF | |
#!/usr/bin/env python3 | |
from argparse import ArgumentParser | |
from numpy import array | |
parser = ArgumentParser() | |
parser.add_argument('--size', type=int) | |
if __name__ == '__main__': | |
args = parser.parse_args() | |
xs = array([None for _ in range(args.size)]) | |
t = tuple([None for _ in range(args.size)]) | |
print(f'{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{id(t) = :#_x}\t{args.size = }') | |
EOF | |
)" | |
::run py_script zsh_script | |
} | |
# ⑥ run 100 times for sizes from [1, 150] elements | |
zsh_script="$(::dedent <<-EOF | |
#!/bin/zsh | |
for sz in \$(seq 1 250); do | |
for _ in \$(seq 1 100); do | |
python /tmp/code/script.py --size "\$sz" | |
done | |
done | |
EOF | |
)" |
Results files here:
- allocation patterns (
id-tests.zsh
,analysis.py
): https://gist.github.com/dutc/f2a932993a1fcdc5a05c4f726c5b34f8 - comparative backtraces (
backtraces.zsh
): https://gist.github.com/dutc/07649346e1f4092bde44d959c12ba333
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For the full explanation, sign up for the “Python Expert” newsletter!
bit.ly/expert-python