Skip to content

Instantly share code, notes, and snippets.

@dutc
Last active January 16, 2021 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dutc/d6c774084b9ed04644d498ec5c3cfe3e to your computer and use it in GitHub Desktop.
Save dutc/d6c774084b9ed04644d498ec5c3cfe3e to your computer and use it in GitHub Desktop.
“Python Expert” Newsletter (Jan 13, 2021): ‘mutating tuples’ code
#!/usr/bin/env python3
from matplotlib.pyplot import show
from pandas import read_csv, to_numeric, DataFrame
from collections import namedtuple
class Analysis(namedtuple('Analysis', 'raw df')):
TUP_PATTERN = r'id\(t\) = (?P<addr>0x[\da-z_]+)'
NDARR_PATTERN = r'xs.__array_interface__\["data"\]\[0\] = (?P<addr>0x[\da-z_]+)'
@classmethod
def from_csv(cls, filename, names):
raw = read_csv(filename, delimiter='\t', names=names)
return cls.from_raw(raw)
@classmethod
def from_raw(cls, raw):
columns = {
'size': to_numeric(raw['size'].str.extract(r'args.size = (?P<sz>\d+)')['sz']),
}
if 'tuple_address' in raw.columns:
columns[('address', 'tuple')] = (
raw['tuple_address'].str.extract(cls.TUP_PATTERN)['addr']
.str.replace('_', '')
.apply(lambda x: int(x, base=16))
)
if 'ndarray_data_address' in raw.columns:
columns[('address', 'ndarray')] = (
raw['ndarray_data_address'].str.extract(cls.NDARR_PATTERN)['addr']
.str.replace('_', '')
.apply(lambda x: int(x, base=16))
)
df = DataFrame(columns)
df = df.set_index('size')
df.columns = *df.columns,
return cls(raw, df)
lt = lambda s, sz: s.df[s.df.index < sz]
le = lambda s, sz: s.df[s.df.index <= sz]
gt = lambda s, sz: s.df[s.df.index > sz]
ge = lambda s, sz: s.df[s.df.index > sz]
eq = lambda s, sz: s.df[s.df.index == sz]
ne = lambda s, sz: s.df[s.df.index != sz]
if __name__ == '__main__':
tup = Analysis.from_csv('tuple.tsv', names='tuple_address size'.split())
ndarr = Analysis.from_csv('ndarray.tsv', names='_ ndarray_data_address size'.split())
# ① tuple allocation covers a large range
print('single `tuple` stats'.center(80, '='))
print(f'{tup.df["address", "tuple"].max() = :#16_x}')
print(f'{tup.df["address", "tuple"].min() = :#16_x}')
print(f'{tup.df["address", "tuple"].mean().astype(int) = :#16_x}')
print(f'{tup.df["address", "tuple"].std().astype(int) = :#16_x}')
print(f'{tup.df["address", "tuple"].unique().size = :,}')
# ==============================single `tuple` stats==============================
# tup.df["address", "tuple"].max() = 0x7ffb_fae3_a130
# tup.df["address", "tuple"].min() = 0x5555_58b8_e360
# tup.df["address", "tuple"].mean().astype(int) = 0x5fa8_e3bd_42d7
# tup.df["address", "tuple"].std().astype(int) = 0x11b0_444c_adef
# tup.df["address", "tuple"].unique().size = 25,000
print('single `ndarray` stats'.center(80, '='))
print(f'{ndarr.df["address", "ndarray"].max() = :#16_x}')
print(f'{ndarr.df["address", "ndarray"].min() = :#16_x}')
print(f'{ndarr.df["address", "ndarray"].mean().astype(int) = :#16_x}')
print(f'{ndarr.df["address", "ndarray"].std().astype(int) = :#16_x}')
print(f'{ndarr.df["address", "ndarray"].unique().size = :,}')
# =============================single `ndarray` stats=============================
# ndarr.df["address", "ndarray"].max() = 0x5655_566e_a3a0
# ndarr.df["address", "ndarray"].min() = 0x5555_561a_2800
# ndarr.df["address", "ndarray"].mean().astype(int) = 0x55d5_5ec0_3d75
# ndarr.df["address", "ndarray"].std().astype(int) = 0x49_7571_d012
# ndarr.df["address", "ndarray"].unique().size = 25,000
print('Range of Addresses'.center(80, '='))
print(f'{tup.df ["address", "tuple" ].max() - tup.df ["address", "tuple" ].min() = :#16_x}')
print(f'{ndarr.df["address", "ndarray"].max() - ndarr.df["address", "ndarray"].min() = :#16_x}')
# ===============================Range of Addresses===============================
# tup.df ["address", "tuple" ].max() - tup.df ["address", "tuple" ].min() = 0x2aa6_a22a_bdd0
# ndarr.df["address", "ndarray"].max() - ndarr.df["address", "ndarray"].min() = 0x100_0054_7ba0
if (SHOW_PLOTS := False):
show()
# ② visible discontinuity in tuple allocation
plt = DataFrame({
'tuple': tup.df["address", "tuple"].groupby("size").mean(),
'ndarray': ndarr.df["address", "ndarray"].groupby("size").mean(),
}).plot(title='Memory Location', legend=True)
plt.axes.yaxis.set_visible(False)
# ③ discontinuity in allocation at size 60 elements
print('Mean Address Around Discontinuty'.center(80, '='))
print((df := tup.df["address", "tuple"].groupby("size").mean()).loc[(idx := df.diff().idxmin())-1: idx+1])
# ========================Mean Address Around Discontinuty========================
# size
# 59 1.401593e+14
# 60 9.436053e+13
# 61 9.440086e+13
# Name: (address, tuple), dtype: float64
tup_ndarr = Analysis.from_csv('tuple-ndarray.tsv', names='tuple_address _ ndarray_data_address size'.split())
ndarr_tup = Analysis.from_csv('ndarray-tuple.tsv', names='_ ndarray_data_address tuple_address size'.split())
# ④ for sizes < 60, `tuple` is always before `ndarray` in memory, irrespective of allocation order
assert ((df := tup_ndarr.lt(60))['address', 'tuple'].min() > df['address', 'ndarray']).max(), '`tuple` appears before `ndarray`!'
assert ((df := ndarr_tup.lt(60))['address', 'tuple'].min() > df['address', 'ndarray']).max(), '`tuple` appears before `ndarray`!'
# ④ for sizes ≥ 60, `tuple` may or may not be before `ndarray` in memory
print('Frequency `tuple` before/after `ndarray`'.center(80, '='))
print(f'{((df := tup_ndarr.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = :.2f}')
print(f'{((df := tup_ndarr.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = :.2f}')
print(f'{((df := ndarr_tup.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = :.2f}')
print(f'{((df := ndarr_tup.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = :.2f}')
# ====================Frequency `tuple` before/after `ndarray`====================
# ((df := tup_ndarr.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = 0.89
# ((df := tup_ndarr.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = 0.11
# ((df := ndarr_tup.ge(60))["address", "tuple"] < df["address", "ndarray"]).mean() = 0.08
# ((df := ndarr_tup.ge(60))["address", "tuple"] >= df["address", "ndarray"]).mean() = 0.92
#!/bin/zsh
::run() {
local debug_py=( "${(@PA)1}" )
local script_py=( "${(@PA)2}" )
local script_zsh=( "${(@PA)3}" )
cmd=(
bwrap
--ro-bind / /
--bind /tmp /tmp
--tmpfs /tmp/code
--tmpfs /tmp/results
--ro-bind-data '<( printf "%s\n" "${(@)debug_py}" )(:t)' /tmp/code/debug.py
--ro-bind-data '<( printf "%s\n" "${(@)script_py}" )(:t)' /tmp/code/script.py
--ro-bind-data '<( printf "%s\n" "${(@)script_zsh}" )(:t)' /tmp/code/script.zsh
zsh /tmp/code/script.zsh
)
eval "$cmd"
}
::dedent() {
first_line="$(head -n1)"
local indent indent_text
if (( $# )); then
indent="${1}"
else
indent_text="$(sed -r 's/^([ ]*).*/\1/' <<< "$first_line")"
indent="${#indent_text}"
fi
sed -r "s/^[ ]{$indent}//" <( echo "$first_line" ; cat )
}
::generate-backtrace() {
local flags=(
-nh -q
--ex 'source /tmp/code/debug.py'
--args python3 /tmp/code/script.py "$@"
)
local script_zsh="$(::dedent <<-EOF
#!/bin/zsh
gdb ${(@q+)flags}
EOF
)"
local debug_py="$(::dedent <<-EOF
from gdb import execute, Breakpoint
setup = [
# 'set logging file /tmp/results/gdb.txt',
# 'set logging redirect on',
# 'set logging on',
'set pagination off',
'set breakpoint pending on',
'set confirm off',
]
for cmd in setup:
execute(cmd)
breakpoints = []
for func in 'malloc tuple_alloc npy_alloc_cache'.split():
b = Breakpoint(function=func)
# ① print backtrace and continue execution
b.commands = f'''
backtrace
continue
'''
b.silent = True
b.enabled = False
breakpoints.append(b)
execute('run')
for b in breakpoints:
b.enabled = True
execute('continue')
execute('quit')
EOF
)"
script_py="$(::dedent <<-EOF
#!/usr/bin/env python3
from argparse import ArgumentParser
from enum import Enum, auto
from random import random
from numpy import array
from os import kill, getpid
from signal import SIGTRAP
class CodePath(Enum):
tuple_alloc = auto()
npy_alloc_cache = auto()
parser = ArgumentParser()
parser.add_argument('--size', type=int)
parser.add_argument('--path', type=CodePath.__getitem__)
# ② drop into debugger after initialisation
import sys; sys.breakpointhook = lambda: kill(getpid(), SIGTRAP)
if __name__ == '__main__':
args = parser.parse_args()
breakpoint()
if args.path is CodePath.tuple_alloc:
xs = tuple([random() for _ in range(args.size)])
elif args.path is CodePath.npy_alloc_cache:
xs = array([random() for _ in range(args.size)])
print(f'{type(xs) = }')
EOF
)"
::run debug_py script_py script_zsh
}
::generate-backtrace "$@"
#!/bin/zsh
::run() {
local py_script=( "${(@PA)1}" )
local zsh_script=( "${(@PA)2}" )
cmd=(
bwrap # ① bwrap helper
--ro-bind / /
--bind /tmp /tmp
--tmpfs /tmp/code
--ro-bind-data '<( printf "%s\n" "${(@)py_script}" )(:t)' /tmp/code/script.py
--ro-bind-data '<( printf "%s\n" "${(@)zsh_script}" )(:t)' /tmp/code/script.zsh
zsh /tmp/code/script.zsh
)
eval "$cmd"
}
::dedent() {
first_line="$(head -n1)"
local indent indent_text
if (( $# )); then
indent="${1}"
else
indent_text="$(sed -r 's/^([ ]*).*/\1/' <<< "$first_line")"
indent="${#indent_text}"
fi
sed -r "s/^[ ]{$indent}//" <( echo "$first_line" ; cat )
}
::test-tuple-id() { # ② `tuple` only
local py_script="$(::dedent <<-EOF
#!/usr/bin/env python3
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--size', type=int)
if __name__ == '__main__':
args = parser.parse_args()
t = tuple([None for _ in range(args.size)])
print(f'{id(t) = :#_x}\t{args.size = }')
EOF
)"
::run py_script zsh_script
}
::test-ndarray-id() { # ③ `ndarray` only
local py_script="$(::dedent <<-EOF
#!/usr/bin/env python3
from argparse import ArgumentParser
from numpy import array
parser = ArgumentParser()
parser.add_argument('--size', type=int)
if __name__ == '__main__':
args = parser.parse_args()
xs = array([None for _ in range(args.size)])
print(f'{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{args.size = }')
EOF
)"
::run py_script zsh_script
}
::test-tuple-ndarray-id() { # ④ `tuple` then `ndarray`
local py_script="$(::dedent <<-EOF
#!/usr/bin/env python3
from argparse import ArgumentParser
from numpy import array
parser = ArgumentParser()
parser.add_argument('--size', type=int)
if __name__ == '__main__':
args = parser.parse_args()
t = tuple([None for _ in range(args.size)])
xs = array([None for _ in range(args.size)])
print(f'{id(t) = :#_x}\t{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{args.size = }')
EOF
)"
::run py_script zsh_script
}
::test-ndarray-tuple-id() { # ⑤ `ndarray` then `tuple`
local py_script="$(::dedent <<-EOF
#!/usr/bin/env python3
from argparse import ArgumentParser
from numpy import array
parser = ArgumentParser()
parser.add_argument('--size', type=int)
if __name__ == '__main__':
args = parser.parse_args()
xs = array([None for _ in range(args.size)])
t = tuple([None for _ in range(args.size)])
print(f'{id(xs) = :#_x}\t{xs.__array_interface__["data"][0] = :#_x}\t{id(t) = :#_x}\t{args.size = }')
EOF
)"
::run py_script zsh_script
}
# ⑥ run 100 times for sizes from [1, 150] elements
zsh_script="$(::dedent <<-EOF
#!/bin/zsh
for sz in \$(seq 1 250); do
for _ in \$(seq 1 100); do
python /tmp/code/script.py --size "\$sz"
done
done
EOF
)"
@dutc
Copy link
Author

dutc commented Jan 13, 2021

For the full explanation, sign up for the “Python Expert” newsletter!

bit.ly/expert-python

@dutc
Copy link
Author

dutc commented Jan 13, 2021

Results files here:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment