Event: PyCon Lithuania
Date: Thu May 26, 2022
Speaker: James Powell
Twitter: @dontusethiscode
Disclaimer: this keynote is not about learning anything new.
→ “Tell me something interesting.”
Okay, so we all know…
print(f'{hash(-1) = }')
But this is a bit strange. It breaks the pattern…
print(
f'{hash(+3) = :>2}',
# f'{hash(+2) = :>2}',
# f'{hash(+1) = :>2}',
# f'{hash( 0) = :>2}',
# f'{hash(-1) = :>2}',
# f'{hash(-2) = :>2}',
# f'{hash(-3) = :>2}',
sep='\n',
)
class T:
def __hash__(self):
return -1
obj = T()
print(f'{hash(obj) = }')
It’s a detail.
But from the details, we can always derive meaning… but sometimes we have to dig & dig & dig.
Ref:
- Objects/typeobject.c:7585
- Python/bltinmodule.c:1567
This brings to our attention a very important question…
This is a case of an in·band vs out·of·band encoding problem.
- “in·band”: use the same “channel” for data/structure or data/metadata
- “out·of·band”: use distinct “channels” for data/structure or data/metadata
You’ve seen this before:
touch a b 'c d' e
tree
# find -type f | xargs du
# find -type f -print0 | xargs -0 du
You have even seen solutions: · quoting/escaping
echo $PATH
mkdir a:b
touch a:b/c
chmod ugo+x a:b/c
export PATH="$PWD/a:b:$PATH"
c # doesn't work
mkdir a:b
docker -v a:b:/ab -it ubuntu:latest bash
Or other solutions:
- partitioning
from numpy import array, nan
xs = array([0, 1, 2, nan])
print(
f'{xs = }',
f'{xs.dtype = }',
sep='\n',
)
from numpy import array, nan, int64
xs = array([
2 ** 53 + 0,
2 ** 53 + 1,
nan,
])
print(
f'{xs = }',
f'{xs % 1_000 = }',
f'{xs[:-1].astype(int64) = }',
f'{xs[:-1].astype(int64) % 1_000 = }',
sep='\n',
)
from numpy.ma import array
xs = array([0, 1, 2], mask=[True, False, False])
print(f'{xs = }')
from pandas import array
xs = array([None, 1, 2], dtype='Int64')
print(f'{xs = }')
from pandas import Series, MultiIndex, date_range
from numpy.random import default_rng
rng = default_rng(0)
idx = MultiIndex.from_product([
date_range('2000-01-01', periods=2),
[*'ab'],
], names=['date', 'entity'])[:-1]
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx) #, dtype='Int64')
# s[-1] = 2**53 + 1
print(
s,
# s.groupby(['date', 'entity']).sum().unstack(),
# s.groupby(['date', 'entity']).sum().unstack().fillna(0).astype(int),
# s.groupby(['date', 'entity']).sum().unstack(fill_value=0),
# s.to_frame('value').pivot_table(
# index='date', columns='entity', values='value',
# aggfunc=sum, fill_value=0
# ),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
You may even have seen this solution:
- prefixing
touch a b c
xxh32sum a b c
{
mkdir repo && cd repo
git init .
dd if=/dev/zero of=file bs=4K count=13
git add file
git commit -am 'first commit'
} >/dev/null 2>&1
du -b file
pigz -d < .git/objects/9e/5e017a1bc230b7f3f705e7411d8e095333ae4f | xxd -l 16
But once you see the problem, you can’t un·see it!
d = {}
# why `KeyError`?
print(f'{d[...] = }')
typeset -A arr
arr=( key1 value1 key2 '' )
for k in key1 key2 key3; do
printf "\${arr[$k]} = %s\n" "${arr[$k]}"
done
d = {None: None}
def find(needle, haystack):
for idx, x in enumerate(haystack):
if x == needle:
return True, idx
return False, None
class Iter:
def __iter__(self):
self.active = True
return self
def __next__(self):
if not self.active:
raise StopIteration()
self.active = False
it = Iter()
print(f'{[*it] = }')
from inspect import signature
@lambda f: [m := signature(f).parameters['missing'], setattr(f, m.name, m.default), f][-1]
def find(needle, haystack, *, missing=object()):
...
return missing
assert find(..., ...) is find.missing
But this goes even deeper…
What is the difference?
s = 'a,b,c'
xs = ['a', 'b', 'c']
This is ambiguous!
s = 'a,b,c,d'
xs = ['a', 'b', 'c,d']
How many steps…?
def f():
raw_data = load_data()
data = clean_data(raw_data)
data = remove_outliers(data)
results = process_data(data)
report = generate_report(results)
from dis import dis
# dis(f)
def g():
yield (raw_data := load_data())
data = clean_data(raw_data)
yield (data := remove_outliers(data))
yield (results := process_data(data))
yield (report := generate_report(results))
So str
is to list
as function
is to generator
…
Isn’t the Xarray API great?
from xarray import DataArray
from numpy import linspace
from numpy.random import default_rng
rng = default_rng(0)
da = DataArray(
data=(data := rng.integers(-10, +10, size=(3, 3))),
dims=[*'xy'],
coords={
'x': linspace(0, 10, data.shape[0]),
'y': linspace(0, 10, data.shape[1]),
},
)
print(
da,
# da.sel(x=[0, 10]),
# da.interp(x=[0, 1, 2]),
# da.interp(x=[0, 1, 2], method='linear'),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
method=…
is a modality!
from xarray import DataArray
from numpy import linspace
from numpy.random import default_rng
rng = default_rng(0)
da = DataArray(
data=(data := rng.integers(-10, +10, size=(3, 3))),
dims=['x', 'method'],
coords={
'x': linspace(0, 10, data.shape[0]),
'method': linspace(0, 10, data.shape[1]),
},
)
print(
da.interp(x=[0, 1, 2], method=[0, 1, 2]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
def f(data, *, mode): pass
@lambda coro: lambda *a, **kw: [ci := coro(*a, **kw), next(ci), lambda v=None: ci.send(v)][-1]
def coro(mode):
while True:
data = yield
Now, do bear in mind…
print(f'{hash(-1) = }')
“Reference implementations over-specify.” (Nick Coghlan)
So what does this mean for us…?
Well, you would never…
class T:
def initialise(self):
...
obj = T()
obj.initialise()
class T:
def __init__(self):
...
obj = T()
Sure, that’s one fewer line of code, but this is the same amount of code (yet you would avoid it)…
class T:
@property
def size(self):
return 0
obj = T()
print(f'{obj.size = }')
class T:
def __len__(self):
return 0
obj = T()
print(f'{len(obj) = }')
Of course, you know that you have to follow some rules (and some conventions):
class T:
def __repr__(self):
return 'T()'
def __len__(self):
return -1
# return 1.2
# return 0
obj = T()
print(f'{len(obj) = }')
For, __len__
, “non-negative, integer size.”
In general:
- unique or privileged
- unambiguous
from pandas import DataFrame
df = DataFrame({
'a': [1, 2, 3],
'b': [4, 5, 6],
})
print(
df,
# f'{len(df) = }',
# f'{len(df.index) = }',
# f'{len(df.columns) = }',
# f'{len(df._data) = }',
# f'{df.size = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
from pandas import DataFrame
df = DataFrame({
'a': [1, 2, 3],
'b': [4, 5, 6],
})
for x in df:
print(f'{x = }')
Similarly…
class T:
def retrieve(self, key):
...
obj = T()
obj.retrieve(...)
class T:
def __getitem__(self, key):
...
obj = T()
obj[...]
class T:
def __getitem__(self, key, *, flag=True):
...
obj = T()
obj[..., flag=True]
class T:
def retrieve(self, key, *, flag=True):
...
obj = T()
obj.retrieve(..., flag=True)
class T:
def retrieve(self, key, *, prod=False, timeout=1.5):
...
obj = T()
obj.retrieve(...)
An aside: “bounded” vs “unbounded” modalities.
Bounded modality → nominal decomposition.
def f(data, *, mode):
...
f(..., mode=True)
f(..., mode=False)
def f_mode(data):
...
def f_nomode(data):
...
f_mode(...)
f_nomode(...)
def f(data, *, mode):
...
mode = True
f(..., mode=mode)
def f_mode(data):
...
def f_nomode(data):
...
mode = True
(f_mode if mode else f_nomode)(...)
class T:
def __init__(self, *, mode=False):
if mode:
...
class T:
def __init__(self, *, mode=False, mode2=False):
if mode:
...
elif mode2:
...
class Dataset:
def __init__(self, filename, *, csv=True, json=False, xml=False):
if csv:
...
elif json:
...
elif xml:
...
class Dataset:
@classmethod
def from_csv(filename):
...
@classmethod
def from_json(filename):
...
@classmethod
def from_xml(filename):
...
from pandas import Series
s = Series([0, 1, 2], index=[0, 10, 20])
print(
f'{s.loc[0] = }',
f'{s.iloc[0] = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
class Loc:
def __get__(self, instance, owner):
...
class T:
loc = Loc()
print(f'{T.loc[...] = }')
from contextlib import contextmanager
class T:
@contextmanager
def context(self, mode):
yield
obj = T()
with obj.context(...) as obj2:
...
class T:
def hash(self):
...
obj = T()
print(f'{obj.hash() = }')
class T:
def __hash__(self):
return 0
obj = T()
print(f'{hash(obj) = }')
from sys import hash_info
print(f'{hash_info = }')
The object model provides either:
- mechanisms for implementing a common vocabulary (for us)
- mechanisms for hooking into the language runtime (not for us)
What if sys.hash_info
changes?
So, what does this mean…?
Every small detail leads to a bigger story, if you dig & dig & dig…
int
is to list
as float
is to tuple
.
xs = [1, 2, 3]
for x in xs: pass
s = 'abc'
for c in s: pass
s = 'abc'
for c in s:
assert isinstance(c, type(s))
xs = [1, 2, 3]
ys = [4, 5, 6]
zs = [*xs, *ys] # PEP-3132 (Python 3, Apr-2007)
print(
f'{xs = }',
f'{ys = }',
f'{zs = }',
sep='\n',
)
from collections.abc import Iterable
def flatten(data):
if isinstance(data, Iterable) and not isinstance(data, str):
for x in data:
yield from flatten(x)
else:
yield data
print(f'{[*flatten([[[1, 2, 3], "abc"]])] = }')
What constitutes an entity is ambiguous itself…
from random import choice as py_choice
from numpy.random import choice as np_choice
print(
f'{py_choice("abc") = }',
# f'{np_choice("abc") = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
Well, if str
is the only builtin Iterable
whose contents are of the same
type as the container, then int
is the only builtin container you can’t
directly iterate over.
from sys import maxsize, int_info
print(
f'{maxsize = }',
f'{int_info = }',
sep='\n',
)
Ref:
- Include/cpython/longintrepr.h:81
What makes a container?
- decomposition
- size-independence
xs = [1, 2, 3]
...
# xs.append(4)
# xs.clear()
...
for x in xs:
pass
xs = [1, 2, 3, 'four']
for x in xs:
f(x)
t = 'abc', 123
...
...
name, value = t
xs = [1, 2, 3, 4, 5]
...
...
xs[2]
t = 'abc', 123, True
...
...
print(f'{t[2] = }')
from collections import namedtuple
Entity = namedtuple('Entity', 'name value flag')
t = Entity('abc', 123, True)
...
...
print(f'{t.flag = }')
Both are containers; both are iterable.
- one is (loosely) homogeneous
- one is heterogeneous
One represents…
- data; a container of non-unique elements you iterate over in some order.
- structure; a collection of fields related to one single entity.
In fact…
tuple
: when you have one things.list
: when you have many things.
What is a dict
?
# this represents a structure
d = {
'name': ...,
'value': ...,
}
d['name']
# this represents some data
d = {
'a': 1,
'b': 2,
'c': 3,
}
for k, v in d.items():
pass
Is float
a container?
- sign, significand, exponent
from numpy import frexp
# mantissa, exponent
print(f'{frexp(123.456) = }')
float
is a structure (like tuple
.)
int
is a container (like list
.)
What more is there lurking beneath? I don’t know.
This keynote is not about learning anything new.
It’s about discovering meaning that you already knew, but sometimes you have to dig & dig & dig…
If you liked this presentation, follow us on Twitter & join our weekly newsletter!
It's the best way to find out about weekly 90-min seminars we run on topics such as this!
Newsletter: bit.ly/expert-python
Twitter: @dutc_training & @dontusethiscode