Created
September 18, 2019 10:31
-
-
Save jul/4746a4dec42a7e57f58d32994478f453 to your computer and use it in GitHub Desktop.
Using recursion in a weired way to flatten a pandoc json (state serialisation of an AST I guess
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Perversion | |
""" | |
from collections import MutableMapping, Counter | |
from json import * | |
from functools import reduce | |
sum = lambda alot: reduce(lambda x, y: y+x, alot) | |
MARKER=object() | |
RECURSOR=object() | |
STOPER=object() | |
class Path(tuple): | |
def endswith( self, *a_tuple ): | |
"""check if path ends with the consecutive given has argumenbts value | |
>>> p = Path( [ 'a', 'b', 'c' ] ) | |
>>> p.endswith( 'b', 'c' ) | |
>>> True | |
>>> p.endswith( 'c', 'b' ) | |
>>> False | |
""" | |
return self[len(self) - len(a_tuple) : ] == a_tuple | |
def startswith( self, *a_tuple ): | |
"""checks if a path starts with the value | |
>>> p = Path( [ 'a', 'b', 'c', 'd' ] ) | |
>>> p.startswith( 'a', 'b' ) | |
>>> True | |
""" | |
return self[: len( a_tuple ) ] == a_tuple | |
def _contains( self, a_tuple, _from = 0, follow = 0): | |
if len( a_tuple) == follow: | |
return True | |
index = False | |
here = self[ _from:] | |
try: | |
index = here.index(a_tuple[follow] ) | |
return self._contains( | |
a_tuple, | |
index + 1 , | |
follow + 1 | |
) | |
except ValueError: | |
return False | |
return False | |
def contains(self, *a_tuple ): | |
"""checks if the serie of keys is contained in a path | |
>>> p = Path( [ 'a', 'b', 'c', 'd' ] ) | |
>>> p.contains( 'b', 'c' ) | |
>>> True | |
""" | |
return self._contains(a_tuple) | |
def value(self): | |
""" function provided for code readability: | |
- returns the left most value of the Path aka the value | |
""" | |
return self[-1] | |
def key(self): | |
""" function provided for code readability: | |
- returns all the keys in the Path | |
""" | |
return Path(self[:-1]) | |
def make_from_path(type_of_mapping, path): | |
"""Work in Progress | |
create a mutable mapping from a `Path`_ (tuple made of a series of keys in a dict leading to a | |
value followed by a value). | |
The source is used a mapping factory and is reset in the process | |
>>> make_from_path(dict, ("y", "z", 2)) | |
>>> #Out[2]: {'y': {'z': 2}} | |
""" | |
path = list(path) | |
value = path.pop() | |
last_key = path.pop() | |
tmap = type_of_mapping | |
mapping = tmap({last_key : value}) | |
while path: | |
_next = path.pop() | |
mapping = tmap({_next : mapping }) | |
return mapping | |
def _any(p): | |
return True | |
def is_array(p): | |
return type(p.value())==list and p.key()[-1] not in { "Image", } and RECURSOR | |
def list_flatener(pa): | |
for v in pa.value(): | |
yield Path(pa.key() +(v,)) | |
def shoud_be_splitted(p): | |
NB=chr(0xa0) | |
v=p.value() | |
if type(v) == str and NB in v: | |
for s in v.split(NB): | |
if s: | |
yield Path(p.key() + (s,)) | |
else: | |
identity(v) | |
def identity(p): | |
yield p | |
def word_counter(p): | |
yield Path(("words", p.value())) | |
yield Path( p) | |
def is_dict(p): | |
return isinstance(p.value(), MutableMapping) and len(p.value()) and RECURSOR | |
def _dissecator(p): | |
v = p.value() | |
if set(v.keys()) & { "c", "t" }: | |
if not(v.get("t")=="Space" ): | |
yield Path(p.key()+(v["t"],v.get("c"))) | |
else: | |
for k,v in p.value().items(): | |
yield Path(p.key()+(k,v)) | |
tree_exploder = dict(( | |
(is_dict,_dissecator), | |
(_any, identity), | |
)) | |
def is_str(p): | |
return p.key().endswith("Str") | |
def islink(p): | |
return p.contains("Link") and tuple((RECURSOR,{is_dict:_dissecator, is_array:list_flatener, _any:identity},)) | |
def stop_iter(e): | |
raise StopIteration | |
def emit_link(p): | |
yield p | |
yield Path(("LKS", p)) | |
dispatch = pandoc_disp = dict(((is_dict, _dissecator),)) | |
# filtering by short circuit (stop iteration) | |
pandoc_disp[lambda p: not p.startswith("blocks")]=stop_iter | |
# one way of filtering and re-emitting/tranform info info by changing the logic of recursion | |
pandoc_disp[islink]=emit_link | |
# another way of filtering and re-emitting info while parsing (clean for me) | |
pandoc_disp[is_str]=word_counter | |
# another way of filtering and re-emitting info while parsing (ugly hidden filter in code) | |
pandoc_disp[is_array]=list_flatener | |
# default : yield leaf | |
pandoc_disp[_any]=identity | |
def mapping_row_iter(tree, path=MARKER, predicate=pandoc_disp): | |
""" | |
iterator on a tree that yield an iterator on a mapping in the form of | |
a list of ordered key that leads to the element and the value | |
""" | |
if path is MARKER: | |
path = () | |
p = Path(path+(tree,)) | |
for pred, effector in predicate.items(): | |
switch = pred(p) | |
if isinstance(switch, tuple): | |
switch, predicate = switch | |
def recursor(p, predicate=pandoc_disp): | |
return mapping_row_iter(p.value(),path=p.key(), predicate=predicate) | |
if switch in {RECURSOR,}: | |
for value in effector(p): | |
yield from recursor(value, predicate) | |
break | |
if switch: | |
yield from effector(p) | |
break | |
graph= mapping_row_iter | |
with open("res.json") as f: | |
tree = load(f) | |
t=list( | |
graph( | |
tree, | |
)) | |
#word counter | |
print(Counter(map(lambda p:p.value(), filter(lambda e:e.startswith("words"),t[200:300])))) | |
#getting all parallely emitted links | |
#print(list(filter(lambda e:e.startswith("LKS"), t))) | |
# getting all Images | |
#print(list(map(lambda p :p[-2:],filter(lambda e:e.key().endswith("Image"), t)))) | |
# getting all link list flattened | |
#list(list(map(lambda p :p[-4:],filter(lambda e:e.key().endswith("Link"), t)))) | |
#transforming a tree in a flat text | |
print(" ".join(map( lambda p:p.value(),filter(lambda p : p.key().endswith("Str"), t[:100])))) | |
#print(t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment