Skip to content

Instantly share code, notes, and snippets.

@jul
Created September 18, 2019 10:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jul/4746a4dec42a7e57f58d32994478f453 to your computer and use it in GitHub Desktop.
Save jul/4746a4dec42a7e57f58d32994478f453 to your computer and use it in GitHub Desktop.
Using recursion in a weired way to flatten a pandoc json (state serialisation of an AST I guess
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Perversion
"""
from collections import MutableMapping, Counter
from json import *
from functools import reduce
sum = lambda alot: reduce(lambda x, y: y+x, alot)
MARKER=object()
RECURSOR=object()
STOPER=object()
class Path(tuple):
def endswith( self, *a_tuple ):
"""check if path ends with the consecutive given has argumenbts value
>>> p = Path( [ 'a', 'b', 'c' ] )
>>> p.endswith( 'b', 'c' )
>>> True
>>> p.endswith( 'c', 'b' )
>>> False
"""
return self[len(self) - len(a_tuple) : ] == a_tuple
def startswith( self, *a_tuple ):
"""checks if a path starts with the value
>>> p = Path( [ 'a', 'b', 'c', 'd' ] )
>>> p.startswith( 'a', 'b' )
>>> True
"""
return self[: len( a_tuple ) ] == a_tuple
def _contains( self, a_tuple, _from = 0, follow = 0):
if len( a_tuple) == follow:
return True
index = False
here = self[ _from:]
try:
index = here.index(a_tuple[follow] )
return self._contains(
a_tuple,
index + 1 ,
follow + 1
)
except ValueError:
return False
return False
def contains(self, *a_tuple ):
"""checks if the serie of keys is contained in a path
>>> p = Path( [ 'a', 'b', 'c', 'd' ] )
>>> p.contains( 'b', 'c' )
>>> True
"""
return self._contains(a_tuple)
def value(self):
""" function provided for code readability:
- returns the left most value of the Path aka the value
"""
return self[-1]
def key(self):
""" function provided for code readability:
- returns all the keys in the Path
"""
return Path(self[:-1])
def make_from_path(type_of_mapping, path):
"""Work in Progress
create a mutable mapping from a `Path`_ (tuple made of a series of keys in a dict leading to a
value followed by a value).
The source is used a mapping factory and is reset in the process
>>> make_from_path(dict, ("y", "z", 2))
>>> #Out[2]: {'y': {'z': 2}}
"""
path = list(path)
value = path.pop()
last_key = path.pop()
tmap = type_of_mapping
mapping = tmap({last_key : value})
while path:
_next = path.pop()
mapping = tmap({_next : mapping })
return mapping
def _any(p):
return True
def is_array(p):
return type(p.value())==list and p.key()[-1] not in { "Image", } and RECURSOR
def list_flatener(pa):
for v in pa.value():
yield Path(pa.key() +(v,))
def shoud_be_splitted(p):
NB=chr(0xa0)
v=p.value()
if type(v) == str and NB in v:
for s in v.split(NB):
if s:
yield Path(p.key() + (s,))
else:
identity(v)
def identity(p):
yield p
def word_counter(p):
yield Path(("words", p.value()))
yield Path( p)
def is_dict(p):
return isinstance(p.value(), MutableMapping) and len(p.value()) and RECURSOR
def _dissecator(p):
v = p.value()
if set(v.keys()) & { "c", "t" }:
if not(v.get("t")=="Space" ):
yield Path(p.key()+(v["t"],v.get("c")))
else:
for k,v in p.value().items():
yield Path(p.key()+(k,v))
tree_exploder = dict((
(is_dict,_dissecator),
(_any, identity),
))
def is_str(p):
return p.key().endswith("Str")
def islink(p):
return p.contains("Link") and tuple((RECURSOR,{is_dict:_dissecator, is_array:list_flatener, _any:identity},))
def stop_iter(e):
raise StopIteration
def emit_link(p):
yield p
yield Path(("LKS", p))
dispatch = pandoc_disp = dict(((is_dict, _dissecator),))
# filtering by short circuit (stop iteration)
pandoc_disp[lambda p: not p.startswith("blocks")]=stop_iter
# one way of filtering and re-emitting/tranform info info by changing the logic of recursion
pandoc_disp[islink]=emit_link
# another way of filtering and re-emitting info while parsing (clean for me)
pandoc_disp[is_str]=word_counter
# another way of filtering and re-emitting info while parsing (ugly hidden filter in code)
pandoc_disp[is_array]=list_flatener
# default : yield leaf
pandoc_disp[_any]=identity
def mapping_row_iter(tree, path=MARKER, predicate=pandoc_disp):
"""
iterator on a tree that yield an iterator on a mapping in the form of
a list of ordered key that leads to the element and the value
"""
if path is MARKER:
path = ()
p = Path(path+(tree,))
for pred, effector in predicate.items():
switch = pred(p)
if isinstance(switch, tuple):
switch, predicate = switch
def recursor(p, predicate=pandoc_disp):
return mapping_row_iter(p.value(),path=p.key(), predicate=predicate)
if switch in {RECURSOR,}:
for value in effector(p):
yield from recursor(value, predicate)
break
if switch:
yield from effector(p)
break
graph= mapping_row_iter
with open("res.json") as f:
tree = load(f)
t=list(
graph(
tree,
))
#word counter
print(Counter(map(lambda p:p.value(), filter(lambda e:e.startswith("words"),t[200:300]))))
#getting all parallely emitted links
#print(list(filter(lambda e:e.startswith("LKS"), t)))
# getting all Images
#print(list(map(lambda p :p[-2:],filter(lambda e:e.key().endswith("Image"), t))))
# getting all link list flattened
#list(list(map(lambda p :p[-4:],filter(lambda e:e.key().endswith("Link"), t))))
#transforming a tree in a flat text
print(" ".join(map( lambda p:p.value(),filter(lambda p : p.key().endswith("Str"), t[:100]))))
#print(t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment