Skip to content

Instantly share code, notes, and snippets.

@bobthemighty
Last active February 14, 2022 07:19
Show Gist options
  • Save bobthemighty/9f4fd8fbb2435b8f6b8cf191dabdf37a to your computer and use it in GitHub Desktop.
Save bobthemighty/9f4fd8fbb2435b8f6b8cf191dabdf37a to your computer and use it in GitHub Desktop.
Streaming spikes
def sip(stream):
"""
NAYA will handle our json just fine so long as it starts with an
open bracket, so it thinks it has a list.
"""
def f():
# Yield a dummy `[`
yield TOKEN_TYPE.OPERATOR, "["
# Followed by the rest of the tokens in the stream
for t in tokenize(stream):
yield t
try:
for o in stream_array(f()):
yield o
except RuntimeError as e:
pass
import gzip
import boto3
s3 = boto3.client("s3")
def parse_file(bucket: str, key: str):
data = s3.get_object(Key=key, Bucket=bucket)
body = gzip.open(data["Body"], encoding="UTF-8", mode="rt")
for obj in sip(body):
print(obj)
class ContentHandler(YajlContentHandler):
def __init__(self):
self._keys = []
self._objects = []
self.results = []
def set(self, value):
self._objects[-1][self._keys.pop()] = value
def yajl_null(self, ctx):
self.set(None)
def yajl_boolean(self, ctx, boolVal):
self.set(boolVal)
def yajl_integer(self, ctx, integerVal):
self.set(integerVal)
def yajl_double(self, ctx, doubleVal):
self.set(doubleVal)
def yajl_number(self, ctx, stringNum):
"""Since this is defined both integer and double callbacks are useless"""
num = float(stringNum) if b"." in stringNum else int(stringNum)
self.set(num)
def yajl_string(self, ctx, stringVal):
self.set(stringVal.decode())
def yajl_start_map(self, ctx):
self._objects.append(dict())
def yajl_map_key(self, ctx, stringVal):
self._keys.append(stringVal.decode())
def yajl_end_map(self, ctx):
if len(self._objects) == 1:
self.results.append(self._objects.pop())
else:
self.set(self._objects.pop())
def yajl_start_array(self, ctx):
self._lists.append([])
self._in_list = True
def yajl_end_array(self, ctx):
self.set(self._lists.pop())
self._in_list = False
# Create the parser
handler = ContentHandler()
parser = YajlParser(handler)
parser.allow_trailing_garbage = True
parser.allow_multiple_values = True
import gzip
import boto3
s3 = boto3.client("s3")
def parse_file(bucket: str, key: str):
data = s3.get_object(Key=key, Bucket=bucket)
body = gzip.open(data["Body"])
parser.parse(body)
print(handler.results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment