python function to decode multiple JSON representations in a single file (stream)
# The MIT License | |
# | |
# Copyright (c) 2019 Neil Webber | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
import json | |
# | |
# A MultiDecoder is a subclass of json.JSONDecoder and adds: | |
# | |
# sizelimit - Object __init__() parameter. If set, protects against an | |
# "unbounded stream of unresolved JSON/garbage" attack by capping | |
# the max size for buffered data. Default is no limit. | |
# | |
# [exception] SkipAllError - tells loadmultijson the string contained only | |
# ignorable characters (e.g., whitespace), so it can eat them rather | |
# than report them as a garbage tuple if they are trailing data. | |
# | |
class MultiDecoder(json.JSONDecoder): | |
def __init__(self, *, sizelimit=0): | |
self.__sizelimit = sizelimit | |
super().__init__() | |
def raw_decode(self, s): | |
if self.__sizelimit > 0 and len(s) > self.__sizelimit: | |
raise ValueError(f"JSON string exceeds limit {self.__sizelimit}") | |
stripped = s.lstrip() # because raw_decode doesn't allow | |
if len(stripped) == 0: | |
raise SkipAllError() | |
obj, n = super().raw_decode(stripped) | |
n += len(s) - len(stripped) # adjust to count the stripped chars | |
return obj, n | |
# raised when the entire string should be discard (e.g., is all whitespace) | |
class SkipAllError(Exception): | |
pass | |
# loadmultijson is a generator that yields individual JSON objects from a | |
# stream containing multiple such strings, yielding a tuple (obj, str) as | |
# each JSON representation is parsed. | |
# | |
# For example if the input stream is: | |
# [1, 2, 3] {"k1": 42} 17 | |
# | |
# three tuples will be generated: | |
# ([1, 2, 3], '[1, 2, 3]') | |
# ({'k1': 42}, '{"k1": 42}') | |
# (17, '17') | |
# | |
# If there is unparseable JSON in the stream it will be returned as the | |
# last tuple, as a 'garbage tuple' distinguished by the obj (first element) | |
# being None. For example: | |
# | |
# [1, 2, 3] ooops | |
# | |
# will yield: | |
# ([1, 2, 3], '[1, 2, 3]') | |
# (None, 'ooops') | |
# | |
# Be aware that any JSON syntax error in the stream will cause the entire | |
# rest of the stream to be consumed and an enormous garbage tuple to be | |
# returned, as the algorithm just tacks on more and more data so long as | |
# the current buffer will not parse. Consider for example a stream containing | |
# three JSON representations (the middle one being illegal): | |
# [1, 2, 3] {"bogus"} [ ...<enormous array here>... ] | |
# This will return one valid tuple, then one enormous garbage tuple containing | |
# the entire rest of the stream. There is no attempt to try to discern where | |
# an invalid JSON object "looks like it ends". The MultiDecoder (the | |
# default decoder) can optionally enforce a sizelimit to (crudely) control | |
# this. An ambitious custom decoder could try to recover the parse stream, | |
# possibly knowing something about the underlying application. | |
# | |
# | |
# REQUIRED POSITIONAL ARGUMENTS: | |
# | |
# readable - anything that has an io style .read() method | |
# | |
# OPTIONAL KEYWORD ARGUMENTS: | |
# decoder - Nominally a JSON decoder object but the only required | |
# method is raw_decode. By default it is a MultiDecoder. | |
# | |
# NOTE: The decoder can raise SkipAllError to communicate that the | |
# entire string it was given is ignorable characters. This | |
# allows a decoder to cause trailing ignorable characters in | |
# a stream (e.g., trailing whitespace) to be eaten rather than | |
# instead being ultimately returned as a trailing garbage tuple | |
def loadmultijson(readable, *, decoder=MultiDecoder()): | |
# read() chunks in this size; arbitrary and ANY VALUE WORKS though | |
# obviously there is a weak time/space tradeoff lurking in this choice. | |
incrsize = 10000 | |
chunk = "" | |
while True: | |
newdata = readable.read(incrsize) | |
if len(newdata) == 0: | |
break | |
chunk += newdata | |
while len(chunk) > 0: | |
try: | |
obj, n = decoder.raw_decode(chunk) | |
yield obj, chunk[:n] | |
chunk = chunk[n:] | |
except SkipAllError: | |
chunk = "" | |
except json.JSONDecodeError: | |
break # no complete json yet; append more data | |
# any leftover data is trailing garbage. Caller has to look for | |
# this as a None JSON object in the tuple | |
if len(chunk) > 0: | |
yield None, chunk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment