Created
October 28, 2022 15:10
-
-
Save Erotemic/60ec1b4ffc9961fa34a1821bb139f70f to your computer and use it in GitHub Desktop.
ijson_ext.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Add NaN parsing to normal ijson pure-python parsing backend. | |
Example: | |
>>> import ijson_ext as ijson | |
>>> text = ub.codeblock( | |
>>> ''' | |
>>> {"id": 246, "image_id": 123, "flag": true, "empty": null, "category_id": 2, "bbox": [ NaN, 5, 130, 290 ] } | |
>>> ''') | |
>>> import io | |
>>> import json | |
>>> file = io.StringIO(text) | |
>>> json.load(file) | |
>>> file = io.StringIO(text) | |
>>> list(ijson_ext.items(file, 'bbox')) | |
""" | |
from json.decoder import scanstring | |
import re | |
from ijson import common, utils | |
import codecs | |
LEXEME_RE = re.compile(r'[a-z0-9eEN\.\+-]+|\S') | |
UNARY_LEXEMES = set('[]{},') | |
EOF = -1, None | |
class UnexpectedSymbol(common.JSONError): | |
def __init__(self, symbol, pos): | |
super(UnexpectedSymbol, self).__init__( | |
'Unexpected symbol %r at %d' % (symbol, pos) | |
) | |
@utils.coroutine | |
def utf8_encoder(target): | |
decoder = codecs.getincrementaldecoder('utf-8')() | |
decode = decoder.decode | |
send = target.send | |
while True: | |
try: | |
final = False | |
bdata = yield | |
except GeneratorExit: | |
final = True | |
bdata = b'' | |
try: | |
sdata = decode(bdata, final) | |
except UnicodeDecodeError as e: | |
try: | |
target.close() | |
except BaseException: | |
pass | |
raise common.IncompleteJSONError(e) | |
if sdata: | |
send(sdata) | |
elif not bdata: | |
target.close() | |
break | |
@utils.coroutine | |
def Lexer(target): | |
""" | |
Parses lexemes out of the incoming content, and sends them to parse_value. | |
A special EOF result is sent when the data source has been exhausted to | |
give parse_value the possibility of raising custom exceptions due to missing | |
content. | |
""" | |
try: | |
data = yield | |
except GeneratorExit: | |
data = '' | |
buf = data | |
pos = 0 | |
discarded = 0 | |
send = target.send | |
while True: | |
match = LEXEME_RE.search(buf, pos) | |
if match: | |
lexeme = match.group() | |
if lexeme == '"': | |
pos = match.start() | |
start = pos + 1 | |
while True: | |
try: | |
end = buf.index('"', start) | |
escpos = end - 1 | |
while buf[escpos] == '\\': | |
escpos -= 1 | |
if (end - escpos) % 2 == 0: | |
start = end + 1 | |
else: | |
break | |
except ValueError: | |
try: | |
data = yield | |
except GeneratorExit: | |
data = '' | |
if not data: | |
raise common.IncompleteJSONError( | |
'Incomplete string lexeme') | |
buf += data | |
send((discarded + pos, buf[pos:end + 1])) | |
pos = end + 1 | |
else: | |
while lexeme not in UNARY_LEXEMES and match.end() == len(buf): | |
try: | |
data = yield | |
except GeneratorExit: | |
data = '' | |
if not data: | |
break | |
buf += data | |
match = LEXEME_RE.search(buf, pos) | |
lexeme = match.group() | |
send((discarded + match.start(), lexeme)) | |
pos = match.end() | |
else: | |
# Don't ask data from an already exhausted source | |
if data: | |
try: | |
data = yield | |
except GeneratorExit: | |
data = '' | |
if not data: | |
# Normally should raise StopIteration, but can raise | |
# IncompleteJSONError too, which is the point of sending EOF | |
try: | |
target.send(EOF) | |
except StopIteration: | |
pass | |
break | |
discarded += len(buf) | |
buf = data | |
pos = 0 | |
# Parsing states | |
_PARSE_VALUE = 0 | |
_PARSE_ARRAY_ELEMENT_END = 1 | |
_PARSE_OBJECT_KEY = 2 | |
_PARSE_OBJECT_END = 3 | |
# infinity singleton for overflow checks | |
inf = float("inf") | |
@utils.coroutine | |
def parse_value(target, multivalue, use_float): | |
""" | |
Parses results coming out of the Lexer into ijson events, which are sent to | |
`target`. A stack keeps track of the type of object being parsed at the time | |
(a value, and object or array -- the last two being values themselves). | |
A special EOF result coming from the Lexer indicates that no more content is | |
expected. This is used to check for incomplete content and raise the | |
appropriate exception, which wouldn't be possible if the Lexer simply closed | |
this co-routine (either explicitly via .close(), or implicitly by itself | |
finishing and decreasing the only reference to the co-routine) since that | |
causes a GeneratorExit exception that cannot be replaced with a custom one. | |
""" | |
state_stack = [_PARSE_VALUE] | |
pop = state_stack.pop | |
push = state_stack.append | |
send = target.send | |
prev_pos, prev_symbol = None, None | |
to_number = common.integer_or_float if use_float else common.integer_or_decimal | |
while True: | |
if prev_pos is None: | |
pos, symbol = yield | |
if (pos, symbol) == EOF: | |
if state_stack: | |
raise common.IncompleteJSONError('Incomplete JSON content') | |
break | |
else: | |
pos, symbol = prev_pos, prev_symbol | |
prev_pos, prev_symbol = None, None | |
try: | |
state = state_stack[-1] | |
except IndexError: | |
if multivalue: | |
state = _PARSE_VALUE | |
push(state) | |
else: | |
raise common.JSONError('Additional data found') | |
assert state_stack | |
if state == _PARSE_VALUE: | |
# Simple, common cases | |
if symbol == 'null': | |
send(('null', None)) | |
pop() | |
elif symbol == 'true': | |
send(('boolean', True)) | |
pop() | |
elif symbol == 'false': | |
send(('boolean', False)) | |
pop() | |
elif symbol == 'NaN': | |
send(('number', float('nan'))) | |
pop() | |
elif symbol[0] == '"': | |
send(('string', parse_string(symbol))) | |
pop() | |
# Array start | |
elif symbol == '[': | |
send(('start_array', None)) | |
pos, symbol = yield | |
if (pos, symbol) == EOF: | |
raise common.IncompleteJSONError('Incomplete JSON content') | |
if symbol == ']': | |
send(('end_array', None)) | |
pop() | |
else: | |
prev_pos, prev_symbol = pos, symbol | |
push(_PARSE_ARRAY_ELEMENT_END) | |
push(_PARSE_VALUE) | |
# Object start | |
elif symbol == '{': | |
send(('start_map', None)) | |
pos, symbol = yield | |
if (pos, symbol) == EOF: | |
raise common.IncompleteJSONError('Incomplete JSON content') | |
if symbol == '}': | |
send(('end_map', None)) | |
pop() | |
else: | |
prev_pos, prev_symbol = pos, symbol | |
push(_PARSE_OBJECT_KEY) | |
# A number | |
else: | |
# JSON numbers can't contain leading zeros | |
if ((len(symbol) > 1 and symbol[0] == '0' and symbol[1] not in ('e', 'E', '.')) or | |
(len(symbol) > 2 and symbol[0:2] == '-0' and symbol[2] not in ('e', 'E', '.'))): | |
raise common.JSONError( | |
'Invalid JSON number: %s' % | |
(symbol,)) | |
# Fractions need a leading digit and must be followed by a | |
# digit | |
if symbol[0] == '.' or symbol[-1] == '.': | |
raise common.JSONError( | |
'Invalid JSON number: %s' % | |
(symbol,)) | |
try: | |
number = to_number(symbol) | |
if number == inf: | |
raise common.JSONError( | |
"float overflow: %s" % | |
(symbol,)) | |
except BaseException: | |
if 'true'.startswith(symbol) or 'false'.startswith(symbol) or 'null'.startswith(symbol): # or 'NaN'.startswith(symbol): | |
raise common.IncompleteJSONError( | |
'Incomplete JSON content') | |
raise UnexpectedSymbol(symbol, pos) | |
else: | |
send(('number', number)) | |
pop() | |
elif state == _PARSE_OBJECT_KEY: | |
if symbol[0] != '"': | |
raise UnexpectedSymbol(symbol, pos) | |
send(('map_key', parse_string(symbol))) | |
pos, symbol = yield | |
if (pos, symbol) == EOF: | |
raise common.IncompleteJSONError('Incomplete JSON content') | |
if symbol != ':': | |
raise UnexpectedSymbol(symbol, pos) | |
state_stack[-1] = _PARSE_OBJECT_END | |
push(_PARSE_VALUE) | |
elif state == _PARSE_OBJECT_END: | |
if symbol == ',': | |
state_stack[-1] = _PARSE_OBJECT_KEY | |
elif symbol != '}': | |
raise UnexpectedSymbol(symbol, pos) | |
else: | |
send(('end_map', None)) | |
pop() | |
pop() | |
elif state == _PARSE_ARRAY_ELEMENT_END: | |
if symbol == ',': | |
state_stack[-1] = _PARSE_ARRAY_ELEMENT_END | |
push(_PARSE_VALUE) | |
elif symbol != ']': | |
raise UnexpectedSymbol(symbol, pos) | |
else: | |
send(('end_array', None)) | |
pop() | |
pop() | |
def parse_string(symbol): | |
return scanstring(symbol, 1)[0] | |
def basic_parse_basecoro(target, multiple_values=False, allow_comments=False, | |
use_float=False): | |
''' | |
Iterator yielding unprefixed events. | |
Parameters: | |
- file: a readable file-like object with JSON input | |
''' | |
if allow_comments: | |
raise ValueError("Comments are not supported by the python backend") | |
return utf8_encoder(Lexer(parse_value(target, multiple_values, use_float))) | |
common.enrich_backend(globals()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment