Skip to content

Instantly share code, notes, and snippets.

@Erotemic
Created October 28, 2022 15:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Erotemic/60ec1b4ffc9961fa34a1821bb139f70f to your computer and use it in GitHub Desktop.
Save Erotemic/60ec1b4ffc9961fa34a1821bb139f70f to your computer and use it in GitHub Desktop.
ijson_ext.py
"""
Add NaN parsing to normal ijson pure-python parsing backend.
Example:
>>> import ijson_ext as ijson
>>> text = ub.codeblock(
>>> '''
>>> {"id": 246, "image_id": 123, "flag": true, "empty": null, "category_id": 2, "bbox": [ NaN, 5, 130, 290 ] }
>>> ''')
>>> import io
>>> import json
>>> file = io.StringIO(text)
>>> json.load(file)
>>> file = io.StringIO(text)
>>> list(ijson_ext.items(file, 'bbox'))
"""
from json.decoder import scanstring
import re
from ijson import common, utils
import codecs
LEXEME_RE = re.compile(r'[a-z0-9eEN\.\+-]+|\S')
UNARY_LEXEMES = set('[]{},')
EOF = -1, None
class UnexpectedSymbol(common.JSONError):
def __init__(self, symbol, pos):
super(UnexpectedSymbol, self).__init__(
'Unexpected symbol %r at %d' % (symbol, pos)
)
@utils.coroutine
def utf8_encoder(target):
decoder = codecs.getincrementaldecoder('utf-8')()
decode = decoder.decode
send = target.send
while True:
try:
final = False
bdata = yield
except GeneratorExit:
final = True
bdata = b''
try:
sdata = decode(bdata, final)
except UnicodeDecodeError as e:
try:
target.close()
except BaseException:
pass
raise common.IncompleteJSONError(e)
if sdata:
send(sdata)
elif not bdata:
target.close()
break
@utils.coroutine
def Lexer(target):
"""
Parses lexemes out of the incoming content, and sends them to parse_value.
A special EOF result is sent when the data source has been exhausted to
give parse_value the possibility of raising custom exceptions due to missing
content.
"""
try:
data = yield
except GeneratorExit:
data = ''
buf = data
pos = 0
discarded = 0
send = target.send
while True:
match = LEXEME_RE.search(buf, pos)
if match:
lexeme = match.group()
if lexeme == '"':
pos = match.start()
start = pos + 1
while True:
try:
end = buf.index('"', start)
escpos = end - 1
while buf[escpos] == '\\':
escpos -= 1
if (end - escpos) % 2 == 0:
start = end + 1
else:
break
except ValueError:
try:
data = yield
except GeneratorExit:
data = ''
if not data:
raise common.IncompleteJSONError(
'Incomplete string lexeme')
buf += data
send((discarded + pos, buf[pos:end + 1]))
pos = end + 1
else:
while lexeme not in UNARY_LEXEMES and match.end() == len(buf):
try:
data = yield
except GeneratorExit:
data = ''
if not data:
break
buf += data
match = LEXEME_RE.search(buf, pos)
lexeme = match.group()
send((discarded + match.start(), lexeme))
pos = match.end()
else:
# Don't ask data from an already exhausted source
if data:
try:
data = yield
except GeneratorExit:
data = ''
if not data:
# Normally should raise StopIteration, but can raise
# IncompleteJSONError too, which is the point of sending EOF
try:
target.send(EOF)
except StopIteration:
pass
break
discarded += len(buf)
buf = data
pos = 0
# Parsing states
_PARSE_VALUE = 0
_PARSE_ARRAY_ELEMENT_END = 1
_PARSE_OBJECT_KEY = 2
_PARSE_OBJECT_END = 3
# infinity singleton for overflow checks
inf = float("inf")
@utils.coroutine
def parse_value(target, multivalue, use_float):
"""
Parses results coming out of the Lexer into ijson events, which are sent to
`target`. A stack keeps track of the type of object being parsed at the time
(a value, and object or array -- the last two being values themselves).
A special EOF result coming from the Lexer indicates that no more content is
expected. This is used to check for incomplete content and raise the
appropriate exception, which wouldn't be possible if the Lexer simply closed
this co-routine (either explicitly via .close(), or implicitly by itself
finishing and decreasing the only reference to the co-routine) since that
causes a GeneratorExit exception that cannot be replaced with a custom one.
"""
state_stack = [_PARSE_VALUE]
pop = state_stack.pop
push = state_stack.append
send = target.send
prev_pos, prev_symbol = None, None
to_number = common.integer_or_float if use_float else common.integer_or_decimal
while True:
if prev_pos is None:
pos, symbol = yield
if (pos, symbol) == EOF:
if state_stack:
raise common.IncompleteJSONError('Incomplete JSON content')
break
else:
pos, symbol = prev_pos, prev_symbol
prev_pos, prev_symbol = None, None
try:
state = state_stack[-1]
except IndexError:
if multivalue:
state = _PARSE_VALUE
push(state)
else:
raise common.JSONError('Additional data found')
assert state_stack
if state == _PARSE_VALUE:
# Simple, common cases
if symbol == 'null':
send(('null', None))
pop()
elif symbol == 'true':
send(('boolean', True))
pop()
elif symbol == 'false':
send(('boolean', False))
pop()
elif symbol == 'NaN':
send(('number', float('nan')))
pop()
elif symbol[0] == '"':
send(('string', parse_string(symbol)))
pop()
# Array start
elif symbol == '[':
send(('start_array', None))
pos, symbol = yield
if (pos, symbol) == EOF:
raise common.IncompleteJSONError('Incomplete JSON content')
if symbol == ']':
send(('end_array', None))
pop()
else:
prev_pos, prev_symbol = pos, symbol
push(_PARSE_ARRAY_ELEMENT_END)
push(_PARSE_VALUE)
# Object start
elif symbol == '{':
send(('start_map', None))
pos, symbol = yield
if (pos, symbol) == EOF:
raise common.IncompleteJSONError('Incomplete JSON content')
if symbol == '}':
send(('end_map', None))
pop()
else:
prev_pos, prev_symbol = pos, symbol
push(_PARSE_OBJECT_KEY)
# A number
else:
# JSON numbers can't contain leading zeros
if ((len(symbol) > 1 and symbol[0] == '0' and symbol[1] not in ('e', 'E', '.')) or
(len(symbol) > 2 and symbol[0:2] == '-0' and symbol[2] not in ('e', 'E', '.'))):
raise common.JSONError(
'Invalid JSON number: %s' %
(symbol,))
# Fractions need a leading digit and must be followed by a
# digit
if symbol[0] == '.' or symbol[-1] == '.':
raise common.JSONError(
'Invalid JSON number: %s' %
(symbol,))
try:
number = to_number(symbol)
if number == inf:
raise common.JSONError(
"float overflow: %s" %
(symbol,))
except BaseException:
if 'true'.startswith(symbol) or 'false'.startswith(symbol) or 'null'.startswith(symbol): # or 'NaN'.startswith(symbol):
raise common.IncompleteJSONError(
'Incomplete JSON content')
raise UnexpectedSymbol(symbol, pos)
else:
send(('number', number))
pop()
elif state == _PARSE_OBJECT_KEY:
if symbol[0] != '"':
raise UnexpectedSymbol(symbol, pos)
send(('map_key', parse_string(symbol)))
pos, symbol = yield
if (pos, symbol) == EOF:
raise common.IncompleteJSONError('Incomplete JSON content')
if symbol != ':':
raise UnexpectedSymbol(symbol, pos)
state_stack[-1] = _PARSE_OBJECT_END
push(_PARSE_VALUE)
elif state == _PARSE_OBJECT_END:
if symbol == ',':
state_stack[-1] = _PARSE_OBJECT_KEY
elif symbol != '}':
raise UnexpectedSymbol(symbol, pos)
else:
send(('end_map', None))
pop()
pop()
elif state == _PARSE_ARRAY_ELEMENT_END:
if symbol == ',':
state_stack[-1] = _PARSE_ARRAY_ELEMENT_END
push(_PARSE_VALUE)
elif symbol != ']':
raise UnexpectedSymbol(symbol, pos)
else:
send(('end_array', None))
pop()
pop()
def parse_string(symbol):
return scanstring(symbol, 1)[0]
def basic_parse_basecoro(target, multiple_values=False, allow_comments=False,
use_float=False):
'''
Iterator yielding unprefixed events.
Parameters:
- file: a readable file-like object with JSON input
'''
if allow_comments:
raise ValueError("Comments are not supported by the python backend")
return utf8_encoder(Lexer(parse_value(target, multiple_values, use_float)))
common.enrich_backend(globals())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment