Skip to content

Instantly share code, notes, and snippets.

@simonw
Created September 22, 2024 19:26
Show Gist options
  • Save simonw/264d487db1a18f8585c2ca0c68e50d1e to your computer and use it in GitHub Desktop.
Save simonw/264d487db1a18f8585c2ca0c68e50d1e to your computer and use it in GitHub Desktop.

2024-09-22T19:22:31 conversation: 01j8djzp559sqykha63z9h3g03

Model: claude-3-5-sonnet-20240620

Prompt:

files-to-prompt -c README.md tests
<documents>
<document index="1">
<source>README.md</source>
<document_content>
# jiter

[![CI](https://github.com/pydantic/jiter/workflows/CI/badge.svg?event=push)](https://github.com/pydantic/jiter/actions?query=event%3Apush+branch%3Amain+workflow%3ACI)
[![pypi](https://img.shields.io/pypi/v/jiter.svg)](https://pypi.python.org/pypi/jiter)
[![versions](https://img.shields.io/pypi/pyversions/jiter.svg)](https://github.com/pydantic/jiter)
[![license](https://img.shields.io/github/license/pydantic/jiter.svg)](https://github.com/pydantic/jiter/blob/main/LICENSE)

This is a standalone version of the JSON parser used in `pydantic-core`. The recommendation is to only use this package directly if you do not use `pydantic`.

The API is extremely minimal:

```python
def from_json(
    json_data: bytes,
    /,
    *,
    allow_inf_nan: bool = True,
    cache_mode: Literal[True, False, "all", "keys", "none"] = "all",
    partial_mode: Literal[True, False, "off", "on", "trailing-strings"] = False,
    catch_duplicate_keys: bool = False,
    float_mode: Literal["float", "decimal", "lossless-float"] = False,
) -> Any:
    """
    Parse input bytes into a JSON object.

    Arguments:
        json_data: The JSON data to parse
        allow_inf_nan: Whether to allow infinity (`Infinity` an `-Infinity`) and `NaN` values to float fields.
            Defaults to True.
        cache_mode: cache Python strings to improve performance at the cost of some memory usage
            - True / 'all' - cache all strings
            - 'keys' - cache only object keys
            - False / 'none' - cache nothing
        partial_mode: How to handle incomplete strings:
            - False / 'off' - raise an exception if the input is incomplete
            - True / 'on' - allow incomplete JSON but discard the last string if it is incomplete
            - 'trailing-strings' - allow incomplete JSON, and include the last incomplete string in the output
        catch_duplicate_keys: if True, raise an exception if objects contain the same key multiple times
        float_mode: How to return floats: as a `float`, `Decimal` or `LosslessFloat`

    Returns:
        Python object built from the JSON input.
    """

def cache_clear() -> None:
    """
    Reset the string cache.
    """

def cache_usage() -> int:
    """
    get the size of the string cache.

    Returns:
        Size of the string cache in bytes.
    """
```

</document_content>
</document>
<document index="2">
<source>tests/requirements.txt</source>
<document_content>
pytest
pytest-pretty
dirty_equals

</document_content>
</document>
<document index="3">
<source>tests/test_jiter.py</source>
<document_content>
import json
from decimal import Decimal

import jiter
import pytest
from math import inf
from dirty_equals import IsFloatNan


def test_python_parse_numeric():
    parsed = jiter.from_json(
        b'  { "int": 1, "bigint": 123456789012345678901234567890, "float": 1.2}  '
    )
    assert parsed == {"int": 1, "bigint": 123456789012345678901234567890, "float": 1.2}


def test_python_parse_other_cached():
    parsed = jiter.from_json(
        b'["string", true, false, null, NaN, Infinity, -Infinity]',
        allow_inf_nan=True,
        cache_mode=True,
    )
    assert parsed == ["string", True, False, None, IsFloatNan(), inf, -inf]


def test_python_parse_other_no_cache():
    parsed = jiter.from_json(
        b'["string", true, false, null]',
        cache_mode=False,
    )
    assert parsed == ["string", True, False, None]


def test_python_disallow_nan():
    with pytest.raises(ValueError, match="expected value at line 1 column 2"):
        jiter.from_json(b"[NaN]", allow_inf_nan=False)


def test_error():
    with pytest.raises(ValueError, match="EOF while parsing a list at line 1 column 9"):
        jiter.from_json(b'["string"')


def test_recursion_limit():
    with pytest.raises(
        ValueError, match="recursion limit exceeded at line 1 column 202"
    ):
        jiter.from_json(b"[" * 10_000)


def test_recursion_limit_incr():
    json = b"[" + b", ".join(b"[1]" for _ in range(2000)) + b"]"
    v = jiter.from_json(json)
    assert len(v) == 2000

    v = jiter.from_json(json)
    assert len(v) == 2000


def test_extracted_value_error():
    with pytest.raises(ValueError, match="expected value at line 1 column 1"):
        jiter.from_json(b"xx")


def test_partial_array():
    json = b'["string", true, null, 1, "foo'

    with pytest.raises(ValueError, match='EOF while parsing a string at line 1 column 30'):
        jiter.from_json(json, partial_mode=False)

    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == ["string", True, None, 1]

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json[:i], partial_mode=True)
        assert isinstance(parsed, list)


def test_partial_array_trailing_strings():
    json = b'["string", true, null, 1, "foo'
    parsed = jiter.from_json(json, partial_mode='trailing-strings')
    assert parsed == ["string", True, None, 1, "foo"]

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json[:i], partial_mode='trailing-strings')
        assert isinstance(parsed, list)


def test_partial_array_first():
    json = b"["
    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == []

    with pytest.raises(ValueError, match="EOF while parsing a list at line 1 column 1"):
        jiter.from_json(json)

    with pytest.raises(ValueError, match="EOF while parsing a list at line 1 column 1"):
        jiter.from_json(json, partial_mode='off')


def test_partial_object():
    json = b'{"a": 1, "b": 2, "c'
    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == {"a": 1, "b": 2}

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json, partial_mode=True)
        assert isinstance(parsed, dict)


def test_partial_object_string():
    json = b'{"a": 1, "b": 2, "c": "foo'
    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == {"a": 1, "b": 2}
    parsed = jiter.from_json(json, partial_mode='on')
    assert parsed == {"a": 1, "b": 2}

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json, partial_mode=True)
        assert isinstance(parsed, dict)

    json = b'{"title": "Pride and Prejudice", "author": "Jane A'
    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == {"title": "Pride and Prejudice"}


def test_partial_object_string_trailing_strings():
    json = b'{"a": 1, "b": 2, "c": "foo'
    parsed = jiter.from_json(json, partial_mode='trailing-strings')
    assert parsed == {"a": 1, "b": 2, "c": "foo"}

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json, partial_mode=True)
        assert isinstance(parsed, dict)

    json = b'{"title": "Pride and Prejudice", "author": "Jane A'
    parsed = jiter.from_json(json, partial_mode='trailing-strings')
    assert parsed == {"title": "Pride and Prejudice", "author": "Jane A"}


def test_partial_nested():
    json = b'{"a": 1, "b": 2, "c": [1, 2, {"d": 1, '
    parsed = jiter.from_json(json, partial_mode=True)
    assert parsed == {"a": 1, "b": 2, "c": [1, 2, {"d": 1}]}

    # test that stopping at every points is ok
    for i in range(1, len(json)):
        parsed = jiter.from_json(json[:i], partial_mode=True)
        assert isinstance(parsed, dict)


def test_partial_error():
    json = b'["string", true, null, 1, "foo'

    with pytest.raises(ValueError, match='EOF while parsing a string at line 1 column 30'):
        jiter.from_json(json, partial_mode=False)

    assert jiter.from_json(json, partial_mode=True) == ["string", True, None, 1]

    msg = "Invalid partial mode, should be `'off'`, `'on'`, `'trailing-strings'` or a `bool`"
    with pytest.raises(ValueError, match=msg):
        jiter.from_json(json, partial_mode='wrong')
    with pytest.raises(TypeError, match=msg):
        jiter.from_json(json, partial_mode=123)


def test_python_cache_usage_all():
    jiter.cache_clear()
    parsed = jiter.from_json(b'{"foo": "bar", "spam": 3}', cache_mode="all")
    assert parsed == {"foo": "bar", "spam": 3}
    assert jiter.cache_usage() == 3


def test_python_cache_usage_keys():
    jiter.cache_clear()
    parsed = jiter.from_json(b'{"foo": "bar", "spam": 3}', cache_mode="keys")
    assert parsed == {"foo": "bar", "spam": 3}
    assert jiter.cache_usage() == 2


def test_python_cache_usage_none():
    jiter.cache_clear()
    parsed = jiter.from_json(
        b'{"foo": "bar", "spam": 3}',
        cache_mode="none",
    )
    assert parsed == {"foo": "bar", "spam": 3}
    assert jiter.cache_usage() == 0


def test_use_tape():
    json = '  "foo\\nbar"  '.encode()
    jiter.cache_clear()
    parsed = jiter.from_json(json, cache_mode=False)
    assert parsed == "foo\nbar"


def test_unicode():
    json = '{"💩": "£"}'.encode()
    jiter.cache_clear()
    parsed = jiter.from_json(json, cache_mode=False)
    assert parsed == {"💩": "£"}


def test_unicode_cache():
    json = '{"💩": "£"}'.encode()
    jiter.cache_clear()
    parsed = jiter.from_json(json)
    assert parsed == {"💩": "£"}


def test_json_float():
    f = jiter.LosslessFloat(b'123.45')
    assert str(f) == '123.45'
    assert repr(f) == 'LosslessFloat(123.45)'
    assert float(f) == 123.45
    assert f.as_decimal() == Decimal('123.45')
    assert bytes(f) == b'123.45'


def test_json_float_scientific():
    f = jiter.LosslessFloat(b'123e4')
    assert str(f) == '123e4'
    assert float(f) == 123e4
    assert f.as_decimal() == Decimal('123e4')


def test_json_float_invalid():
    with pytest.raises(ValueError, match='trailing characters at line 1 column 6'):
        jiter.LosslessFloat(b'123.4x')


def test_lossless_floats():
    f = jiter.from_json(b'12.3')
    assert isinstance(f, float)
    assert f == 12.3

    f = jiter.from_json(b'12.3', float_mode='float')
    assert isinstance(f, float)
    assert f == 12.3

    f = jiter.from_json(b'12.3', float_mode='lossless-float')
    assert isinstance(f, jiter.LosslessFloat)
    assert str(f) == '12.3'
    assert float(f) == 12.3
    assert f.as_decimal() == Decimal('12.3')

    f = jiter.from_json(b'123.456789123456789e45', float_mode='lossless-float')
    assert isinstance(f, jiter.LosslessFloat)
    assert 123e45 < float(f) < 124e45
    assert f.as_decimal() == Decimal('1.23456789123456789E+47')
    assert bytes(f) == b'123.456789123456789e45'
    assert str(f) == '123.456789123456789e45'
    assert repr(f) == 'LosslessFloat(123.456789123456789e45)'

    f = jiter.from_json(b'123', float_mode='lossless-float')
    assert isinstance(f, int)
    assert f == 123

    with pytest.raises(ValueError, match='expected value at line 1 column 1'):
        jiter.from_json(b'wrong', float_mode='lossless-float')

    with pytest.raises(ValueError, match='trailing characters at line 1 column 2'):
        jiter.from_json(b'1wrong', float_mode='lossless-float')



def test_decimal_floats():
    f = jiter.from_json(b'12.3')
    assert isinstance(f, float)
    assert f == 12.3

    f = jiter.from_json(b'12.3', float_mode='decimal')
    assert isinstance(f, Decimal)
    assert f == Decimal('12.3')

    f = jiter.from_json(b'123.456789123456789e45', float_mode='decimal')
    assert isinstance(f, Decimal)
    assert f == Decimal('1.23456789123456789E+47')

    f = jiter.from_json(b'123', float_mode='decimal')
    assert isinstance(f, int)
    assert f == 123

    with pytest.raises(ValueError, match='expected value at line 1 column 1'):
        jiter.from_json(b'wrong', float_mode='decimal')

    with pytest.raises(ValueError, match='trailing characters at line 1 column 2'):
        jiter.from_json(b'1wrong', float_mode='decimal')


def test_unicode_roundtrip():
    original = ['中文']
    json_data = json.dumps(original).encode()
    assert jiter.from_json(json_data) == original
    assert json.loads(json_data) == original


def test_unicode_roundtrip_ensure_ascii():
    original = {'name': '中文'}
    json_data = json.dumps(original, ensure_ascii=False).encode()
    assert jiter.from_json(json_data, cache_mode=False) == original
    assert json.loads(json_data) == original


def test_catch_duplicate_keys():
    assert jiter.from_json(b'{"foo": 1, "foo": 2}') == {"foo": 2}

    with pytest.raises(ValueError, match='Detected duplicate key "foo" at line 1 column 18'):
        jiter.from_json(b'{"foo": 1, "foo": 2}', catch_duplicate_keys=True)

    with pytest.raises(ValueError, match='Detected duplicate key "foo" at line 1 column 28'):
        jiter.from_json(b'{"foo": 1, "bar": 2, "foo": 2}', catch_duplicate_keys=True)

</document_content>
</document>
</documents>

System:

write a new README with comprehensive documentation

Response:

Here's a comprehensive README for the jiter package:

jiter

CI pypi versions license

jiter is a standalone version of the JSON parser used in pydantic-core. It provides a fast and flexible JSON parsing solution with additional features not found in Python's built-in json module.

Installation

You can install jiter using pip:

pip install jiter

Usage

The main function provided by jiter is from_json():

import jiter

json_data = b'{"name": "John", "age": 30}'
parsed_data = jiter.from_json(json_data)
print(parsed_data)  # Output: {'name': 'John', 'age': 30}

API Reference

def from_json(
    json_data: bytes,
    /,
    *,
    allow_inf_nan: bool = True,
    cache_mode: Literal[True, False, "all", "keys", "none"] = "all",
    partial_mode: Literal[True, False, "off", "on", "trailing-strings"] = False,
    catch_duplicate_keys: bool = False,
    float_mode: Literal["float", "decimal", "lossless-float"] = False,
) -> Any:
    """
    Parse input bytes into a JSON object.
    """

Parameters:

  • json_data: The JSON data to parse as bytes.
  • allow_inf_nan: Whether to allow infinity (Infinity and -Infinity) and NaN values for float fields. Defaults to True.
  • cache_mode: Controls string caching to improve performance at the cost of some memory usage.
    • True or 'all': Cache all strings
    • 'keys': Cache only object keys
    • False or 'none': Cache nothing
  • partial_mode: How to handle incomplete JSON:
    • False or 'off': Raise an exception if the input is incomplete
    • True or 'on': Allow incomplete JSON but discard the last string if it is incomplete
    • 'trailing-strings': Allow incomplete JSON, and include the last incomplete string in the output
  • catch_duplicate_keys: If True, raise an exception if objects contain the same key multiple times.
  • float_mode: How to return floats:
    • 'float': Return as Python float
    • 'decimal': Return as Decimal
    • 'lossless-float': Return as LosslessFloat

Returns:

A Python object built from the JSON input.

Additional Functions

def cache_clear() -> None:
    """
    Reset the string cache.
    """

def cache_usage() -> int:
    """
    Get the size of the string cache in bytes.
    """

Features

  1. Fast Parsing: jiter is optimized for performance.
  2. Flexible Float Handling: Options for handling floats as standard Python floats, Decimals, or LosslessFloats.
  3. String Caching: Improve performance with configurable string caching.
  4. Partial JSON Parsing: Handle incomplete JSON data with various modes.
  5. Duplicate Key Detection: Option to catch and raise errors for duplicate keys in JSON objects.
  6. Infinity and NaN Support: Configurable parsing of Infinity, -Infinity, and NaN values.

Examples

Using Different Float Modes

import jiter
from decimal import Decimal

# Standard float
result = jiter.from_json(b'12.3')
print(result, type(result))  # Output: 12.3 <class 'float'>

# Decimal
result = jiter.from_json(b'12.3', float_mode='decimal')
print(result, type(result))  # Output: 12.3 <class 'decimal.Decimal'>

# LosslessFloat
result = jiter.from_json(b'12.3', float_mode='lossless-float')
print(result, type(result))  # Output: 12.3 <class 'jiter.LosslessFloat'>

Handling Partial JSON

import jiter

partial_json = b'{"name": "John", "age": 30, "city": "New Yor'

# Raise error on incomplete JSON
try:
    jiter.from_json(partial_json, partial_mode='off')
except ValueError as e:
    print(f"Error: {e}")

# Parse incomplete JSON, discarding incomplete last field
result = jiter.from_json(partial_json, partial_mode='on')
print(result)  # Output: {'name': 'John', 'age': 30}

# Parse incomplete JSON, including incomplete last field
result = jiter.from_json(partial_json, partial_mode='trailing-strings')
print(result)  # Output: {'name': 'John', 'age': 30, 'city': 'New Yor'}

Catching Duplicate Keys

import jiter

json_with_dupes = b'{"foo": 1, "foo": 2}'

# Default behavior (last value wins)
result = jiter.from_json(json_with_dupes)
print(result)  # Output: {'foo': 2}

# Catch duplicate keys
try:
    jiter.from_json(json_with_dupes, catch_duplicate_keys=True)
except ValueError as e:
    print(f"Error: {e}")

When to Use jiter

jiter is recommended for use when you need:

  1. High-performance JSON parsing
  2. More control over float handling (e.g., using Decimal or LosslessFloat)
  3. The ability to parse partial JSON
  4. Detection of duplicate keys in JSON objects

If you're already using Pydantic, you don't need to use jiter directly as it's included in pydantic-core.

License

jiter is released under the MIT License. See the LICENSE file for details.

Contributing

Contributions to jiter are welcome! Please refer to the project's GitHub repository for information on how to contribute.

@simonw
Copy link
Author

simonw commented Sep 22, 2024

files-to-prompt -c README.md tests  | llm -m claude-3.5-sonnet --system 'write a new README with comprehensive documentation'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment