Skip to content

Instantly share code, notes, and snippets.

@simonw
Last active September 7, 2024 05:39
Show Gist options
  • Save simonw/f5caf4ca24662f0078ec3cffcb040ce4 to your computer and use it in GitHub Desktop.
Save simonw/f5caf4ca24662f0078ec3cffcb040ce4 to your computer and use it in GitHub Desktop.

2024-09-07T03:48:10 conversation: 01j759hrkxtekeyrt1t017fmz9

Model: claude-3-5-sonnet-20240620

Prompt:

json_flatten.py
---
"""
flatten() and unflatten()

A pair of functions that can convert an arbitrary JSON object into a
flat name/value pair dictionary and back again, preserving type 
information and handling both nested lists and nested dictionaries.

For example:

    {
        "this": {
            "is": {
                "nested": [{
                    "nested_dict_one": 10
                }, {
                    "nested_dict_two": 20.5
                }]
            },
            "other_types": {
                "true": True,
                "false": False,
                "none": None,
            }
        }
    }

Flattens to:

    {
        "this.is.nested.[0].nested_dict_one$int": "10",
        "this.is.nested.[1].nested_dict_two$float": "20.5",
        "this.other_types.true$bool": "True",
        "this.other_types.false$bool": "False",
        "this.other_types.none$none": "None",
    }
"""
import re


def _object_to_rows(obj, prefix=None):
    rows = []
    dot_prefix = prefix and (prefix + ".") or ""
    if isinstance(obj, dict):
        if not obj:
            rows.append(((prefix or "") + "$empty", "{}"))
        else:
            for key, item in obj.items():
                rows.extend(_object_to_rows(item, prefix=dot_prefix + key))
    elif isinstance(obj, (list, tuple)):
        if len(obj) == 0:
            rows.append(((prefix or "") + "$emptylist", "[]"))
        for i, item in enumerate(obj):
            rows.extend(_object_to_rows(item, prefix=dot_prefix + "[{}]".format(i)))
    elif obj is None:
        rows.append(((prefix or "") + "$none", "None"))
    elif isinstance(obj, bool):
        rows.append(((prefix or "") + "$bool", str(obj)))
    elif isinstance(obj, int):
        rows.append(((prefix or "") + "$int", str(obj)))
    elif isinstance(obj, float):
        rows.append(((prefix or "") + "$float", str(obj)))
    else:
        rows.append((prefix, str(obj)))
    return rows


def flatten(obj):
    if not isinstance(obj, dict):
        raise TypeError("Expected dict, got {}".format(type(obj)))
    return dict(_object_to_rows(obj))


_types_re = re.compile(r".*\$(none|bool|int|float|empty|emptylist)$")
_int_key_re = re.compile(r"\[(\d+)\]")


def unflatten(data):
    obj = {}
    for key, value in data.items():
        current = obj
        bits = key.split(".")
        path, lastkey = bits[:-1], bits[-1]
        for bit in path:
            current[bit] = current.get(bit) or {}
            current = current[bit]
        # Now deal with $type suffixes:
        if _types_re.match(lastkey):
            lastkey, lasttype = lastkey.rsplit("$", 2)
            value = {
                "int": int,
                "float": float,
                "empty": lambda v: {},
                "emptylist": lambda v: [],
                "bool": lambda v: v.lower() == "true",
                "none": lambda v: None,
            }.get(lasttype, lambda v: v)(value)
        current[lastkey] = value

    # We handle foo.[0].one, foo.[1].two syntax in a second pass,
    # by iterating through our structure looking for dictionaries
    # where all of the keys are stringified integers
    def replace_integer_keyed_dicts_with_lists(obj):
        if isinstance(obj, dict):
            if obj and all(_int_key_re.match(k) for k in obj):
                return [
                    i[1]
                    for i in sorted(
                        [
                            (
                                int(_int_key_re.match(k).group(1)),
                                replace_integer_keyed_dicts_with_lists(v),
                            )
                            for k, v in obj.items()
                        ]
                    )
                ]
            else:
                return dict(
                    (k, replace_integer_keyed_dicts_with_lists(v))
                    for k, v in obj.items()
                )
        elif isinstance(obj, list):
            return [replace_integer_keyed_dicts_with_lists(v) for v in obj]
        else:
            return obj

    obj = replace_integer_keyed_dicts_with_lists(obj)
    # Handle root units only, e.g. {'$empty': '{}'}
    if list(obj.keys()) == [""]:
        return list(obj.values())[0]
    return obj


---
setup.py
---
from setuptools import setup
import os

VERSION = "0.3"


def get_long_description():
    with open(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"),
        encoding="utf8",
    ) as fp:
        return fp.read()


setup(
    name="json-flatten",
    description="Python functions for flattening a JSON object to a single dictionary of pairs, and unflattening that dictionary back to a JSON object",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
    author="Simon Willison",
    url="https://github.com/simonw/json-flatten",
    project_urls={
        "Issues": "https://github.com/simonw/json-flatten/issues",
        "CI": "https://github.com/simonw/json-flatten/actions",
        "Changelog": "https://github.com/simonw/json-flatten/releases",
    },
    license="Apache License, Version 2.0",
    version=VERSION,
    py_modules=["json_flatten"],
    extras_require={"test": ["pytest", "black"]},
    tests_require=["json-flatten[test]"],
)


---
test_json_flatten.py
---
from json_flatten import flatten, unflatten
import pytest


@pytest.mark.parametrize(
    "test_name,unflattened,flattened",
    [
        # test_name, unflattened, flattened
        ("simple", {"foo": "bar"}, {"foo": "bar"}),
        ("nested", {"foo": {"bar": "baz"}}, {"foo.bar": "baz"}),
        ("list_with_one_item", {"foo": ["item"]}, {"foo.[0]": "item"}),
        ("nested_lists", {"foo": [["item"]]}, {"foo.[0].[0]": "item"}),
        (
            "list",
            {"foo": {"bar": ["one", "two"]}},
            {"foo.bar.[0]": "one", "foo.bar.[1]": "two"},
        ),
        ("int", {"foo": 5}, {"foo$int": "5"}),
        ("none", {"foo": None}, {"foo$none": "None"}),
        ("bool_true", {"foo": True}, {"foo$bool": "True"}),
        ("bool_false", {"foo": False}, {"foo$bool": "False"}),
        ("float", {"foo": 2.5}, {"foo$float": "2.5"}),
        (
            "complex",
            {
                "this": {
                    "is": {
                        "nested": [{"nested_dict_one": 10}, {"nested_dict_two": 20.5}]
                    },
                    "other_types": {"false": False, "true": True, "none": None},
                }
            },
            {
                "this.is.nested.[0].nested_dict_one$int": "10",
                "this.is.nested.[1].nested_dict_two$float": "20.5",
                "this.other_types.true$bool": "True",
                "this.other_types.false$bool": "False",
                "this.other_types.none$none": "None",
            },
        ),
        (
            "dollar_signs_that_are_not_type_indicators",
            {
                "foo": [
                    {
                        "emails": ["bar@example.com"],
                        "phones": {"_$!<home>!$_": "555-555-5555"},
                    }
                ]
            },
            {
                "foo.[0].emails.[0]": "bar@example.com",
                "foo.[0].phones._$!<home>!$_": "555-555-5555",
            },
        ),
        ("empty_object", {}, {"$empty": "{}"}),
        (
            "nested_empty_objects",
            {"nested": {"foo": {}, "bar": {}}},
            {"nested.foo$empty": "{}", "nested.bar$empty": "{}"},
        ),
        ("empty_nested_list", {"empty": []}, {"empty$emptylist": "[]"}),
        (
            "empty_nested_list_complex",
            {"foo": {"bar": []}, "nested": [[], []]},
            {
                "foo.bar$emptylist": "[]",
                "nested.[0]$emptylist": "[]",
                "nested.[1]$emptylist": "[]",
            },
        ),
        ("dict_with_numeric_key", {"bob": {"24": 4}}, {"bob.24$int": "4"}),
    ],
)
def test_flatten_unflatten(test_name, unflattened, flattened):
    actual_flattened = flatten(unflattened)
    assert actual_flattened == flattened
    actual_unflattened = unflatten(actual_flattened)
    assert actual_unflattened == unflattened


def test_integers_with_gaps_does_not_create_sparse_array():
    assert unflatten({"list.[10]": "three", "list.[5]": "two", "list.[0]": "one"}) == {
        "list": ["one", "two", "three"]
    }


def test_list_as_base_level_object_rejected_with_error():
    with pytest.raises(TypeError):
        flatten([{"name": "john"}])


---

System:

write detailed documentation in markdown describing the format used to represent JSON and nested JSON as key/value pairs, include a table as well

Response:

Here's detailed documentation in markdown describing the format used to represent JSON and nested JSON as key/value pairs:

JSON Flattening Format

This document describes the format used to represent JSON and nested JSON structures as flattened key/value pairs.

Basic Principles

  1. Keys are constructed using dot notation to represent nesting.
  2. Type information is preserved using $type suffixes.
  3. List indices are represented using [index] notation.
  4. Empty objects and lists have special representations.

Key Construction

Nested Objects

For nested objects, keys are constructed by joining the nested keys with dots.

Example:

{
  "user": {
    "name": "John",
    "age": 30
  }
}

Flattened:

user.name: John
user.age$int: 30

Lists

List items are represented using [index] notation.

Example:

{
  "fruits": ["apple", "banana", "cherry"]
}

Flattened:

fruits.[0]: apple
fruits.[1]: banana
fruits.[2]: cherry

Nested Lists

For nested lists, the index notation is repeated.

Example:

{
  "matrix": [[1, 2], [3, 4]]
}

Flattened:

matrix.[0].[0]$int: 1
matrix.[0].[1]$int: 2
matrix.[1].[0]$int: 3
matrix.[1].[1]$int: 4

Type Preservation

Types are preserved using $type suffixes:

Type Suffix Example
Integer $int age$int: 30
Float $float price$float: 19.99
Boolean $bool active$bool: True
Null $none data$none: None
Empty Object $empty obj$empty: {}
Empty List $emptylist list$emptylist: []

String values do not require a type suffix.

Special Cases

Empty Objects

An empty object at the root level is represented as:

$empty: {}

Empty Lists

An empty list is represented as:

key$emptylist: []

Root-Level Non-Object

If the root JSON element is not an object (e.g., a list), it will be rejected with a TypeError.

Examples

Complex Nested Structure

JSON:

{
  "user": {
    "name": "Alice",
    "age": 28,
    "hobbies": ["reading", "swimming"],
    "address": {
      "street": "123 Main St",
      "city": "Anytown"
    },
    "active": true,
    "salary": 50000.50,
    "spouse": null
  }
}

Flattened:

user.name: Alice
user.age$int: 28
user.hobbies.[0]: reading
user.hobbies.[1]: swimming
user.address.street: 123 Main St
user.address.city: Anytown
user.active$bool: True
user.salary$float: 50000.50
user.spouse$none: None

This format allows for lossless conversion between nested JSON structures and flat key/value pairs, preserving all type information and structural relationships.

2024-09-07T03:49:37

Prompt:

Rewrite that document to use the Python cog library to generate the examples

Response:

Certainly! Here's the rewritten documentation using Python's cog library to generate the examples:

# JSON Flattening Format

This document describes the format used to represent JSON and nested JSON structures as flattened key/value pairs.

## Basic Principles

1. Keys are constructed using dot notation to represent nesting.
2. Type information is preserved using `$type` suffixes.
3. List indices are represented using `[index]` notation.
4. Empty objects and lists have special representations.

## Key Construction

### Nested Objects

For nested objects, keys are constructed by joining the nested keys with dots.

Example:
[[[cog
import cog
from json_flatten import flatten

example = {
  "user": {
    "name": "John",
    "age": 30
  }
}

cog.out("```json\n")
cog.out(str(example))
cog.out("\n```\n")
cog.out("Flattened:\n```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

### Lists

List items are represented using `[index]` notation.

Example:
[[[cog
example = {
  "fruits": ["apple", "banana", "cherry"]
}

cog.out("```json\n")
cog.out(str(example))
cog.out("\n```\n")
cog.out("Flattened:\n```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

### Nested Lists

For nested lists, the index notation is repeated.

Example:
[[[cog
example = {
  "matrix": [[1, 2], [3, 4]]
}

cog.out("```json\n")
cog.out(str(example))
cog.out("\n```\n")
cog.out("Flattened:\n```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

## Type Preservation

Types are preserved using `$type` suffixes:

| Type | Suffix | Example |
|------|--------|---------|
| Integer | `$int` | `age$int: 30` |
| Float | `$float` | `price$float: 19.99` |
| Boolean | `$bool` | `active$bool: True` |
| Null | `$none` | `data$none: None` |
| Empty Object | `$empty` | `obj$empty: {}` |
| Empty List | `$emptylist` | `list$emptylist: []` |

String values do not require a type suffix.

## Special Cases

### Empty Objects

An empty object at the root level is represented as:
[[[cog
example = {}

cog.out("```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

### Empty Lists

An empty list is represented as:
[[[cog
example = {"list": []}

cog.out("```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

### Root-Level Non-Object

If the root JSON element is not an object (e.g., a list), it will be rejected with a TypeError.

## Examples

### Complex Nested Structure

JSON:
[[[cog
example = {
  "user": {
    "name": "Alice",
    "age": 28,
    "hobbies": ["reading", "swimming"],
    "address": {
      "street": "123 Main St",
      "city": "Anytown"
    },
    "active": True,
    "salary": 50000.50,
    "spouse": None
  }
}

cog.out("```json\n")
cog.out(str(example))
cog.out("\n```\n")
cog.out("\nFlattened:\n```\n")
for key, value in flatten(example).items():
    cog.out(f"{key}: {value}\n")
cog.out("```\n")
]]]
[[[end]]]

This format allows for lossless conversion between nested JSON structures and flat key/value pairs, preserving all type information and structural relationships.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment