Skip to content

Instantly share code, notes, and snippets.

@tzaffi
Last active February 3, 2022 19:04
Show Gist options
  • Save tzaffi/c14aa3af79ea7d09170a8727029b5d95 to your computer and use it in GitHub Desktop.
Save tzaffi/c14aa3af79ea7d09170a8727029b5d95 to your computer and use it in GitHub Desktop.
YAYD - Yet Another YAML (or JSON) Differ
from collections import OrderedDict
from copy import deepcopy
import json
from typing import List, Union
L, R = "left", "right"
def deep_diff(
x: Union[dict, list],
y: Union[dict, list],
exclude_keys: List[str] = [],
overlaps_only: bool = False,
extras_only: Union[L, R, None] = None,
arraysets: bool = False,
) -> Union[dict, list, None]:
"""
Take the deep diff of JSON-like dictionaries
"""
senseless = "it doesn't make sense to "
if overlaps_only:
assert (
arraysets
), f"{senseless}diff overlaps only when not considering arrays as sets"
if extras_only:
assert (
arraysets
), f"{senseless}have extras_only={extras_only} when not considering arrays as sets"
assert (
not overlaps_only
), f"{senseless}have extras_only={extras_only} when diffing overlaps only"
right_extras = extras_only == R
left_extras = extras_only == L
def dd(x, y):
if x == y:
return None
# awkward, but handles subclasses of dict/list:
if not (
isinstance(x, (list, dict))
and (isinstance(x, type(y)) or isinstance(y, type(x)))
):
return [x, y] if not extras_only else None
if isinstance(x, dict):
d = type(x)() # handles OrderedDict's as well
for k in x.keys() ^ y.keys():
if k in exclude_keys or overlaps_only:
continue
if (k in x and right_extras) or (k in y and left_extras):
continue
d[k] = [deepcopy(x[k]), None] if k in x else [None, deepcopy(y[k])]
for k in x.keys() & y.keys():
if k in exclude_keys:
continue
next_d = dd(x[k], y[k])
if next_d is None:
continue
d[k] = next_d
return d if d else None
# assume a list:
m, n = len(x), len(y)
if not arraysets:
d = [None] * max(m, n)
flipped = False
if m > n:
flipped = True
x, y = y, x
for i, x_val in enumerate(x):
d[i] = dd(y[i], x_val) if flipped else dd(x_val, y[i])
if not overlaps_only:
for i in range(m, n):
d[i] = [y[i], None] if flipped else [None, y[i]]
else: # will raise error if contains a non-hashable element
sx, sy = set(x), set(y)
if extras_only:
d = list(sx - sy) if left_extras else list(sy - sx)
elif overlaps_only:
ox, oy = sorted(x), sorted(y)
d = []
for e in ox:
if e not in oy:
d.append([e, None])
for e in oy:
if e not in ox:
d.append([None, e])
else:
d = [[e, None] if e in x else [None, e] for e in sx ^ sy]
return None if all(map(lambda x: x is None, d)) else d
return sort_json(dd(x, y))
def is_diff_array(da: list) -> bool:
if len(da) != 2 or da == [None, None]:
return False
if None in da:
return True
def all_of_type(xs, t):
return all(map(lambda x: isinstance(x, t), xs))
if all_of_type(da, list) or all_of_type(da, dict):
return False
return True
def sort_json(d: Union[dict, list], sort_lists: bool = False):
if isinstance(d, list):
return [sort_json(x) for x in (sorted(d) if sort_lists else d)]
if isinstance(d, dict):
return OrderedDict(**{k: sort_json(d[k]) for k in sorted(d.keys())})
return d
def jdump(jd, only_objs=False):
if only_objs and not isinstance(jd, (list, dict, str)):
return jd
return json.dumps(jd, separators=(",", ":"))
def prettify_diff(
json_diff: Union[dict, list, int, str, None],
src: str = "",
tgt: str = "",
suppress_bs: bool = True,
value_limit: int = None,
):
def sup(x):
if not isinstance(x, str):
return x
if value_limit is not None and len(x) > value_limit:
x = x[:value_limit] + "..."
return x
def suppress(x, y):
x, y = jdump(x, only_objs=True), jdump(y, only_objs=True)
if None not in (x, y):
return x, y
return sup(x), sup(y)
def pd(jd):
if isinstance(jd, list):
if is_diff_array(jd):
x, y = jd
if suppress_bs:
x, y = suppress(x, y)
# return [f"[{tgt:^10}] --> {x}", f"[{src:^10}] --> {y}"]
return [{tgt: x}, {src: y}]
return [pd(x) for x in jd]
if isinstance(jd, dict):
return {k: pd(v) for k, v in jd.items()}
return jd
return sort_json(pd(json_diff))
import atexit
import json
from pathlib import Path
from typing import List
import yaml
from git import Repo
from .json_diff import deep_diff, prettify_diff
NEW, OVERLAP, DROPPED, FULL = "new", "overlap", "dropped", "full"
DIFF_TYPES = [NEW, OVERLAP, DROPPED, FULL]
# These are the diff reports that will be run and compared/asserted against:
ASSERTIONS = [DROPPED, FULL]
# Only compare swagger "definitions":
MODELS_ONLY = True
REPO_DIR = Path.cwd()
INDEXER_SWGR = REPO_DIR / "api" / "indexer.oas2.json"
GOAL_DIR = REPO_DIR / "third_party" / "go-algorand"
ALGOD_SWGR = GOAL_DIR / "daemon" / "algod" / "api" / "algod.oas2.json"
REPORTS_DIR = REPO_DIR / "parity" / "reports"
already_printed = False
def print_git_info_once():
global already_printed
if already_printed:
return
already_printed = True
indexer = Repo(REPO_DIR)
indexer_commit = indexer.git.rev_parse("HEAD")
goal = Repo(GOAL_DIR)
goal_commit = goal.git.rev_parse("HEAD")
print(f"""Finished comparing:
* Indexer Swagger {INDEXER_SWGR} for commit hash {indexer_commit}
* Algod Swagger {ALGOD_SWGR} for commit hash {goal_commit}
""")
def tsetup():
atexit.register(print_git_info_once)
exclude = [
"basePath",
"consumes",
"host",
"info",
"paths",
"produces",
"security",
"securityDefinitions",
"schemes",
"diff_types",
"x-algorand-format",
"x-go-name",
]
with open(INDEXER_SWGR, "r") as f:
indexer = json.loads(f.read())
if MODELS_ONLY:
indexer = indexer["definitions"]
with open(ALGOD_SWGR, "r") as f:
algod = json.loads(f.read())
if MODELS_ONLY:
algod = algod["definitions"]
return exclude, indexer, algod
def get_report_path(diff_type, for_write=False):
suffix = "_OUT" if for_write else ""
yml_path = REPORTS_DIR / f"algod2indexer_{diff_type}{suffix}.yml"
return yml_path
def save_yaml(diff, diff_type):
yml_path = get_report_path(diff_type, for_write=True)
with open(yml_path, "w") as f:
f.write(yaml.dump(diff, indent=2, sort_keys=True, width=2000))
print(f"\nsaved json diff to {yml_path}")
def yamlize(diff):
def ddize(d):
if isinstance(d, dict):
return {k: ddize(v) for k, v in d.items()}
if isinstance(d, list):
return [ddize(x) for x in d]
return d
return ddize(prettify_diff(diff, src="ALGOD", tgt="INDEXER", value_limit=30))
def generate_diff(source, target, excludes, diff_type):
assert (
diff_type in DIFF_TYPES
), f"Unrecognized diff_type [{diff_type}] not in {DIFF_TYPES}"
if diff_type == OVERLAP:
# Overlaps - existing fields that have been modified freom algod ---> indexer
overlaps_only = True
extras_only = None
elif diff_type == NEW:
# Additions - fields that have been introduced in indexer
overlaps_only = False
extras_only = "left"
elif diff_type == DROPPED:
# Removals - fields that have been deleted in indexer
overlaps_only = False
extras_only = "right"
else:
# Full Diff - anything that's different
assert diff_type == FULL
overlaps_only = False
extras_only = None
return yamlize(
deep_diff(
target,
source,
exclude_keys=excludes,
overlaps_only=overlaps_only,
extras_only=extras_only,
arraysets=True,
)
)
def save_reports(*reports) -> None:
"""
Generate a YAML report shoing differences between Algod's API and Indexer's API.
Possible `reports` diff_types are:
"overlap" - show only modifications to features that Algod and Indexer have in common
"new" - focus on features added to Indexer and missing from Algod
"dropped" (recommended) - focus on features that are present in Algod but dropped in Indexer
"full" (recommended) - show all differences
"""
excludes, indexer_swgr, algod_swgr = tsetup()
for diff_type in reports:
diff = generate_diff(algod_swgr, indexer_swgr, excludes, diff_type)
save_yaml(diff, diff_type)
def test_parity(reports: List[str] = ASSERTIONS, save_new: bool = True):
excludes, indexer_swgr, algod_swgr = tsetup()
"""
For each report in reports:
1. load the pre-existing yaml report into `old_diff`
2. re-generate the equivalent report by comparing `algod_swgr` with `indexer_swgr`
3. compute the `diff_of_diffs` between these two reports
4. assert that there is no diff
"""
if save_new:
save_reports(*reports)
for diff_type in reports:
ypath = get_report_path(diff_type, for_write=False)
with open(ypath, "r") as f:
old_diff = yaml.safe_load(f)
new_diff = generate_diff(algod_swgr, indexer_swgr, excludes, diff_type)
diff_of_diffs = deep_diff(old_diff, new_diff)
assert (
diff_of_diffs is None
), f"""UNEXPECTED CHANGE IN {ypath}. Differences are:
{json.dumps(diff_of_diffs,indent=2)}
"""
from cmath import exp
from copy import deepcopy
from .json_diff import deep_diff
def test_deep_diff():
d1 = {
"dad": 55,
"mom": 56,
}
d2 = {
"mom": 55,
"dad": 55,
}
actual = deep_diff(d1, d2)
expected = {"mom": [56, 55]}
assert expected == actual, f"expected: {expected} v. actual: {actual}"
actual = deep_diff(d1, deepcopy(d1))
expected = None
assert expected == actual, f"expected: {expected} v. actual: {actual}"
mom_info = {
"age": 56,
"profession": "MD",
"hobbies": ["ballet", "opera", {"football": "american"}, "racecar driving"],
}
d3 = {
"dad": 55,
"mom": mom_info,
}
actual = deep_diff(d1, d3)
expected = {"mom": [56, mom_info]}
assert expected == actual, f"expected: {expected} v. actual: {actual}"
d4 = {
"mom": mom_info,
}
actual = deep_diff(d3, d4)
expected = {"dad": [55, None]}
assert expected == actual, f"expected: {expected} v. actual: {actual}"
d5 = {
"dad": 55,
"mom": {
"age": 56,
"profession": "Programmer",
"hobbies": ["ballet", "opera", {"football": "british"}, "racecar driving"],
},
}
actual = deep_diff(d3, d5)
expected = {
"mom": {
"profession": ["MD", "Programmer"],
"hobbies": [None, None, {"football": ["american", "british"]}, None],
}
}
assert expected == actual, f"expected: {expected} v. actual: {actual}"
a1 = ["hello", "world", {"I": "wish"}, "you", {"all": "the best"}]
a2 = ["hello", "world", {"I": "wish"}, "you", {"all": "the very best"}]
actual = deep_diff(a1, a2)
expected = [None, None, None, None, {"all": ["the best", "the very best"]}]
assert expected == actual, f"expected: {expected} v. actual: {actual}"
a3 = ["hello", "world", "I", "wish", "you", "good", "times"]
a4 = ["world", "hello", "you", "good", "timesies", "wish"]
actual = deep_diff(a3, a4, overlaps_only=True, arraysets=True)
expected = [["I", None], ["times", None], [None, "timesies"]]
assert expected == actual, f"expected: {expected} v. actual: {actual}"
s1 = ["alice", "bob", "cassie", "deandrea", "elbaz"]
s2 = ["bob", "alice", "cassie", "deandrea", "elbaz", "farber"]
actual = deep_diff(s1, s2)
expected = [["alice", "bob"], ["bob", "alice"], None, None, None, [None, "farber"]]
assert expected == actual, f"expected: {expected} v. actual: {actual}"
actual = deep_diff(s1, s2, arraysets=True)
expected = [[None, "farber"]]
assert expected == actual, f"expected: {expected} v. actual: {actual}"
real1 = {
"definitions": {
"Account": {
"properties": {
"sig-type": {
"description": "Indicates what type of signature is used by this account, must be one of:\n* sig\n* msig\n* lsig\n* or null if unknown"
}
}
}
}
}
real2 = {
"definitions": {
"Account": {
"properties": {
"sig-type": {
"description": "Indicates what type of signature is used by this account, must be one of:\n* sig\n* msig\n* lsig",
}
}
}
}
}
expected = deepcopy(real2)
expected["definitions"]["Account"]["properties"]["sig-type"]["description"] = [
real1["definitions"]["Account"]["properties"]["sig-type"]["description"],
real2["definitions"]["Account"]["properties"]["sig-type"]["description"],
]
actual = deep_diff(real1, real2)
assert expected == actual, f"expected: {expected} v. actual: {actual}"
actual = deep_diff(real1, real2, extras_only="left", arraysets=True)
expected = None
assert expected == actual, f"expected: {expected} v. actual: {actual}"
fb1 = {"FANG": [{"Facebook": {"price": 330}}]}
fb2 = {"FANG": [{"Meta": {"price": 290}}]}
actual = deep_diff(fb1, fb2)
expected = {
"FANG": [{"Facebook": [{"price": 330}, None], "Meta": [None, {"price": 290}]}]
}
assert expected == actual, f"expected: {expected} v. actual: {actual}"
Account:
properties:
min-balance:
- INDEXER: null
- ALGOD: '{"description":"MicroAlgo bala...'
required:
- min-balance
ApplicationParams:
required:
- creator
BuildVersion:
- INDEXER: null
- ALGOD: '{"properties":{"branch":{"type...'
DryrunRequest:
- INDEXER: null
- ALGOD: '{"description":"Request data t...'
DryrunSource:
- INDEXER: null
- ALGOD: '{"description":"DryrunSource i...'
DryrunState:
- INDEXER: null
- ALGOD: '{"description":"Stores the TEA...'
DryrunTxnResult:
- INDEXER: null
- ALGOD: '{"description":"DryrunTxnResul...'
ErrorResponse:
- INDEXER: null
- ALGOD: '{"description":"An error respo...'
ParticipationKey:
- INDEXER: null
- ALGOD: '{"description":"Represents a p...'
PendingTransactionResponse:
- INDEXER: null
- ALGOD: '{"description":"Details about ...'
Version:
- INDEXER: null
- ALGOD: '{"description":"algod version ...'
@tzaffi
Copy link
Author

tzaffi commented Feb 3, 2022

Notable:

  • algod2indexer_dropped.yml - how a diff looks like after being prettified and converted to yaml
  • json_diff.py
    • deep_diff() - the workhorse
    • prettify_diff() - trim long string values, unless they are a true diff (the values exist in both versions and are different)
  • test_indexer_v_algod.py
    • save_yaml() - save the diff as yaml
    • generate_diff() - driver that handles all four diff output variants
      1. NEW - ignore values that were dropped, focusing only on values in the right but not the left
      2. OVERLAP - only common non-null values that differ
      3. DROPPED - ignore values that are new, focusing only on values that are in the left but not the right
      4. FULL - all of the above

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment