Skip to content

Instantly share code, notes, and snippets.

@sleekweasel
Last active February 26, 2025 14:57
Vega script - unrolled recursion to analyse records with overlapping keyword sets. Run on https://vega.github.io/editor/#/edited
{
"$schema": "https://vega.github.io/schema/vega/v4.json",
"title": "Recursive Doughnut Chart by Most Common Keyword",
"autosize": "pad",
"signals": [
{ "name": "width", "init": "containerSize()[0]", "on": [] },
{ "name": "height", "init": "containerSize()[1]", "on": [] },
{ "name": "size", "init": "width > height ? height : width"}
],
"data": [
{
"name": "rawData",
"values": [
{ "id": 1, "keys": ["a", "z"] },
{ "id": 2, "keys": ["a", "c"] },
{ "id": 3, "keys": ["z", "c"] },
{ "id": 4, "keys": ["a", "z", "c"] },
{ "id": 5, "keys": ["c"] },
{ "id": 6, "keys": ["d", "z"] },
{ "id": 7, "keys": [] },
]
},
{ // The 'ix'/'cx' fields are a list of count/key records, flattened to a string as ~-separated values, with numbers padded to work with alphabetic sort.
// The number starts with 'i' if it's a fixed-width count of the keys for its subsection of the data
// The number starts with 'c' if it's a 10s-complement negative of the count of records actually allocated to the key for that subsection of the data.
"name": "key1Counts",
"source": "rawData",
"transform": [
{ "type": "flatten", "fields": ["keys"], "as": ["key1"] },
{
"type": "aggregate",
"groupby": ["key1"],
"ops": ["count"],
"as": ["count1"]
},
{ "type": "formula", "as": "ix", "expr": "'i' + slice('000000'+datum.count1, -5) + '~' + datum.key1 + '~'"}
]
},
{ // One record for each id/key pair in the record's set.
"name": "key1CountsFlat",
"source": "rawData",
"transform": [
{ "type": "flatten", "fields": ["keys"], "as": ["key1"] },
{
"type": "lookup",
"from": "key1Counts",
"key": "key1",
"fields": ["key1"],
"values": ["ix"],
}
]
},
{ // Choose each record's most popular key
"name": "maxCount1Records",
"source": "key1CountsFlat",
"transform": [
{
"type": "aggregate",
"groupby": ["id"],
"ops": ["argmax"],
"fields": ["ix"],
"as": ["mck1"]
}
]
},
{ // Add back records which don't have enough keys for this level of analysis (in this case, empty sets)
"name": "everyIdKeysGrouped1",
"source": "rawData",
"transform": [
{
"type": "lookup",
"fields": ["id"],
"from": "maxCount1Records",
"key": "id",
"values": ["mck1.ix"],
"as": ["ix"],
"default": "~~"
},
{"type": "formula", "as": "ix", "expr": "datum.ix"}
]
},
{ // Aggregate by index, then compute cx for the actual count of records in each group. And compute pie angles.
"name": "arcs1",
"source": "everyIdKeysGrouped1",
"transform": [
{
"type": "aggregate",
"groupby": ["ix"],
"ops": ["count"],
"as": ["count"]
},
{"type": "formula", "as": "key", "expr": "split(datum.ix, '~')[1]"},
{"type": "formula", "as": "cx", "expr": "'c'+(1000000-datum.count) + '~' + datum.key + '~'"},
{"type": "collect", "sort": {"field": "cx"}},
{
"type": "pie",
"field": "count",
"startAngle": 0,
"endAngle": 6.28318,
"as": ["startAngle", "endAngle"]
},
{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
{"type": "formula", "as": "label2", "expr": "datum.key"}
]
},
{ // Same as rawData, but with the first-level indexes ix and cs added.
"name": "everyIdKeysGroupedCx1",
"source": "everyIdKeysGrouped1",
"transform": [
{"type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["cx"], "as": ["cx"]}
]
},
{ // Compute most popular keys for each cx group. cxix is unique enough to identify by record.
"name": "key2Counts",
"source": "everyIdKeysGroupedCx1",
"transform": [
{ "type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["count", "key", "cx"]},
{ "type": "flatten", "fields": ["keys"], "as": ["key"] },
{ "type": "filter", "expr": "indexof(datum.cx, '~' + datum.key + '~') < 0"},
{
"type": "aggregate",
"groupby": ["cx", "key"],
"ops": ["count"],
"as": ["count"]
},
{"type": "formula", "as": "cxix", "expr": "datum.cx+'i'+slice('000000'+datum.count, -5) + '~' + datum.key + '~'"},
{"type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
]
},
{ // Flatten, in order to select most popular key for each cx group.
"name": "key2CountsFlat",
"source": "everyIdKeysGroupedCx1",
"transform": [
{ "type": "flatten", "fields": ["keys"], "as": ["key"] },
{ "type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
{
"type": "lookup",
"fields": ["cxk2"],
"from": "key2Counts",
"key": "cxk2",
"values": ["count", "cxix"],
"as": ["count", "cxix"]
},
{ "type": "filter", "expr": "datum.count"}
]
}
,
{ // choose most popular key for each record
"name": "maxCount2Records",
"source": "key2CountsFlat",
"transform": [
{
"type": "aggregate",
"groupby": ["id"],
"ops": ["argmax"],
"fields": ["count"],
"as": ["mck2"]
}
]
},
{ // Add all the 'missing' records back in, so they show as missing sectors.
"name": "everyIdKeysGrouped2",
"source": "everyIdKeysGroupedCx1",
"transform": [
{
"type": "lookup",
"fields": ["id"],
"from": "maxCount2Records",
"key": "id",
"values": ["mck2.cxix"],
"as": ["cxix"],
"default": "~~"
},
{
"type": "formula", "as": "cxix", "expr": "( datum.cxix === '~~') ? datum.cx + datum.cxix : datum.cxix"
}
]
},
{ // Group by most populated key and generate 'cx' indexes for next level of analysis
"name": "arcs2",
"source": "everyIdKeysGrouped2",
"transform": [
{
"type": "aggregate",
"groupby": ["cxix"],
"ops": ["count"],
"as": ["count"]
},
{"type": "formula", "as": "p", "expr": "split(datum.cxix, '~')"},
{"type": "formula", "as": "key", "expr": "datum.p[3]"},
{"type": "formula", "as": "cxcx", "expr": "datum.p[0]+'~'+datum.p[1]+'~c'+(1000000-datum.count) + '~' + datum.key + '~'"},
{"type": "collect", "sort": {"field": "cxcx"}},
{
"type": "pie",
"field": "count",
"startAngle": 0,
"endAngle": 6.28318,
"as": ["startAngle", "endAngle"]
},
{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
{"type": "formula", "as": "label2", "expr": "datum.key"}
]
}
],
"scales": [
{
"name": "color",
"type": "ordinal",
"domain": { "data": "key1Counts", "field": "key1" },
"range": { "scheme": "category10" }
},
],
"marks": [
{
"type": "arc",
"from": { "data": "arcs1" },
"encode": {
"enter": {
"fill": { "scale": "color", "field": "label2" },
"x": { "signal": "width / 2" },
"y": { "signal": "height / 2" },
"startAngle": { "field": "startAngle" },
"endAngle": { "field": "endAngle" },
"innerRadius": { "signal": "size * 2 / 12" },
"outerRadius": { "signal": "size * 4 / 12" },
"stroke": { "value": "#fff" }
}
}
},
{
"type": "arc",
"from": { "data": "arcs2" },
"encode": {
"enter": {
"fill": { "scale": "color", "field": "label2" },
"x": { "signal": "width / 2" },
"y": { "signal": "height / 2" },
"startAngle": { "field": "startAngle" },
"endAngle": { "field": "endAngle" },
"innerRadius": { "signal": "size * 4 / 12" },
"outerRadius": { "signal": "size * 6 / 12" },
"stroke": { "value": "#fff" }
}
}
},
{
"type": "text",
"from": { "data": "arcs1" },
"encode": {
"enter": {
"text": { "field": "label" }, // Bind the label field from the data
"fill": { "value": "black" }, // Color of the label
"fontSize": { "value": 12 }, // Font size for the label
"fontWeight": { "value": "bold" },
"x": {
"signal": "width / 2 + (size * 3 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius
},
"y": {
"signal": "height / 2 - (size * 3 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius
},
"align": { "value": "center" }, // Text alignment
"baseline": { "value": "middle" } // Text vertical alignment
}
}
},
{
"type": "text",
"from": { "data": "arcs2" },
"encode": {
"enter": {
"text": { "field": "label" }, // Bind the label field from the data
"fill": { "value": "black" }, // Color of the label
"fontSize": { "value": 12 }, // Font size for the label
"fontWeight": { "value": "bold" },
"x": {
"signal": "width / 2 + (size * 5 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius
},
"y": {
"signal": "height / 2 - (size * 5 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius
},
"align": { "value": "center" }, // Text alignment
"baseline": { "value": "middle" } // Text vertical alignment
}
}
}
,
{ "type": "text",
"from": { "data": "rawData"},
"encode": {
"enter": {
"text": { "signal": "datum.id + ' ' + datum.keys"},
"x": { "signal": "datum.id"},
"y": { "signal": "datum.id * 20"}
}
}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment