Last active
February 26, 2025 14:57
Vega script - unrolled recursion to analyse records with overlapping keyword sets. Run on https://vega.github.io/editor/#/edited
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"$schema": "https://vega.github.io/schema/vega/v4.json", | |
"title": "Recursive Doughnut Chart by Most Common Keyword", | |
"autosize": "pad", | |
"signals": [ | |
{ "name": "width", "init": "containerSize()[0]", "on": [] }, | |
{ "name": "height", "init": "containerSize()[1]", "on": [] }, | |
{ "name": "size", "init": "width > height ? height : width"} | |
], | |
"data": [ | |
{ | |
"name": "rawData", | |
"values": [ | |
{ "id": 1, "keys": ["a", "z"] }, | |
{ "id": 2, "keys": ["a", "c"] }, | |
{ "id": 3, "keys": ["z", "c"] }, | |
{ "id": 4, "keys": ["a", "z", "c"] }, | |
{ "id": 5, "keys": ["c"] }, | |
{ "id": 6, "keys": ["d", "z"] }, | |
{ "id": 7, "keys": [] }, | |
] | |
}, | |
{ // The 'ix'/'cx' fields are a list of count/key records, flattened to a string as ~-separated values, with numbers padded to work with alphabetic sort. | |
// The number starts with 'i' if it's a fixed-width count of the keys for its subsection of the data | |
// The number starts with 'c' if it's a 10s-complement negative of the count of records actually allocated to the key for that subsection of the data. | |
"name": "key1Counts", | |
"source": "rawData", | |
"transform": [ | |
{ "type": "flatten", "fields": ["keys"], "as": ["key1"] }, | |
{ | |
"type": "aggregate", | |
"groupby": ["key1"], | |
"ops": ["count"], | |
"as": ["count1"] | |
}, | |
{ "type": "formula", "as": "ix", "expr": "'i' + slice('000000'+datum.count1, -5) + '~' + datum.key1 + '~'"} | |
] | |
}, | |
{ // One record for each id/key pair in the record's set. | |
"name": "key1CountsFlat", | |
"source": "rawData", | |
"transform": [ | |
{ "type": "flatten", "fields": ["keys"], "as": ["key1"] }, | |
{ | |
"type": "lookup", | |
"from": "key1Counts", | |
"key": "key1", | |
"fields": ["key1"], | |
"values": ["ix"], | |
} | |
] | |
}, | |
{ // Choose each record's most popular key | |
"name": "maxCount1Records", | |
"source": "key1CountsFlat", | |
"transform": [ | |
{ | |
"type": "aggregate", | |
"groupby": ["id"], | |
"ops": ["argmax"], | |
"fields": ["ix"], | |
"as": ["mck1"] | |
} | |
] | |
}, | |
{ // Add back records which don't have enough keys for this level of analysis (in this case, empty sets) | |
"name": "everyIdKeysGrouped1", | |
"source": "rawData", | |
"transform": [ | |
{ | |
"type": "lookup", | |
"fields": ["id"], | |
"from": "maxCount1Records", | |
"key": "id", | |
"values": ["mck1.ix"], | |
"as": ["ix"], | |
"default": "~~" | |
}, | |
{"type": "formula", "as": "ix", "expr": "datum.ix"} | |
] | |
}, | |
{ // Aggregate by index, then compute cx for the actual count of records in each group. And compute pie angles. | |
"name": "arcs1", | |
"source": "everyIdKeysGrouped1", | |
"transform": [ | |
{ | |
"type": "aggregate", | |
"groupby": ["ix"], | |
"ops": ["count"], | |
"as": ["count"] | |
}, | |
{"type": "formula", "as": "key", "expr": "split(datum.ix, '~')[1]"}, | |
{"type": "formula", "as": "cx", "expr": "'c'+(1000000-datum.count) + '~' + datum.key + '~'"}, | |
{"type": "collect", "sort": {"field": "cx"}}, | |
{ | |
"type": "pie", | |
"field": "count", | |
"startAngle": 0, | |
"endAngle": 6.28318, | |
"as": ["startAngle", "endAngle"] | |
}, | |
{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"}, | |
{"type": "formula", "as": "label2", "expr": "datum.key"} | |
] | |
}, | |
{ // Same as rawData, but with the first-level indexes ix and cs added. | |
"name": "everyIdKeysGroupedCx1", | |
"source": "everyIdKeysGrouped1", | |
"transform": [ | |
{"type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["cx"], "as": ["cx"]} | |
] | |
}, | |
{ // Compute most popular keys for each cx group. cxix is unique enough to identify by record. | |
"name": "key2Counts", | |
"source": "everyIdKeysGroupedCx1", | |
"transform": [ | |
{ "type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["count", "key", "cx"]}, | |
{ "type": "flatten", "fields": ["keys"], "as": ["key"] }, | |
{ "type": "filter", "expr": "indexof(datum.cx, '~' + datum.key + '~') < 0"}, | |
{ | |
"type": "aggregate", | |
"groupby": ["cx", "key"], | |
"ops": ["count"], | |
"as": ["count"] | |
}, | |
{"type": "formula", "as": "cxix", "expr": "datum.cx+'i'+slice('000000'+datum.count, -5) + '~' + datum.key + '~'"}, | |
{"type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"}, | |
] | |
}, | |
{ // Flatten, in order to select most popular key for each cx group. | |
"name": "key2CountsFlat", | |
"source": "everyIdKeysGroupedCx1", | |
"transform": [ | |
{ "type": "flatten", "fields": ["keys"], "as": ["key"] }, | |
{ "type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"}, | |
{ | |
"type": "lookup", | |
"fields": ["cxk2"], | |
"from": "key2Counts", | |
"key": "cxk2", | |
"values": ["count", "cxix"], | |
"as": ["count", "cxix"] | |
}, | |
{ "type": "filter", "expr": "datum.count"} | |
] | |
} | |
, | |
{ // choose most popular key for each record | |
"name": "maxCount2Records", | |
"source": "key2CountsFlat", | |
"transform": [ | |
{ | |
"type": "aggregate", | |
"groupby": ["id"], | |
"ops": ["argmax"], | |
"fields": ["count"], | |
"as": ["mck2"] | |
} | |
] | |
}, | |
{ // Add all the 'missing' records back in, so they show as missing sectors. | |
"name": "everyIdKeysGrouped2", | |
"source": "everyIdKeysGroupedCx1", | |
"transform": [ | |
{ | |
"type": "lookup", | |
"fields": ["id"], | |
"from": "maxCount2Records", | |
"key": "id", | |
"values": ["mck2.cxix"], | |
"as": ["cxix"], | |
"default": "~~" | |
}, | |
{ | |
"type": "formula", "as": "cxix", "expr": "( datum.cxix === '~~') ? datum.cx + datum.cxix : datum.cxix" | |
} | |
] | |
}, | |
{ // Group by most populated key and generate 'cx' indexes for next level of analysis | |
"name": "arcs2", | |
"source": "everyIdKeysGrouped2", | |
"transform": [ | |
{ | |
"type": "aggregate", | |
"groupby": ["cxix"], | |
"ops": ["count"], | |
"as": ["count"] | |
}, | |
{"type": "formula", "as": "p", "expr": "split(datum.cxix, '~')"}, | |
{"type": "formula", "as": "key", "expr": "datum.p[3]"}, | |
{"type": "formula", "as": "cxcx", "expr": "datum.p[0]+'~'+datum.p[1]+'~c'+(1000000-datum.count) + '~' + datum.key + '~'"}, | |
{"type": "collect", "sort": {"field": "cxcx"}}, | |
{ | |
"type": "pie", | |
"field": "count", | |
"startAngle": 0, | |
"endAngle": 6.28318, | |
"as": ["startAngle", "endAngle"] | |
}, | |
{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"}, | |
{"type": "formula", "as": "label2", "expr": "datum.key"} | |
] | |
} | |
], | |
"scales": [ | |
{ | |
"name": "color", | |
"type": "ordinal", | |
"domain": { "data": "key1Counts", "field": "key1" }, | |
"range": { "scheme": "category10" } | |
}, | |
], | |
"marks": [ | |
{ | |
"type": "arc", | |
"from": { "data": "arcs1" }, | |
"encode": { | |
"enter": { | |
"fill": { "scale": "color", "field": "label2" }, | |
"x": { "signal": "width / 2" }, | |
"y": { "signal": "height / 2" }, | |
"startAngle": { "field": "startAngle" }, | |
"endAngle": { "field": "endAngle" }, | |
"innerRadius": { "signal": "size * 2 / 12" }, | |
"outerRadius": { "signal": "size * 4 / 12" }, | |
"stroke": { "value": "#fff" } | |
} | |
} | |
}, | |
{ | |
"type": "arc", | |
"from": { "data": "arcs2" }, | |
"encode": { | |
"enter": { | |
"fill": { "scale": "color", "field": "label2" }, | |
"x": { "signal": "width / 2" }, | |
"y": { "signal": "height / 2" }, | |
"startAngle": { "field": "startAngle" }, | |
"endAngle": { "field": "endAngle" }, | |
"innerRadius": { "signal": "size * 4 / 12" }, | |
"outerRadius": { "signal": "size * 6 / 12" }, | |
"stroke": { "value": "#fff" } | |
} | |
} | |
}, | |
{ | |
"type": "text", | |
"from": { "data": "arcs1" }, | |
"encode": { | |
"enter": { | |
"text": { "field": "label" }, // Bind the label field from the data | |
"fill": { "value": "black" }, // Color of the label | |
"fontSize": { "value": 12 }, // Font size for the label | |
"fontWeight": { "value": "bold" }, | |
"x": { | |
"signal": "width / 2 + (size * 3 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius | |
}, | |
"y": { | |
"signal": "height / 2 - (size * 3 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius | |
}, | |
"align": { "value": "center" }, // Text alignment | |
"baseline": { "value": "middle" } // Text vertical alignment | |
} | |
} | |
}, | |
{ | |
"type": "text", | |
"from": { "data": "arcs2" }, | |
"encode": { | |
"enter": { | |
"text": { "field": "label" }, // Bind the label field from the data | |
"fill": { "value": "black" }, // Color of the label | |
"fontSize": { "value": 12 }, // Font size for the label | |
"fontWeight": { "value": "bold" }, | |
"x": { | |
"signal": "width / 2 + (size * 5 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius | |
}, | |
"y": { | |
"signal": "height / 2 - (size * 5 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius | |
}, | |
"align": { "value": "center" }, // Text alignment | |
"baseline": { "value": "middle" } // Text vertical alignment | |
} | |
} | |
} | |
, | |
{ "type": "text", | |
"from": { "data": "rawData"}, | |
"encode": { | |
"enter": { | |
"text": { "signal": "datum.id + ' ' + datum.keys"}, | |
"x": { "signal": "datum.id"}, | |
"y": { "signal": "datum.id * 20"} | |
} | |
} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment