sleekweasel/overlapping-sets.json

## overlapping-sets.json
{
    "$schema": "https://vega.github.io/schema/vega/v4.json",
    "title": "Recursive Doughnut Chart by Most Common Keyword",
    "autosize": "pad",
    "signals": [
        { "name": "width", "init": "containerSize()[0]", "on": [] },
        { "name": "height", "init": "containerSize()[1]", "on": [] },
        { "name": "size", "init": "width > height ? height : width"}
    ],
    "data": [
        {
            "name": "rawData",
            "values": [
                { "id": 1, "keys": ["a", "z"] },
                { "id": 2, "keys": ["a", "c"] },
                { "id": 3, "keys": ["z", "c"] },
                { "id": 4, "keys": ["a", "z", "c"] },
                { "id": 5, "keys": ["c"] },
                { "id": 6, "keys": ["d", "z"] },
                { "id": 7, "keys": [] },
            ]
        },
        { // The 'ix'/'cx' fields are a list of count/key records, flattened to a string as ~-separated values, with numbers padded to work with alphabetic sort.
          // The number starts with 'i' if it's a fixed-width count of the keys for its subsection of the data
          // The number starts with 'c' if it's a 10s-complement negative of the count of records actually allocated to the key for that subsection of the data.
            "name": "key1Counts",
            "source": "rawData",
            "transform": [
                { "type": "flatten", "fields": ["keys"], "as": ["key1"] },
                {
                    "type": "aggregate",
                    "groupby": ["key1"],
                    "ops": ["count"],
                    "as": ["count1"]
                },
                { "type": "formula", "as": "ix", "expr": "'i' + slice('000000'+datum.count1, -5) + '~' + datum.key1 + '~'"}
            ]
        },
        { // One record for each id/key pair in the record's set.
            "name": "key1CountsFlat",
            "source": "rawData",
            "transform": [
                { "type": "flatten", "fields": ["keys"], "as": ["key1"] },
                {
                    "type": "lookup",
                    "from": "key1Counts",
                    "key": "key1",
                    "fields": ["key1"],
                    "values": ["ix"],
                }
            ]
        },
        { // Choose each record's most popular key
            "name": "maxCount1Records",
            "source": "key1CountsFlat",
            "transform": [
                {
                    "type": "aggregate",
                    "groupby": ["id"],
                    "ops": ["argmax"],
                    "fields": ["ix"],
                    "as": ["mck1"]
                }
            ]
        },
        { // Add back records which don't have enough keys for this level of analysis (in this case, empty sets)
            "name": "everyIdKeysGrouped1",
            "source": "rawData",
            "transform": [
                {
                    "type": "lookup",
                    "fields": ["id"],
                    "from": "maxCount1Records",
                    "key": "id",
                    "values": ["mck1.ix"],
                    "as": ["ix"],
                    "default": "~~"
                },
                {"type": "formula", "as": "ix", "expr": "datum.ix"}
            ]
        },
        { // Aggregate by index, then compute cx for the actual count of records in each group. And compute pie angles.
            "name": "arcs1",
            "source": "everyIdKeysGrouped1",
            "transform": [
                {
                    "type": "aggregate",
                    "groupby": ["ix"],
                    "ops": ["count"],
                    "as": ["count"]
                },
                {"type": "formula", "as": "key", "expr": "split(datum.ix, '~')[1]"},
                {"type": "formula", "as": "cx", "expr": "'c'+(1000000-datum.count) + '~' + datum.key + '~'"},
                {"type": "collect", "sort": {"field": "cx"}},
                {
                    "type": "pie",
                    "field": "count",
                    "startAngle": 0,
                    "endAngle": 6.28318,
                    "as": ["startAngle", "endAngle"]
                },
                {"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
                {"type": "formula", "as": "label2", "expr": "datum.key"}
            ]
        },
        { // Same as rawData, but with the first-level indexes ix and cs added.
          "name": "everyIdKeysGroupedCx1",
          "source": "everyIdKeysGrouped1",
          "transform": [
            {"type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["cx"], "as": ["cx"]}
          ]
        },
        { // Compute most popular keys for each cx group. cxix is unique enough to identify by record.
            "name": "key2Counts",
            "source": "everyIdKeysGroupedCx1",
            "transform": [
              { "type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["count", "key", "cx"]},
              { "type": "flatten", "fields": ["keys"], "as": ["key"] },
              { "type": "filter", "expr": "indexof(datum.cx, '~' + datum.key + '~') < 0"},
              {
                  "type": "aggregate",
                  "groupby": ["cx", "key"],
                  "ops": ["count"],
                  "as": ["count"]
              },
              {"type": "formula", "as": "cxix", "expr": "datum.cx+'i'+slice('000000'+datum.count, -5) + '~' + datum.key + '~'"},
              {"type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
            ]
        },
        { // Flatten, in order to select most popular key for each cx group.
            "name": "key2CountsFlat",
            "source": "everyIdKeysGroupedCx1",
            "transform": [
                { "type": "flatten", "fields": ["keys"], "as": ["key"] },
                { "type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
                {
                    "type": "lookup",
                    "fields": ["cxk2"],
                    "from": "key2Counts",
                    "key": "cxk2",
                    "values": ["count", "cxix"],
                    "as": ["count", "cxix"]
                },
                { "type": "filter", "expr": "datum.count"}

            ]
        }
    ,
        { // choose most popular key for each record
            "name": "maxCount2Records",
            "source": "key2CountsFlat",
            "transform": [
                {
                    "type": "aggregate",
                    "groupby": ["id"],
                    "ops": ["argmax"],
                    "fields": ["count"],
                    "as": ["mck2"]
                }
            ]
        },
        { // Add all the 'missing' records back in, so they show as missing sectors.
            "name": "everyIdKeysGrouped2",
            "source": "everyIdKeysGroupedCx1",
            "transform": [
                {
                    "type": "lookup",
                    "fields": ["id"],
                    "from": "maxCount2Records",
                    "key": "id",
                    "values": ["mck2.cxix"],
                    "as": ["cxix"],
                    "default": "~~"
                },
                {
                  "type": "formula", "as": "cxix", "expr": "( datum.cxix === '~~') ? datum.cx + datum.cxix : datum.cxix"
                }
            ]
        },
        { // Group by most populated key and generate 'cx' indexes for next level of analysis
            "name": "arcs2",
            "source": "everyIdKeysGrouped2",
            "transform": [
                {
                    "type": "aggregate",
                    "groupby": ["cxix"],
                    "ops": ["count"],
                    "as": ["count"]
                },
                {"type": "formula", "as": "p", "expr": "split(datum.cxix, '~')"},
                {"type": "formula", "as": "key", "expr": "datum.p[3]"},
                {"type": "formula", "as": "cxcx", "expr": "datum.p[0]+'~'+datum.p[1]+'~c'+(1000000-datum.count) + '~' + datum.key + '~'"},
                {"type": "collect", "sort": {"field": "cxcx"}},
                {
                    "type": "pie",
                    "field": "count",
                    "startAngle": 0,
                    "endAngle": 6.28318,
                    "as": ["startAngle", "endAngle"]
                },
                {"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
                {"type": "formula", "as": "label2", "expr": "datum.key"}
            ]
        }

    ],
    "scales": [
        {
            "name": "color",
            "type": "ordinal",
            "domain": { "data": "key1Counts", "field": "key1" },
            "range": { "scheme": "category10" }
        },
    ],
    "marks": [
        {
            "type": "arc",
            "from": { "data": "arcs1" },
            "encode": {
                "enter": {
                    "fill": { "scale": "color", "field": "label2" },
                    "x": { "signal": "width / 2" },
                    "y": { "signal": "height / 2" },
                    "startAngle": { "field": "startAngle" },
                    "endAngle": { "field": "endAngle" },
                    "innerRadius": { "signal": "size * 2 / 12" },
                    "outerRadius": { "signal": "size * 4 / 12" },
                    "stroke": { "value": "#fff" }
                }
            }
        },
        {
            "type": "arc",
            "from": { "data": "arcs2" },
            "encode": {
                "enter": {
                    "fill": { "scale": "color", "field": "label2" },
                    "x": { "signal": "width / 2" },
                    "y": { "signal": "height / 2" },
                    "startAngle": { "field": "startAngle" },
                    "endAngle": { "field": "endAngle" },
                    "innerRadius": { "signal": "size * 4 / 12" },
                    "outerRadius": { "signal": "size * 6 / 12" },
                    "stroke": { "value": "#fff" }
                }
            }
        },
{
  "type": "text",
  "from": { "data": "arcs1" },
  "encode": {
    "enter": {
      "text": { "field": "label" },  // Bind the label field from the data
      "fill": { "value": "black" },  // Color of the label
      "fontSize": { "value": 12 },   // Font size for the label
      "fontWeight": { "value": "bold" },
      "x": {
        "signal": "width / 2 + (size * 3 / 12) * sin((datum.startAngle + datum.endAngle) / 2)"  // Position X based on angle and radius
      },
      "y": {
        "signal": "height / 2 - (size * 3 / 12) * cos((datum.startAngle + datum.endAngle) / 2)"  // Position Y based on angle and radius
      },
      "align": { "value": "center" },  // Text alignment
      "baseline": { "value": "middle" }  // Text vertical alignment
    }
  }
},
{
  "type": "text",
  "from": { "data": "arcs2" },
  "encode": {
    "enter": {
      "text": { "field": "label" },  // Bind the label field from the data
      "fill": { "value": "black" },  // Color of the label
      "fontSize": { "value": 12 },   // Font size for the label
      "fontWeight": { "value": "bold" },
      "x": {
        "signal": "width / 2 + (size * 5 / 12) * sin((datum.startAngle + datum.endAngle) / 2)"  // Position X based on angle and radius
      },
      "y": {
        "signal": "height / 2 - (size * 5 / 12) * cos((datum.startAngle + datum.endAngle) / 2)"  // Position Y based on angle and radius
      },
      "align": { "value": "center" },  // Text alignment
      "baseline": { "value": "middle" }  // Text vertical alignment
    }
  }
}
,
{ "type": "text",
  "from": { "data": "rawData"},
  "encode": {
    "enter": {
    "text": { "signal": "datum.id + ' ' + datum.keys"},
    "x": { "signal": "datum.id"},
    "y": { "signal": "datum.id * 20"}
    }
  }
}
    ]
}
	{
	"$schema": "https://vega.github.io/schema/vega/v4.json",
	"title": "Recursive Doughnut Chart by Most Common Keyword",
	"autosize": "pad",
	"signals": [
	{ "name": "width", "init": "containerSize()[0]", "on": [] },
	{ "name": "height", "init": "containerSize()[1]", "on": [] },
	{ "name": "size", "init": "width > height ? height : width"}
	],
	"data": [
	{
	"name": "rawData",
	"values": [
	{ "id": 1, "keys": ["a", "z"] },
	{ "id": 2, "keys": ["a", "c"] },
	{ "id": 3, "keys": ["z", "c"] },
	{ "id": 4, "keys": ["a", "z", "c"] },
	{ "id": 5, "keys": ["c"] },
	{ "id": 6, "keys": ["d", "z"] },
	{ "id": 7, "keys": [] },
	]
	},
	{ // The 'ix'/'cx' fields are a list of count/key records, flattened to a string as ~-separated values, with numbers padded to work with alphabetic sort.
	// The number starts with 'i' if it's a fixed-width count of the keys for its subsection of the data
	// The number starts with 'c' if it's a 10s-complement negative of the count of records actually allocated to the key for that subsection of the data.
	"name": "key1Counts",
	"source": "rawData",
	"transform": [
	{ "type": "flatten", "fields": ["keys"], "as": ["key1"] },
	{
	"type": "aggregate",
	"groupby": ["key1"],
	"ops": ["count"],
	"as": ["count1"]
	},
	{ "type": "formula", "as": "ix", "expr": "'i' + slice('000000'+datum.count1, -5) + '~' + datum.key1 + '~'"}
	]
	},
	{ // One record for each id/key pair in the record's set.
	"name": "key1CountsFlat",
	"source": "rawData",
	"transform": [
	{ "type": "flatten", "fields": ["keys"], "as": ["key1"] },
	{
	"type": "lookup",
	"from": "key1Counts",
	"key": "key1",
	"fields": ["key1"],
	"values": ["ix"],
	}
	]
	},
	{ // Choose each record's most popular key
	"name": "maxCount1Records",
	"source": "key1CountsFlat",
	"transform": [
	{
	"type": "aggregate",
	"groupby": ["id"],
	"ops": ["argmax"],
	"fields": ["ix"],
	"as": ["mck1"]
	}
	]
	},
	{ // Add back records which don't have enough keys for this level of analysis (in this case, empty sets)
	"name": "everyIdKeysGrouped1",
	"source": "rawData",
	"transform": [
	{
	"type": "lookup",
	"fields": ["id"],
	"from": "maxCount1Records",
	"key": "id",
	"values": ["mck1.ix"],
	"as": ["ix"],
	"default": "~~"
	},
	{"type": "formula", "as": "ix", "expr": "datum.ix"}
	]
	},
	{ // Aggregate by index, then compute cx for the actual count of records in each group. And compute pie angles.
	"name": "arcs1",
	"source": "everyIdKeysGrouped1",
	"transform": [
	{
	"type": "aggregate",
	"groupby": ["ix"],
	"ops": ["count"],
	"as": ["count"]
	},
	{"type": "formula", "as": "key", "expr": "split(datum.ix, '~')[1]"},
	{"type": "formula", "as": "cx", "expr": "'c'+(1000000-datum.count) + '~' + datum.key + '~'"},
	{"type": "collect", "sort": {"field": "cx"}},
	{
	"type": "pie",
	"field": "count",
	"startAngle": 0,
	"endAngle": 6.28318,
	"as": ["startAngle", "endAngle"]
	},
	{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
	{"type": "formula", "as": "label2", "expr": "datum.key"}
	]
	},
	{ // Same as rawData, but with the first-level indexes ix and cs added.
	"name": "everyIdKeysGroupedCx1",
	"source": "everyIdKeysGrouped1",
	"transform": [
	{"type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["cx"], "as": ["cx"]}
	]
	},
	{ // Compute most popular keys for each cx group. cxix is unique enough to identify by record.
	"name": "key2Counts",
	"source": "everyIdKeysGroupedCx1",
	"transform": [
	{ "type": "lookup", "fields": ["ix"], "from": "arcs1", "key": "ix", "values": ["count", "key", "cx"]},
	{ "type": "flatten", "fields": ["keys"], "as": ["key"] },
	{ "type": "filter", "expr": "indexof(datum.cx, '~' + datum.key + '~') < 0"},
	{
	"type": "aggregate",
	"groupby": ["cx", "key"],
	"ops": ["count"],
	"as": ["count"]
	},
	{"type": "formula", "as": "cxix", "expr": "datum.cx+'i'+slice('000000'+datum.count, -5) + '~' + datum.key + '~'"},
	{"type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
	]
	},
	{ // Flatten, in order to select most popular key for each cx group.
	"name": "key2CountsFlat",
	"source": "everyIdKeysGroupedCx1",
	"transform": [
	{ "type": "flatten", "fields": ["keys"], "as": ["key"] },
	{ "type": "formula", "as": "cxk2", "expr": "datum.cx + datum.key"},
	{
	"type": "lookup",
	"fields": ["cxk2"],
	"from": "key2Counts",
	"key": "cxk2",
	"values": ["count", "cxix"],
	"as": ["count", "cxix"]
	},
	{ "type": "filter", "expr": "datum.count"}

	]
	}
	,
	{ // choose most popular key for each record
	"name": "maxCount2Records",
	"source": "key2CountsFlat",
	"transform": [
	{
	"type": "aggregate",
	"groupby": ["id"],
	"ops": ["argmax"],
	"fields": ["count"],
	"as": ["mck2"]
	}
	]
	},
	{ // Add all the 'missing' records back in, so they show as missing sectors.
	"name": "everyIdKeysGrouped2",
	"source": "everyIdKeysGroupedCx1",
	"transform": [
	{
	"type": "lookup",
	"fields": ["id"],
	"from": "maxCount2Records",
	"key": "id",
	"values": ["mck2.cxix"],
	"as": ["cxix"],
	"default": "~~"
	},
	{
	"type": "formula", "as": "cxix", "expr": "( datum.cxix === '~~') ? datum.cx + datum.cxix : datum.cxix"
	}
	]
	},
	{ // Group by most populated key and generate 'cx' indexes for next level of analysis
	"name": "arcs2",
	"source": "everyIdKeysGrouped2",
	"transform": [
	{
	"type": "aggregate",
	"groupby": ["cxix"],
	"ops": ["count"],
	"as": ["count"]
	},
	{"type": "formula", "as": "p", "expr": "split(datum.cxix, '~')"},
	{"type": "formula", "as": "key", "expr": "datum.p[3]"},
	{"type": "formula", "as": "cxcx", "expr": "datum.p[0]+'~'+datum.p[1]+'~c'+(1000000-datum.count) + '~' + datum.key + '~'"},
	{"type": "collect", "sort": {"field": "cxcx"}},
	{
	"type": "pie",
	"field": "count",
	"startAngle": 0,
	"endAngle": 6.28318,
	"as": ["startAngle", "endAngle"]
	},
	{"type": "formula", "as": "label", "expr": "datum.key ? datum.key + ' ('+datum.count+')' : datum.count"},
	{"type": "formula", "as": "label2", "expr": "datum.key"}
	]
	}

	],
	"scales": [
	{
	"name": "color",
	"type": "ordinal",
	"domain": { "data": "key1Counts", "field": "key1" },
	"range": { "scheme": "category10" }
	},
	],
	"marks": [
	{
	"type": "arc",
	"from": { "data": "arcs1" },
	"encode": {
	"enter": {
	"fill": { "scale": "color", "field": "label2" },
	"x": { "signal": "width / 2" },
	"y": { "signal": "height / 2" },
	"startAngle": { "field": "startAngle" },
	"endAngle": { "field": "endAngle" },
	"innerRadius": { "signal": "size * 2 / 12" },
	"outerRadius": { "signal": "size * 4 / 12" },
	"stroke": { "value": "#fff" }
	}
	}
	},
	{
	"type": "arc",
	"from": { "data": "arcs2" },
	"encode": {
	"enter": {
	"fill": { "scale": "color", "field": "label2" },
	"x": { "signal": "width / 2" },
	"y": { "signal": "height / 2" },
	"startAngle": { "field": "startAngle" },
	"endAngle": { "field": "endAngle" },
	"innerRadius": { "signal": "size * 4 / 12" },
	"outerRadius": { "signal": "size * 6 / 12" },
	"stroke": { "value": "#fff" }
	}
	}
	},
	{
	"type": "text",
	"from": { "data": "arcs1" },
	"encode": {
	"enter": {
	"text": { "field": "label" }, // Bind the label field from the data
	"fill": { "value": "black" }, // Color of the label
	"fontSize": { "value": 12 }, // Font size for the label
	"fontWeight": { "value": "bold" },
	"x": {
	"signal": "width / 2 + (size * 3 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius
	},
	"y": {
	"signal": "height / 2 - (size * 3 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius
	},
	"align": { "value": "center" }, // Text alignment
	"baseline": { "value": "middle" } // Text vertical alignment
	}
	}
	},
	{
	"type": "text",
	"from": { "data": "arcs2" },
	"encode": {
	"enter": {
	"text": { "field": "label" }, // Bind the label field from the data
	"fill": { "value": "black" }, // Color of the label
	"fontSize": { "value": 12 }, // Font size for the label
	"fontWeight": { "value": "bold" },
	"x": {
	"signal": "width / 2 + (size * 5 / 12) * sin((datum.startAngle + datum.endAngle) / 2)" // Position X based on angle and radius
	},
	"y": {
	"signal": "height / 2 - (size * 5 / 12) * cos((datum.startAngle + datum.endAngle) / 2)" // Position Y based on angle and radius
	},
	"align": { "value": "center" }, // Text alignment
	"baseline": { "value": "middle" } // Text vertical alignment
	}
	}
	}
	,
	{ "type": "text",
	"from": { "data": "rawData"},
	"encode": {
	"enter": {
	"text": { "signal": "datum.id + ' ' + datum.keys"},
	"x": { "signal": "datum.id"},
	"y": { "signal": "datum.id * 20"}
	}
	}
	}
	]
	}