buhrmann/venn.hs

## venn.hs
# Assuming a (long) dataset of keywords with associated author information,
# with at least one row per (keyword x author), this creates one cluster of
# keywords for each combination of authors. E.g. if there are 3 different
# authors (A, B, C), there will be up to 8 clusters: A, B, C, A ∧ B, A ∧ C,
# A ∧ B ∧ C. These are essentially all the subgroups in a Venn diagram of
# A, B and C.

# If the dataset is already aggregate (one row per keywords and all authors
# having used the keyword in a corresponding list (multivalued category), this
# step isn't necessary

aggregate(ds, {
    "by": "keyword",
    "presort": {
        "columns": [
            "author"
        ]
    },
    "aggregations": {
        "author": {
            "authors": {
                "func": "unique"
            }
        }
    }
}) => (ds)

concatenate(ds.authors, {
    "separator": " ∧ ",
    "out_type": "category"
}) -> (ds.title)

# We use supervised UMAP to better separate the groups
layout_dataset(ds[["title", "authors"]], {
    "n_neighbors": 500,
    "spread": 2,
    "min_dist": 1.99,
    "metric": "dice",
    "target": "title",
    "random_state": 2
}) -> (ds.x, ds.y)
	# Assuming a (long) dataset of keywords with associated author information,
	# with at least one row per (keyword x author), this creates one cluster of
	# keywords for each combination of authors. E.g. if there are 3 different
	# authors (A, B, C), there will be up to 8 clusters: A, B, C, A ∧ B, A ∧ C,
	# A ∧ B ∧ C. These are essentially all the subgroups in a Venn diagram of
	# A, B and C.

	# If the dataset is already aggregate (one row per keywords and all authors
	# having used the keyword in a corresponding list (multivalued category), this
	# step isn't necessary

	aggregate(ds, {
	"by": "keyword",
	"presort": {
	"columns": [
	"author"
	]
	},
	"aggregations": {
	"author": {
	"authors": {
	"func": "unique"
	}
	}
	}
	}) => (ds)

	concatenate(ds.authors, {
	"separator": " ∧ ",
	"out_type": "category"
	}) -> (ds.title)

	# We use supervised UMAP to better separate the groups
	layout_dataset(ds[["title", "authors"]], {
	"n_neighbors": 500,
	"spread": 2,
	"min_dist": 1.99,
	"metric": "dice",
	"target": "title",
	"random_state": 2
	}) -> (ds.x, ds.y)