dustalov/Makefile

## lcc-cw-tg-changes.R
library(ggplot2)

palette <- c('CW'='#f768a1', 'TG'='#2c7fb8')

visualize <- function(df, title='Algorithm', xbreaks=NULL, ybreaks=NULL) {
  df$step <- df$step + 1

  df.agg <- aggregate(changed ~ algorithm + step, data=df, mean)

  df.agg.sd <- aggregate(changed ~ algorithm + step, data=df, FUN=sd)$changed
  df.agg.sd[is.na(df.agg.sd)] <- 0

  df.agg$ymin <- df.agg$changed - df.agg.sd
  df.agg$ymin[df.agg$ymin < 0] <- 0
  df.agg$ymax <- df.agg$changed + df.agg.sd

  df.agg$algorithm <- sub('^cw-.*', 'CW', df.agg$algorithm)
  df.agg$algorithm <- sub('^tg-.*', 'TG', df.agg$algorithm)

  ggplot(data=df.agg, aes(x=step, y=changed, color=algorithm)) +
    scale_x_continuous('Step', limits=c(1, max(df.agg$step)), labels=scales::comma, breaks=xbreaks) +
    scale_y_continuous('# of labels changed', trans='log1p', labels=scales::comma, breaks=ybreaks) +
    scale_color_manual(title, values=palette) +
    geom_line() +
    geom_point() +
    geom_errorbar(aes(ymin=ymin, ymax=ymax, colour=algorithm), width=.2, na.rm=T, position=position_dodge(width=.3)) +
    theme(
      legend.position=c(.975, .975),
      legend.justification=c('right', 'top'),
      legend.margin=margin(6, 6, 6, 6),
      plot.margin=unit(c(0, 3, 0, 3), 'mm'),
      panel.background=element_blank(),
      panel.grid.major.x=element_blank(),
      panel.grid.minor.x=element_blank(),
      panel.grid.major.y=element_blank(),
      panel.grid.minor.y=element_blank(),
      text=element_text(size=10, family='Helvetica'),
      strip.text=element_text(size=10, family='Helvetica'),
      strip.background=element_blank()
    )
}

lcc.10k <- read.csv('eng_news_2016_10K-eval.tsv', sep='\t')
lcc.10k <- lcc.10k[lcc.10k$algorithm %in% c('cw-10', 'tg-5'),]
print(visualize(lcc.10k, title='eng_news_2016_10K', xbreaks=seq(1, 13), ybreaks=c(0, 1000, 5000)))

lcc.100k <- read.csv('eng_news_2016_100K-eval.tsv', sep='\t')
lcc.100k <- lcc.100k[lcc.100k$algorithm %in% c('cw-50', 'tg-20'),]
print(visualize(lcc.100k, title='eng_news_2016_100K', xbreaks=c(1, seq(5, 40, 5), 47), ybreaks=c(0, 1000, 5000, 10000, 25000)))

lcc.300k <- read.csv('eng_news_2016_300K-eval.tsv', sep='\t')
lcc.300k <- lcc.300k[lcc.300k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.300k, title='eng_news_2016_300K', xbreaks=c(1, seq(5, 20, 5), 23), ybreaks=c(0, 1000, 5000, 10000, 50000)))

lcc.1m <- read.csv('eng_news_2016_1M-eval.tsv', sep='\t')
lcc.1m <- lcc.1m[lcc.1m$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.1m, title='eng_news_2016_1M', xbreaks=c(1, seq(5, 25, 5), 28), ybreaks=c(0, 1000, 5000, 10000, 50000, 100000)))

## lcc-cw-tg-counts.R
library(ggplot2)

palette <- c('CW'='#f768a1', 'TG'='#2c7fb8')

visualize <- function(df, title='Algorithm', xbreaks=NULL, ybreaks=NULL) {
  df$step <- df$step + 1

  df.agg <- aggregate(
    changed ~ algorithm + step,
    data=df[df$changed == 0,],
    FUN=length)

  df.agg$algorithm <- sub('^cw-.*', 'CW', df.agg$algorithm)
  df.agg$algorithm <- sub('^tg-.*', 'TG', df.agg$algorithm)

  ggplot(data=df.agg, aes(x=step, y=changed, fill=algorithm)) +
    scale_x_continuous('Step', breaks=xbreaks) +
    scale_y_continuous('Count', breaks=ybreaks) +
    scale_fill_manual(title, values=palette) +
    geom_col(position=position_dodge2(preserve='total')) +
    theme(
      legend.position='none',
      plot.margin=unit(c(0, 3, 0, 3), 'mm'),
      panel.background=element_blank(),
      panel.grid.major.x=element_blank(),
      panel.grid.minor.x=element_blank(),
      panel.grid.major.y=element_blank(),
      panel.grid.minor.y=element_blank(),
      text=element_text(size=10, family='Helvetica'),
      strip.text=element_text(size=10, family='Helvetica'),
      strip.background=element_blank()
    )
}

lcc.10k <- read.csv('eng_news_2016_10K-eval.tsv', sep='\t')
lcc.10k <- lcc.10k[lcc.10k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.10k, title='eng_news_2016_10K', xbreaks=seq(5, 23), ybreaks=c(seq(0, 20, 5), 23)))

lcc.100k <- read.csv('eng_news_2016_100K-eval.tsv', sep='\t')
lcc.100k <- lcc.100k[lcc.100k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.100k, title='eng_news_2016_100K', xbreaks=seq(11, 47), ybreaks=c(seq(0, 10, 5))))

lcc.300k <- read.csv('eng_news_2016_300K-eval.tsv', sep='\t')
lcc.300k <- lcc.300k[lcc.300k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.300k, title='eng_news_2016_300K', xbreaks=seq(10, 23), ybreaks=c(seq(0, 10, 5), 12)))

lcc.1m <- read.csv('eng_news_2016_1M-eval.tsv', sep='\t')
lcc.1m <- lcc.1m[lcc.1m$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.1m, title='eng_news_2016_1M', xbreaks=seq(11, 28), ybreaks=c(seq(0, 10, 5), 13)))

## lcc_cw_tg_changes.groovy
#!/usr/bin/env groovy
import groovy.transform.CompileStatic
@Grab('org.jgrapht:jgrapht-core:1.3.1')
import org.jgrapht.Graph
import org.jgrapht.graph.SimpleWeightedGraph
import org.jgrapht.util.SupplierUtil
import org.nlpub.watset.graph.ChineseWhispers
import org.nlpub.watset.graph.Clustering
import org.nlpub.watset.graph.NodeWeighting
import org.nlpub.watset.graph.TelephoneGame

import java.nio.file.Paths
import java.nio.file.Files

/*
 * Copyright 2019 Dmitry Ustalov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

Locale.setDefault(Locale.ROOT)

if (args.size() != 1) {
    System.err.println('Usage: groovy -classpath watset.jar cw_tg_changes.groovy co_s.txt')
    System.exit(1)
}

path = Paths.get(args[0])

if (!Files.isRegularFile(path)) {
    System.err.println('Cannot open the input file')
    System.exit(2)
}

builder = SimpleWeightedGraph.createBuilder(SupplierUtil.createDefaultWeightedEdgeSupplier())

path.withReader {
    it.each {
        tokens = it.split('\t', 4)
        (first, second) = [tokens[0] as int, tokens[1] as int]
        if (first > second) (first, second) = [second, first]
        builder.addVertices(first, second)
        builder.addEdge(first, second, tokens[3] as float)
    }
}

graph = builder.build()

@CompileStatic
class CallbackChineseWhispers<V, E> extends ChineseWhispers<V, E> {
    CallbackChineseWhispers(Graph<V, E> graph, NodeWeighting<V, E> weighting, int iterations, Random random) {
        super(graph, weighting, iterations, random)
    }

    Clustering<V> fit(Closure callback) {
        final List<V> nodes = new ArrayList<>(graph.vertexSet())

        labels = new HashMap<>(nodes.size())

        int i = 0

        for (final V node : graph.vertexSet()) {
            labels.put(node, i++)
        }

        for (steps = 0; steps < iterations; steps++) {
            Collections.shuffle(nodes, random)

            int changed = step(nodes)
            callback(steps, changed)

            if (changed == 0) break
        }

        return this
    }
}

@CompileStatic
class CallbackTelephoneGame<V, E> extends TelephoneGame<V, E> {
    CallbackTelephoneGame(Graph<V, E> graph, NodeWeighting<V, E> weighting, int theta, Random random) {
        super(graph, weighting, theta, random)
    }

    Clustering<V> fit(Closure callback) {
        def nodes = new ArrayList<>(graph.vertexSet())

        labels = new HashMap<>(nodes.size())
        active = new HashMap<>(nodes.size())
        history = new HashMap<>(nodes.size())

        int i = 0

        for (final V node : graph.vertexSet()) {
            labels.put(node, i++)
            active.put(node, true)
            history.put(node, new HashMap<>())
        }

        steps = 0

        while (active.values().any()) {
            steps++

            notified = 0

            Collections.shuffle(nodes, random)

            changed = step(nodes)

            callback(steps, notified, changed)
        }

        return this
    }
}

final GRID = [5, 10, 20, 50, 100, 500]

println(['algorithm', 'seed', 'step', 'changed'].join('\t'))

50.times { seed ->
    // Chinese Whispers
    GRID.each { iterations ->
        def changelog = []

        def cw = new CallbackChineseWhispers<>(graph, NodeWeighting.top(), iterations, new Random(seed))
        cw.fit() { steps, changed -> changelog << changed }

        changelog.eachWithIndex { changed, step ->
            printf('%s-%d\t%d\t%d\t%d%n', 'cw', iterations, seed, step, changed)
        }
    }

    // Telephone Game
    GRID.each { theta ->
        def changelog = []

        def tg = new CallbackTelephoneGame<>(graph, NodeWeighting.top(), theta, new Random(seed))
        tg.fit() { steps, _, changed -> changelog << changed }

        changelog.eachWithIndex { changed, step ->
            printf('%s-%d\t%d\t%d\t%d%n', 'tg', theta, seed, step, changed)
        }
    }
}

## Makefile
WATSET ?= ../watset-java/target/watset.jar
LCC ?= ../lcc

export LANG:=en_US.UTF-8
export LC_COLLATE:=C
export CLASSPATH := $(WATSET)

nodes:
	cut -f1,2 $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
	cut -f1,2 $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
	cut -f1,2 $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
	cut -f1,2 $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
	cut -f1,2 $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l

edges:
	wc -l $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt
	wc -l $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt
	wc -l $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt
	wc -l $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt
	wc -l $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt

eng_news_2016_10K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

eng_news_2016_30K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

eng_news_2016_100K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

eng_news_2016_300K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

eng_news_2016_1M-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

clean:
	rm -fv *-eval.tsv
	library(ggplot2)

	palette <- c('CW'='#f768a1', 'TG'='#2c7fb8')

	visualize <- function(df, title='Algorithm', xbreaks=NULL, ybreaks=NULL) {
	df$step <- df$step + 1

	df.agg <- aggregate(changed ~ algorithm + step, data=df, mean)

	df.agg.sd <- aggregate(changed ~ algorithm + step, data=df, FUN=sd)$changed
	df.agg.sd[is.na(df.agg.sd)] <- 0

	df.agg$ymin <- df.agg$changed - df.agg.sd
	df.agg$ymin[df.agg$ymin < 0] <- 0
	df.agg$ymax <- df.agg$changed + df.agg.sd

	df.agg$algorithm <- sub('^cw-.*', 'CW', df.agg$algorithm)
	df.agg$algorithm <- sub('^tg-.*', 'TG', df.agg$algorithm)

	ggplot(data=df.agg, aes(x=step, y=changed, color=algorithm)) +
	scale_x_continuous('Step', limits=c(1, max(df.agg$step)), labels=scales::comma, breaks=xbreaks) +
	scale_y_continuous('# of labels changed', trans='log1p', labels=scales::comma, breaks=ybreaks) +
	scale_color_manual(title, values=palette) +
	geom_line() +
	geom_point() +
	geom_errorbar(aes(ymin=ymin, ymax=ymax, colour=algorithm), width=.2, na.rm=T, position=position_dodge(width=.3)) +
	theme(
	legend.position=c(.975, .975),
	legend.justification=c('right', 'top'),
	legend.margin=margin(6, 6, 6, 6),
	plot.margin=unit(c(0, 3, 0, 3), 'mm'),
	panel.background=element_blank(),
	panel.grid.major.x=element_blank(),
	panel.grid.minor.x=element_blank(),
	panel.grid.major.y=element_blank(),
	panel.grid.minor.y=element_blank(),
	text=element_text(size=10, family='Helvetica'),
	strip.text=element_text(size=10, family='Helvetica'),
	strip.background=element_blank()
	)
	}

	lcc.10k <- read.csv('eng_news_2016_10K-eval.tsv', sep='\t')
	lcc.10k <- lcc.10k[lcc.10k$algorithm %in% c('cw-10', 'tg-5'),]
	print(visualize(lcc.10k, title='eng_news_2016_10K', xbreaks=seq(1, 13), ybreaks=c(0, 1000, 5000)))

	lcc.100k <- read.csv('eng_news_2016_100K-eval.tsv', sep='\t')
	lcc.100k <- lcc.100k[lcc.100k$algorithm %in% c('cw-50', 'tg-20'),]
	print(visualize(lcc.100k, title='eng_news_2016_100K', xbreaks=c(1, seq(5, 40, 5), 47), ybreaks=c(0, 1000, 5000, 10000, 25000)))

	lcc.300k <- read.csv('eng_news_2016_300K-eval.tsv', sep='\t')
	lcc.300k <- lcc.300k[lcc.300k$algorithm %in% c('cw-50', 'tg-10'),]
	print(visualize(lcc.300k, title='eng_news_2016_300K', xbreaks=c(1, seq(5, 20, 5), 23), ybreaks=c(0, 1000, 5000, 10000, 50000)))

	lcc.1m <- read.csv('eng_news_2016_1M-eval.tsv', sep='\t')
	lcc.1m <- lcc.1m[lcc.1m$algorithm %in% c('cw-50', 'tg-10'),]
	print(visualize(lcc.1m, title='eng_news_2016_1M', xbreaks=c(1, seq(5, 25, 5), 28), ybreaks=c(0, 1000, 5000, 10000, 50000, 100000)))
	#!/usr/bin/env groovy
	import groovy.transform.CompileStatic
	@Grab('org.jgrapht:jgrapht-core:1.3.1')
	import org.jgrapht.Graph
	import org.jgrapht.graph.SimpleWeightedGraph
	import org.jgrapht.util.SupplierUtil
	import org.nlpub.watset.graph.ChineseWhispers
	import org.nlpub.watset.graph.Clustering
	import org.nlpub.watset.graph.NodeWeighting
	import org.nlpub.watset.graph.TelephoneGame

	import java.nio.file.Paths
	import java.nio.file.Files

	/*
	* Copyright 2019 Dmitry Ustalov
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*/

	Locale.setDefault(Locale.ROOT)

	if (args.size() != 1) {
	System.err.println('Usage: groovy -classpath watset.jar cw_tg_changes.groovy co_s.txt')
	System.exit(1)
	}

	path = Paths.get(args[0])

	if (!Files.isRegularFile(path)) {
	System.err.println('Cannot open the input file')
	System.exit(2)
	}

	builder = SimpleWeightedGraph.createBuilder(SupplierUtil.createDefaultWeightedEdgeSupplier())

	path.withReader {
	it.each {
	tokens = it.split('\t', 4)
	(first, second) = [tokens[0] as int, tokens[1] as int]
	if (first > second) (first, second) = [second, first]
	builder.addVertices(first, second)
	builder.addEdge(first, second, tokens[3] as float)
	}
	}

	graph = builder.build()

	@CompileStatic
	class CallbackChineseWhispers<V, E> extends ChineseWhispers<V, E> {
	CallbackChineseWhispers(Graph<V, E> graph, NodeWeighting<V, E> weighting, int iterations, Random random) {
	super(graph, weighting, iterations, random)
	}

	Clustering<V> fit(Closure callback) {
	final List<V> nodes = new ArrayList<>(graph.vertexSet())

	labels = new HashMap<>(nodes.size())

	int i = 0

	for (final V node : graph.vertexSet()) {
	labels.put(node, i++)
	}

	for (steps = 0; steps < iterations; steps++) {
	Collections.shuffle(nodes, random)

	int changed = step(nodes)
	callback(steps, changed)

	if (changed == 0) break
	}

	return this
	}
	}

	@CompileStatic
	class CallbackTelephoneGame<V, E> extends TelephoneGame<V, E> {
	CallbackTelephoneGame(Graph<V, E> graph, NodeWeighting<V, E> weighting, int theta, Random random) {
	super(graph, weighting, theta, random)
	}

	Clustering<V> fit(Closure callback) {
	def nodes = new ArrayList<>(graph.vertexSet())

	labels = new HashMap<>(nodes.size())
	active = new HashMap<>(nodes.size())
	history = new HashMap<>(nodes.size())

	int i = 0

	for (final V node : graph.vertexSet()) {
	labels.put(node, i++)
	active.put(node, true)
	history.put(node, new HashMap<>())
	}

	steps = 0

	while (active.values().any()) {
	steps++

	notified = 0

	Collections.shuffle(nodes, random)

	changed = step(nodes)

	callback(steps, notified, changed)
	}

	return this
	}
	}

	final GRID = [5, 10, 20, 50, 100, 500]

	println(['algorithm', 'seed', 'step', 'changed'].join('\t'))

	50.times { seed ->
	// Chinese Whispers
	GRID.each { iterations ->
	def changelog = []

	def cw = new CallbackChineseWhispers<>(graph, NodeWeighting.top(), iterations, new Random(seed))
	cw.fit() { steps, changed -> changelog << changed }

	changelog.eachWithIndex { changed, step ->
	printf('%s-%d\t%d\t%d\t%d%n', 'cw', iterations, seed, step, changed)
	}
	}

	// Telephone Game
	GRID.each { theta ->
	def changelog = []

	def tg = new CallbackTelephoneGame<>(graph, NodeWeighting.top(), theta, new Random(seed))
	tg.fit() { steps, _, changed -> changelog << changed }

	changelog.eachWithIndex { changed, step ->
	printf('%s-%d\t%d\t%d\t%d%n', 'tg', theta, seed, step, changed)
	}
	}
	}
	WATSET ?= ../watset-java/target/watset.jar
	LCC ?= ../lcc

	export LANG:=en_US.UTF-8
	export LC_COLLATE:=C
	export CLASSPATH := $(WATSET)

	nodes:
	cut -f1,2 $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt \| sed -re 's/\t/\n/g' \| sort -u \| wc -l
	cut -f1,2 $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt \| sed -re 's/\t/\n/g' \| sort -u \| wc -l
	cut -f1,2 $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt \| sed -re 's/\t/\n/g' \| sort -u \| wc -l
	cut -f1,2 $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt \| sed -re 's/\t/\n/g' \| sort -u \| wc -l
	cut -f1,2 $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt \| sed -re 's/\t/\n/g' \| sort -u \| wc -l

	edges:
	wc -l $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt
	wc -l $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt
	wc -l $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt
	wc -l $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt
	wc -l $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt

	eng_news_2016_10K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

	eng_news_2016_30K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

	eng_news_2016_100K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

	eng_news_2016_300K-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

	eng_news_2016_1M-eval.tsv:
	nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@

	clean:
	rm -fv *-eval.tsv