Skip to content

Instantly share code, notes, and snippets.

@dustalov
Last active August 31, 2019 21:31
Show Gist options
  • Save dustalov/773b6e7e5a462895d1bacb9a91386068 to your computer and use it in GitHub Desktop.
Save dustalov/773b6e7e5a462895d1bacb9a91386068 to your computer and use it in GitHub Desktop.
Chinese Whispers and Telephone Game Performance Evaluation
library(ggplot2)
palette <- c('CW'='#f768a1', 'TG'='#2c7fb8')
visualize <- function(df, title='Algorithm', xbreaks=NULL, ybreaks=NULL) {
df$step <- df$step + 1
df.agg <- aggregate(changed ~ algorithm + step, data=df, mean)
df.agg.sd <- aggregate(changed ~ algorithm + step, data=df, FUN=sd)$changed
df.agg.sd[is.na(df.agg.sd)] <- 0
df.agg$ymin <- df.agg$changed - df.agg.sd
df.agg$ymin[df.agg$ymin < 0] <- 0
df.agg$ymax <- df.agg$changed + df.agg.sd
df.agg$algorithm <- sub('^cw-.*', 'CW', df.agg$algorithm)
df.agg$algorithm <- sub('^tg-.*', 'TG', df.agg$algorithm)
ggplot(data=df.agg, aes(x=step, y=changed, color=algorithm)) +
scale_x_continuous('Step', limits=c(1, max(df.agg$step)), labels=scales::comma, breaks=xbreaks) +
scale_y_continuous('# of labels changed', trans='log1p', labels=scales::comma, breaks=ybreaks) +
scale_color_manual(title, values=palette) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=ymin, ymax=ymax, colour=algorithm), width=.2, na.rm=T, position=position_dodge(width=.3)) +
theme(
legend.position=c(.975, .975),
legend.justification=c('right', 'top'),
legend.margin=margin(6, 6, 6, 6),
plot.margin=unit(c(0, 3, 0, 3), 'mm'),
panel.background=element_blank(),
panel.grid.major.x=element_blank(),
panel.grid.minor.x=element_blank(),
panel.grid.major.y=element_blank(),
panel.grid.minor.y=element_blank(),
text=element_text(size=10, family='Helvetica'),
strip.text=element_text(size=10, family='Helvetica'),
strip.background=element_blank()
)
}
lcc.10k <- read.csv('eng_news_2016_10K-eval.tsv', sep='\t')
lcc.10k <- lcc.10k[lcc.10k$algorithm %in% c('cw-10', 'tg-5'),]
print(visualize(lcc.10k, title='eng_news_2016_10K', xbreaks=seq(1, 13), ybreaks=c(0, 1000, 5000)))
lcc.100k <- read.csv('eng_news_2016_100K-eval.tsv', sep='\t')
lcc.100k <- lcc.100k[lcc.100k$algorithm %in% c('cw-50', 'tg-20'),]
print(visualize(lcc.100k, title='eng_news_2016_100K', xbreaks=c(1, seq(5, 40, 5), 47), ybreaks=c(0, 1000, 5000, 10000, 25000)))
lcc.300k <- read.csv('eng_news_2016_300K-eval.tsv', sep='\t')
lcc.300k <- lcc.300k[lcc.300k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.300k, title='eng_news_2016_300K', xbreaks=c(1, seq(5, 20, 5), 23), ybreaks=c(0, 1000, 5000, 10000, 50000)))
lcc.1m <- read.csv('eng_news_2016_1M-eval.tsv', sep='\t')
lcc.1m <- lcc.1m[lcc.1m$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.1m, title='eng_news_2016_1M', xbreaks=c(1, seq(5, 25, 5), 28), ybreaks=c(0, 1000, 5000, 10000, 50000, 100000)))
library(ggplot2)
palette <- c('CW'='#f768a1', 'TG'='#2c7fb8')
visualize <- function(df, title='Algorithm', xbreaks=NULL, ybreaks=NULL) {
df$step <- df$step + 1
df.agg <- aggregate(
changed ~ algorithm + step,
data=df[df$changed == 0,],
FUN=length)
df.agg$algorithm <- sub('^cw-.*', 'CW', df.agg$algorithm)
df.agg$algorithm <- sub('^tg-.*', 'TG', df.agg$algorithm)
ggplot(data=df.agg, aes(x=step, y=changed, fill=algorithm)) +
scale_x_continuous('Step', breaks=xbreaks) +
scale_y_continuous('Count', breaks=ybreaks) +
scale_fill_manual(title, values=palette) +
geom_col(position=position_dodge2(preserve='total')) +
theme(
legend.position='none',
plot.margin=unit(c(0, 3, 0, 3), 'mm'),
panel.background=element_blank(),
panel.grid.major.x=element_blank(),
panel.grid.minor.x=element_blank(),
panel.grid.major.y=element_blank(),
panel.grid.minor.y=element_blank(),
text=element_text(size=10, family='Helvetica'),
strip.text=element_text(size=10, family='Helvetica'),
strip.background=element_blank()
)
}
lcc.10k <- read.csv('eng_news_2016_10K-eval.tsv', sep='\t')
lcc.10k <- lcc.10k[lcc.10k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.10k, title='eng_news_2016_10K', xbreaks=seq(5, 23), ybreaks=c(seq(0, 20, 5), 23)))
lcc.100k <- read.csv('eng_news_2016_100K-eval.tsv', sep='\t')
lcc.100k <- lcc.100k[lcc.100k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.100k, title='eng_news_2016_100K', xbreaks=seq(11, 47), ybreaks=c(seq(0, 10, 5))))
lcc.300k <- read.csv('eng_news_2016_300K-eval.tsv', sep='\t')
lcc.300k <- lcc.300k[lcc.300k$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.300k, title='eng_news_2016_300K', xbreaks=seq(10, 23), ybreaks=c(seq(0, 10, 5), 12)))
lcc.1m <- read.csv('eng_news_2016_1M-eval.tsv', sep='\t')
lcc.1m <- lcc.1m[lcc.1m$algorithm %in% c('cw-50', 'tg-10'),]
print(visualize(lcc.1m, title='eng_news_2016_1M', xbreaks=seq(11, 28), ybreaks=c(seq(0, 10, 5), 13)))
#!/usr/bin/env groovy
import groovy.transform.CompileStatic
@Grab('org.jgrapht:jgrapht-core:1.3.1')
import org.jgrapht.Graph
import org.jgrapht.graph.SimpleWeightedGraph
import org.jgrapht.util.SupplierUtil
import org.nlpub.watset.graph.ChineseWhispers
import org.nlpub.watset.graph.Clustering
import org.nlpub.watset.graph.NodeWeighting
import org.nlpub.watset.graph.TelephoneGame
import java.nio.file.Paths
import java.nio.file.Files
/*
* Copyright 2019 Dmitry Ustalov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
Locale.setDefault(Locale.ROOT)
if (args.size() != 1) {
System.err.println('Usage: groovy -classpath watset.jar cw_tg_changes.groovy co_s.txt')
System.exit(1)
}
path = Paths.get(args[0])
if (!Files.isRegularFile(path)) {
System.err.println('Cannot open the input file')
System.exit(2)
}
builder = SimpleWeightedGraph.createBuilder(SupplierUtil.createDefaultWeightedEdgeSupplier())
path.withReader {
it.each {
tokens = it.split('\t', 4)
(first, second) = [tokens[0] as int, tokens[1] as int]
if (first > second) (first, second) = [second, first]
builder.addVertices(first, second)
builder.addEdge(first, second, tokens[3] as float)
}
}
graph = builder.build()
@CompileStatic
class CallbackChineseWhispers<V, E> extends ChineseWhispers<V, E> {
CallbackChineseWhispers(Graph<V, E> graph, NodeWeighting<V, E> weighting, int iterations, Random random) {
super(graph, weighting, iterations, random)
}
Clustering<V> fit(Closure callback) {
final List<V> nodes = new ArrayList<>(graph.vertexSet())
labels = new HashMap<>(nodes.size())
int i = 0
for (final V node : graph.vertexSet()) {
labels.put(node, i++)
}
for (steps = 0; steps < iterations; steps++) {
Collections.shuffle(nodes, random)
int changed = step(nodes)
callback(steps, changed)
if (changed == 0) break
}
return this
}
}
@CompileStatic
class CallbackTelephoneGame<V, E> extends TelephoneGame<V, E> {
CallbackTelephoneGame(Graph<V, E> graph, NodeWeighting<V, E> weighting, int theta, Random random) {
super(graph, weighting, theta, random)
}
Clustering<V> fit(Closure callback) {
def nodes = new ArrayList<>(graph.vertexSet())
labels = new HashMap<>(nodes.size())
active = new HashMap<>(nodes.size())
history = new HashMap<>(nodes.size())
int i = 0
for (final V node : graph.vertexSet()) {
labels.put(node, i++)
active.put(node, true)
history.put(node, new HashMap<>())
}
steps = 0
while (active.values().any()) {
steps++
notified = 0
Collections.shuffle(nodes, random)
changed = step(nodes)
callback(steps, notified, changed)
}
return this
}
}
final GRID = [5, 10, 20, 50, 100, 500]
println(['algorithm', 'seed', 'step', 'changed'].join('\t'))
50.times { seed ->
// Chinese Whispers
GRID.each { iterations ->
def changelog = []
def cw = new CallbackChineseWhispers<>(graph, NodeWeighting.top(), iterations, new Random(seed))
cw.fit() { steps, changed -> changelog << changed }
changelog.eachWithIndex { changed, step ->
printf('%s-%d\t%d\t%d\t%d%n', 'cw', iterations, seed, step, changed)
}
}
// Telephone Game
GRID.each { theta ->
def changelog = []
def tg = new CallbackTelephoneGame<>(graph, NodeWeighting.top(), theta, new Random(seed))
tg.fit() { steps, _, changed -> changelog << changed }
changelog.eachWithIndex { changed, step ->
printf('%s-%d\t%d\t%d\t%d%n', 'tg', theta, seed, step, changed)
}
}
}
WATSET ?= ../watset-java/target/watset.jar
LCC ?= ../lcc
export LANG:=en_US.UTF-8
export LC_COLLATE:=C
export CLASSPATH := $(WATSET)
nodes:
cut -f1,2 $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
cut -f1,2 $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
cut -f1,2 $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
cut -f1,2 $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
cut -f1,2 $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
edges:
wc -l $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt
wc -l $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt
wc -l $(LCC)/eng_news_2016_100K/eng_news_2016_100K-co_s.txt
wc -l $(LCC)/eng_news_2016_300K/eng_news_2016_300K-co_s.txt
wc -l $(LCC)/eng_news_2016_1M/eng_news_2016_1M-co_s.txt
eng_news_2016_10K-eval.tsv:
nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@
eng_news_2016_30K-eval.tsv:
nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@
eng_news_2016_100K-eval.tsv:
nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@
eng_news_2016_300K-eval.tsv:
nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@
eng_news_2016_1M-eval.tsv:
nice groovy lcc_cw_tg_changes.groovy $(LCC)/$(patsubst %-eval.tsv,%,$@)/$(patsubst %-eval.tsv,%-co_s.txt,$@) > $@
clean:
rm -fv *-eval.tsv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment