Skip to content

Instantly share code, notes, and snippets.

@dustalov
Last active June 23, 2019 21:44
Show Gist options
  • Save dustalov/2c1340b972fd28f8cad30e45adf24178 to your computer and use it in GitHub Desktop.
Save dustalov/2c1340b972fd28f8cad30e45adf24178 to your computer and use it in GitHub Desktop.
Watset (Java) Performance Measurement
#!/usr/bin/env groovy
import org.apache.commons.math3.stat.descriptive.moment.Mean
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation
import org.jgrapht.graph.SimpleWeightedGraph
import org.jgrapht.util.SupplierUtil
import org.nlpub.watset.graph.ChineseWhispers
import org.nlpub.watset.graph.NodeWeighting
import org.nlpub.watset.graph.MaxMax
import org.nlpub.watset.eval.Measurer
import org.nlpub.watset.graph.Watset
import java.nio.file.Paths
import java.util.concurrent.ForkJoinPool
import java.util.logging.Level
import java.util.logging.LogManager
import java.util.logging.Logger
/*
* Copyright 2018 Dmitry Ustalov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
Locale.setDefault(Locale.ROOT)
def options = new CliBuilder().with {
usage = 'collocation.groovy [-s] [-p]'
s 'silent'
p 'parallel'
parse(args) ?: System.exit(1)
}
logger = Logger.getLogger('Collocation')
if (options.s) {
LogManager.getLogManager().getLogger('').getHandlers().each { it.setLevel(Level.WARNING) }
}
if (options.p) {
logger.info(String.format('Parallelism level is %d.', ForkJoinPool.commonPool().getParallelism()))
} else {
// This is a very important bit that effectively disables stream parallelism.
System.properties['java.util.concurrent.ForkJoinPool.common.parallelism'] = '1'
assert ForkJoinPool.commonPool().getParallelism() == 1
}
if (!options.arguments()) {
logger.warning('No collocation file provided.')
System.exit(2)
}
builder = SimpleWeightedGraph.createBuilder(SupplierUtil.createDefaultWeightedEdgeSupplier())
Paths.get(options.arguments()[0]).withReader {
it.each {
tokens = it.split('\t', 4)
(first, second) = [tokens[0] as int, tokens[1] as int]
if (first > second) (first, second) = [second, first]
builder.addVertices(first, second)
builder.addEdge(first, second, tokens[3] as float)
}
}
graph = builder.build()
degree = graph.vertexSet().stream().mapToInt({ graph.degreeOf(it) }).max().orElse(0)
algorithms = new LinkedHashMap()
algorithms.put('cw', ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337)))
algorithms.put('maxmax', MaxMax.provider())
algorithms.put('watset-top-top', Watset.provider(
ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337)),
ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337))))
System.out.printf('%s\t%s\t%s\t%s\t%s\t%s\n', 'algorithm', 'nodes', 'edges', 'degree', 'clusters', 'mean', 'stddev')
algorithms.each { algorithmEntry ->
algorithmName = algorithmEntry.key
algorithm = algorithmEntry.value
measurer = new Measurer(algorithm, graph)
measurer.run()
double[] durations = measurer.getDurations()
double[] clusters = Arrays.stream(measurer.getClusters()).asDoubleStream().toArray()
System.out.printf('%s\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\n',
algorithmName,
graph.vertexSet().size(),
graph.edgeSet().size(),
degree,
new Mean().evaluate(clusters),
new Mean().evaluate(durations),
new StandardDeviation().evaluate(durations)
)
}
#!/bin/bash -e
export JAVA_OPTS='-Xms64G -Xmx64G'
export CLASSPATH="$HOME/watset-java/target/watset.jar"
LEIPZIG="$HOME/leipzig"
for corpus in {eng_news_2016,deu_news_2015,rus_news_2010}_{10K,30K,100K,300K,1M}; do
groovy collocation.groovy "$LEIPZIG/$corpus/$corpus-co_s.txt" | tee collocation-$corpus.txt
groovy collocation.groovy -p "$LEIPZIG/$corpus/$corpus-co_s.txt" | tee collocation-parallel-$corpus.txt
done
#!/usr/bin/env groovy
import org.apache.commons.math3.stat.descriptive.moment.Mean
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation
import org.jgrapht.Graph
import org.jgrapht.generate.CompleteGraphGenerator
import org.jgrapht.generate.GnmRandomGraphGenerator
import org.jgrapht.generate.ScaleFreeGraphGenerator
import org.jgrapht.generate.StarGraphGenerator
import org.jgrapht.graph.SimpleWeightedGraph
import org.jgrapht.util.SupplierUtil
import org.nlpub.watset.graph.ChineseWhispers
import org.nlpub.watset.graph.NodeWeighting
import org.nlpub.watset.graph.MaxMax
import org.nlpub.watset.eval.Measurer
import org.nlpub.watset.graph.Watset
import java.util.concurrent.ForkJoinPool
import java.util.logging.Level
import java.util.logging.LogManager
import java.util.logging.Logger
/*
* Copyright 2018 Dmitry Ustalov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
Locale.setDefault(Locale.ROOT)
def options = new CliBuilder().with {
usage = 'performance.groovy [-p] [-s]'
p 'parallel'
s 'silent'
parse(args) ?: System.exit(1)
}
if (!options.p) {
// This is a very important bit that effectively disables stream parallelism.
System.properties['java.util.concurrent.ForkJoinPool.common.parallelism'] = '1'
assert ForkJoinPool.commonPool().getParallelism() == 1
}
logger = Logger.getLogger('Performance')
if (options.s) {
LogManager.getLogManager().getLogger('').getHandlers().each { it.setLevel(Level.WARNING) }
}
starGraph = { int n -> new StarGraphGenerator(n) }
scaleFreeGraph = { int n -> new ScaleFreeGraphGenerator(n, new Random(1337)) }
erdosRenyiGraph = { int n -> new GnmRandomGraphGenerator(n, n * 3, new Random(1337), false, false) }
completeGraph = { int n -> new CompleteGraphGenerator(n) }
static def generate(n, generator) {
new SimpleWeightedGraph(SupplierUtil.createIntegerSupplier(), SupplierUtil.createDefaultEdgeSupplier()).with {
generator(n).generateGraph(it, new HashMap())
it
}
}
graphs = new LinkedHashMap<Graph, String>()
degrees = new HashMap<Graph, String>()
1.upto(options.arguments() ? options.arguments()[0] as int : 3) {
logger.info(String.format('Generating graphs for 10^%d.', it))
graphs.put(generate(10**it, starGraph), 'star')
logger.info('Star graph done.')
graphs.put(generate(10**it, erdosRenyiGraph), 'erdos-renyi')
logger.info('Erdős-Rényi graph done.')
if (it < 6) {
graphs.put(generate(10**it, scaleFreeGraph), 'scale-free')
logger.info('Scale-free graph done.')
}
if (it < 3) {
graphs.put(generate(10**it, completeGraph), 'complete')
logger.info('Complete graph done.')
}
}
graphs.keySet().each { graph ->
degrees.put(graph, graph.vertexSet().stream().
mapToInt({ graph.degreeOf(it) }).
max().orElse(0))
}
algorithms = new LinkedHashMap()
algorithms.put('cw', ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337)))
algorithms.put('maxmax', MaxMax.provider())
algorithms.put('watset-top-top', Watset.provider(
ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337)),
ChineseWhispers.provider(
NodeWeighting.top(),
ChineseWhispers.ITERATIONS,
new Random(1337))))
System.out.printf('%s\t%s\t%s\t%s\t%s\t%s\t%s\n', 'algorithm', 'graph', 'nodes', 'edges', 'degree', 'clusters', 'mean', 'stddev')
algorithms.each { algorithmEntry ->
algorithmName = algorithmEntry.key
algorithm = algorithmEntry.value
graphs.each { graphEntry ->
graphName = graphEntry.value
graph = graphEntry.key
measurer = new Measurer(algorithm, graph)
measurer.run()
double[] durations = measurer.getDurations()
double[] clusters = Arrays.stream(measurer.getClusters()).asDoubleStream().toArray()
System.out.printf('%s\t%s\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\n',
algorithmName,
graphName,
graph.vertexSet().size(),
graph.edgeSet().size(),
degrees.get(graph),
new Mean().evaluate(clusters),
new Mean().evaluate(durations),
new StandardDeviation().evaluate(durations)
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment