Skip to content

Instantly share code, notes, and snippets.

@c-rack
Forked from hay/WMStats.java
Last active August 29, 2015 14:08
Show Gist options
  • Save c-rack/06a2b34721ad0812e908 to your computer and use it in GitHub Desktop.
Save c-rack/06a2b34721ad0812e908 to your computer and use it in GitHub Desktop.

Here's the assignment:

Download this raw statistics dump from Wikipedia (360mb unzipped):

http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-10/pagecounts-20141029-230000.gz

Write a simple script in your favourite programming language that:

  • Gets all views from the English Wikipedia (these are prefixed by "en ")
  • Limit those articles to the ones with at least 500 views
  • Sort by number of views, highest ones first and print the first ten articles.
  • Also measure the time this takes and print it out as well.

Right now we've got versions in Javascript (Node.js), PHP, Go, Python, Ruby, Bash (awk/sed/grep), Groovy and Java in both Java 8 functional style and 'old school' style.

The Bash, Groovy and Java versions were written by @breun, the Ruby version was written by @tieleman, the others by yours truly.

Some measurements on my machine (2011 Macbook Pro, no SSD):

  • Go: 2.36s (2.31, 2.42, 2.36, 2.33, 2.36)
  • Java (oldschool): 4.77s / 2.66s if not taking the first measurement into account (13.15, 2.59, 2.48, 3.08, 2.58, 2.58)
  • Node.js: 7.10s (7.56, 7.18, 7.01, 6.89, 6.88)
  • Groovy: 8.14s (9.16, 7.67, 8.60, 7.87, 7.40)
  • PHP: 8.42s (8.54, 8.31, 8.47, 8.41, 8.39)
  • Ruby: 8.85s (9.3, 9.38, 8.6, 8.37, 8.61)
  • Python: 9.26s (8.35, 8.54, 10.43, 9.35, 9.62)
  • Bash: 12.34s (12.62, 12.22, 12.78, 12.80, 11.29)

Your output should look like this:

Query took 7.56 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)
// Run with javac WMStats.java && java WMStats
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
public class WMStats {
private static final String filename = "pagecounts-20141029-230000";
private static final String prefix = "en ";
private static final int threshold = 500;
private static final Charset charset = StandardCharsets.ISO_8859_1;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
long start = System.currentTimeMillis();
List<NameAndCount> top10 = Files
.lines(Paths.get(filename), charset)
.filter(line -> line.startsWith(prefix))
.map(line -> line.split(" "))
.map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
.filter(pair -> pair.count >= threshold)
.sorted((a, b) -> Integer.compare(b.count, a.count))
.limit(10)
.collect(Collectors.toList());
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : top10) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
}
}
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class WMStats {
private static final String FILE_NAME = "pagecounts-20141029-230000";
private static final String PREFIX = "en ";
private static final String DELIMITER = " ";
private static final int THRESHOLD = 500;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
List<NameAndCount> list = new ArrayList<>();
long start = System.currentTimeMillis();
try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
for (String line; (line = reader.readLine()) != null; ) {
if (!line.startsWith(PREFIX)) {
continue;
}
String[] parts = line.split(DELIMITER);
int count = Integer.parseInt(parts[2]);
if (count >= THRESHOLD) {
list.add(new NameAndCount(parts[1], count));
}
}
}
Collections.sort(list);
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : list.subList(0, 9)) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount implements Comparable<NameAndCount> {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
@Override
public int compareTo(NameAndCount other) {
return Integer.compare(other.count, count);
}
}
}
time grep '^en ' pagecounts-20141029-230000 |\
awk '{ if ($3 >= 500) print $3 " " $2 }' |\
LC_ALL=C sort -nr |\
head -n 10 |\
awk '{ print $2 " (" $1 ")" }'
#include <stdio.h> // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h> // clock()
#define BUFSIZE 1024
struct article {
char title[BUFSIZE];
int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;
void findLowestSlot() {
slot = -1;
v = 1 << 30;
for( i=0; i<10; i++ ) {
if (articles[i].views < v) {
slot = i;
v = articles[i].views;
}
}
}
void printSorted() {
for( j=0; j<10; j++ ) {
v = 0;
slot = 0;
for( i=0; i<10; i++ ) {
if( articles[i].views > v ) {
v = articles[i].views;
slot = i;
}
}
printf("%s (%d)\n", articles[slot].title, articles[slot].views);
articles[slot].views = 0;
}
printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}
int main(void) {
start = clock();
FILE* file = fopen("pagecounts-20141029-230000", "r");
while( !feof(file) ) {
fgets(&line[0], BUFSIZE-1, file);
if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
continue;
}
column = 3;
while( line[column++] != ' ' );
views = atoi(&line[column]);
if( views < 500 ) {
continue;
}
findLowestSlot();
if( slot != -1 && articles[slot].views < views ) {
articles[slot].views = views;
line[column-1] = '\0';
strcpy(articles[slot].title, &line[3]);
}
}
fclose(file);
printSorted();
return 0;
}
package main
import (
"fmt"
"os"
"bufio"
"strings"
"strconv"
"sort"
"time"
)
type Article struct {
article string
views int
}
type Articles []Article
func timestamp() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}
func (slice Articles) Len() int {
return len(slice)
}
func (slice Articles) Less(i, j int) bool {
return slice[i].views > slice[j].views
}
func (slice Articles) Swap(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
}
func main() {
filename := "pagecounts-20141029-230000"
minviews := 500
prefix := "en "
file, err := os.Open(filename)
if err != nil {
fmt.Println(err)
}
scanner := bufio.NewScanner(file)
articles := make(Articles, 0)
start := timestamp()
for scanner.Scan() {
line := scanner.Text()
if (!strings.HasPrefix(line, prefix)) {
continue
}
parts := strings.Split(line, " ")
articlename := parts[1]
views := parts[2]
viewsInt, _ := strconv.Atoi(views)
if (viewsInt > minviews) {
article := Article{
article: articlename,
views : viewsInt,
}
articles = append(articles, article)
}
}
sort.Sort(articles)
elapsed := timestamp() - start
// So how do we get a float here in as a Println?
fmt.Printf("Query took %v seconds\n", (elapsed / 10))
for i := 0; i < 10; i++ {
fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
}
}
#!/usr/bin/env groovy
filename = 'pagecounts-20141029-230000'
prefix = 'en'
threshold = 500
overThreshold = []
start = System.currentTimeMillis()
new File(filename).splitEachLine(" ") { elems ->
def count = elems[2].toInteger()
if (elems[0] == prefix && count >= threshold) {
overThreshold << [count, elems[1]]
}
}
overThreshold.sort { a, b -> b[0] <=> a[0] }
println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"
overThreshold[0..9].each { line ->
println line[1] + ' (' + line[0] + ')'
}
var fs = require('fs');
var readline = require('readline');
var filename = 'pagecounts-20141029-230000';
var minviews = 500;
var prefix = 'en ';
var rd = readline.createInterface({
input : fs.createReadStream(filename),
output : process.stdout,
terminal : false
});
var start = +new Date();
var count = [];
rd.on('line', function(line) {
if (line.indexOf(prefix) !== 0) {
return;
}
var parts = line.split(" ");
var lang = parts[0];
var article = parts[1];
var views = parts[2];
if (views > minviews) {
count.push([article, parseInt(views)]);
}
});
rd.on('close', function() {
count.sort(function(a, b) {
return a[1] > b[1] ? -1 : 1;
});
var elapsed = (+new Date() - start) / 1000;
console.log("Query took " + elapsed.toFixed(2) + " seconds");
count.slice(0, 10).forEach(function(item) {
console.log(item[0] + " (" + item[1] + ")");
});
});
<?php
$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";
$f = fopen("pagecounts-20141029-230000", "r");
$start = microtime(true);
$count = [];
while (($line = fgets($f)) !== false) {
$len = strlen($prefix);
if (substr($line, 0, $len) !== $prefix) {
continue;
}
$pieces = explode(" ", $line);
$lang = $pieces[0];
$article = $pieces[1];
$views = $pieces[2];
if ($views > $minviews) {
$count[] = [$article, $views];
}
}
usort($count, function($a, $b) {
return $a[1] > $b[1] ? -1 : 1;
});
$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";
$count = array_slice($count, 0, 10);
foreach ($count as $item) {
printf("%s (%s) \n", $item[0], $item[1]);
}
import time
filename = "pagecounts-20141029-230000"
minviews = 500
prefix = "en "
f = open(filename, "r").readlines()
start = time.time()
count = []
for line in f:
if prefix and not line.startswith(prefix):
continue
(lang, article, views, size) = line.split(" ")
if int(views) > minviews:
count.append((article, views))
count.sort(key = lambda x: int(x[1]), reverse = True)
elapsed = time.time() - start
print "Query took %s seconds" % round(elapsed, 2)
for item in count[0:10]:
print "%s (%s)" % item
require 'benchmark'
filename = 'pagecounts-20141029-230000'
min_views = 500
prefix = 'en '
count = []
time = Benchmark.measure do
File.open(filename).each_line do |line|
next unless line.start_with? prefix
parts = line.force_encoding("iso-8859-1").split(' ', 3)
article, views = parts[1], parts[2].to_i
count << [article, views] if views > min_views
end
count.sort! { |a,b| b[1] <=> a[1] }
end
puts "Query took #{time.total} seconds"
count[0..9].each { |item| puts "#{item[0]} (#{item[1]})" }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment