Skip to content

Instantly share code, notes, and snippets.

@hay
Last active August 30, 2016 11:03
Show Gist options
  • Save hay/74908efe0d79557a97ff to your computer and use it in GitHub Desktop.
Save hay/74908efe0d79557a97ff to your computer and use it in GitHub Desktop.
Wikipedia statistics parser comparison

Here's the assignment:

Download this raw statistics dump from Wikipedia (360mb unzipped):

http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-10/pagecounts-20141029-230000.gz

Write a simple script in your favourite programming language that:

  • Gets all views from the English Wikipedia (these are prefixed by "en ")
  • Limit those articles to the ones with at least 500 views
  • Sort by number of views, highest ones first and print the first ten articles.
  • Also measure the time this takes and print it out as well.

Right now we've got versions in Javascript (Node.js), PHP, Go, Python, Ruby, Bash (awk/sed/grep), Groovy and Java in both Java 8 functional style and 'old school' style.

The Bash, Groovy and Java versions were written by @breun, the Ruby version was written by @tieleman, the others by yours truly.

Some measurements on my machine (2011 Macbook Pro, no SSD):

  • C: 1.63s (1.58, 1.73, 1.59, 1.62, 1.63)
  • Go: 2.36s (2.31, 2.42, 2.36, 2.33, 2.36)
  • Java (oldschool): 4.77s / 2.66s if not taking the first measurement into account (13.15, 2.59, 2.48, 3.08, 2.58, 2.58)
  • Groovy: 4.33s (4.16s, 4.27s, 4.55, 4.42, 4.27)
  • Node.js: 7.10s (7.56, 7.18, 7.01, 6.89, 6.88)
  • PHP: 7.44s (7.25, 7.35, 7.28, 7.37, 7.97)
  • Python: 7.45s (6.59, 7.28, 6.81, 8.99, 7.59)
  • Ruby: 8.85s (9.3, 9.38, 8.6, 8.37, 8.61)
  • Bash: 12.34s (12.62, 12.22, 12.78, 12.80, 11.29)
  • Lua: 22.81s (24.08, 22.65, 22.11, 21.53, 23.70)

Your output should look like this:

Query took 7.56 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)
// Run with javac WMStats.java && java WMStats
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
public class WMStats {
private static final String filename = "pagecounts-20141029-230000";
private static final String prefix = "en ";
private static final int threshold = 500;
private static final Charset charset = StandardCharsets.ISO_8859_1;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
long start = System.currentTimeMillis();
List<NameAndCount> top10 = Files
.lines(Paths.get(filename), charset)
.filter(line -> line.startsWith(prefix))
.map(line -> line.split(" "))
.map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
.filter(pair -> pair.count >= threshold)
.sorted((a, b) -> Integer.compare(b.count, a.count))
.limit(10)
.collect(Collectors.toList());
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : top10) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
}
}
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class WMStats {
private static final String FILE_NAME = "pagecounts-20141029-230000";
private static final String PREFIX = "en ";
private static final String DELIMITER = " ";
private static final int THRESHOLD = 500;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
List<NameAndCount> list = new ArrayList<>();
long start = System.currentTimeMillis();
try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
for (String line; (line = reader.readLine()) != null; ) {
if (!line.startsWith(PREFIX)) {
continue;
}
String[] parts = line.split(DELIMITER);
int count = Integer.parseInt(parts[2]);
if (count >= THRESHOLD) {
list.add(new NameAndCount(parts[1], count));
}
}
}
Collections.sort(list);
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : list.subList(0, 9)) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount implements Comparable<NameAndCount> {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
@Override
public int compareTo(NameAndCount other) {
return Integer.compare(other.count, count);
}
}
}
time grep '^en ' pagecounts-20141029-230000 |\
awk '{ if ($3 >= 500) print $3 " " $2 }' |\
LC_ALL=C sort -nr |\
head -n 10 |\
awk '{ print $2 " (" $1 ")" }'
// Compile with gcc -Wall -O3 ws.c -o ws && ./ws
#include <stdio.h> // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h> // clock()
#define BUFSIZE 1024
struct article {
char title[BUFSIZE];
int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;
void findLowestSlot() {
slot = -1;
v = 1 << 30;
for( i=0; i<10; i++ ) {
if (articles[i].views < v) {
slot = i;
v = articles[i].views;
}
}
}
void printSorted() {
for( j=0; j<10; j++ ) {
v = 0;
slot = 0;
for( i=0; i<10; i++ ) {
if( articles[i].views > v ) {
v = articles[i].views;
slot = i;
}
}
printf("%s (%d)\n", articles[slot].title, articles[slot].views);
articles[slot].views = 0;
}
printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}
int main(void) {
start = clock();
FILE* file = fopen("pagecounts-20141029-230000", "r");
while( !feof(file) ) {
fgets(&line[0], BUFSIZE-1, file);
if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
continue;
}
column = 3;
while( line[column++] != ' ' );
views = atoi(&line[column]);
if( views < 500 ) {
continue;
}
findLowestSlot();
if( slot != -1 && articles[slot].views < views ) {
articles[slot].views = views;
line[column-1] = '\0';
strcpy(articles[slot].title, &line[3]);
}
}
fclose(file);
printSorted();
return 0;
}
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
class Program
{
static void Main(string[] args)
{
string filename = "pagecounts-20141029-230000";
int minViews = 500;
Stopwatch timer = new Stopwatch();
timer.Start();
var top10 = File.ReadLines(filename)
.Where(line => line.Substring(0, 3) == "en ")
.Select(line => line.Split(' '))
.Select(values => Tuple.Create(values[1], int.Parse(values[2])))
.Where(tuple => tuple.Item2 >= minViews)
.OrderByDescending(tuple => tuple.Item2)
.Take(10)
.ToArray();
timer.Stop();
Console.WriteLine("Query took " + timer.Elapsed.TotalSeconds + " seconds");
foreach (var tuple in top10)
{
Console.WriteLine("{0} ({1})", tuple.Item1, tuple.Item2);
}
}
}
package main
import (
"fmt"
"os"
"bufio"
"strings"
"strconv"
"sort"
"time"
)
type Article struct {
article string
views int
}
type Articles []Article
func timestamp() int64 {
return time.Now().UnixNano()
}
func (slice Articles) Len() int {
return len(slice)
}
func (slice Articles) Less(i, j int) bool {
return slice[i].views > slice[j].views
}
func (slice Articles) Swap(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
}
func main() {
filename := "pagecounts-20141029-230000"
minviews := 500
prefix := "en "
file, err := os.Open(filename)
if err != nil {
fmt.Println(err)
}
scanner := bufio.NewScanner(file)
articles := make(Articles, 0)
start := timestamp()
for scanner.Scan() {
line := scanner.Text()
if (!strings.HasPrefix(line, prefix)) {
continue
}
parts := strings.Split(line, " ")
articlename := parts[1]
views := parts[2]
viewsInt, _ := strconv.Atoi(views)
if (viewsInt > minviews) {
article := Article{
article: articlename,
views : viewsInt,
}
articles = append(articles, article)
}
}
sort.Sort(articles)
elapsed := timestamp() - start
fmt.Printf("Query took %f seconds\n", time.Duration(elapsed).Seconds())
for i := 0; i < 10; i++ {
fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
}
}
#!/usr/bin/env groovy
filename = 'pagecounts-20141029-230000'
prefix = 'en '
delimiter = ' '
threshold = 500
overThreshold = []
start = System.currentTimeMillis()
new File(filename).eachLine { line ->
if (line.startsWith(prefix)) {
def elems = line.split(delimiter)
def count = elems[2].toInteger()
if (count >= threshold) {
overThreshold << [count, elems[1]]
}
}
}
overThreshold.sort { a, b -> b[0] <=> a[0] }
println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"
overThreshold[0..9].each { line ->
println line[1] + ' (' + line[0] + ')'
}
{- Compile with ghc -Wall -O2 wmstats.hs and time with time ./wmstats -}
{-# LANGUAGE OverloadedStrings #-}
import Data.List (sortBy)
import qualified Data.ByteString.Char8 as B8
main :: IO ()
main = B8.readFile "pagecounts-20141029-230000" >>=
mapM_ display . take 10 . sortBy (flip compare) . filter popular
. map parse . filter en . map B8.words . B8.lines
where
en :: [B8.ByteString] -> Bool
en ("en":_) = True
en _ = False
parse :: [B8.ByteString] -> (Int, B8.ByteString)
parse (_:t:vs:_) = case B8.readInt vs of
Just (v, "") -> (v, t)
_ -> (0, "")
parse _ = (0, "")
popular :: (Int, B8.ByteString) -> Bool
popular (v, _) = v > (500 :: Int)
display :: (Int, B8.ByteString) -> IO ()
display (v, t) = B8.putStrLn $ B8.concat [t, " (", B8.pack $ show v, ")"]
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class WMStats {
private static final String FILE_NAME = "pagecounts-20141029-230000";
private static final String PREFIX = "en ";
private static final String DELIMITER = " ";
private static final int THRESHOLD = 500;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
List<NameAndCount> list = new ArrayList<>();
long start = System.currentTimeMillis();
try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
for (String line; (line = reader.readLine()) != null; ) {
if (!line.startsWith(PREFIX)) {
continue;
}
String[] parts = line.split(DELIMITER);
int count = Integer.parseInt(parts[2]);
if (count >= THRESHOLD) {
list.add(new NameAndCount(parts[1], count));
}
}
}
Collections.sort(list);
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : list.subList(0, 9)) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount implements Comparable<NameAndCount> {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
@Override
public int compareTo(NameAndCount other) {
return Integer.compare(other.count, count);
}
}
}
var fs = require('fs');
var readline = require('readline');
var filename = 'pagecounts-20141029-230000';
var minviews = 500;
var prefix = 'en ';
var rd = readline.createInterface({
input : fs.createReadStream(filename),
output : process.stdout,
terminal : false
});
var start = +new Date();
var count = [];
rd.on('line', function(line) {
if (line.indexOf(prefix) !== 0) {
return;
}
var parts = line.split(" ");
var lang = parts[0];
var article = parts[1];
var views = parts[2];
if (views > minviews) {
count.push([article, parseInt(views)]);
}
});
rd.on('close', function() {
count.sort(function(a, b) {
return a[1] > b[1] ? -1 : 1;
});
var elapsed = (+new Date() - start) / 1000;
console.log("Query took " + elapsed.toFixed(2) + " seconds");
count.slice(0, 10).forEach(function(item) {
console.log(item[0] + " (" + item[1] + ")");
});
});
filename = "pagecounts-20141029-230000"
minviews = 500
prefix = "en "
-- Here's something interesting: Lua doesn't have a split
-- function :/
function split(str)
pieces = {}
for piece in string.gmatch(str, "%S+") do
table.insert(pieces, piece)
end
return pieces
end
f = io.open(filename)
start = os.clock()
count = {}
while true do
line = f:read()
if line == nil then break end
-- Yes, indexes in Lua start at 1 instead of 0 :/
if line:find(prefix) == 1 then
pieces = split(line)
article = pieces[2]
views = tonumber(pieces[3])
if views > minviews then
table.insert(count, {
article = article, views = views
})
end
end
end
table.sort(count, function (a, b)
return a.views > b.views
end)
elapsed = os.clock() - start
print(string.format("Query took %f seconds \n", elapsed))
for i = 1, 10 do
piece = count[i]
print(string.format("%s (%s)", piece.article, piece.views))
end
<?php
$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";
$f = fopen($filename, "r");
$start = microtime(true);
$count = [];
$len = strlen($prefix);
while (($line = fgets($f)) !== false) {
if (substr($line, 0, $len) == $prefix) {
list($lang, $article, $views) = explode(" ", $line);
if ($views > $minviews) {
$count[] = [$article, $views];
}
}
}
usort($count, function ($a, $b) {
return $a[1] > $b[1] ? -1 : 1;
});
$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";
$count = array_slice($count, 0, 10);
foreach ($count as $item) {
printf("%s (%s) \n", $item[0], $item[1]);
}
import sys, time
IGNORE = ["index.html", "index.php", "nl", "hoofdpagina"]
def _skip(article):
if article.lower() in IGNORE:
return True
# Namespaces
if ":" in article:
return True
return False
def stats(filename, minimalviews = 0, prefix = False, verbose = False):
f = open(filename, "r").readlines()
start = time.time()
count = []
for line in f:
if prefix and not line.startswith(prefix):
continue
(lang, article, views, size) = line.split(" ")
if _skip(article):
continue
if int(views) > minimalviews:
count.append((article, views))
count.sort(key = lambda x: int(x[1]), reverse = True)
elapsed = time.time() - start
if verbose:
print "Query took %s seconds" % round(elapsed, 2)
return count
require 'benchmark'
filename = 'pagecounts-20141029-230000'
min_views = 500
prefix = 'en '
count = []
time = Benchmark.measure do
File.open(filename).each_line do |line|
next unless line.start_with? prefix
parts = line.force_encoding("iso-8859-1").split(' ', 3)
article, views = parts[1], parts[2].to_i
count << [article, views] if views > min_views
end
count.sort! { |a,b| b[1] <=> a[1] }
end
puts "Query took #{time.total} seconds"
count[0..9].each { |item| puts "#{item[0]} (#{item[1]})" }
#![feature(slicing_syntax)]
extern crate lines; // https://github.com/xitep/lines-rs.git
extern crate time; // part of the standard library
use lines::linemapper;
use std::os;
use std::io::{File, BufferedReader};
use std::str;
use time as t;
use std::time::Duration;
struct Page {
title: String,
count: uint
}
fn main() {
let args = os::args();
if args.len() != 2 {
os::set_exit_status(1);
return;
}
let start = t::precise_time_ns();
let top10 = process_file(File::open(&Path::new(&args[1])).unwrap());
let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);
println!("Fast: Query took {:3d} millis", ellapsed.num_milliseconds());
for t in top10.iter() {
println!("{} ({})", t.title, t.count);
}
let start = t::precise_time_ns();
let top10 = process_file_more_idiomatically(File::open(&Path::new(&args[1])).unwrap());
let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);
println!("Idiomatic: Query took {:3d} millis", ellapsed.num_milliseconds());
for t in top10.iter() {
println!("{} ({})", t.title, t.count);
}
}
fn process_file(f: File) -> Vec<Page> {
let r = BufferedReader::with_capacity(4*1024, f);
let mut pages = Vec::<Page>::with_capacity(500);
linemapper::map_lines(r, |line| {
if line[0] != b'e' || line[1] != b'n' || line[2] != b' ' {
return true; // ~ skip this line but continue parsing
}
let mut s = unsafe {str::raw::from_utf8(line[3..])}.split(' ');
let title = s.next().unwrap();
let count: uint = from_str(s.next().unwrap()).unwrap();
if count < 500 {
return true; // ~ skip this line but continue parsing
}
pages.push(Page{title: String::from_str(title), count: count});
true // ~ continue parsing
}).unwrap();
pages.sort_by(|a, b| b.count.cmp(&a.count));
pages.truncate(10);
pages
}
fn process_file_more_idiomatically(f: File) -> Vec<Page> {
let mut r = BufferedReader::with_capacity(4*1024, f);
let mut pages = Vec::<Page>::with_capacity(500);
for line in r.lines() {
let line = line.unwrap();
if !line.starts_with("en ") {
continue;
}
let mut s = line.split(' ');
s.next(); // ~ consume the already matched "en "
let title = s.next().unwrap();
let count: uint = from_str(s.next().unwrap()).unwrap();
if count >= 500 {
pages.push(Page{title: String::from_str(title), count: count});
}
}
pages.sort_by(|a, b| b.count.cmp(&a.count));
pages.truncate(10);
pages
}
@hay
Copy link
Author

hay commented Nov 4, 2014

Thanks for all your versions! I've updated my benchmarks accordingly. C is still the fastest though, but we've made some great improvements in the Python and Groovy versions.

I've also added a lua version, that is unfortunately the slowest by a landslide: 22.81s!

Conclusion of this little experiment: don't use lua for parsing text ;)

@xitep
Copy link

xitep commented Nov 8, 2014

Hello, I just realized I forgot to share my optimized go version. Here you go:

package main

import (
    "bufio"
    "bytes"
    "fmt"
    "os"
    "sort"
    "time"
)

type Article struct {
    article string
    views   int
}

type Articles []Article

func (slice Articles) Len() int {
    return len(slice)
}

func (slice Articles) Less(i, j int) bool {
    return slice[i].views > slice[j].views
}

func (slice Articles) Swap(i, j int) {
    slice[i], slice[j] = slice[j], slice[i]
}

func main() {
    filename := "pagecounts-20141029-230000"
    minviews := 500
    prefix := []byte("en ")

    file, err := os.Open(filename)

    if err != nil {
        fmt.Println(err)
    }

    scanner := bufio.NewScanner(file)
    articles := make(Articles, 0, 500)
    start := time.Now()

    spaceSep := []byte{' '}
    for scanner.Scan() {
        line := scanner.Bytes()

        if !bytes.HasPrefix(line, prefix) {
            continue
        }

        parts := bytes.Split(line, spaceSep)
        articlename := parts[1]
        views := parts[2]
        viewsInt := bytesToUInt(views)

        if viewsInt < minviews {
            continue
        }

        article := Article{
            article: string(articlename),
            views:   viewsInt,
        }

        articles = append(articles, article)
    }

    sort.Sort(articles)

    elapsed := time.Since(start)
    // So how do we get a float here in as a Println?
    fmt.Printf("Query took %v\n", elapsed)

    for i := 0; i < 10; i++ {
        fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
    }
}

func bytesToUInt(xs []byte) int {
    n := 0
    for _, x := range xs {
        n = n*10 + int(x-'0')
    }
    return n
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment