Skip to content

Instantly share code, notes, and snippets.

@hay
Last active August 30, 2016 11:03
Show Gist options
  • Save hay/74908efe0d79557a97ff to your computer and use it in GitHub Desktop.
Save hay/74908efe0d79557a97ff to your computer and use it in GitHub Desktop.
Wikipedia statistics parser comparison

Here's the assignment:

Download this raw statistics dump from Wikipedia (360mb unzipped):

http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-10/pagecounts-20141029-230000.gz

Write a simple script in your favourite programming language that:

  • Gets all views from the English Wikipedia (these are prefixed by "en ")
  • Limit those articles to the ones with at least 500 views
  • Sort by number of views, highest ones first and print the first ten articles.
  • Also measure the time this takes and print it out as well.

Right now we've got versions in Javascript (Node.js), PHP, Go, Python, Ruby, Bash (awk/sed/grep), Groovy and Java in both Java 8 functional style and 'old school' style.

The Bash, Groovy and Java versions were written by @breun, the Ruby version was written by @tieleman, the others by yours truly.

Some measurements on my machine (2011 Macbook Pro, no SSD):

  • C: 1.63s (1.58, 1.73, 1.59, 1.62, 1.63)
  • Go: 2.36s (2.31, 2.42, 2.36, 2.33, 2.36)
  • Java (oldschool): 4.77s / 2.66s if not taking the first measurement into account (13.15, 2.59, 2.48, 3.08, 2.58, 2.58)
  • Groovy: 4.33s (4.16s, 4.27s, 4.55, 4.42, 4.27)
  • Node.js: 7.10s (7.56, 7.18, 7.01, 6.89, 6.88)
  • PHP: 7.44s (7.25, 7.35, 7.28, 7.37, 7.97)
  • Python: 7.45s (6.59, 7.28, 6.81, 8.99, 7.59)
  • Ruby: 8.85s (9.3, 9.38, 8.6, 8.37, 8.61)
  • Bash: 12.34s (12.62, 12.22, 12.78, 12.80, 11.29)
  • Lua: 22.81s (24.08, 22.65, 22.11, 21.53, 23.70)

Your output should look like this:

Query took 7.56 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)
// Run with javac WMStats.java && java WMStats
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
public class WMStats {
private static final String filename = "pagecounts-20141029-230000";
private static final String prefix = "en ";
private static final int threshold = 500;
private static final Charset charset = StandardCharsets.ISO_8859_1;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
long start = System.currentTimeMillis();
List<NameAndCount> top10 = Files
.lines(Paths.get(filename), charset)
.filter(line -> line.startsWith(prefix))
.map(line -> line.split(" "))
.map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
.filter(pair -> pair.count >= threshold)
.sorted((a, b) -> Integer.compare(b.count, a.count))
.limit(10)
.collect(Collectors.toList());
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : top10) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
}
}
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class WMStats {
private static final String FILE_NAME = "pagecounts-20141029-230000";
private static final String PREFIX = "en ";
private static final String DELIMITER = " ";
private static final int THRESHOLD = 500;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
List<NameAndCount> list = new ArrayList<>();
long start = System.currentTimeMillis();
try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
for (String line; (line = reader.readLine()) != null; ) {
if (!line.startsWith(PREFIX)) {
continue;
}
String[] parts = line.split(DELIMITER);
int count = Integer.parseInt(parts[2]);
if (count >= THRESHOLD) {
list.add(new NameAndCount(parts[1], count));
}
}
}
Collections.sort(list);
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : list.subList(0, 9)) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount implements Comparable<NameAndCount> {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
@Override
public int compareTo(NameAndCount other) {
return Integer.compare(other.count, count);
}
}
}
time grep '^en ' pagecounts-20141029-230000 |\
awk '{ if ($3 >= 500) print $3 " " $2 }' |\
LC_ALL=C sort -nr |\
head -n 10 |\
awk '{ print $2 " (" $1 ")" }'
// Compile with gcc -Wall -O3 ws.c -o ws && ./ws
#include <stdio.h> // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h> // clock()
#define BUFSIZE 1024
struct article {
char title[BUFSIZE];
int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;
void findLowestSlot() {
slot = -1;
v = 1 << 30;
for( i=0; i<10; i++ ) {
if (articles[i].views < v) {
slot = i;
v = articles[i].views;
}
}
}
void printSorted() {
for( j=0; j<10; j++ ) {
v = 0;
slot = 0;
for( i=0; i<10; i++ ) {
if( articles[i].views > v ) {
v = articles[i].views;
slot = i;
}
}
printf("%s (%d)\n", articles[slot].title, articles[slot].views);
articles[slot].views = 0;
}
printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}
int main(void) {
start = clock();
FILE* file = fopen("pagecounts-20141029-230000", "r");
while( !feof(file) ) {
fgets(&line[0], BUFSIZE-1, file);
if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
continue;
}
column = 3;
while( line[column++] != ' ' );
views = atoi(&line[column]);
if( views < 500 ) {
continue;
}
findLowestSlot();
if( slot != -1 && articles[slot].views < views ) {
articles[slot].views = views;
line[column-1] = '\0';
strcpy(articles[slot].title, &line[3]);
}
}
fclose(file);
printSorted();
return 0;
}
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
class Program
{
static void Main(string[] args)
{
string filename = "pagecounts-20141029-230000";
int minViews = 500;
Stopwatch timer = new Stopwatch();
timer.Start();
var top10 = File.ReadLines(filename)
.Where(line => line.Substring(0, 3) == "en ")
.Select(line => line.Split(' '))
.Select(values => Tuple.Create(values[1], int.Parse(values[2])))
.Where(tuple => tuple.Item2 >= minViews)
.OrderByDescending(tuple => tuple.Item2)
.Take(10)
.ToArray();
timer.Stop();
Console.WriteLine("Query took " + timer.Elapsed.TotalSeconds + " seconds");
foreach (var tuple in top10)
{
Console.WriteLine("{0} ({1})", tuple.Item1, tuple.Item2);
}
}
}
package main
import (
"fmt"
"os"
"bufio"
"strings"
"strconv"
"sort"
"time"
)
type Article struct {
article string
views int
}
type Articles []Article
func timestamp() int64 {
return time.Now().UnixNano()
}
func (slice Articles) Len() int {
return len(slice)
}
func (slice Articles) Less(i, j int) bool {
return slice[i].views > slice[j].views
}
func (slice Articles) Swap(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
}
func main() {
filename := "pagecounts-20141029-230000"
minviews := 500
prefix := "en "
file, err := os.Open(filename)
if err != nil {
fmt.Println(err)
}
scanner := bufio.NewScanner(file)
articles := make(Articles, 0)
start := timestamp()
for scanner.Scan() {
line := scanner.Text()
if (!strings.HasPrefix(line, prefix)) {
continue
}
parts := strings.Split(line, " ")
articlename := parts[1]
views := parts[2]
viewsInt, _ := strconv.Atoi(views)
if (viewsInt > minviews) {
article := Article{
article: articlename,
views : viewsInt,
}
articles = append(articles, article)
}
}
sort.Sort(articles)
elapsed := timestamp() - start
fmt.Printf("Query took %f seconds\n", time.Duration(elapsed).Seconds())
for i := 0; i < 10; i++ {
fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
}
}
#!/usr/bin/env groovy
filename = 'pagecounts-20141029-230000'
prefix = 'en '
delimiter = ' '
threshold = 500
overThreshold = []
start = System.currentTimeMillis()
new File(filename).eachLine { line ->
if (line.startsWith(prefix)) {
def elems = line.split(delimiter)
def count = elems[2].toInteger()
if (count >= threshold) {
overThreshold << [count, elems[1]]
}
}
}
overThreshold.sort { a, b -> b[0] <=> a[0] }
println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"
overThreshold[0..9].each { line ->
println line[1] + ' (' + line[0] + ')'
}
{- Compile with ghc -Wall -O2 wmstats.hs and time with time ./wmstats -}
{-# LANGUAGE OverloadedStrings #-}
import Data.List (sortBy)
import qualified Data.ByteString.Char8 as B8
main :: IO ()
main = B8.readFile "pagecounts-20141029-230000" >>=
mapM_ display . take 10 . sortBy (flip compare) . filter popular
. map parse . filter en . map B8.words . B8.lines
where
en :: [B8.ByteString] -> Bool
en ("en":_) = True
en _ = False
parse :: [B8.ByteString] -> (Int, B8.ByteString)
parse (_:t:vs:_) = case B8.readInt vs of
Just (v, "") -> (v, t)
_ -> (0, "")
parse _ = (0, "")
popular :: (Int, B8.ByteString) -> Bool
popular (v, _) = v > (500 :: Int)
display :: (Int, B8.ByteString) -> IO ()
display (v, t) = B8.putStrLn $ B8.concat [t, " (", B8.pack $ show v, ")"]
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class WMStats {
private static final String FILE_NAME = "pagecounts-20141029-230000";
private static final String PREFIX = "en ";
private static final String DELIMITER = " ";
private static final int THRESHOLD = 500;
public static void main(String[] args) throws IOException {
new WMStats().go();
}
public void go() throws IOException {
List<NameAndCount> list = new ArrayList<>();
long start = System.currentTimeMillis();
try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
for (String line; (line = reader.readLine()) != null; ) {
if (!line.startsWith(PREFIX)) {
continue;
}
String[] parts = line.split(DELIMITER);
int count = Integer.parseInt(parts[2]);
if (count >= THRESHOLD) {
list.add(new NameAndCount(parts[1], count));
}
}
}
Collections.sort(list);
System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");
for (NameAndCount pair : list.subList(0, 9)) {
System.out.println(pair.name + " (" + pair.count + ")");
}
}
class NameAndCount implements Comparable<NameAndCount> {
String name;
int count;
NameAndCount(String name, int count) {
this.name = name;
this.count = count;
}
@Override
public int compareTo(NameAndCount other) {
return Integer.compare(other.count, count);
}
}
}
var fs = require('fs');
var readline = require('readline');
var filename = 'pagecounts-20141029-230000';
var minviews = 500;
var prefix = 'en ';
var rd = readline.createInterface({
input : fs.createReadStream(filename),
output : process.stdout,
terminal : false
});
var start = +new Date();
var count = [];
rd.on('line', function(line) {
if (line.indexOf(prefix) !== 0) {
return;
}
var parts = line.split(" ");
var lang = parts[0];
var article = parts[1];
var views = parts[2];
if (views > minviews) {
count.push([article, parseInt(views)]);
}
});
rd.on('close', function() {
count.sort(function(a, b) {
return a[1] > b[1] ? -1 : 1;
});
var elapsed = (+new Date() - start) / 1000;
console.log("Query took " + elapsed.toFixed(2) + " seconds");
count.slice(0, 10).forEach(function(item) {
console.log(item[0] + " (" + item[1] + ")");
});
});
filename = "pagecounts-20141029-230000"
minviews = 500
prefix = "en "
-- Here's something interesting: Lua doesn't have a split
-- function :/
function split(str)
pieces = {}
for piece in string.gmatch(str, "%S+") do
table.insert(pieces, piece)
end
return pieces
end
f = io.open(filename)
start = os.clock()
count = {}
while true do
line = f:read()
if line == nil then break end
-- Yes, indexes in Lua start at 1 instead of 0 :/
if line:find(prefix) == 1 then
pieces = split(line)
article = pieces[2]
views = tonumber(pieces[3])
if views > minviews then
table.insert(count, {
article = article, views = views
})
end
end
end
table.sort(count, function (a, b)
return a.views > b.views
end)
elapsed = os.clock() - start
print(string.format("Query took %f seconds \n", elapsed))
for i = 1, 10 do
piece = count[i]
print(string.format("%s (%s)", piece.article, piece.views))
end
<?php
$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";
$f = fopen($filename, "r");
$start = microtime(true);
$count = [];
$len = strlen($prefix);
while (($line = fgets($f)) !== false) {
if (substr($line, 0, $len) == $prefix) {
list($lang, $article, $views) = explode(" ", $line);
if ($views > $minviews) {
$count[] = [$article, $views];
}
}
}
usort($count, function ($a, $b) {
return $a[1] > $b[1] ? -1 : 1;
});
$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";
$count = array_slice($count, 0, 10);
foreach ($count as $item) {
printf("%s (%s) \n", $item[0], $item[1]);
}
import sys, time
IGNORE = ["index.html", "index.php", "nl", "hoofdpagina"]
def _skip(article):
if article.lower() in IGNORE:
return True
# Namespaces
if ":" in article:
return True
return False
def stats(filename, minimalviews = 0, prefix = False, verbose = False):
f = open(filename, "r").readlines()
start = time.time()
count = []
for line in f:
if prefix and not line.startswith(prefix):
continue
(lang, article, views, size) = line.split(" ")
if _skip(article):
continue
if int(views) > minimalviews:
count.append((article, views))
count.sort(key = lambda x: int(x[1]), reverse = True)
elapsed = time.time() - start
if verbose:
print "Query took %s seconds" % round(elapsed, 2)
return count
require 'benchmark'
filename = 'pagecounts-20141029-230000'
min_views = 500
prefix = 'en '
count = []
time = Benchmark.measure do
File.open(filename).each_line do |line|
next unless line.start_with? prefix
parts = line.force_encoding("iso-8859-1").split(' ', 3)
article, views = parts[1], parts[2].to_i
count << [article, views] if views > min_views
end
count.sort! { |a,b| b[1] <=> a[1] }
end
puts "Query took #{time.total} seconds"
count[0..9].each { |item| puts "#{item[0]} (#{item[1]})" }
#![feature(slicing_syntax)]
extern crate lines; // https://github.com/xitep/lines-rs.git
extern crate time; // part of the standard library
use lines::linemapper;
use std::os;
use std::io::{File, BufferedReader};
use std::str;
use time as t;
use std::time::Duration;
struct Page {
title: String,
count: uint
}
fn main() {
let args = os::args();
if args.len() != 2 {
os::set_exit_status(1);
return;
}
let start = t::precise_time_ns();
let top10 = process_file(File::open(&Path::new(&args[1])).unwrap());
let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);
println!("Fast: Query took {:3d} millis", ellapsed.num_milliseconds());
for t in top10.iter() {
println!("{} ({})", t.title, t.count);
}
let start = t::precise_time_ns();
let top10 = process_file_more_idiomatically(File::open(&Path::new(&args[1])).unwrap());
let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);
println!("Idiomatic: Query took {:3d} millis", ellapsed.num_milliseconds());
for t in top10.iter() {
println!("{} ({})", t.title, t.count);
}
}
fn process_file(f: File) -> Vec<Page> {
let r = BufferedReader::with_capacity(4*1024, f);
let mut pages = Vec::<Page>::with_capacity(500);
linemapper::map_lines(r, |line| {
if line[0] != b'e' || line[1] != b'n' || line[2] != b' ' {
return true; // ~ skip this line but continue parsing
}
let mut s = unsafe {str::raw::from_utf8(line[3..])}.split(' ');
let title = s.next().unwrap();
let count: uint = from_str(s.next().unwrap()).unwrap();
if count < 500 {
return true; // ~ skip this line but continue parsing
}
pages.push(Page{title: String::from_str(title), count: count});
true // ~ continue parsing
}).unwrap();
pages.sort_by(|a, b| b.count.cmp(&a.count));
pages.truncate(10);
pages
}
fn process_file_more_idiomatically(f: File) -> Vec<Page> {
let mut r = BufferedReader::with_capacity(4*1024, f);
let mut pages = Vec::<Page>::with_capacity(500);
for line in r.lines() {
let line = line.unwrap();
if !line.starts_with("en ") {
continue;
}
let mut s = line.split(' ');
s.next(); // ~ consume the already matched "en "
let title = s.next().unwrap();
let count: uint = from_str(s.next().unwrap()).unwrap();
if count >= 500 {
pages.push(Page{title: String::from_str(title), count: count});
}
}
pages.sort_by(|a, b| b.count.cmp(&a.count));
pages.truncate(10);
pages
}
@agounaris
Copy link

With a few tiny modifications the php version is down to an average of 4.7 sec

agounaris@S0051 ~/Development $ php my_one.php
Query took 4.699 seconds
Main_Page (394296)
...
agounaris@S0051 ~/Development $ php my_one.php
Query took 4.826 seconds
Main_Page (394296)
....
agounaris@S0051 ~/Development $ php my_one.php
Query took 4.592 seconds
Main_Page (394296)
<?php
$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";

$f = fopen($filename, "r");

$start = microtime(true);

$count = [];
$len = strlen($prefix);

while (($line = fgets($f)) !== false) {
    if (substr($line, 0, $len) == $prefix) {
        list($lang, $article, $views) = explode(" ", $line);

        if ($views > $minviews) {
            $count[] = [$article, $views];
        }
    }
}

usort($count, function ($a, $b) {
    return $a[1] > $b[1] ? -1 : 1;
});

$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";

$count = array_slice($count, 0, 10);

foreach ($count as $item) {
    printf("%s (%s) \n", $item[0], $item[1]);
}

@pdonald
Copy link

pdonald commented Nov 4, 2014

Python

if prefix and not line.startswith(prefix):

Remove prefix and: 4.36s -> 4.14s

Change to if line[:3] != prefix:: 4.36s -> 3.5s

You should also try pypy.

C#/.NET

2.34s on my computer

using System;
using System.Diagnostics;
using System.IO;
using System.Linq;

class Program
{
    static void Main(string[] args)
    {
        string filename = "pagecounts-20141029-230000";
        int minViews = 500;

        Stopwatch timer = new Stopwatch();
        timer.Start();

        var top10 = File.ReadLines(filename)
                   .Where(line => line.Substring(0, 3) == "en ")
                   .Select(line => line.Split(' '))
                   .Select(values => Tuple.Create(values[1], int.Parse(values[2])))
                   .Where(tuple => tuple.Item2 >= minViews)
                   .OrderByDescending(tuple => tuple.Item2)
                   .Take(10)
                   .ToArray();

        timer.Stop();

        Console.WriteLine("Query took " + timer.Elapsed.TotalSeconds + " seconds");

        foreach (var tuple in top10)
        {
            Console.WriteLine("{0} ({1})", tuple.Item1, tuple.Item2);
        }
    }
}

@breun
Copy link

breun commented Nov 4, 2014

Optimized Groovy version: first check for prefix, then split, just like the other implementations.

#!/usr/bin/env groovy

filename = 'pagecounts-20141029-230000'
prefix = 'en '
delimiter = ' '
threshold = 500

overThreshold = []

start = System.currentTimeMillis()

new File(filename).eachLine { line ->
    if (line.startsWith(prefix)) {
        def elems = line.split(delimiter)
        def count = elems[2].toInteger()
        if (count >= threshold) {
            overThreshold << [count, elems[1]]
        }
    }
}

overThreshold.sort { a, b -> b[0] <=> a[0] }

println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"

overThreshold[0..9].each { line ->
    println line[1] + ' (' + line[0] + ')'
}

Now runs in about 3 seconds as well:

$ ./wmstats.groovy 
Query took 3.095 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)

@c-rack
Copy link

c-rack commented Nov 4, 2014

Here is a quick and dirty C implementation:

#include <stdio.h>  // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h>   // clock()

#define BUFSIZE 1024

struct article {
    char title[BUFSIZE];
    int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;

void findLowestSlot() {
    slot = -1;
    v = 1 << 30;
    for( i=0; i<10; i++ ) {
        if (articles[i].views < v) {
            slot = i;
            v = articles[i].views;
        }
    }
}

void printSorted() {
    for( j=0; j<10; j++ ) {
        v = 0;
        slot = 0;
        for( i=0; i<10; i++ ) {
            if( articles[i].views > v ) {
                v = articles[i].views;
                slot = i;
            }
        }
        printf("%s (%d)\n", articles[slot].title, articles[slot].views);
        articles[slot].views = 0;
    }
    printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}

int main(void) {
    start = clock();
    FILE* file = fopen("pagecounts-20141029-230000", "r");
    while( !feof(file) ) {
        fgets(&line[0], BUFSIZE-1, file);
        if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
            continue;
        }
        column = 3;
        while( line[column++] != ' ' );
        views = atoi(&line[column]);
        if( views < 500 ) {
            continue;
        }
        findLowestSlot();
        if( slot != -1 && articles[slot].views < views ) {
            articles[slot].views = views;
            line[column-1] = '\0';
            strcpy(articles[slot].title, &line[3]);
        }
    }
    fclose(file);
    printSorted();
    return 0;
}

which is about twice as fast as the golang version on my system:

$ gcc -Wall -O3 ws.c -o ws && ./ws
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)
Query took 1278 ms

@gillesdemey
Copy link

Correct Golang time print:

func timestamp() int64 {
    return time.Now().UnixNano()
}
fmt.Printf("Query took %f seconds\n", time.Duration(elapsed).Seconds())

Prints Query took 2.626277 seconds on my 2012 MacBook Air

@hadrianw
Copy link

hadrianw commented Nov 4, 2014

My dirty version in C, but line length is limited by memory.

On my system:
my version takes 530-550 ms.
c-rack's version takes 560-600 ms.
go version takes 3150-3280 ms

Compile with:

$ gcc -std=c99 -Wall -Wextra -pedantic main.c -o main -O3
#define _POSIX_C_SOURCE 201411L
#include <stdio.h>  
#include <stdlib.h> 
#include <time.h>   

#define LENGTH(x) (sizeof x / sizeof x[0])

size_t
nospace(char *str) {
    size_t i = 0;
    for(; ; str++) {
        switch(*str) {
        case ' ':
        case '\0':
            return i;
        }
        i++;
    }
}

typedef struct {
    size_t count;
    char *titlestr;
    char *countstr;
} Page;

typedef struct {
    size_t na;
    size_t nu;
    Page *v;
} Pages;

void
append(Pages *ps, Page p) {
    if(ps->nu == ps->na) {
        ps->na = ps->na * 3 / 2;
        ps->v = realloc(ps->v, sizeof ps->v[0] * ps->na);
    }
    ps->v[ps->nu++] = p;
}

int
cmp(const void *p1, const void *p2) {
    return *(size_t*)p2 - *(size_t*)p1;
}

int
main(int argc, char *argv[]) {
    clock_t start = clock();
    FILE *f = fopen("pagecounts-20141029-230000", "rb");
    size_t nline = 64;
    char *line = malloc(nline);

    Pages pages = {.na = 4096};
    pages.v = malloc(sizeof pages.v[0] * pages.na);

    while( getline(&line, &nline, f) > 0 ){
        char *s = line;
        if(s[0] == 'e' && s[1] == 'n' && s[2] == ' ') {
            Page a;
            s += 3;

            a.titlestr = s;
            s += nospace(s);
            s[0] = '\0';
            s++;

            a.countstr = s;
            s += nospace(s);
            s[0] = '\0';

            a.count = atoll(a.countstr);

            if(a.count > 500) {
                append(&pages, a);
                nline = 64;
                line = malloc(nline);
            }
        }
    }
    qsort(pages.v, pages.nu, sizeof pages.v[0], cmp);
    for(int i = 0; i < 10; i++) {
        fputs(pages.v[i].titlestr, stdout);
        fputs(" (", stdout);
        fputs(pages.v[i].countstr, stdout);
        puts(")");
    }
    printf("Query took %ld ms\n", (clock() - start) * 1000 / CLOCKS_PER_SEC);
    return 0;
}

@TravisCardwell
Copy link

Here is a quick version in Haskell:

{-# LANGUAGE OverloadedStrings #-}
import           Data.List             (sortBy)
import qualified Data.ByteString.Char8 as B8

main :: IO ()
main = B8.readFile "pagecounts-20141029-230000" >>=
    mapM_ display . take 10 . sortBy (flip compare) . filter popular 
                  . map parse . filter en . map B8.words . B8.lines

  where

    en :: [B8.ByteString] -> Bool
    en ("en":_) = True
    en _        = False

    parse :: [B8.ByteString] -> (Int, B8.ByteString)
    parse (_:t:vs:_) = case B8.readInt vs of
                         Just (v, "") -> (v, t)
                         _            -> (0, "")
    parse _          = (0, "")

    popular :: (Int, B8.ByteString) -> Bool
    popular (v, _) = v > (500 :: Int)

    display :: (Int, B8.ByteString) -> IO ()
    display (v, t) = B8.putStrLn $ B8.concat [t, " (", B8.pack $ show v, ")"]

To compile: ghc -Wall -O2 wmstats.hs

To time: time ./wmstats

On my machine, the Go version takes ~2.040s and this Haskell version takes ~1.232s.

@xitep
Copy link

xitep commented Nov 4, 2014

///
/// on my machine: (Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz) / 8GB / Standard Debian Wheezy
/// average (each program has been run at least 10 times):
///
///   rust:
///     -- compile: rustc --opt-level=3 src/main.rs -L ../lines-rs/target/release/
///      -- note: this is a _non-idiomatic_ implementation, it avoids
///         unnecessary allocation and treats input as ascii. part of
///         it's speed, comes from the lines-rs library which locates
///         the end of line using SIMD instructions (just as go's
///         version on my machine):
///      -- 410 - 420 millis
///       -- this is the version shown below in the function: process_file
///
///    -- this is a measurement is the same as about with the specified assembly
///       code disabled in the lines-rs library. pure rust code:
///     -- this set-up is very similar to hadrianw's c version
///     -- 545 - 555 millis
///
///    -- this is a measurement where input is validated to be utf-8 with
///        the specilized assembly code in lines-rs disabled:
///     -- 651 millis
///
///    -- this is a measurement where input is validated to be utf-8 with
///        the specilized assembly code in lines-rs enabled:
///     -- 539 millis
///
///    -- here a measure utilize only the standard library (e.g. doing a lot
///       of unnecessary allocation in our case and enforcing input to be
///       utf8):
///     -- 1405 millis
///     -- this set-up is very similar to your go version (my non-optimized version - see below)
///     -- I'd expect improvements in future here since Rust's IO API currently
///        enforces allocation for every line but could be avoided in future.
///     -- this is the version shown below in the function process_file_more_idiomatically
///
///   java "new school" (oracle jvm 1.8.0_25-b17) (excluding measuments until
///   jvm compiled to navite code):
///     -- 1653 millis
///
///   java "old school" (oracle jvm 1.8.0_25-b17) (excluding measurements until
///   jvm compiled to native code):
///     -- 1521 millis
///
/// go version: (go version go1.3.3 linux/amd64)
///     -- compile: go build -gcflags=-B -ccflags=-B pagecount.go
///     -- 1.944513922s
///
/// go version (optimized to avoid needless alloation) (go version go1.3.3 linux/amd64)
///     -- compile: go build -gcflags=-B -ccflags=-B pagecount.go
///     -- 1.459461794s
///
/// TravisCardwell's haskell version (The Glorious Glasgow Haskell Compilation System, version 7.4.1)
///     -- compile as adviced by TravisCardwell
///     -- 1.18s
///
/// hadrianw's c version: (gcc (Debian 4.7.2-5) 4.7.2)
///     -- compile as adviced by hadrianw
///     -- 510ms - 520ms

Here's the rust code:

#![feature(slicing_syntax)]

extern crate lines; // https://github.com/xitep/lines-rs.git
extern crate time; // part of the standard library

use lines::linemapper; 
use std::os;
use std::io::{File, BufferedReader};
use std::str;
use time as t;
use std::time::Duration;

struct Page {
    title: String,
    count: uint
}

fn main() {
    let args = os::args();
    if args.len() != 2 {
        os::set_exit_status(1);
        return;
    }

    let start = t::precise_time_ns();
    let top10 = process_file(File::open(&Path::new(&args[1])).unwrap());
    let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);

    println!("Fast: Query took {:3d} millis", ellapsed.num_milliseconds());
    for t in top10.iter() {
        println!("{} ({})", t.title, t.count);
    }

    let start = t::precise_time_ns();
    let top10 = process_file_more_idiomatically(File::open(&Path::new(&args[1])).unwrap());
    let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);

    println!("Idiomatic: Query took {:3d} millis", ellapsed.num_milliseconds());
    for t in top10.iter() {
        println!("{} ({})", t.title, t.count);
    }
}

fn process_file(f: File) -> Vec<Page> {
    let r = BufferedReader::with_capacity(4*1024, f);
    let mut pages = Vec::<Page>::with_capacity(500);
    linemapper::map_lines(r, |line| {
        if line[0] != b'e' || line[1] != b'n' || line[2] != b' ' {
            return true; // ~ skip this line but continue parsing
        }
        let mut s = unsafe {str::raw::from_utf8(line[3..])}.split(' ');
        let title = s.next().unwrap();
        let count: uint = from_str(s.next().unwrap()).unwrap();
        if count < 500 {
            return true; // ~ skip this line but continue parsing
        }
        pages.push(Page{title: String::from_str(title), count: count});
        true // ~ continue parsing
    }).unwrap();
    pages.sort_by(|a, b| b.count.cmp(&a.count));
    pages.truncate(10);
    pages
}

fn process_file_more_idiomatically(f: File) -> Vec<Page> {
    let mut r = BufferedReader::with_capacity(4*1024, f);
    let mut pages = Vec::<Page>::with_capacity(500);
    for line in r.lines() {
        let line = line.unwrap();
        if !line.starts_with("en ") {
            continue;
        }
        let mut s = line.split(' ');
        s.next(); // ~ consume the already matched "en "
        let title = s.next().unwrap();
        let count: uint = from_str(s.next().unwrap()).unwrap();
        if count >= 500 {
            pages.push(Page{title: String::from_str(title), count: count});
        }
    }
    pages.sort_by(|a, b| b.count.cmp(&a.count));
    pages.truncate(10);
    pages
}

@hay
Copy link
Author

hay commented Nov 4, 2014

Thanks for all your versions! I've updated my benchmarks accordingly. C is still the fastest though, but we've made some great improvements in the Python and Groovy versions.

I've also added a lua version, that is unfortunately the slowest by a landslide: 22.81s!

Conclusion of this little experiment: don't use lua for parsing text ;)

@xitep
Copy link

xitep commented Nov 8, 2014

Hello, I just realized I forgot to share my optimized go version. Here you go:

package main

import (
    "bufio"
    "bytes"
    "fmt"
    "os"
    "sort"
    "time"
)

type Article struct {
    article string
    views   int
}

type Articles []Article

func (slice Articles) Len() int {
    return len(slice)
}

func (slice Articles) Less(i, j int) bool {
    return slice[i].views > slice[j].views
}

func (slice Articles) Swap(i, j int) {
    slice[i], slice[j] = slice[j], slice[i]
}

func main() {
    filename := "pagecounts-20141029-230000"
    minviews := 500
    prefix := []byte("en ")

    file, err := os.Open(filename)

    if err != nil {
        fmt.Println(err)
    }

    scanner := bufio.NewScanner(file)
    articles := make(Articles, 0, 500)
    start := time.Now()

    spaceSep := []byte{' '}
    for scanner.Scan() {
        line := scanner.Bytes()

        if !bytes.HasPrefix(line, prefix) {
            continue
        }

        parts := bytes.Split(line, spaceSep)
        articlename := parts[1]
        views := parts[2]
        viewsInt := bytesToUInt(views)

        if viewsInt < minviews {
            continue
        }

        article := Article{
            article: string(articlename),
            views:   viewsInt,
        }

        articles = append(articles, article)
    }

    sort.Sort(articles)

    elapsed := time.Since(start)
    // So how do we get a float here in as a Println?
    fmt.Printf("Query took %v\n", elapsed)

    for i := 0; i < 10; i++ {
        fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
    }
}

func bytesToUInt(xs []byte) int {
    n := 0
    for _, x := range xs {
        n = n*10 + int(x-'0')
    }
    return n
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment