hay/WMStats.java

## wikistats-readme.md

      
    Raw
  

              wikistats-readme.md
            
          
    Here's the assignment:
Download this raw statistics dump from Wikipedia (360mb unzipped):
http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-10/pagecounts-20141029-230000.gz
Write a simple script in your favourite programming language that:

Gets all views from the English Wikipedia (these are prefixed by "en ")
Limit those articles to the ones with at least 500 views
Sort by number of views, highest ones first and print the first ten articles.
Also measure the time this takes and print it out as well.

Right now we've got versions in Javascript (Node.js), PHP, Go, Python, Ruby, Bash (awk/sed/grep), Groovy and Java in both Java 8 functional style and 'old school' style.
The Bash, Groovy and Java versions were written by @breun, the Ruby version was written by @tieleman, the others by yours truly.
Some measurements on my machine (2011 Macbook Pro, no SSD):

C: 1.63s (1.58, 1.73, 1.59, 1.62, 1.63)
Go: 2.36s (2.31, 2.42, 2.36, 2.33, 2.36)
Java (oldschool): 4.77s / 2.66s if not taking the first measurement into account  (13.15, 2.59, 2.48, 3.08, 2.58, 2.58)
Groovy: 4.33s (4.16s, 4.27s, 4.55, 4.42, 4.27)
Node.js: 7.10s (7.56, 7.18, 7.01, 6.89, 6.88)
PHP: 7.44s (7.25, 7.35, 7.28, 7.37, 7.97)
Python: 7.45s (6.59, 7.28, 6.81, 8.99, 7.59)
Ruby: 8.85s (9.3, 9.38, 8.6, 8.37, 8.61)
Bash: 12.34s (12.62, 12.22, 12.78, 12.80, 11.29)
Lua: 22.81s (24.08, 22.65, 22.11, 21.53, 23.70)

Your output should look like this:
Query took 7.56 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)


## wmstats-newschool.java
// Run with javac WMStats.java && java WMStats

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;

public class WMStats {

    private static final String filename = "pagecounts-20141029-230000";
    private static final String prefix = "en ";
    private static final int threshold = 500;
    private static final Charset charset = StandardCharsets.ISO_8859_1;

    public static void main(String[] args) throws IOException {
        new WMStats().go();
    }

    public void go() throws IOException {
        long start = System.currentTimeMillis();

        List<NameAndCount> top10 = Files
            .lines(Paths.get(filename), charset)
            .filter(line -> line.startsWith(prefix))
            .map(line -> line.split(" "))
            .map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
            .filter(pair -> pair.count >= threshold)
            .sorted((a, b) -> Integer.compare(b.count, a.count))
            .limit(10)
            .collect(Collectors.toList());

        System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

        for (NameAndCount pair : top10) {
            System.out.println(pair.name + " (" + pair.count + ")");
        }
    }

    class NameAndCount {
        String name;
        int count;

        NameAndCount(String name, int count) {
            this.name = name;
            this.count = count;
        }
    }
}

## wmstats-oldschool.java
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class WMStats {

    private static final String FILE_NAME = "pagecounts-20141029-230000";
    private static final String PREFIX = "en ";
    private static final String DELIMITER = " ";
    private static final int THRESHOLD = 500;

    public static void main(String[] args) throws IOException {
        new WMStats().go();
    }

    public void go() throws IOException {
        List<NameAndCount> list = new ArrayList<>();

        long start = System.currentTimeMillis();

        try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
            for (String line; (line = reader.readLine()) != null; ) {
                if (!line.startsWith(PREFIX)) {
                    continue;
                }
                String[] parts = line.split(DELIMITER);
                int count = Integer.parseInt(parts[2]);
                if (count >= THRESHOLD) {
                    list.add(new NameAndCount(parts[1], count));
                }
            }
        }
        Collections.sort(list);

        System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

        for (NameAndCount pair : list.subList(0, 9)) {
            System.out.println(pair.name + " (" + pair.count + ")");
        }
    }

    class NameAndCount implements Comparable<NameAndCount> {
        String name;
        int count;

        NameAndCount(String name, int count) {
            this.name = name;
            this.count = count;
        }

        @Override
        public int compareTo(NameAndCount other) {
            return Integer.compare(other.count, count);
        }
    }
}

## wmstats.bash
time grep '^en ' pagecounts-20141029-230000 |\
awk '{ if ($3 >= 500) print $3 " " $2 }' |\
LC_ALL=C sort -nr |\
head -n 10 |\
awk '{ print $2 " (" $1 ")" }'

## wmstats.c
// Compile with gcc -Wall -O3 ws.c -o ws && ./ws

#include <stdio.h>  // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h>   // clock()

#define BUFSIZE 1024

struct article {
    char title[BUFSIZE];
    int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;

void findLowestSlot() {
    slot = -1;
    v = 1 << 30;
    for( i=0; i<10; i++ ) {
        if (articles[i].views < v) {
            slot = i;
            v = articles[i].views;
        }
    }
}

void printSorted() {
    for( j=0; j<10; j++ ) {
        v = 0;
        slot = 0;
        for( i=0; i<10; i++ ) {
            if( articles[i].views > v ) {
                v = articles[i].views;
                slot = i;
            }
        }
        printf("%s (%d)\n", articles[slot].title, articles[slot].views);
        articles[slot].views = 0;
    }
    printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}

int main(void) {
    start = clock();
    FILE* file = fopen("pagecounts-20141029-230000", "r");
    while( !feof(file) ) {
        fgets(&line[0], BUFSIZE-1, file);
        if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
            continue;
        }
        column = 3;
        while( line[column++] != ' ' );
        views = atoi(&line[column]);
        if( views < 500 ) {
            continue;
        }
        findLowestSlot();
        if( slot != -1 && articles[slot].views < views ) {
            articles[slot].views = views;
            line[column-1] = '\0';
            strcpy(articles[slot].title, &line[3]);
        }
    }
    fclose(file);
    printSorted();
    return 0;
}

## wmstats.cs
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;

class Program
{
    static void Main(string[] args)
    {
        string filename = "pagecounts-20141029-230000";
        int minViews = 500;

        Stopwatch timer = new Stopwatch();
        timer.Start();

        var top10 = File.ReadLines(filename)
                   .Where(line => line.Substring(0, 3) == "en ")
                   .Select(line => line.Split(' '))
                   .Select(values => Tuple.Create(values[1], int.Parse(values[2])))
                   .Where(tuple => tuple.Item2 >= minViews)
                   .OrderByDescending(tuple => tuple.Item2)
                   .Take(10)
                   .ToArray();

        timer.Stop();

        Console.WriteLine("Query took " + timer.Elapsed.TotalSeconds + " seconds");

        foreach (var tuple in top10)
        {
            Console.WriteLine("{0} ({1})", tuple.Item1, tuple.Item2);
        }
    }
}

## wmstats.go
package main

import (
    "fmt"
    "os"
    "bufio"
    "strings"
    "strconv"
    "sort"
    "time"
)

type Article struct {
    article string
    views int
}

type Articles []Article

func timestamp() int64 {
    return time.Now().UnixNano()
}

func (slice Articles) Len() int {
    return len(slice)
}

func (slice Articles) Less(i, j int) bool {
    return slice[i].views > slice[j].views
}

func (slice Articles) Swap(i, j int) {
    slice[i], slice[j] = slice[j], slice[i]
}

func main() {
    filename := "pagecounts-20141029-230000"
    minviews := 500
    prefix := "en "

    file, err := os.Open(filename)

    if err != nil {
        fmt.Println(err)
    }

    scanner := bufio.NewScanner(file)
    articles := make(Articles, 0)
    start := timestamp()

    for scanner.Scan() {
        line := scanner.Text()

        if (!strings.HasPrefix(line, prefix)) {
            continue
        }

        parts := strings.Split(line, " ")
        articlename := parts[1]
        views := parts[2]
        viewsInt, _ := strconv.Atoi(views)

        if (viewsInt > minviews) {
            article := Article{
                article: articlename,
                views : viewsInt,
            }

            articles = append(articles, article)
        }
    }

    sort.Sort(articles)

    elapsed := timestamp() - start
    fmt.Printf("Query took %f seconds\n", time.Duration(elapsed).Seconds())

    for i := 0; i < 10; i++ {
        fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
    }
}

## wmstats.groovy
#!/usr/bin/env groovy

filename = 'pagecounts-20141029-230000'
prefix = 'en '
delimiter = ' '
threshold = 500

overThreshold = []

start = System.currentTimeMillis()

new File(filename).eachLine { line ->
    if (line.startsWith(prefix)) {
        def elems = line.split(delimiter)
        def count = elems[2].toInteger()
        if (count >= threshold) {
            overThreshold << [count, elems[1]]
        }
    }
}

overThreshold.sort { a, b -> b[0] <=> a[0] }

println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"

overThreshold[0..9].each { line ->
    println line[1] + ' (' + line[0] + ')'
}

## wmstats.hs
{- Compile with ghc -Wall -O2 wmstats.hs and time with time ./wmstats -}
{-# LANGUAGE OverloadedStrings #-}
import           Data.List             (sortBy)
import qualified Data.ByteString.Char8 as B8

main :: IO ()
main = B8.readFile "pagecounts-20141029-230000" >>=
    mapM_ display . take 10 . sortBy (flip compare) . filter popular
                  . map parse . filter en . map B8.words . B8.lines

  where

    en :: [B8.ByteString] -> Bool
    en ("en":_) = True
    en _        = False

    parse :: [B8.ByteString] -> (Int, B8.ByteString)
    parse (_:t:vs:_) = case B8.readInt vs of
                         Just (v, "") -> (v, t)
                         _            -> (0, "")
    parse _          = (0, "")

    popular :: (Int, B8.ByteString) -> Bool
    popular (v, _) = v > (500 :: Int)

    display :: (Int, B8.ByteString) -> IO ()
    display (v, t) = B8.putStrLn $ B8.concat [t, " (", B8.pack $ show v, ")"]

## WMStats.java
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class WMStats {

    private static final String FILE_NAME = "pagecounts-20141029-230000";
    private static final String PREFIX = "en ";
    private static final String DELIMITER = " ";
    private static final int THRESHOLD = 500;

    public static void main(String[] args) throws IOException {
        new WMStats().go();
    }

    public void go() throws IOException {
        List<NameAndCount> list = new ArrayList<>();

        long start = System.currentTimeMillis();

        try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
            for (String line; (line = reader.readLine()) != null; ) {
                if (!line.startsWith(PREFIX)) {
                    continue;
                }
                String[] parts = line.split(DELIMITER);
                int count = Integer.parseInt(parts[2]);
                if (count >= THRESHOLD) {
                    list.add(new NameAndCount(parts[1], count));
                }
            }
        }
        Collections.sort(list);

        System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

        for (NameAndCount pair : list.subList(0, 9)) {
            System.out.println(pair.name + " (" + pair.count + ")");
        }
    }

    class NameAndCount implements Comparable<NameAndCount> {
        String name;
        int count;

        NameAndCount(String name, int count) {
            this.name = name;
            this.count = count;
        }

        @Override
        public int compareTo(NameAndCount other) {
            return Integer.compare(other.count, count);
        }
    }
}

## wmstats.js
var fs = require('fs');
var readline = require('readline');

var filename = 'pagecounts-20141029-230000';
var minviews = 500;
var prefix = 'en ';

var rd = readline.createInterface({
    input : fs.createReadStream(filename),
    output : process.stdout,
    terminal : false
});

var start = +new Date();

var count = [];

rd.on('line', function(line) {
    if (line.indexOf(prefix) !== 0) {
        return;
    }

    var parts = line.split(" ");
    var lang = parts[0];
    var article = parts[1];
    var views = parts[2];

    if (views > minviews) {
        count.push([article, parseInt(views)]);
    }
});

rd.on('close', function() {
    count.sort(function(a, b) {
        return a[1] > b[1] ? -1 : 1;
    });

    var elapsed = (+new Date() - start) / 1000;
    console.log("Query took " + elapsed.toFixed(2) + " seconds");

    count.slice(0, 10).forEach(function(item) {
        console.log(item[0] + " (" + item[1] + ")");
    });
});

## wmstats.lua
filename = "pagecounts-20141029-230000"
minviews = 500
prefix = "en "

-- Here's something interesting: Lua doesn't have a split
-- function :/
function split(str)
    pieces = {}

    for piece in string.gmatch(str, "%S+") do
        table.insert(pieces, piece)
    end

    return pieces
end

f = io.open(filename)

start = os.clock()

count = {}

while true do
    line = f:read()
    if line == nil then break end

    -- Yes, indexes in Lua start at 1 instead of 0 :/
    if line:find(prefix) == 1 then
        pieces = split(line)
        article = pieces[2]
        views = tonumber(pieces[3])

        if views > minviews then
            table.insert(count, {
                article = article, views = views
            })
        end
    end
end

table.sort(count, function (a, b)
    return a.views > b.views
end)

elapsed = os.clock() - start

print(string.format("Query took %f seconds \n", elapsed))

for i = 1, 10 do
    piece = count[i]
    print(string.format("%s (%s)", piece.article, piece.views))
end

## wmstats.php
<?php
$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";

$f = fopen($filename, "r");

$start = microtime(true);

$count = [];
$len = strlen($prefix);

while (($line = fgets($f)) !== false) {
    if (substr($line, 0, $len) == $prefix) {
        list($lang, $article, $views) = explode(" ", $line);

        if ($views > $minviews) {
            $count[] = [$article, $views];
        }
    }
}

usort($count, function ($a, $b) {
    return $a[1] > $b[1] ? -1 : 1;
});

$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";

$count = array_slice($count, 0, 10);

foreach ($count as $item) {
    printf("%s (%s) \n", $item[0], $item[1]);
}

## wmstats.py
import sys, time
IGNORE = ["index.html", "index.php", "nl", "hoofdpagina"]

def _skip(article):
    if article.lower() in IGNORE:
        return True

    # Namespaces
    if ":" in article:
        return True

    return False

def stats(filename, minimalviews = 0, prefix = False, verbose = False):
    f = open(filename, "r").readlines()

    start = time.time()

    count = []

    for line in f:
        if prefix and not line.startswith(prefix):
            continue

        (lang, article, views, size) = line.split(" ")

        if _skip(article):
            continue

        if int(views) > minimalviews:
            count.append((article, views))

    count.sort(key = lambda x: int(x[1]), reverse = True)

    elapsed = time.time() - start

    if verbose:
        print "Query took %s seconds" % round(elapsed, 2)

    return count

## wmstats.rb
require 'benchmark'

filename = 'pagecounts-20141029-230000'
min_views = 500
prefix = 'en '
count = []

time = Benchmark.measure do
  File.open(filename).each_line do |line|
    next unless line.start_with? prefix

    parts = line.force_encoding("iso-8859-1").split(' ', 3)
    article, views = parts[1], parts[2].to_i

    count << [article, views] if views > min_views
  end

  count.sort! { |a,b| b[1] <=> a[1] }
end

puts "Query took #{time.total} seconds"

count[0..9].each { |item| puts "#{item[0]} (#{item[1]})" }

## wmstats.rs
#![feature(slicing_syntax)]

extern crate lines; // https://github.com/xitep/lines-rs.git
extern crate time; // part of the standard library

use lines::linemapper;
use std::os;
use std::io::{File, BufferedReader};
use std::str;
use time as t;
use std::time::Duration;

struct Page {
    title: String,
    count: uint
}

fn main() {
    let args = os::args();
    if args.len() != 2 {
        os::set_exit_status(1);
        return;
    }

    let start = t::precise_time_ns();
    let top10 = process_file(File::open(&Path::new(&args[1])).unwrap());
    let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);

    println!("Fast: Query took {:3d} millis", ellapsed.num_milliseconds());
    for t in top10.iter() {
        println!("{} ({})", t.title, t.count);
    }

    let start = t::precise_time_ns();
    let top10 = process_file_more_idiomatically(File::open(&Path::new(&args[1])).unwrap());
    let ellapsed = Duration::nanoseconds((t::precise_time_ns()-start) as i64);

    println!("Idiomatic: Query took {:3d} millis", ellapsed.num_milliseconds());
    for t in top10.iter() {
        println!("{} ({})", t.title, t.count);
    }
}

fn process_file(f: File) -> Vec<Page> {
    let r = BufferedReader::with_capacity(4*1024, f);
    let mut pages = Vec::<Page>::with_capacity(500);
    linemapper::map_lines(r, |line| {
        if line[0] != b'e' || line[1] != b'n' || line[2] != b' ' {
            return true; // ~ skip this line but continue parsing
        }
        let mut s = unsafe {str::raw::from_utf8(line[3..])}.split(' ');
        let title = s.next().unwrap();
        let count: uint = from_str(s.next().unwrap()).unwrap();
        if count < 500 {
            return true; // ~ skip this line but continue parsing
        }
        pages.push(Page{title: String::from_str(title), count: count});
        true // ~ continue parsing
    }).unwrap();
    pages.sort_by(|a, b| b.count.cmp(&a.count));
    pages.truncate(10);
    pages
}

fn process_file_more_idiomatically(f: File) -> Vec<Page> {
    let mut r = BufferedReader::with_capacity(4*1024, f);
    let mut pages = Vec::<Page>::with_capacity(500);
    for line in r.lines() {
        let line = line.unwrap();
        if !line.starts_with("en ") {
            continue;
        }
        let mut s = line.split(' ');
        s.next(); // ~ consume the already matched "en "
        let title = s.next().unwrap();
        let count: uint = from_str(s.next().unwrap()).unwrap();
        if count >= 500 {
            pages.push(Page{title: String::from_str(title), count: count});
        }
    }
    pages.sort_by(|a, b| b.count.cmp(&a.count));
    pages.truncate(10);
    pages
}
	// Run with javac WMStats.java && java WMStats

	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Paths;
	import java.util.List;
	import java.util.stream.Collectors;

	public class WMStats {

	private static final String filename = "pagecounts-20141029-230000";
	private static final String prefix = "en ";
	private static final int threshold = 500;
	private static final Charset charset = StandardCharsets.ISO_8859_1;

	public static void main(String[] args) throws IOException {
	new WMStats().go();
	}

	public void go() throws IOException {
	long start = System.currentTimeMillis();

	List<NameAndCount> top10 = Files
	.lines(Paths.get(filename), charset)
	.filter(line -> line.startsWith(prefix))
	.map(line -> line.split(" "))
	.map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
	.filter(pair -> pair.count >= threshold)
	.sorted((a, b) -> Integer.compare(b.count, a.count))
	.limit(10)
	.collect(Collectors.toList());

	System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

	for (NameAndCount pair : top10) {
	System.out.println(pair.name + " (" + pair.count + ")");
	}
	}

	class NameAndCount {
	String name;
	int count;

	NameAndCount(String name, int count) {
	this.name = name;
	this.count = count;
	}
	}
	}
	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;

	public class WMStats {

	private static final String FILE_NAME = "pagecounts-20141029-230000";
	private static final String PREFIX = "en ";
	private static final String DELIMITER = " ";
	private static final int THRESHOLD = 500;

	public static void main(String[] args) throws IOException {
	new WMStats().go();
	}

	public void go() throws IOException {
	List<NameAndCount> list = new ArrayList<>();

	long start = System.currentTimeMillis();

	try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
	for (String line; (line = reader.readLine()) != null; ) {
	if (!line.startsWith(PREFIX)) {
	continue;
	}
	String[] parts = line.split(DELIMITER);
	int count = Integer.parseInt(parts[2]);
	if (count >= THRESHOLD) {
	list.add(new NameAndCount(parts[1], count));
	}
	}
	}
	Collections.sort(list);

	System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

	for (NameAndCount pair : list.subList(0, 9)) {
	System.out.println(pair.name + " (" + pair.count + ")");
	}
	}

	class NameAndCount implements Comparable<NameAndCount> {
	String name;
	int count;

	NameAndCount(String name, int count) {
	this.name = name;
	this.count = count;
	}

	@Override
	public int compareTo(NameAndCount other) {
	return Integer.compare(other.count, count);
	}
	}
	}
	time grep '^en ' pagecounts-20141029-230000 \|\
	awk '{ if ($3 >= 500) print $3 " " $2 }' \|\
	LC_ALL=C sort -nr \|\
	head -n 10 \|\
	awk '{ print $2 " (" $1 ")" }'
	// Compile with gcc -Wall -O3 ws.c -o ws && ./ws

	#include <stdio.h> // fopen(), feof(), fgets(), fclose(), printf()
	#include <string.h> // strcpy()
	#include <stdlib.h> // atoi()
	#include <time.h> // clock()

	#define BUFSIZE 1024

	struct article {
	char title[BUFSIZE];
	int views;
	} articles[10];
	char line[BUFSIZE];
	int column, slot, i, j, v, views;
	clock_t start;

	void findLowestSlot() {
	slot = -1;
	v = 1 << 30;
	for( i=0; i<10; i++ ) {
	if (articles[i].views < v) {
	slot = i;
	v = articles[i].views;
	}
	}
	}

	void printSorted() {
	for( j=0; j<10; j++ ) {
	v = 0;
	slot = 0;
	for( i=0; i<10; i++ ) {
	if( articles[i].views > v ) {
	v = articles[i].views;
	slot = i;
	}
	}
	printf("%s (%d)\n", articles[slot].title, articles[slot].views);
	articles[slot].views = 0;
	}
	printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
	}

	int main(void) {
	start = clock();
	FILE* file = fopen("pagecounts-20141029-230000", "r");
	while( !feof(file) ) {
	fgets(&line[0], BUFSIZE-1, file);
	if( line[0] != 'e' \|\| line[1] != 'n' \|\| line[2] != ' ' ) {
	continue;
	}
	column = 3;
	while( line[column++] != ' ' );
	views = atoi(&line[column]);
	if( views < 500 ) {
	continue;
	}
	findLowestSlot();
	if( slot != -1 && articles[slot].views < views ) {
	articles[slot].views = views;
	line[column-1] = '\0';
	strcpy(articles[slot].title, &line[3]);
	}
	}
	fclose(file);
	printSorted();
	return 0;
	}
	using System;
	using System.Diagnostics;
	using System.IO;
	using System.Linq;

	class Program
	{
	static void Main(string[] args)
	{
	string filename = "pagecounts-20141029-230000";
	int minViews = 500;

	Stopwatch timer = new Stopwatch();
	timer.Start();

	var top10 = File.ReadLines(filename)
	.Where(line => line.Substring(0, 3) == "en ")
	.Select(line => line.Split(' '))
	.Select(values => Tuple.Create(values[1], int.Parse(values[2])))
	.Where(tuple => tuple.Item2 >= minViews)
	.OrderByDescending(tuple => tuple.Item2)
	.Take(10)
	.ToArray();

	timer.Stop();

	Console.WriteLine("Query took " + timer.Elapsed.TotalSeconds + " seconds");

	foreach (var tuple in top10)
	{
	Console.WriteLine("{0} ({1})", tuple.Item1, tuple.Item2);
	}
	}
	}
	package main

	import (
	"fmt"
	"os"
	"bufio"
	"strings"
	"strconv"
	"sort"
	"time"
	)

	type Article struct {
	article string
	views int
	}

	type Articles []Article

	func timestamp() int64 {
	return time.Now().UnixNano()
	}

	func (slice Articles) Len() int {
	return len(slice)
	}

	func (slice Articles) Less(i, j int) bool {
	return slice[i].views > slice[j].views
	}

	func (slice Articles) Swap(i, j int) {
	slice[i], slice[j] = slice[j], slice[i]
	}

	func main() {
	filename := "pagecounts-20141029-230000"
	minviews := 500
	prefix := "en "

	file, err := os.Open(filename)

	if err != nil {
	fmt.Println(err)
	}

	scanner := bufio.NewScanner(file)
	articles := make(Articles, 0)
	start := timestamp()

	for scanner.Scan() {
	line := scanner.Text()

	if (!strings.HasPrefix(line, prefix)) {
	continue
	}

	parts := strings.Split(line, " ")
	articlename := parts[1]
	views := parts[2]
	viewsInt, _ := strconv.Atoi(views)

	if (viewsInt > minviews) {
	article := Article{
	article: articlename,
	views : viewsInt,
	}

	articles = append(articles, article)
	}
	}

	sort.Sort(articles)

	elapsed := timestamp() - start
	fmt.Printf("Query took %f seconds\n", time.Duration(elapsed).Seconds())

	for i := 0; i < 10; i++ {
	fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
	}
	}
	#!/usr/bin/env groovy

	filename = 'pagecounts-20141029-230000'
	prefix = 'en '
	delimiter = ' '
	threshold = 500

	overThreshold = []

	start = System.currentTimeMillis()

	new File(filename).eachLine { line ->
	if (line.startsWith(prefix)) {
	def elems = line.split(delimiter)
	def count = elems[2].toInteger()
	if (count >= threshold) {
	overThreshold << [count, elems[1]]
	}
	}
	}

	overThreshold.sort { a, b -> b[0] <=> a[0] }

	println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"

	overThreshold[0..9].each { line ->
	println line[1] + ' (' + line[0] + ')'
	}
	{- Compile with ghc -Wall -O2 wmstats.hs and time with time ./wmstats -}
	{-# LANGUAGE OverloadedStrings #-}
	import Data.List (sortBy)
	import qualified Data.ByteString.Char8 as B8

	main :: IO ()
	main = B8.readFile "pagecounts-20141029-230000" >>=
	mapM_ display . take 10 . sortBy (flip compare) . filter popular
	. map parse . filter en . map B8.words . B8.lines

	where

	en :: [B8.ByteString] -> Bool
	en ("en":_) = True
	en _ = False

	parse :: [B8.ByteString] -> (Int, B8.ByteString)
	parse (_:t:vs:_) = case B8.readInt vs of
	Just (v, "") -> (v, t)
	_ -> (0, "")
	parse _ = (0, "")

	popular :: (Int, B8.ByteString) -> Bool
	popular (v, _) = v > (500 :: Int)

	display :: (Int, B8.ByteString) -> IO ()
	display (v, t) = B8.putStrLn $ B8.concat [t, " (", B8.pack $ show v, ")"]
	var fs = require('fs');
	var readline = require('readline');

	var filename = 'pagecounts-20141029-230000';
	var minviews = 500;
	var prefix = 'en ';

	var rd = readline.createInterface({
	input : fs.createReadStream(filename),
	output : process.stdout,
	terminal : false
	});

	var start = +new Date();

	var count = [];

	rd.on('line', function(line) {
	if (line.indexOf(prefix) !== 0) {
	return;
	}

	var parts = line.split(" ");
	var lang = parts[0];
	var article = parts[1];
	var views = parts[2];

	if (views > minviews) {
	count.push([article, parseInt(views)]);
	}
	});

	rd.on('close', function() {
	count.sort(function(a, b) {
	return a[1] > b[1] ? -1 : 1;
	});

	var elapsed = (+new Date() - start) / 1000;
	console.log("Query took " + elapsed.toFixed(2) + " seconds");

	count.slice(0, 10).forEach(function(item) {
	console.log(item[0] + " (" + item[1] + ")");
	});
	});
	filename = "pagecounts-20141029-230000"
	minviews = 500
	prefix = "en "

	-- Here's something interesting: Lua doesn't have a split
	-- function :/
	function split(str)
	pieces = {}

	for piece in string.gmatch(str, "%S+") do
	table.insert(pieces, piece)
	end

	return pieces
	end

	f = io.open(filename)

	start = os.clock()

	count = {}

	while true do
	line = f:read()
	if line == nil then break end

	-- Yes, indexes in Lua start at 1 instead of 0 :/
	if line:find(prefix) == 1 then
	pieces = split(line)
	article = pieces[2]
	views = tonumber(pieces[3])

	if views > minviews then
	table.insert(count, {
	article = article, views = views
	})
	end
	end
	end

	table.sort(count, function (a, b)
	return a.views > b.views
	end)

	elapsed = os.clock() - start

	print(string.format("Query took %f seconds \n", elapsed))

	for i = 1, 10 do
	piece = count[i]
	print(string.format("%s (%s)", piece.article, piece.views))
	end