c-rack/wikistats-readme.md

## wikistats-readme.md

      
    Raw
  

              wikistats-readme.md
            
          
    Here's the assignment:
Download this raw statistics dump from Wikipedia (360mb unzipped):
http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-10/pagecounts-20141029-230000.gz
Write a simple script in your favourite programming language that:

Gets all views from the English Wikipedia (these are prefixed by "en ")
Limit those articles to the ones with at least 500 views
Sort by number of views, highest ones first and print the first ten articles.
Also measure the time this takes and print it out as well.

Right now we've got versions in Javascript (Node.js), PHP, Go, Python, Ruby, Bash (awk/sed/grep), Groovy and Java in both Java 8 functional style and 'old school' style.
The Bash, Groovy and Java versions were written by @breun, the Ruby version was written by @tieleman, the others by yours truly.
Some measurements on my machine (2011 Macbook Pro, no SSD):

Go: 2.36s (2.31, 2.42, 2.36, 2.33, 2.36)
Java (oldschool): 4.77s / 2.66s if not taking the first measurement into account  (13.15, 2.59, 2.48, 3.08, 2.58, 2.58)
Node.js: 7.10s (7.56, 7.18, 7.01, 6.89, 6.88)
Groovy: 8.14s (9.16, 7.67, 8.60, 7.87, 7.40)
PHP: 8.42s (8.54, 8.31, 8.47, 8.41, 8.39)
Ruby: 8.85s (9.3, 9.38, 8.6, 8.37, 8.61)
Python: 9.26s (8.35, 8.54, 10.43, 9.35, 9.62)
Bash: 12.34s (12.62, 12.22, 12.78, 12.80, 11.29)

Your output should look like this:
Query took 7.56 seconds
Main_Page (394296)
Malware (51666)
Loan (45440)
Special:HideBanners (40771)
Y%C5%AB_Kobayashi (34596)
Special:Search (18672)
Glutamate_flavoring (17508)
Online_shopping (16310)
Chang_and_Eng_Bunker (14956)
Dance_Moms (8928)


## wmstats-newschool.java
// Run with javac WMStats.java && java WMStats

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;

public class WMStats {

    private static final String filename = "pagecounts-20141029-230000";
    private static final String prefix = "en ";
    private static final int threshold = 500;
    private static final Charset charset = StandardCharsets.ISO_8859_1;

    public static void main(String[] args) throws IOException {
        new WMStats().go();
    }

    public void go() throws IOException {
        long start = System.currentTimeMillis();

        List<NameAndCount> top10 = Files
            .lines(Paths.get(filename), charset)
            .filter(line -> line.startsWith(prefix))
            .map(line -> line.split(" "))
            .map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
            .filter(pair -> pair.count >= threshold)
            .sorted((a, b) -> Integer.compare(b.count, a.count))
            .limit(10)
            .collect(Collectors.toList());

        System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

        for (NameAndCount pair : top10) {
            System.out.println(pair.name + " (" + pair.count + ")");
        }
    }

    class NameAndCount {
        String name;
        int count;

        NameAndCount(String name, int count) {
            this.name = name;
            this.count = count;
        }
    }
}

## wmstats-oldschool.java
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class WMStats {

    private static final String FILE_NAME = "pagecounts-20141029-230000";
    private static final String PREFIX = "en ";
    private static final String DELIMITER = " ";
    private static final int THRESHOLD = 500;

    public static void main(String[] args) throws IOException {
        new WMStats().go();
    }

    public void go() throws IOException {
        List<NameAndCount> list = new ArrayList<>();

        long start = System.currentTimeMillis();

        try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
            for (String line; (line = reader.readLine()) != null; ) {
                if (!line.startsWith(PREFIX)) {
                    continue;
                }
                String[] parts = line.split(DELIMITER);
                int count = Integer.parseInt(parts[2]);
                if (count >= THRESHOLD) {
                    list.add(new NameAndCount(parts[1], count));
                }
            }
        }
        Collections.sort(list);

        System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

        for (NameAndCount pair : list.subList(0, 9)) {
            System.out.println(pair.name + " (" + pair.count + ")");
        }
    }

    class NameAndCount implements Comparable<NameAndCount> {
        String name;
        int count;

        NameAndCount(String name, int count) {
            this.name = name;
            this.count = count;
        }

        @Override
        public int compareTo(NameAndCount other) {
            return Integer.compare(other.count, count);
        }
    }
}

## wmstats.bash
time grep '^en ' pagecounts-20141029-230000 |\
awk '{ if ($3 >= 500) print $3 " " $2 }' |\
LC_ALL=C sort -nr |\
head -n 10 |\
awk '{ print $2 " (" $1 ")" }'

## wmstats.c
#include <stdio.h>  // fopen(), feof(), fgets(), fclose(), printf()
#include <string.h> // strcpy()
#include <stdlib.h> // atoi()
#include <time.h>   // clock()

#define BUFSIZE 1024

struct article {
  char title[BUFSIZE];
  int views;
} articles[10];
char line[BUFSIZE];
int column, slot, i, j, v, views;
clock_t start;

void findLowestSlot() {
  slot = -1;
  v = 1 << 30;
  for( i=0; i<10; i++ ) {
    if (articles[i].views < v) {
      slot = i;
      v = articles[i].views;
    }
  }
}

void printSorted() {
  for( j=0; j<10; j++ ) {
    v = 0;
    slot = 0;
    for( i=0; i<10; i++ ) {
      if( articles[i].views > v ) {
        v = articles[i].views;
        slot = i;
      }
    }
    printf("%s (%d)\n", articles[slot].title, articles[slot].views);
    articles[slot].views = 0;
  }
  printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
}

int main(void) {
  start = clock();
  FILE* file = fopen("pagecounts-20141029-230000", "r");
  while( !feof(file) ) {
    fgets(&line[0], BUFSIZE-1, file);
    if( line[0] != 'e' || line[1] != 'n' || line[2] != ' ' ) {
      continue;
    }
    column = 3;
    while( line[column++] != ' ' );
    views = atoi(&line[column]);
    if( views < 500 ) {
      continue;
    }
    findLowestSlot();
    if( slot != -1 && articles[slot].views < views ) {
      articles[slot].views = views;
      line[column-1] = '\0';
      strcpy(articles[slot].title, &line[3]);
    }
  }
  fclose(file);
  printSorted();
  return 0;
}


## wmstats.go
package main

import (
    "fmt"
    "os"
    "bufio"
    "strings"
    "strconv"
    "sort"
    "time"
)

type Article struct {
    article string
    views int
}

type Articles []Article

func timestamp() int64 {
    return time.Now().UnixNano() / int64(time.Millisecond)
}

func (slice Articles) Len() int {
    return len(slice)
}

func (slice Articles) Less(i, j int) bool {
    return slice[i].views > slice[j].views
}

func (slice Articles) Swap(i, j int) {
    slice[i], slice[j] = slice[j], slice[i]
}

func main() {
    filename := "pagecounts-20141029-230000"
    minviews := 500
    prefix := "en "

    file, err := os.Open(filename)

    if err != nil {
        fmt.Println(err)
    }

    scanner := bufio.NewScanner(file)
    articles := make(Articles, 0)
    start := timestamp()

    for scanner.Scan() {
        line := scanner.Text()

        if (!strings.HasPrefix(line, prefix)) {
            continue
        }

        parts := strings.Split(line, " ")
        articlename := parts[1]
        views := parts[2]
        viewsInt, _ := strconv.Atoi(views)

        if (viewsInt > minviews) {
            article := Article{
                article: articlename,
                views : viewsInt,
            }

            articles = append(articles, article)
        }
    }

    sort.Sort(articles)

    elapsed := timestamp() - start
    // So how do we get a float here in as a Println?
    fmt.Printf("Query took %v seconds\n", (elapsed / 10))

    for i := 0; i < 10; i++ {
        fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
    }
}

## wmstats.groovy
#!/usr/bin/env groovy

filename = 'pagecounts-20141029-230000'
prefix = 'en'
threshold = 500

overThreshold = []

start = System.currentTimeMillis()

new File(filename).splitEachLine(" ") { elems ->
   def count = elems[2].toInteger()
   if (elems[0] == prefix && count >= threshold) {
        overThreshold << [count, elems[1]]
   }
}

overThreshold.sort { a, b -> b[0] <=> a[0] }

println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"

overThreshold[0..9].each { line ->
    println line[1] + ' (' + line[0] + ')'
}

## wmstats.js
var fs = require('fs');
var readline = require('readline');

var filename = 'pagecounts-20141029-230000';
var minviews = 500;
var prefix = 'en ';

var rd = readline.createInterface({
    input : fs.createReadStream(filename),
    output : process.stdout,
    terminal : false
});

var start = +new Date();

var count = [];

rd.on('line', function(line) {
    if (line.indexOf(prefix) !== 0) {
        return;
    }

    var parts = line.split(" ");
    var lang = parts[0];
    var article = parts[1];
    var views = parts[2];

    if (views > minviews) {
        count.push([article, parseInt(views)]);
    }
});

rd.on('close', function() {
    count.sort(function(a, b) {
        return a[1] > b[1] ? -1 : 1;
    });

    var elapsed = (+new Date() - start) / 1000;
    console.log("Query took " + elapsed.toFixed(2) + " seconds");

    count.slice(0, 10).forEach(function(item) {
        console.log(item[0] + " (" + item[1] + ")");
    });
});

## wmstats.php
<?php

$filename = "pagecounts-20141029-230000";
$minviews = 500;
$prefix = "en ";

$f = fopen("pagecounts-20141029-230000", "r");

$start = microtime(true);

$count = [];

while (($line = fgets($f)) !== false) {
    $len = strlen($prefix);

    if (substr($line, 0, $len) !== $prefix) {
        continue;
    }

    $pieces = explode(" ", $line);
    $lang = $pieces[0];
    $article = $pieces[1];
    $views = $pieces[2];

    if ($views > $minviews) {
        $count[] = [$article, $views];
    }
}

usort($count, function($a, $b) {
    return $a[1] > $b[1] ? -1 : 1;
});

$elapsed = round(microtime(true) - $start, 3);
echo "Query took $elapsed seconds\n";

$count = array_slice($count, 0, 10);

foreach ($count as $item) {
    printf("%s (%s) \n", $item[0], $item[1]);
}

## wmstats.py
import time

filename = "pagecounts-20141029-230000"
minviews = 500
prefix = "en "

f = open(filename, "r").readlines()

start = time.time()

count = []

for line in f:
    if prefix and not line.startswith(prefix):
        continue

    (lang, article, views, size) = line.split(" ")

    if int(views) > minviews:
        count.append((article, views))

count.sort(key = lambda x: int(x[1]), reverse = True)

elapsed = time.time() - start

print "Query took %s seconds" % round(elapsed, 2)

for item in count[0:10]:
    print "%s (%s)" % item

## wmstats.rb
require 'benchmark'

filename = 'pagecounts-20141029-230000'
min_views = 500
prefix = 'en '
count = []

time = Benchmark.measure do
  File.open(filename).each_line do |line|
    next unless line.start_with? prefix

    parts = line.force_encoding("iso-8859-1").split(' ', 3)
    article, views = parts[1], parts[2].to_i

    count << [article, views] if views > min_views
  end

  count.sort! { |a,b| b[1] <=> a[1] }
end

puts "Query took #{time.total} seconds"

count[0..9].each { |item| puts "#{item[0]} (#{item[1]})" }
	// Run with javac WMStats.java && java WMStats

	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Paths;
	import java.util.List;
	import java.util.stream.Collectors;

	public class WMStats {

	private static final String filename = "pagecounts-20141029-230000";
	private static final String prefix = "en ";
	private static final int threshold = 500;
	private static final Charset charset = StandardCharsets.ISO_8859_1;

	public static void main(String[] args) throws IOException {
	new WMStats().go();
	}

	public void go() throws IOException {
	long start = System.currentTimeMillis();

	List<NameAndCount> top10 = Files
	.lines(Paths.get(filename), charset)
	.filter(line -> line.startsWith(prefix))
	.map(line -> line.split(" "))
	.map(elems -> new NameAndCount(elems[1], Integer.parseInt(elems[2])))
	.filter(pair -> pair.count >= threshold)
	.sorted((a, b) -> Integer.compare(b.count, a.count))
	.limit(10)
	.collect(Collectors.toList());

	System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

	for (NameAndCount pair : top10) {
	System.out.println(pair.name + " (" + pair.count + ")");
	}
	}

	class NameAndCount {
	String name;
	int count;

	NameAndCount(String name, int count) {
	this.name = name;
	this.count = count;
	}
	}
	}
	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;

	public class WMStats {

	private static final String FILE_NAME = "pagecounts-20141029-230000";
	private static final String PREFIX = "en ";
	private static final String DELIMITER = " ";
	private static final int THRESHOLD = 500;

	public static void main(String[] args) throws IOException {
	new WMStats().go();
	}

	public void go() throws IOException {
	List<NameAndCount> list = new ArrayList<>();

	long start = System.currentTimeMillis();

	try (BufferedReader reader = new BufferedReader(new FileReader(FILE_NAME))) {
	for (String line; (line = reader.readLine()) != null; ) {
	if (!line.startsWith(PREFIX)) {
	continue;
	}
	String[] parts = line.split(DELIMITER);
	int count = Integer.parseInt(parts[2]);
	if (count >= THRESHOLD) {
	list.add(new NameAndCount(parts[1], count));
	}
	}
	}
	Collections.sort(list);

	System.out.println("Query took " + (System.currentTimeMillis() - start) / 1000f + " seconds");

	for (NameAndCount pair : list.subList(0, 9)) {
	System.out.println(pair.name + " (" + pair.count + ")");
	}
	}

	class NameAndCount implements Comparable<NameAndCount> {
	String name;
	int count;

	NameAndCount(String name, int count) {
	this.name = name;
	this.count = count;
	}

	@Override
	public int compareTo(NameAndCount other) {
	return Integer.compare(other.count, count);
	}
	}
	}
	time grep '^en ' pagecounts-20141029-230000 \|\
	awk '{ if ($3 >= 500) print $3 " " $2 }' \|\
	LC_ALL=C sort -nr \|\
	head -n 10 \|\
	awk '{ print $2 " (" $1 ")" }'
	#include <stdio.h> // fopen(), feof(), fgets(), fclose(), printf()
	#include <string.h> // strcpy()
	#include <stdlib.h> // atoi()
	#include <time.h> // clock()

	#define BUFSIZE 1024

	struct article {
	char title[BUFSIZE];
	int views;
	} articles[10];
	char line[BUFSIZE];
	int column, slot, i, j, v, views;
	clock_t start;

	void findLowestSlot() {
	slot = -1;
	v = 1 << 30;
	for( i=0; i<10; i++ ) {
	if (articles[i].views < v) {
	slot = i;
	v = articles[i].views;
	}
	}
	}

	void printSorted() {
	for( j=0; j<10; j++ ) {
	v = 0;
	slot = 0;
	for( i=0; i<10; i++ ) {
	if( articles[i].views > v ) {
	v = articles[i].views;
	slot = i;
	}
	}
	printf("%s (%d)\n", articles[slot].title, articles[slot].views);
	articles[slot].views = 0;
	}
	printf("Query took %ld ms\n", (clock() - start) / (CLOCKS_PER_SEC / 1000));
	}

	int main(void) {
	start = clock();
	FILE* file = fopen("pagecounts-20141029-230000", "r");
	while( !feof(file) ) {
	fgets(&line[0], BUFSIZE-1, file);
	if( line[0] != 'e' \|\| line[1] != 'n' \|\| line[2] != ' ' ) {
	continue;
	}
	column = 3;
	while( line[column++] != ' ' );
	views = atoi(&line[column]);
	if( views < 500 ) {
	continue;
	}
	findLowestSlot();
	if( slot != -1 && articles[slot].views < views ) {
	articles[slot].views = views;
	line[column-1] = '\0';
	strcpy(articles[slot].title, &line[3]);
	}
	}
	fclose(file);
	printSorted();
	return 0;
	}
	package main

	import (
	"fmt"
	"os"
	"bufio"
	"strings"
	"strconv"
	"sort"
	"time"
	)

	type Article struct {
	article string
	views int
	}

	type Articles []Article

	func timestamp() int64 {
	return time.Now().UnixNano() / int64(time.Millisecond)
	}

	func (slice Articles) Len() int {
	return len(slice)
	}

	func (slice Articles) Less(i, j int) bool {
	return slice[i].views > slice[j].views
	}

	func (slice Articles) Swap(i, j int) {
	slice[i], slice[j] = slice[j], slice[i]
	}

	func main() {
	filename := "pagecounts-20141029-230000"
	minviews := 500
	prefix := "en "

	file, err := os.Open(filename)

	if err != nil {
	fmt.Println(err)
	}

	scanner := bufio.NewScanner(file)
	articles := make(Articles, 0)
	start := timestamp()

	for scanner.Scan() {
	line := scanner.Text()

	if (!strings.HasPrefix(line, prefix)) {
	continue
	}

	parts := strings.Split(line, " ")
	articlename := parts[1]
	views := parts[2]
	viewsInt, _ := strconv.Atoi(views)

	if (viewsInt > minviews) {
	article := Article{
	article: articlename,
	views : viewsInt,
	}

	articles = append(articles, article)
	}
	}

	sort.Sort(articles)

	elapsed := timestamp() - start
	// So how do we get a float here in as a Println?
	fmt.Printf("Query took %v seconds\n", (elapsed / 10))

	for i := 0; i < 10; i++ {
	fmt.Printf("%v (%v)\n", articles[i].article, articles[i].views)
	}
	}
	#!/usr/bin/env groovy

	filename = 'pagecounts-20141029-230000'
	prefix = 'en'
	threshold = 500

	overThreshold = []

	start = System.currentTimeMillis()

	new File(filename).splitEachLine(" ") { elems ->
	def count = elems[2].toInteger()
	if (elems[0] == prefix && count >= threshold) {
	overThreshold << [count, elems[1]]
	}
	}

	overThreshold.sort { a, b -> b[0] <=> a[0] }

	println "Query took ${(System.currentTimeMillis() - start) / 1000} seconds"

	overThreshold[0..9].each { line ->
	println line[1] + ' (' + line[0] + ')'
	}
	var fs = require('fs');
	var readline = require('readline');

	var filename = 'pagecounts-20141029-230000';
	var minviews = 500;
	var prefix = 'en ';

	var rd = readline.createInterface({
	input : fs.createReadStream(filename),
	output : process.stdout,
	terminal : false
	});

	var start = +new Date();

	var count = [];

	rd.on('line', function(line) {
	if (line.indexOf(prefix) !== 0) {
	return;
	}

	var parts = line.split(" ");
	var lang = parts[0];
	var article = parts[1];
	var views = parts[2];

	if (views > minviews) {
	count.push([article, parseInt(views)]);
	}
	});

	rd.on('close', function() {
	count.sort(function(a, b) {
	return a[1] > b[1] ? -1 : 1;
	});

	var elapsed = (+new Date() - start) / 1000;
	console.log("Query took " + elapsed.toFixed(2) + " seconds");

	count.slice(0, 10).forEach(function(item) {
	console.log(item[0] + " (" + item[1] + ")");
	});
	});
	<?php

	$filename = "pagecounts-20141029-230000";
	$minviews = 500;
	$prefix = "en ";

	$f = fopen("pagecounts-20141029-230000", "r");

	$start = microtime(true);

	$count = [];

	while (($line = fgets($f)) !== false) {
	$len = strlen($prefix);

	if (substr($line, 0, $len) !== $prefix) {
	continue;
	}

	$pieces = explode(" ", $line);
	$lang = $pieces[0];
	$article = $pieces[1];
	$views = $pieces[2];

	if ($views > $minviews) {
	$count[] = [$article, $views];
	}
	}

	usort($count, function($a, $b) {
	return $a[1] > $b[1] ? -1 : 1;
	});

	$elapsed = round(microtime(true) - $start, 3);
	echo "Query took $elapsed seconds\n";

	$count = array_slice($count, 0, 10);

	foreach ($count as $item) {
	printf("%s (%s) \n", $item[0], $item[1]);
	}
	import time

	filename = "pagecounts-20141029-230000"
	minviews = 500
	prefix = "en "

	f = open(filename, "r").readlines()

	start = time.time()

	count = []

	for line in f:
	if prefix and not line.startswith(prefix):
	continue

	(lang, article, views, size) = line.split(" ")

	if int(views) > minviews:
	count.append((article, views))

	count.sort(key = lambda x: int(x[1]), reverse = True)

	elapsed = time.time() - start

	print "Query took %s seconds" % round(elapsed, 2)

	for item in count[0:10]:
	print "%s (%s)" % item
	require 'benchmark'

	filename = 'pagecounts-20141029-230000'
	min_views = 500
	prefix = 'en '
	count = []

	time = Benchmark.measure do
	File.open(filename).each_line do \|line\|
	next unless line.start_with? prefix

	parts = line.force_encoding("iso-8859-1").split(' ', 3)
	article, views = parts[1], parts[2].to_i

	count << [article, views] if views > min_views
	end

	count.sort! { \|a,b\| b[1] <=> a[1] }
	end

	puts "Query took #{time.total} seconds"

	count[0..9].each { \|item\| puts "#{item[0]} (#{item[1]})" }