alaiacano/MapTest.scala

## generate_data.py
import random
random.seed(0)
with open("data_2M_rows.tsv", "w") as fout:
    for i in xrange(2000000):
        (a, b) = (random.randint(0, 10), random.randint(0, 10))
        fout.write("%d\t%d\tc\td\n" % (a, b))

"""
Fist 20 lines:

9	8	c	d
4	2	c	d
5	4	c	d
8	3	c	d
5	6	c	d
9	5	c	d
3	8	c	d
6	2	c	d
10	10	c	d
8	9	c	d
3	8	c	d
9	7	c	d
5	1	c	d
4	6	c	d
10	10	c	d
5	9	c	d
2	8	c	d
6	0	c	d
7	4	c	d
9	7	c	d
"""

## map_test.R
fn <- function(a,b,c,d) {
    if (a > b) {c} else {d}
}
fn.v <- Vectorize(fn)

d <- read.table(
    "~/Desktop/maptest/data_20M_rows.tsv",
    header=FALSE,
    col.names=c("a", "b", "c", "d"),
    stringsAsFactors=FALSE)

d$e <- fn.v(d$a, d$b, d$c, d$d)

write.table(d, "data_r.tsv", row.names=FALSE, col.names=FALSE, quote=FALSE)

## map_test_csv.py
import csv
import sys

fin = csv.reader(open(sys.argv[1], 'r'), delimiter='\t')
fout = csv.writer(open(sys.argv[2], 'wb'), delimiter='\t')
for row in fin:
    (a, b, c, d) = row
    e = c if int(a) > int(b) else d
    fout.writerow([a, b, c, d, e])

## map_test_pandas.py
import pandas as pd
import sys

data = pd.read_table(sys.argv[1], names=["a", "b", "c", "d"])

data['e'] = pd.Series(data.apply(lambda x: x[2] if x[0] > x[1] else x[3], 1), index=data.index)

data.to_csv(sys.argv[2], sep="\t", header=False, index=False)

## MapTest.scala
import com.twitter.scalding._
import TDsl._

class MapTest(args: Args) extends Job(args) {
  val data = TypedTsv[(Int, Int, String, String)](args("input"))
    .map{ case (a, b, c, d) =>
      val e = if (a > b) c else d
      (a, b, c, d, e)
    }
    .write(TypedTsv[(Int, Int, String, String, String)](args("output")))
}

## results.md

      
    Raw
  

              results.md
            
          
    The test is to open a file, do a simple operation on each row and write it to a new file.
I ran a very scientific test on my Macbook Air by running each of these once or twice for each file.
2,000,000 rows:

awk

This is probably the theoretical minimum
time cat data_2M_rows.tsv | awk -F'\t' '{if ($1 > $2) {print $1,$2,$3,$4,$3} else {print $1,$2,$3,$4,$4}}' > data_awk.tsv

real	0m4.643s
user	0m4.543s
sys	0m0.083s

scalding

real	0m8.179s
user	0m9.361s
sys	0m0.425s

Python - csv

real	0m9.621s
user	0m9.475s
sys	0m0.088s

R

real	0m36.028s
user	0m35.140s
sys	0m0.637s

Python - pandas

real	1m6.409s
user	1m5.250s
sys	0m0.764s

10,000,000 rows

awk

real	0m24.844s
user	0m24.380s
sys	0m0.367s

scalding

real	0m30.500s
user	0m30.930s
sys	0m0.768s

python - csv

real	0m47.627s
user	0m46.933s
sys	0m0.414s

python pandas & R

forget about it
	import random
	random.seed(0)
	with open("data_2M_rows.tsv", "w") as fout:
	for i in xrange(2000000):
	(a, b) = (random.randint(0, 10), random.randint(0, 10))
	fout.write("%d\t%d\tc\td\n" % (a, b))

	"""
	Fist 20 lines:

	9 8 c d
	4 2 c d
	5 4 c d
	8 3 c d
	5 6 c d
	9 5 c d
	3 8 c d
	6 2 c d
	10 10 c d
	8 9 c d
	3 8 c d
	9 7 c d
	5 1 c d
	4 6 c d
	10 10 c d
	5 9 c d
	2 8 c d
	6 0 c d
	7 4 c d
	9 7 c d
	"""
	fn <- function(a,b,c,d) {
	if (a > b) {c} else {d}
	}
	fn.v <- Vectorize(fn)

	d <- read.table(
	"~/Desktop/maptest/data_20M_rows.tsv",
	header=FALSE,
	col.names=c("a", "b", "c", "d"),
	stringsAsFactors=FALSE)

	d$e <- fn.v(d$a, d$b, d$c, d$d)

	write.table(d, "data_r.tsv", row.names=FALSE, col.names=FALSE, quote=FALSE)
	import csv
	import sys

	fin = csv.reader(open(sys.argv[1], 'r'), delimiter='\t')
	fout = csv.writer(open(sys.argv[2], 'wb'), delimiter='\t')
	for row in fin:
	(a, b, c, d) = row
	e = c if int(a) > int(b) else d
	fout.writerow([a, b, c, d, e])
	import pandas as pd
	import sys

	data = pd.read_table(sys.argv[1], names=["a", "b", "c", "d"])

	data['e'] = pd.Series(data.apply(lambda x: x[2] if x[0] > x[1] else x[3], 1), index=data.index)

	data.to_csv(sys.argv[2], sep="\t", header=False, index=False)
	import com.twitter.scalding._
	import TDsl._

	class MapTest(args: Args) extends Job(args) {
	val data = TypedTsv[(Int, Int, String, String)](args("input"))
	.map{ case (a, b, c, d) =>
	val e = if (a > b) c else d
	(a, b, c, d, e)
	}
	.write(TypedTsv[(Int, Int, String, String, String)](args("output")))
	}