Skip to content

Instantly share code, notes, and snippets.

@aliles
Created July 21, 2011 12:55
Show Gist options
  • Save aliles/1097126 to your computer and use it in GitHub Desktop.
Save aliles/1097126 to your computer and use it in GitHub Desktop.
Rough Avro benchmarking using Geonames dataset
import sys
import time
from avro_geonames import *
if __name__ == '__main__':
iterator = iter_geonames(sys.argv[1])
writer = open_writer()
start = time.clock()
for n in xrange(100000):
feature = iterator.next()
writer.append(feature._asdict())
stop = time.clock()
print stop - start, "CPU seconds"
from collections import namedtuple
from avro import schema, datafile, io
SCHEMA_STR = """{
"type": "record",
"name": "Feature",
"fields": [
{ "name": "geonameid", "type": "int" },
{ "name": "name", "type": "string" },
{ "name": "asciiname", "type": "bytes" },
{ "name": "alternatenames", "type":
{ "type": "array", "items": "bytes" }
},
{ "name": "latitude", "type": "float" },
{ "name": "longitude", "type": "float" },
{ "name": "feature_class", "type": "bytes" },
{ "name": "feature_code", "type": "bytes" },
{ "name": "country_code", "type": "bytes" },
{ "name": "cc2", "type":
{ "type": "array", "items": "bytes" }
},
{ "name": "admin1_code", "type": "bytes" },
{ "name": "admin2_code", "type": "bytes" },
{ "name": "admin3_code", "type": "bytes" },
{ "name": "admin4_code", "type": "bytes" },
{ "name": "population", "type": "int" },
{ "name": "elevation", "type": "int" },
{ "name": "gtopo30", "type": "bytes" },
{ "name": "timezone", "type": "bytes" },
{ "name": "modification_date", "type": "bytes" }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
FIELDS = [
'geonameid',
'name',
'asciiname',
'alternatenames',
'latitude',
'longitude',
'feature_class',
'feature_code',
'country_code',
'cc2',
'admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'population',
'elevation',
'gtopo30',
'timezone',
'modification_date'
]
Feature = namedtuple('Feature', ' '.join(FIELDS))
def iter_geonames(filename):
def cast(seq):
return (
int(seq[0]),
seq[1].decode('utf8'),
seq[2],
seq[3].split(','),
float(seq[4]),
float(seq[5]),
seq[6],
seq[7],
seq[8],
seq[9].split(','),
seq[10],
seq[11],
seq[12],
seq[13],
int(seq[14] if seq[14] else 0),
int(seq[15] if seq[15] else 0),
seq[16],
seq[17],
seq[18]
)
with open(filename) as source:
for line in source:
yield Feature(*cast(line.split('\t')))
def open_writer():
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open('/dev/null', 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='null'
)
return df_writer
import sys
import time
from avro_geonames import *
if __name__ == '__main__':
values = []
iterator = iter_geonames(sys.argv[1])
writer = open_writer()
for n in xrange(15):
start = time.clock()
for i in xrange(50):
feature = iterator.next()
writer.append(feature._asdict())
stop = time.clock()
values.append(stop - start)
print ",".join(str(v) for v in values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment