Created
September 13, 2014 02:34
-
-
Save jayswan/e0a97681ef8f66f367f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from csv import DictReader | |
import gzip | |
from pprint import pprint | |
from sys import argv | |
FIELDNAMES = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'trans_id', 'query', 'qclass', 'qclass_name', 'qtype', 'qtype_name', 'rcode', 'rcode_name', 'AA', 'TC', 'RD', 'RA', 'Z', 'answersTTLs', 'rejected'] | |
def ingest(files, delim='\t', qchar='"'): | |
""" return a list of dictionaries representing lines in a CSV file """ | |
records = [] | |
for fn in files: | |
print 'loading %s' % fn | |
if fn.endswith('.gz'): | |
d = DictReader(gzip.open(fn), delimiter=delim, quotechar=qchar, fieldnames=FIELDNAMES) | |
else: | |
d = DictReader(open(fn), delimiter=delim, quotechar=qchar, fieldnames=FIELDNAMES) | |
records.extend([item for item in d]) | |
return records | |
def count(records, field): | |
""" return a 'uniq -c' style list for a field in a list of dicts """ | |
return Counter([item[field] for item in records]).most_common() | |
def uniq(records, field): | |
""" return a 'uniq' style set for a field in a list of dicts """ | |
return {item[field] for item in records} | |
def linecount(records): | |
return len(records) | |
def keys(records): | |
return sorted(records[0].keys()) | |
def profile(records): | |
print 'Number of records:' | |
print linecount(records) | |
print 'fields:' | |
pprint(keys(records)) | |
def main(): | |
profile(ingest(argv[1])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment