Skip to content

Instantly share code, notes, and snippets.

@dperconti
Created March 17, 2014 16:36
Show Gist options
  • Save dperconti/9602981 to your computer and use it in GitHub Desktop.
Save dperconti/9602981 to your computer and use it in GitHub Desktop.
A tag counting script with comments
import sys
import json
import codecs
from collections import defaultdict
# This line will fix your unicode printing woes... still getting the hang of
# using codecs for everything, but it's important
sys.stdout = codecs.getwriter("utf8")(sys.stdout)
# collections.Counter was introduced in 2.7 -- this is 2.6 compatible
# this is the real goodness
tag_counts = defaultdict(int)
# Using sys.argv[1] allows you to use this on other files in the future without
# modifying the script
with open(sys.argv[1]) as f:
# Using `for line in file` syntax allows you to process each line of the
# data without loading it all into memory at once
for line in f:
# Try/Catch for the first brackets which are not considered "proper" JSON
try:
# Stripping the whitespace and commas is a decent bit cleaner here
# Also, this is really a device_info dictionary, not JsonFormat
device_info = json.loads(line.strip().strip(','))
# Always catch Exception (or more specific) so that KeyboardInterrupt
# (ctrl + C) will still stop the program
# Also, keep your try blocks as small as possible
except Exception:
continue
# It's tags... so name it what it is
# Man it is dumb that gimme outputs the tags in this format... sorry
# !meaculpa
tags = eval(device_info["tags"])
# tags could be None
if not tags:
continue
for tag in tags:
# Using the magic/beauty of defaultdict/counter here
# defaultdict(int) is essentially the same as Counter()
# read more here:
# http://docs.python.org/2/library/collections.html#counter-objects
tag_counts[tag] += 1
# I prefer to print to stdout in my scripts, since you can just
# python my_script.py input.txt > out.txt
# and get more or less the same result as writing to a file which gives me
# added flexibility
for tag, count in tag_counts.iteritems():
print tag, "--", count
# The rest of the code is an example that I don't want to actually execute, so
# I'm exiting early. Don't actually need to use sys.exit(0).
sys.exit(0)
# If you really want to write to a file, you could use something like this...
# Open 2nd file provided on the command line in "write binary" mode
# You don't need to use codecs.open(sys.argv[2], 'wb', 'utf8') but it would be safer
with open(sys.argv[2], 'wb') as f:
for tag, count in tag_counts.iteritems():
f.write(u"{0} -- {1}".format(tag, count))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment