Skip to content

Instantly share code, notes, and snippets.

@atmb4u
Created October 3, 2015 05:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save atmb4u/7128b996fc8b1a47a506 to your computer and use it in GitHub Desktop.
Save atmb4u/7128b996fc8b1a47a506 to your computer and use it in GitHub Desktop.
Demo for thinking in functions workshop with imdb database
import csv
import itertools
from collections import Counter, defaultdict
import json
import re
"""
get Blockbuster Database dataset from http://www.crowdflower.com/data-for-everyone
"""
f = open("top_ten_movies_per_year_DFE.csv")
r = csv.DictReader(f)
data = [x for x in r]
f.close()
gross_by_studio = defaultdict(list)
[
gross_by_studio[a].append(b)
for a, b in
list(
itertools.chain(*[
[
(y.strip(), float("".join([
a for a in
re.split("[^.0-9]", x.get('worldwide_gross'))]))
if '$' in x.get('worldwide_gross') else 0)
for y in x.get('studio').split("/")
] for x in data
]
)
)
]
gross_by_studio = dict([
(x, sum(y))
for x, y in
gross_by_studio.items()
])
print json.dumps(gross_by_studio, indent=4)
genre_count = dict(
Counter([
a for a in
list(itertools.chain(*[
[
x.get('Genre_%d'%y) for y in [1,2,3] if x.get('Genre_%d'%y)]
for x in data])) if a
]
)
)
def to_gross(q):
print q
return float("".join([
a for a in
re.split("[^.0-9]", q)]) if q else 0)
print json.dumps(genre_count, indent=4)
gross_rating = [(a.get('title'), a.get('imdb_rating'), a.get('adjusted'), a.get('year')) for a in sorted(data, key=lambda x:(-to_gross(x.get('adjusted')), -to_gross(x.get('imdb_rating'))))][:10]
print json.dumps(gross_rating, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment