Skip to content

Instantly share code, notes, and snippets.

@octoparse
Last active May 9, 2019 01:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.
Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.
Data Science: What is the near future of Superheroines?
import collections
import re
def get_first_name(aString):
if not aString:
return aString
ss = aString.replace('*', '').split(' ') # ['Leonard', 'Nimoy*Chris', 'PineZachary', 'QuintoZoe', 'SaldanaKarl']
name_list = [] # result returned for this function
for name in ss:
names = re.findall('([A-Z])', name) #[N, C]
if len(names) == 1: # L
name_list.append(name)
elif len(names) == 2:
name_list.append(name[name.find(names[1],1):]) #name.find(names['C']) = 5 --> name[5:]
return name_list[0:-1] # array
def read_marvel(file_name):
movies = []
with open(file_name) as f:
movies = f.read().split('\n')
return movies
def read_csv(file_name): # file_name = 'boxoffice.csv'
movie_list = [] # create an empty list
with open(file_name) as f:
txt = f.read()
row_list = txt.split('\n') # a list of each row of data
for row in row_list:
name_list = row.split(',')
if len(name_list) >= 3:
name_list[1] = get_first_name(name_list[1])
# 0: movie name, 1: actor names, 2: year
movie_list.append((name_list[0], name_list[1], name_list[2]))
return movie_list
def lookup_gender(filename):
gender_dict = {}
with open(filename) as f:
firstname_gender = f.read().split('\n')
for t in firstname_gender[:-1]: # there is a '' at the end cuz rows were split by \n
firstname, gender = t.split(',')
gender_dict[firstname] = gender
return gender_dict
## SCRIPT begins
# construct a first name to gender dictionary
gender_dict = lookup_gender('name.csv')
movie_list = read_csv('boxoffice.csv') # movie_list has 3 columns: name, actor list, year
all_movie_dict = collections.OrderedDict()
sorted_movie_list = sorted(movie_list, key=lambda x:x[2], reverse=True)
##all_actors = set()
##for m in sorted_movie_list:
## for n in m[1]:
## all_actors.add(n)
##
##with open('raw_name.csv','w') as f:
## for name in all_actors:
## f.write(name + '\n')
## Analysze all movie's actor gender by year
all_year_dict = collections.OrderedDict()
for m in sorted_movie_list:
all_movie_dict[m[0]] = [m[1], m[2]]
year = m[2]
actors = m[1]
try:
genders = [gender_dict[name] for name in actors]
except KeyError:
pass
if year in all_year_dict:
all_year_dict[year] += genders
else:
all_year_dict[year] = genders
print ' key: year, value: list of gender of male/female'
for i in all_year_dict:
my_temp_dict = {j:all_year_dict[i].count(j) for j in all_year_dict[i]}
print i, my_temp_dict
## Analyze Marvel's movie actor gender by year
# construct a dict, key: year, value: list of gender of male/female
year_dict = collections.OrderedDict()
for marvel_movie in read_marvel('marvel_movies.txt'): # m as movie name
# for each name in the marvel movie
year = str(all_movie_dict[marvel_movie][1])
actors = all_movie_dict[marvel_movie][0]
gender = [gender_dict[name] for name in actors]
if year in year_dict:
year_dict[year] += gender
else:
year_dict[year] = gender
# construct a dict, key: male/female, value: count
print 'key: male/female, value: count'
for i in year_dict:
my_dict = {j:year_dict[i].count(j) for j in year_dict[i]}
print i, my_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment