Created
May 7, 2020 10:24
-
-
Save MrN00b0t/281bc740d0f017529c24cb33e8dcf2ac to your computer and use it in GitHub Desktop.
Codecademy: This is Jeopardy!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
pd.set_option('display.max_colwidth', -1) | |
#Load csv into Data Frame | |
jeopardy = pd.read_csv('jeopardy.csv') | |
#Investigate DataFrame Structure | |
print(jeopardy.head()) | |
#Does not show full DataFrame; try different method | |
for col in jeopardy.columns: | |
print(col) | |
#Column names are not compatible as variable names, so reformat and rename them. | |
for col in jeopardy.columns: | |
colnew = col.strip().lower().replace(' ', '_') | |
jeopardy.rename(columns={col: colnew}, inplace= True) | |
# Get a feel for data held in each column | |
print(jeopardy[colnew]) | |
# New column names are show_number(int), air_date(yyyy-mm-dd), round(str), category(str), value(str), question(str) and answer(str) | |
# Searching a dataset for a list of words; convert haystack and needle to lower case before performing search | |
def find_question(frame, wordlist): | |
search = lambda x: all(word.lower() in x.lower() for word in wordlist) | |
return frame.loc[frame['question'].apply(search)] | |
# Test Function | |
print(find_question(jeopardy, ['Chief', 'Justice'])) | |
#Create column float_value containing formatted and floated values, replacing Final Jeopardy values ('None') with 0.0 | |
jeopardy['float_value'] = jeopardy.value.apply(lambda value: float(value.replace('$', '').replace(',', '')) if value != 'None' else 0.0) | |
#Test lambda function | |
print(jeopardy.float_value) | |
#Determine mean difficulty of question containing a word | |
difficulty = find_question(jeopardy, ['King']) | |
print(difficulty.float_value.mean()) | |
#Determine all unique answers to question containing search word and count | |
print(difficulty.answer.value_counts()) | |
#Calculating number of questions containing word "computer" for each decade | |
#Add column to dataframe formatted to year | |
jeopardy['question_year'] = jeopardy.air_date.apply(lambda x: x[:4]) | |
#Use find_question function and GROUP BY question_year | |
computer = find_question(jeopardy, ['Computer']) | |
computer_by_year = computer.groupby('question_year').show_number.count().reset_index() | |
#Select rows where question_year is in the 90s | |
computer_90s = computer_by_year[(computer_by_year.question_year < '2000') & (computer_by_year.question_year > '1989')] | |
#Select rows where question_year is in the 2000s | |
computer_2000s = computer_by_year[(computer_by_year.question_year < '2010') & (computer_by_year.question_year > '1999')] | |
#Calculate the total number of questions containing the search term by decade and print to terminal | |
print("The number of questions featuring the word \"computer\" in the 1990s was " + str(computer_90s.show_number.sum()) + " whereas the number of questions containing the word \"computer\" in the 2000s was " + str(computer_2000s.show_number.sum())) | |
#Display number of instances of category occuring in particular round | |
category_round = jeopardy.groupby(['category', 'round']).show_number.count().reset_index() | |
#Plot in a pivot table to increase readability | |
category_round_pivot = category_round.pivot(columns= 'round', index= 'category', values= 'show_number').reset_index() | |
#Rename columns | |
category_round_pivot.columns = ['category', 'double', 'final', 'single'] | |
#Display resulting pivot table | |
print(category_round_pivot) | |
#To find data on specific category | |
literature = category_round_pivot[(category_round_pivot.category == 'LITERATURE')] | |
print(literature) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment