MrN00b0t/script.py

## script.py
import pandas as pd
pd.set_option('display.max_colwidth', -1)

#Load csv into Data Frame
jeopardy = pd.read_csv('jeopardy.csv')

#Investigate DataFrame Structure
print(jeopardy.head())
#Does not show full DataFrame; try different method

for col in jeopardy.columns:
  print(col)
#Column names are not compatible as variable names, so reformat and rename them.

for col in jeopardy.columns:
  colnew = col.strip().lower().replace(' ', '_')
  jeopardy.rename(columns={col: colnew}, inplace= True)

# Get a feel for data held in each column
print(jeopardy[colnew])

# New column names are show_number(int), air_date(yyyy-mm-dd), round(str), category(str), value(str), question(str) and answer(str)

# Searching a dataset for a list of words; convert haystack and needle to lower case before performing search
def find_question(frame, wordlist):
  search = lambda x: all(word.lower() in x.lower() for word in wordlist)
  return frame.loc[frame['question'].apply(search)]

# Test Function
print(find_question(jeopardy, ['Chief', 'Justice']))

#Create column float_value containing formatted and floated values, replacing Final Jeopardy values ('None') with 0.0
jeopardy['float_value'] = jeopardy.value.apply(lambda value: float(value.replace('$', '').replace(',', '')) if value != 'None' else 0.0)
#Test lambda function
print(jeopardy.float_value)

#Determine mean difficulty of question containing a word
difficulty = find_question(jeopardy, ['King'])
print(difficulty.float_value.mean())
#Determine all unique answers to question containing search word and count
print(difficulty.answer.value_counts())

#Calculating number of questions containing word "computer" for each decade
#Add column to dataframe formatted to year
jeopardy['question_year'] = jeopardy.air_date.apply(lambda x: x[:4])
#Use find_question function and GROUP BY question_year
computer = find_question(jeopardy, ['Computer'])
computer_by_year = computer.groupby('question_year').show_number.count().reset_index()
#Select rows where question_year is in the 90s
computer_90s = computer_by_year[(computer_by_year.question_year < '2000') & (computer_by_year.question_year > '1989')]
#Select rows where question_year is in the 2000s
computer_2000s = computer_by_year[(computer_by_year.question_year < '2010') & (computer_by_year.question_year > '1999')]
#Calculate the total number of questions containing the search term by decade and print to terminal
print("The number of questions featuring the word \"computer\" in the 1990s was " + str(computer_90s.show_number.sum()) + " whereas the number of questions containing the word \"computer\" in the 2000s was " + str(computer_2000s.show_number.sum()))

#Display number of instances of category occuring in particular round
category_round = jeopardy.groupby(['category', 'round']).show_number.count().reset_index()
#Plot in a pivot table to increase readability
category_round_pivot = category_round.pivot(columns= 'round', index= 'category', values= 'show_number').reset_index()
#Rename columns
category_round_pivot.columns = ['category', 'double', 'final', 'single']
#Display resulting pivot table
print(category_round_pivot)
#To find data on specific category
literature = category_round_pivot[(category_round_pivot.category == 'LITERATURE')]
print(literature)
	import pandas as pd
	pd.set_option('display.max_colwidth', -1)

	#Load csv into Data Frame
	jeopardy = pd.read_csv('jeopardy.csv')

	#Investigate DataFrame Structure
	print(jeopardy.head())
	#Does not show full DataFrame; try different method

	for col in jeopardy.columns:
	print(col)
	#Column names are not compatible as variable names, so reformat and rename them.

	for col in jeopardy.columns:
	colnew = col.strip().lower().replace(' ', '_')
	jeopardy.rename(columns={col: colnew}, inplace= True)

	# Get a feel for data held in each column
	print(jeopardy[colnew])

	# New column names are show_number(int), air_date(yyyy-mm-dd), round(str), category(str), value(str), question(str) and answer(str)

	# Searching a dataset for a list of words; convert haystack and needle to lower case before performing search
	def find_question(frame, wordlist):
	search = lambda x: all(word.lower() in x.lower() for word in wordlist)
	return frame.loc[frame['question'].apply(search)]

	# Test Function
	print(find_question(jeopardy, ['Chief', 'Justice']))

	#Create column float_value containing formatted and floated values, replacing Final Jeopardy values ('None') with 0.0
	jeopardy['float_value'] = jeopardy.value.apply(lambda value: float(value.replace('$', '').replace(',', '')) if value != 'None' else 0.0)
	#Test lambda function
	print(jeopardy.float_value)

	#Determine mean difficulty of question containing a word
	difficulty = find_question(jeopardy, ['King'])
	print(difficulty.float_value.mean())
	#Determine all unique answers to question containing search word and count
	print(difficulty.answer.value_counts())

	#Calculating number of questions containing word "computer" for each decade
	#Add column to dataframe formatted to year
	jeopardy['question_year'] = jeopardy.air_date.apply(lambda x: x[:4])
	#Use find_question function and GROUP BY question_year
	computer = find_question(jeopardy, ['Computer'])
	computer_by_year = computer.groupby('question_year').show_number.count().reset_index()
	#Select rows where question_year is in the 90s
	computer_90s = computer_by_year[(computer_by_year.question_year < '2000') & (computer_by_year.question_year > '1989')]
	#Select rows where question_year is in the 2000s
	computer_2000s = computer_by_year[(computer_by_year.question_year < '2010') & (computer_by_year.question_year > '1999')]
	#Calculate the total number of questions containing the search term by decade and print to terminal
	print("The number of questions featuring the word \"computer\" in the 1990s was " + str(computer_90s.show_number.sum()) + " whereas the number of questions containing the word \"computer\" in the 2000s was " + str(computer_2000s.show_number.sum()))

	#Display number of instances of category occuring in particular round
	category_round = jeopardy.groupby(['category', 'round']).show_number.count().reset_index()
	#Plot in a pivot table to increase readability
	category_round_pivot = category_round.pivot(columns= 'round', index= 'category', values= 'show_number').reset_index()
	#Rename columns
	category_round_pivot.columns = ['category', 'double', 'final', 'single']
	#Display resulting pivot table
	print(category_round_pivot)
	#To find data on specific category
	literature = category_round_pivot[(category_round_pivot.category == 'LITERATURE')]
	print(literature)