This is my solution to the Codecademy Pro project “This is Jeopardy!” which includes answers to the integral “further credit” questions. The aim is to examine data from the popular US game show “Jeopardy!” to identify trends – perhaps allowing a future contestant to revise topics more likely to benefit them at certain stages of the game!
import pandas as pd
pd.set_option('display.max_colwidth', -1)
#Load csv into Data Frame
jeopardy = pd.read_csv('jeopardy.csv')
#Investigate DataFrame Structure
print(jeopardy.head())
#Does not show full DataFrame; try different method
for col in jeopardy.columns:
print(col)
#Column names are not compatible as variable names, so reformat and rename them.
for col in jeopardy.columns:
colnew = col.strip().lower().replace(' ', '_')
jeopardy.rename(columns={col: colnew}, inplace= True)
# Get a feel for data held in each column
print(jeopardy[colnew])
# New column names are show_number(int), air_date(yyyy-mm-dd), round(str), category(str), value(str), question(str) and answer(str)
# Searching a dataset for a list of words; convert haystack and needle to lower case before performing search
def find_question(frame, wordlist):
search = lambda x: all(word.lower() in x.lower() for word in wordlist)
return frame.loc[frame['question'].apply(search)]
# Test Function
print(find_question(jeopardy, ['Chief', 'Justice']))
#Create column float_value containing formatted and floated values, replacing Final Jeopardy values ('None') with 0.0
jeopardy['float_value'] = jeopardy.value.apply(lambda value: float(value.replace('$', '').replace(',', '')) if value != 'None' else 0.0)
#Test lambda function
print(jeopardy.float_value)
#Determine mean difficulty of question containing a word
difficulty = find_question(jeopardy, ['King'])
print(difficulty.float_value.mean())
#Determine all unique answers to question containing search word and count
print(difficulty.answer.value_counts())
#Calculating number of questions containing word "computer" for each decade
#Add column to dataframe formatted to year
jeopardy['question_year'] = jeopardy.air_date.apply(lambda x: x[:4])
#Use find_question function and GROUP BY question_year
computer = find_question(jeopardy, ['Computer'])
computer_by_year = computer.groupby('question_year').show_number.count().reset_index()
#Select rows where question_year is in the 90s
computer_90s = computer_by_year[(computer_by_year.question_year < '2000') & (computer_by_year.question_year > '1989')]
#Select rows where question_year is in the 2000s
computer_2000s = computer_by_year[(computer_by_year.question_year < '2010') & (computer_by_year.question_year > '1999')]
#Calculate the total number of questions containing the search term by decade and print to terminal
print("The number of questions featuring the word \"computer\" in the 1990s was " + str(computer_90s.show_number.sum()) + " whereas the number of questions containing the word \"computer\" in the 2000s was " + str(computer_2000s.show_number.sum()))
#Display number of instances of category occuring in particular round
category_round = jeopardy.groupby(['category', 'round']).show_number.count().reset_index()
#Plot in a pivot table to increase readability
category_round_pivot = category_round.pivot(columns= 'round', index= 'category', values= 'show_number').reset_index()
#Rename columns
category_round_pivot.columns = ['category', 'double', 'final', 'single']
#Display resulting pivot table
print(category_round_pivot)
#To find data on specific category
literature = category_round_pivot[(category_round_pivot.category == 'LITERATURE')]
print(literature)