This is the code I produced for the “extra credit” section on Codecademy Pro Beautiful Soup review. This exercise really helped me get to grips with the basics of data cleaning; not just getting the data into the right columns but ensuring it is the right type for future analysis.
import requests from bs4 import BeautifulSoup import pandas as pd prefix = "https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/" webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html') webpage = webpage_response.content soup = BeautifulSoup(webpage, "html.parser") turtle_links = soup.find_all("a") links = [] #go through all of the a tags and get the links associated with them" for a in turtle_links: links.append(prefix+a["href"]) #Define turtle_data: turtle_data = {} #follow each link: for link in links: webpage = requests.get(link) turtle = BeautifulSoup(webpage.content, "html.parser") turtle_name = turtle.select(".name")[0].get_text() stats = turtle.find("ul") stats_text = stats.get_text("|") turtle_data[turtle_name] = stats_text.split("|") turtle_df = pd.DataFrame. from_dict(turtle_data, orient='index') #All newlines are even columns, drop those: drop_columns = [] for i in range(0, 12, 2): drop_columns.append(i) turtle_df = turtle_df.drop(drop_columns, axis=1) #Provide numerical index instead of name: turtle_df = turtle_df.reset_index() #Give appropriate names to remaining columns: turtle_df = turtle_df.rename(columns={'index': 'name', 1: 'years_old', 3: 'weight_lbs', 5: 'gender', 7: 'breed', 9: 'source'}) #Remove extraneous data turtle_df.source = turtle_df.source.replace('SOURCE: ', '', regex= True) turtle_df.breed = turtle_df.breed.replace('BREED: ', '', regex= True) turtle_df.gender = turtle_df.gender.replace('SEX: ', '', regex= True) #Age and weight likely to benefit from being numerical; strip out string values new_weight = turtle_df.weight_lbs.str.split(' ', expand= True) new_age = turtle_df.years_old.str.split(' ', expand= True) turtle_df.weight_lbs = new_weight[1] turtle_df.years_old = new_age[1] #Convert to numeric turtle_df.weight_lbs = pd.to_numeric(turtle_df.weight_lbs) turtle_df.years_old = pd.to_numeric(turtle_df.years_old) #Aggregate functions can now be performed on age/weight! print(turtle_df.years_old.mean())