Data Cleaning a Beautiful Soup Scrape in Python

Date: May 19, 2020Author: MrN00b0t 0 Comments

This is the code I produced for the “extra credit” section on Codecademy Pro Beautiful Soup review. This exercise really helped me get to grips with the basics of data cleaning; not just getting the data into the right columns but ensuring it is the right type for future analysis.

import requests
from bs4 import BeautifulSoup
import pandas as pd

prefix = "https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/"
webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html')

webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")

turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them"
for a in turtle_links:
    links.append(prefix+a["href"])
    
#Define turtle_data:
turtle_data = {}

#follow each link:
for link in links:
  webpage = requests.get(link)
  turtle = BeautifulSoup(webpage.content, "html.parser")
  turtle_name = turtle.select(".name")[0].get_text()
  
  stats = turtle.find("ul")
  stats_text = stats.get_text("|")
  turtle_data[turtle_name] = stats_text.split("|")

turtle_df = pd.DataFrame. from_dict(turtle_data, orient='index')
#All newlines are even columns, drop those:
drop_columns = []
for i in range(0, 12, 2):
  drop_columns.append(i)
turtle_df = turtle_df.drop(drop_columns, axis=1)
#Provide numerical index instead of name:
turtle_df = turtle_df.reset_index()

#Give appropriate names to remaining columns:
turtle_df = turtle_df.rename(columns={'index': 'name', 1: 'years_old', 3: 'weight_lbs', 5: 'gender', 7: 'breed', 9: 'source'})

#Remove extraneous data 
turtle_df.source = turtle_df.source.replace('SOURCE: ', '', regex= True)
turtle_df.breed = turtle_df.breed.replace('BREED: ', '', regex= True)
turtle_df.gender = turtle_df.gender.replace('SEX: ', '', regex= True)

#Age and weight likely to benefit from being numerical; strip out string values
new_weight = turtle_df.weight_lbs.str.split(' ', expand= True)
new_age = turtle_df.years_old.str.split(' ', expand= True)
turtle_df.weight_lbs = new_weight[1]
turtle_df.years_old = new_age[1]

#Convert to numeric
turtle_df.weight_lbs = pd.to_numeric(turtle_df.weight_lbs)
turtle_df.years_old = pd.to_numeric(turtle_df.years_old)

#Aggregate functions can now be performed on age/weight!
print(turtle_df.years_old.mean())

Leave a comment Cancel reply