Skip to content

Instantly share code, notes, and snippets.

@pybokeh
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pybokeh/8806446 to your computer and use it in GitHub Desktop.
Save pybokeh/8806446 to your computer and use it in GitHub Desktop.
CarComplaintsDotCom
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from bs4 import BeautifulSoup\n",
"from collections import OrderedDict\n",
"import time\n",
"import urllib.request as request\n",
"import re\n",
"\n",
"def getMakes():\n",
" \"\"\"Function to get all the makes available at carcomplaints.com\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'\n",
" html = request.urlopen(url)\n",
" \n",
" soup = BeautifulSoup(html)\n",
" sections = soup.find_all('section', id=re.compile('makes'))\n",
" \n",
" make_list = []\n",
" for section in range(len(sections)):\n",
" for li in sections[section].find_all('li'):\n",
" make_list.append(li.a['href'].replace('/',''))\n",
" \n",
" return make_list\n",
"\n",
"\n",
"def getYearCounts(make, model):\n",
" \"\"\"Function that returns a Python dict that contains model years and their complaint qty\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'\n",
" html = request.urlopen(url)\n",
"\n",
" soup = BeautifulSoup(html)\n",
" li = soup.find_all('li', id=re.compile('bar*'))\n",
"\n",
" year_counts_dict = {}\n",
" for item in li:\n",
" year_counts_dict[int(item.find('span',class_='label').get_text())]=int(item.find('span',class_='count').get_text().replace(\",\",\"\"))\n",
" \n",
" return year_counts_dict\n",
"\n",
"\n",
"def getCountsByModel(make):\n",
" \"\"\"Method that returns the number of complaints for each model based on vehicle make\n",
" Applicable make values are: 'Honda','Acura','Ford','GM',etc\n",
" Method returns a dictionary where the key is the model, value is the qty of complaints\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'\n",
" url_make = url+make+'/'\n",
" html_make = request.urlopen(url_make)\n",
" \n",
" soup = BeautifulSoup(html_make)\n",
" ul = soup.find_all('ul', class_='column bar',id=re.compile('c*'))\n",
" \n",
" make_model_counts_dict = OrderedDict()\n",
" num_column_data = len(ul) # The data is divided up in arbitrary number of columns per HTML page source\n",
" for i in range(num_column_data): # For each column of data...\n",
" for row in ul[i].find_all('li'):\n",
" make_model_counts_dict[row.a.get_text().replace(' ','_')] = int(row.span.get_text().replace(\",\",\"\"))\n",
" \n",
" return make_model_counts_dict\n",
"\n",
"\n",
"def getTopSystemsQty(make, model, year):\n",
" \"\"\"Function that returns an OrderedDict containing system problems and their complaint qty\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'\n",
" html = request.urlopen(url)\n",
"\n",
" soup = BeautifulSoup(html)\n",
" li = soup.find_all('li', id=re.compile('bar*'))\n",
" \n",
" problem_counts_dict = OrderedDict() # We want to maintain insertion order\n",
" for item in li:\n",
" try:\n",
" problem_counts_dict[item.a['href'][:-1]]=int(item.span.get_text().replace(\",\",\"\"))\n",
" except:\n",
" pass\n",
" \n",
" return problem_counts_dict\n",
"\n",
"\n",
"def getNhtsaSystemsQty(make, model, year):\n",
" \"\"\"Function that returns an OrderedDict containing qty of NHTSA complaints by system\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'\n",
" html = request.urlopen(url)\n",
"\n",
" soup = BeautifulSoup(html)\n",
"\n",
" nhtsa = soup.find_all('em', class_='nhtsa')\n",
"\n",
" nhtsa_counts = []\n",
" for item in nhtsa:\n",
" try:\n",
" # There are 3 string tokens separated by whitespace, i want the 3rd token which is the qty\n",
" nhtsa_counts.append(int(item.span.get_text().split()[2]))\n",
" except:\n",
" # Unfortunately, some only have 2 tokens\n",
" nhtsa_counts.append(int(item.span.get_text().split()[1]))\n",
"\n",
" systems = soup.find_all('li', id=re.compile('bar*'))\n",
"\n",
" systems_list = []\n",
" for item in systems:\n",
" systems_list.append(item.a['href'][:-1]) # Remove the ending forward slash\n",
"\n",
" nhtsa_systems_counts = list(zip(systems_list,nhtsa_counts))\n",
" \n",
" nhtsa_systems_qty_dict = OrderedDict()\n",
" for item in nhtsa_systems_counts:\n",
" nhtsa_systems_qty_dict[item[0]]=item[1]\n",
" \n",
" return nhtsa_systems_qty_dict\n",
"\n",
"\n",
"def getSubSystemsQty(make, model, year, system):\n",
" \"\"\"Function that will return an OrderedDict of # of complaints by sub-system\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'\n",
" html = request.urlopen(url)\n",
" soup = BeautifulSoup(html)\n",
"\n",
" li = soup.find_all('li', id=re.compile('bar*'))\n",
"\n",
" subsystem_counts_dict = OrderedDict() # We want to maintain insertion order\n",
" for item in li:\n",
" subsystem_counts_dict[item.a['href'].split(\".\")[0]]=int(item.span.get_text().replace(\",\",\"\"))\n",
" \n",
" return subsystem_counts_dict\n",
"\n",
"\n",
"def getReviews(make, model, year, system, subsystem):\n",
" \"\"\"Function that returns a list of all (maybe) customer reviews\n",
" NOTE: If there are more than 50 reviews, then the reviews are spread out over multiple pages.\"\"\"\n",
" \n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'+subsystem+'.shtml'\n",
" html = request.urlopen(url)\n",
" soup = BeautifulSoup(html)\n",
"\n",
" reviews = soup.find_all('div', itemprop=\"reviewBody\")\n",
" \n",
" complaints = []\n",
" for complaint in reviews:\n",
" complaints.append(complaint.p.get_text())\n",
" \n",
" ##### Read the first page, now check if there are 2 or more pages #####\n",
" # Get the subtitle so we can then figure out if there are multiple pages\n",
" page_count_text = soup.find('div', id=\"subtitle\").span.get_text()\n",
"\n",
" # If 'Page 1 of' exists, then there must be more than one page to read...loop thru all available pages\n",
" if 'Page 1' in page_count_text:\n",
" # Get total number of pages\n",
" num_pages = int(page_count_text.split()[3].replace(\")\",\"\"))\n",
" print(\"Page 1 of\",num_pages,\"parsed\")\n",
" for page in range(2,num_pages+1):\n",
" url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'+subsystem+'-'+str(page)+'.shtml'\n",
" html = request.urlopen(url)\n",
" try: # Thru testing, found page(s) that BeautifulSoup could not parse due to page having bad markup syntax\n",
" soup = BeautifulSoup(html)\n",
" reviews = soup.find_all('div', itemprop=\"reviewBody\")\n",
" for complaint in reviews:\n",
" complaints.append(complaint.p.get_text())\n",
" print(\"Page\",page,\"of\",num_pages,\"parsed\")\n",
" time.sleep(5) # Need to add delay to prevent Connection Refused error\n",
" except:\n",
" print(\"Page\", page,\"has severely bad markup!\",\"No data from this page was parsed.\")\n",
" pass\n",
" print(\"Retrieval of review text completed\")\n",
" else:\n",
" print(\"There was only 1 page to parse. Retrieval of review text completed.\")\n",
" \n",
" return complaints"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getMakes()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 62,
"text": [
"['Acura',\n",
" 'Audi',\n",
" 'BMW',\n",
" 'Buick',\n",
" 'Cadillac',\n",
" 'Chevrolet',\n",
" 'Chrysler',\n",
" 'Dodge',\n",
" 'Ford',\n",
" 'GMC',\n",
" 'Honda',\n",
" 'Hyundai',\n",
" 'Infiniti',\n",
" 'Isuzu',\n",
" 'Jeep',\n",
" 'Kia',\n",
" 'Lexus',\n",
" 'Lincoln',\n",
" 'Mazda',\n",
" 'Mercedes-Benz',\n",
" 'Mercury',\n",
" 'Mini',\n",
" 'Mitsubishi',\n",
" 'Nissan',\n",
" 'Oldsmobile',\n",
" 'Plymouth',\n",
" 'Pontiac',\n",
" 'Porsche',\n",
" 'Ram',\n",
" 'Saab',\n",
" 'Saturn',\n",
" 'Scion',\n",
" 'Subaru',\n",
" 'Toyota',\n",
" 'Volvo',\n",
" 'Volkswagen',\n",
" 'Alfa_Romeo',\n",
" 'AMC',\n",
" 'Bentley',\n",
" 'Chery',\n",
" 'Daewoo',\n",
" 'Datsun',\n",
" 'Eagle',\n",
" 'Ferrari',\n",
" 'Fiat',\n",
" 'Geo',\n",
" 'Holden',\n",
" 'HSV',\n",
" 'Hummer',\n",
" 'Jaguar',\n",
" 'Kenworth',\n",
" 'Lamborghini',\n",
" 'Land_Rover',\n",
" 'Mahindra',\n",
" 'Maruti',\n",
" 'Opel',\n",
" 'Peugeot',\n",
" 'Renault',\n",
" 'Rover',\n",
" 'Seat',\n",
" 'Skoda',\n",
" 'Suzuki',\n",
" 'Tata',\n",
" 'Tesla',\n",
" 'Vauxhall',\n",
" 'Zimmer']"
]
}
],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getCountsByModel('Tesla')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 58,
"text": [
"OrderedDict([('Model_S', 1)])"
]
}
],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getYearCounts('Tesla', 'Model_S')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 61,
"text": [
"{2013: 1, 2014: 0}"
]
}
],
"prompt_number": 61
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getTopSystemsQty('Tesla', 'Model_S', 2013)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 55,
"text": [
"OrderedDict([('accessories-exterior', 1)])"
]
}
],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getSubSystemsQty('Tesla', 'Model_S', 2013, 'accessories-exterior')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 59,
"text": [
"OrderedDict([('door_handles_hot', 1)])"
]
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"getReviews('Tesla', 'Model_S', 2013, 'accessories-exterior','door_handles_hot')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"There was only 1 page to parse. Retrieval of review text completed.\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 60,
"text": [
"[\"I HAVE PURCHASED THE TESLA S IN LATE 2013 AND EVER SINCE, EVERY TIME I TRY TO GET INTO THE CAR I ALMOST BURN MY HAND DUE TO DOOR HANDLE BEING SO HOT UNDER THE SUN OR BY OUTSIDE TEMPERATURE. NOW WE ARE IN NOVEMBER AND THE CAR GIVES UNPLESANT HOT SHOCK TO MY HAND EVERY TIME AND I CAN'T IMAGINE HOW HARD IT WILL BE TO GET INTO THE CAR WHEN REAL HOT DAYS COME NEXT SUMMER. TESLA SHOULD HAVE THOUGHT OF DRIVER SAFETY A LITTLE FURTHER BY TREATING ITS METAL DOOR HANDLES SO THAT IT IS SAFER TO GRAB THE DOOR HANDLE BEFORE GRABBING THE STEERING WHEEL. I DON'T THINK ANYONE CAN DRIVE SAFELY WITH BURNT HAND.\"]"
]
}
],
"prompt_number": 60
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment