Skip to content

Instantly share code, notes, and snippets.

@ashtom84
Created March 24, 2016 13:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashtom84/ff18f571e19955f08dc0 to your computer and use it in GitHub Desktop.
Save ashtom84/ff18f571e19955f08dc0 to your computer and use it in GitHub Desktop.
import time
start_time = time.time()
import math
import re
from bs4 import BeautifulSoup
import requests
import unicodedata
import numpy as np
import pandas as pd
# Convert unicode to string
def conv_var(var):
if type(var) == unicode:
var = unicodedata.normalize('NFKD', var).encode('ascii','ignore')
return var
# Initialization of the website search path
site_path = "https://www.cyclecrunch.com"
search_path = "/Search/All_Makes/All_Models/Motorcycles"
page = 1
nbpage = 2501
# initialization of the variables
yr, pr, Id, name, loc, col, listed, model, description = [], [], [], [], [], [], [], [], []
condit, mile, loc, ttl, city, state, make, pr2 = [], [], [], [], [], [], [], []
# initialization dictionary
dic = {"year" : [], "price" : [], "make" : [], "name" : [], "color" : [], "condition" : [],
"km" : [], "city" : [], "state" : [], "Id" : [], "model" : [], "listed" : [],
"price2" : [], "description" : []}
current_path = search_path
### while loop to sift through the pages
# sift through the first pages of the website
while page < nbpage:
# output for the first page
outpg = BeautifulSoup(requests.get(site_path + current_path).text)
res = ["bike bikeresult-grid feat", "bike bikeresult-grid feat hglt"]
# details contained in the current page
opg2 = outpg.find("div", class_ = "search-result-wrapper")
opg = []
opg.append(opg2.find("div"))
while opg[-1].find_next_sibling("div") != None:
opg.append(opg[-1].find_next_sibling("div"))
for x in opg:
tempyr = x.get("data-year")
if tempyr != None: yr.append(tempyr)
else: yr.append("NaN")
temppr = x.get("data-price")
if temppr != None: pr.append(temppr)
else: pr.append("NaN")
tempId = x.get("data-year")
if tempId != None: Id.append(tempId)
else: Id.append("NaN")
tempname = x.div.find("div", {"itemprop" : "name"}).string
if tempname != None:
name.append(conv_var(re.sub('\xae', '', tempname)))
else:
name.append("NaN")
temploc = str(x.div.find("div", {"class" : "location"}).string)
if temploc != None:
loc.append(temploc)
else:
loc.append("NaN")
tempttl = conv_var(x.div.find("div", {"class" : "ttl"}).get("title"))
if tempttl != None:
ttl.append(tempttl)
else:
ttl.append("NaN")
ip = outpg.find_all("div", {"class" : "search-result-wrapper"})
url_current = ["" for x in range(0, len(ip[0].contents))]
for i in range(0, len(url_current)):
tt = ip[0].contents[i].a.get("href")
if type(tt) == unicode:
tt = unicodedata.normalize('NFKD', tt).encode('ascii','ignore')
url_current[i] = site_path + tt
else:
url_current[i] = site_path + tt
# inner page results
for url in url_current:
inpage = BeautifulSoup(requests.get(url).text)
wd = inpage.find("section", class_ = "widget detail")
bw = inpage.find("div", class_ = "bike-wrapper")
ta = inpage.find("article", class_ = "text-area")
if wd != None:
temppr2 = wd.find("dt", text = "Price:")
if temppr2 != None:
pr2.append(temppr)
else:
pr2.append("NaN")
tempcol = wd.find("dt", text = "Color:")
if tempcol != None:
tempcol = tempcol.find_next_sibling("dd")
if tempcol.string:
col.append(conv_var(tempcol.string.replace("/", '.')))
else:
col.append("NaN")
else:
col.append("NaN")
tempcondit = wd.find("dt", text = "Condition:")
if tempcondit != None:
tempcondit = tempcondit.find_next_sibling("dd")
if tempcondit != None:
condit.append(str(tempcondit.string))
else:
condit.append("NaN")
else:
condit.append("NaN")
tempmile = wd.find("dt", text = "Mileage:")
if tempmile != None:
tempmile = tempmile.find_next_sibling("dd")
if tempmile != None:
mile.append(str(tempmile.string))
else:
mile.append("NaN")
else:
mile.append("NaN")
tempcity = wd.find("dt", text = "Location:")
if tempcity != None:
tempcity = tempcity.find_next_sibling("dd")
if tempcity != None:
city.append(str(tempcity.string.split(",")[0]))
else:
city.append("NaN")
else:
city.append("NaN")
tempstate = wd.find("dt", text = "Location:")
if tempstate != None:
tempstate = tempstate.find_next_sibling("dd")
if tempstate != None:
state.append(str(tempstate.string.split(",")[1]))
else:
state.append("NaN")
else:
state.append("NaN")
templisted = wd.find("dt", text = "Listed: ")
if templisted != None:
templisted = templisted.find_next_sibling("dd")
if templisted.string:
listed.append(conv_var(templisted.string.replace("/", '.')))
else:
listed.append("NaN")
else:
listed.append("NaN")
else:
col.append("NaN")
condit.append("NaN")
mile.append("NaN")
city.append("NaN")
state.append("NaN")
listed.append("NaN")
if bw != None:
if bw.get("data-make") != None:
make.append(conv_var(bw.get("data-make")))
else:
make.append("NaN")
else: make.append("NaN")
if ta != None:
if ta.p != None:
if ta.p.string:
description.append(conv_var(ta.p.string.replace("/", '.')))
else:
description.append("NaN")
else:
description.append("NaN")
else: description.append("NaN")
km = ["" for x in range(0, len(mile))]
for i in range(0, len(mile)):
for st in mile[i].split(","):
km[i] = str(km[i])
km[i] += st
if math.isnan(float(km[i])) == False:
km[i] = int(float(km[i])*1.60934)
else:
km[i] = np.nan
model.append(url.split("/")[-2])
dic["year"] = yr
dic["price"] = pr
dic["price2"] = pr2
dic["make"] = make
dic["name"] = name
dic["color"] = col
dic["condition"] = condit
dic["km"] = km
dic["city"] = city
dic["state"] = state
dic["Id"] = Id
dic["listed"] = listed
dic["model"] = model
dic["description"] = description
#url for the next page
next_page = outpg.find_all("span", {"class" : "pgs"})[0].contents[6].get("href")
current_path = next_page
print page
page += 1
#bike = pd.DataFrame(dic)
#bike.to_csv('bike400.csv')
print("--- %s seconds ---" % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment