-
-
Save ashtom84/ff18f571e19955f08dc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
start_time = time.time() | |
import math | |
import re | |
from bs4 import BeautifulSoup | |
import requests | |
import unicodedata | |
import numpy as np | |
import pandas as pd | |
# Convert unicode to string | |
def conv_var(var): | |
if type(var) == unicode: | |
var = unicodedata.normalize('NFKD', var).encode('ascii','ignore') | |
return var | |
# Initialization of the website search path | |
site_path = "https://www.cyclecrunch.com" | |
search_path = "/Search/All_Makes/All_Models/Motorcycles" | |
page = 1 | |
nbpage = 2501 | |
# initialization of the variables | |
yr, pr, Id, name, loc, col, listed, model, description = [], [], [], [], [], [], [], [], [] | |
condit, mile, loc, ttl, city, state, make, pr2 = [], [], [], [], [], [], [], [] | |
# initialization dictionary | |
dic = {"year" : [], "price" : [], "make" : [], "name" : [], "color" : [], "condition" : [], | |
"km" : [], "city" : [], "state" : [], "Id" : [], "model" : [], "listed" : [], | |
"price2" : [], "description" : []} | |
current_path = search_path | |
### while loop to sift through the pages | |
# sift through the first pages of the website | |
while page < nbpage: | |
# output for the first page | |
outpg = BeautifulSoup(requests.get(site_path + current_path).text) | |
res = ["bike bikeresult-grid feat", "bike bikeresult-grid feat hglt"] | |
# details contained in the current page | |
opg2 = outpg.find("div", class_ = "search-result-wrapper") | |
opg = [] | |
opg.append(opg2.find("div")) | |
while opg[-1].find_next_sibling("div") != None: | |
opg.append(opg[-1].find_next_sibling("div")) | |
for x in opg: | |
tempyr = x.get("data-year") | |
if tempyr != None: yr.append(tempyr) | |
else: yr.append("NaN") | |
temppr = x.get("data-price") | |
if temppr != None: pr.append(temppr) | |
else: pr.append("NaN") | |
tempId = x.get("data-year") | |
if tempId != None: Id.append(tempId) | |
else: Id.append("NaN") | |
tempname = x.div.find("div", {"itemprop" : "name"}).string | |
if tempname != None: | |
name.append(conv_var(re.sub('\xae', '', tempname))) | |
else: | |
name.append("NaN") | |
temploc = str(x.div.find("div", {"class" : "location"}).string) | |
if temploc != None: | |
loc.append(temploc) | |
else: | |
loc.append("NaN") | |
tempttl = conv_var(x.div.find("div", {"class" : "ttl"}).get("title")) | |
if tempttl != None: | |
ttl.append(tempttl) | |
else: | |
ttl.append("NaN") | |
ip = outpg.find_all("div", {"class" : "search-result-wrapper"}) | |
url_current = ["" for x in range(0, len(ip[0].contents))] | |
for i in range(0, len(url_current)): | |
tt = ip[0].contents[i].a.get("href") | |
if type(tt) == unicode: | |
tt = unicodedata.normalize('NFKD', tt).encode('ascii','ignore') | |
url_current[i] = site_path + tt | |
else: | |
url_current[i] = site_path + tt | |
# inner page results | |
for url in url_current: | |
inpage = BeautifulSoup(requests.get(url).text) | |
wd = inpage.find("section", class_ = "widget detail") | |
bw = inpage.find("div", class_ = "bike-wrapper") | |
ta = inpage.find("article", class_ = "text-area") | |
if wd != None: | |
temppr2 = wd.find("dt", text = "Price:") | |
if temppr2 != None: | |
pr2.append(temppr) | |
else: | |
pr2.append("NaN") | |
tempcol = wd.find("dt", text = "Color:") | |
if tempcol != None: | |
tempcol = tempcol.find_next_sibling("dd") | |
if tempcol.string: | |
col.append(conv_var(tempcol.string.replace("/", '.'))) | |
else: | |
col.append("NaN") | |
else: | |
col.append("NaN") | |
tempcondit = wd.find("dt", text = "Condition:") | |
if tempcondit != None: | |
tempcondit = tempcondit.find_next_sibling("dd") | |
if tempcondit != None: | |
condit.append(str(tempcondit.string)) | |
else: | |
condit.append("NaN") | |
else: | |
condit.append("NaN") | |
tempmile = wd.find("dt", text = "Mileage:") | |
if tempmile != None: | |
tempmile = tempmile.find_next_sibling("dd") | |
if tempmile != None: | |
mile.append(str(tempmile.string)) | |
else: | |
mile.append("NaN") | |
else: | |
mile.append("NaN") | |
tempcity = wd.find("dt", text = "Location:") | |
if tempcity != None: | |
tempcity = tempcity.find_next_sibling("dd") | |
if tempcity != None: | |
city.append(str(tempcity.string.split(",")[0])) | |
else: | |
city.append("NaN") | |
else: | |
city.append("NaN") | |
tempstate = wd.find("dt", text = "Location:") | |
if tempstate != None: | |
tempstate = tempstate.find_next_sibling("dd") | |
if tempstate != None: | |
state.append(str(tempstate.string.split(",")[1])) | |
else: | |
state.append("NaN") | |
else: | |
state.append("NaN") | |
templisted = wd.find("dt", text = "Listed: ") | |
if templisted != None: | |
templisted = templisted.find_next_sibling("dd") | |
if templisted.string: | |
listed.append(conv_var(templisted.string.replace("/", '.'))) | |
else: | |
listed.append("NaN") | |
else: | |
listed.append("NaN") | |
else: | |
col.append("NaN") | |
condit.append("NaN") | |
mile.append("NaN") | |
city.append("NaN") | |
state.append("NaN") | |
listed.append("NaN") | |
if bw != None: | |
if bw.get("data-make") != None: | |
make.append(conv_var(bw.get("data-make"))) | |
else: | |
make.append("NaN") | |
else: make.append("NaN") | |
if ta != None: | |
if ta.p != None: | |
if ta.p.string: | |
description.append(conv_var(ta.p.string.replace("/", '.'))) | |
else: | |
description.append("NaN") | |
else: | |
description.append("NaN") | |
else: description.append("NaN") | |
km = ["" for x in range(0, len(mile))] | |
for i in range(0, len(mile)): | |
for st in mile[i].split(","): | |
km[i] = str(km[i]) | |
km[i] += st | |
if math.isnan(float(km[i])) == False: | |
km[i] = int(float(km[i])*1.60934) | |
else: | |
km[i] = np.nan | |
model.append(url.split("/")[-2]) | |
dic["year"] = yr | |
dic["price"] = pr | |
dic["price2"] = pr2 | |
dic["make"] = make | |
dic["name"] = name | |
dic["color"] = col | |
dic["condition"] = condit | |
dic["km"] = km | |
dic["city"] = city | |
dic["state"] = state | |
dic["Id"] = Id | |
dic["listed"] = listed | |
dic["model"] = model | |
dic["description"] = description | |
#url for the next page | |
next_page = outpg.find_all("span", {"class" : "pgs"})[0].contents[6].get("href") | |
current_path = next_page | |
print page | |
page += 1 | |
#bike = pd.DataFrame(dic) | |
#bike.to_csv('bike400.csv') | |
print("--- %s seconds ---" % (time.time() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment