ashtom84/Scrape-Motorcycles Secret

## Scrape-Motorcycles
import time
start_time = time.time()
import math
import re
from bs4 import BeautifulSoup
import requests
import unicodedata
import numpy as np
import pandas as pd

# Convert unicode to string
def conv_var(var):
    if type(var) == unicode:
            var = unicodedata.normalize('NFKD', var).encode('ascii','ignore')
    return var

# Initialization of the website search path
site_path = "https://www.cyclecrunch.com"
search_path = "/Search/All_Makes/All_Models/Motorcycles"
page = 1
nbpage = 2501

# initialization of the variables
yr, pr, Id, name, loc, col, listed, model, description = [], [], [], [], [], [], [], [], []
condit, mile, loc, ttl, city, state, make, pr2 = [], [], [], [], [], [], [], []
# initialization dictionary
dic = {"year" : [], "price" : [], "make" : [], "name" : [], "color" : [], "condition" : [],
       "km" : [], "city" : [], "state" : [], "Id" : [], "model" : [], "listed" : [],
        "price2" : [], "description" : []}


current_path = search_path


### while loop to sift through the pages
# sift through the first pages of the website
while page < nbpage:
    # output for the first page
    outpg = BeautifulSoup(requests.get(site_path + current_path).text)

    res = ["bike bikeresult-grid feat", "bike bikeresult-grid feat hglt"]
    # details contained in the current page

    opg2 = outpg.find("div", class_ = "search-result-wrapper")

    opg = []
    opg.append(opg2.find("div"))
    while opg[-1].find_next_sibling("div") != None:
        opg.append(opg[-1].find_next_sibling("div"))

    for x in opg:

        tempyr = x.get("data-year")
        if tempyr != None: yr.append(tempyr)
        else: yr.append("NaN")

        temppr = x.get("data-price")
        if temppr != None: pr.append(temppr)
        else: pr.append("NaN")

        tempId = x.get("data-year")
        if tempId != None: Id.append(tempId)
        else: Id.append("NaN")

        tempname = x.div.find("div", {"itemprop" : "name"}).string
        if tempname != None:
            name.append(conv_var(re.sub('\xae', '', tempname)))
        else:
            name.append("NaN")

        temploc = str(x.div.find("div", {"class" : "location"}).string)
        if temploc != None:
            loc.append(temploc)
        else:
            loc.append("NaN")

        tempttl = conv_var(x.div.find("div", {"class" : "ttl"}).get("title"))
        if tempttl != None:
            ttl.append(tempttl)
        else:
            ttl.append("NaN")

    ip = outpg.find_all("div", {"class" : "search-result-wrapper"})
    url_current = ["" for x in range(0, len(ip[0].contents))]
    for i in range(0, len(url_current)):
        tt = ip[0].contents[i].a.get("href")
        if type(tt) == unicode:
            tt = unicodedata.normalize('NFKD', tt).encode('ascii','ignore')
            url_current[i] = site_path + tt
        else:
            url_current[i] = site_path + tt

    # inner page results
    for url in url_current:
        inpage = BeautifulSoup(requests.get(url).text)
        wd = inpage.find("section", class_ = "widget detail")
        bw = inpage.find("div", class_ = "bike-wrapper")
        ta = inpage.find("article", class_ = "text-area")

        if wd != None:

            temppr2 = wd.find("dt", text = "Price:")
            if temppr2 != None:
                pr2.append(temppr)
            else:
                pr2.append("NaN")

            tempcol = wd.find("dt", text = "Color:")
            if tempcol != None:
                tempcol = tempcol.find_next_sibling("dd")
                if tempcol.string:
                    col.append(conv_var(tempcol.string.replace("/", '.')))
                else:
                    col.append("NaN")
            else:
                col.append("NaN")

            tempcondit = wd.find("dt", text = "Condition:")
            if tempcondit != None:
                tempcondit = tempcondit.find_next_sibling("dd")
                if tempcondit != None:
                    condit.append(str(tempcondit.string))
                else:
                    condit.append("NaN")
            else:
                condit.append("NaN")

            tempmile = wd.find("dt", text = "Mileage:")
            if tempmile != None:
                tempmile = tempmile.find_next_sibling("dd")
                if tempmile != None:
                    mile.append(str(tempmile.string))
                else:
                    mile.append("NaN")
            else:
                mile.append("NaN")

            tempcity = wd.find("dt", text = "Location:")
            if tempcity != None:
                tempcity = tempcity.find_next_sibling("dd")
                if tempcity != None:
                    city.append(str(tempcity.string.split(",")[0]))
                else:
                    city.append("NaN")
            else:
                city.append("NaN")

            tempstate = wd.find("dt", text = "Location:")
            if tempstate != None:
                tempstate = tempstate.find_next_sibling("dd")
                if tempstate != None:
                    state.append(str(tempstate.string.split(",")[1]))
                else:
                    state.append("NaN")
            else:
                state.append("NaN")

            templisted = wd.find("dt", text = "Listed: ")
            if templisted != None:
                templisted = templisted.find_next_sibling("dd")
                if templisted.string:
                    listed.append(conv_var(templisted.string.replace("/", '.')))
                else:
                    listed.append("NaN")
            else:
                listed.append("NaN")

        else:
            col.append("NaN")
            condit.append("NaN")
            mile.append("NaN")
            city.append("NaN")
            state.append("NaN")
            listed.append("NaN")


        if bw != None:

            if bw.get("data-make") != None:
                make.append(conv_var(bw.get("data-make")))
            else:
                make.append("NaN")
        else: make.append("NaN")


        if ta != None:

            if ta.p != None:
                if ta.p.string:
                    description.append(conv_var(ta.p.string.replace("/", '.')))
                else:
                    description.append("NaN")
            else:
                description.append("NaN")
        else: description.append("NaN")


        km = ["" for x in range(0, len(mile))]

        for i in range(0, len(mile)):
            for st in mile[i].split(","):
                km[i] = str(km[i])
                km[i] += st
                if math.isnan(float(km[i])) == False:
                    km[i] = int(float(km[i])*1.60934)
                else:
                    km[i] = np.nan

        model.append(url.split("/")[-2])

    dic["year"] = yr
    dic["price"] = pr
    dic["price2"] = pr2
    dic["make"] = make
    dic["name"] = name
    dic["color"] = col
    dic["condition"] = condit
    dic["km"] = km
    dic["city"] = city
    dic["state"] = state
    dic["Id"] = Id
    dic["listed"] = listed
    dic["model"] = model
    dic["description"] = description


    #url for the next page
    next_page = outpg.find_all("span", {"class" : "pgs"})[0].contents[6].get("href")

    current_path = next_page
    print page
    page += 1

#bike = pd.DataFrame(dic)
#bike.to_csv('bike400.csv')

print("--- %s seconds ---" % (time.time() - start_time))
	import time
	start_time = time.time()
	import math
	import re
	from bs4 import BeautifulSoup
	import requests
	import unicodedata
	import numpy as np
	import pandas as pd

	# Convert unicode to string
	def conv_var(var):
	if type(var) == unicode:
	var = unicodedata.normalize('NFKD', var).encode('ascii','ignore')
	return var

	# Initialization of the website search path
	site_path = "https://www.cyclecrunch.com"
	search_path = "/Search/All_Makes/All_Models/Motorcycles"
	page = 1
	nbpage = 2501

	# initialization of the variables
	yr, pr, Id, name, loc, col, listed, model, description = [], [], [], [], [], [], [], [], []
	condit, mile, loc, ttl, city, state, make, pr2 = [], [], [], [], [], [], [], []
	# initialization dictionary
	dic = {"year" : [], "price" : [], "make" : [], "name" : [], "color" : [], "condition" : [],
	"km" : [], "city" : [], "state" : [], "Id" : [], "model" : [], "listed" : [],
	"price2" : [], "description" : []}


	current_path = search_path


	### while loop to sift through the pages
	# sift through the first pages of the website
	while page < nbpage:
	# output for the first page
	outpg = BeautifulSoup(requests.get(site_path + current_path).text)

	res = ["bike bikeresult-grid feat", "bike bikeresult-grid feat hglt"]
	# details contained in the current page

	opg2 = outpg.find("div", class_ = "search-result-wrapper")

	opg = []
	opg.append(opg2.find("div"))
	while opg[-1].find_next_sibling("div") != None:
	opg.append(opg[-1].find_next_sibling("div"))

	for x in opg:

	tempyr = x.get("data-year")
	if tempyr != None: yr.append(tempyr)
	else: yr.append("NaN")

	temppr = x.get("data-price")
	if temppr != None: pr.append(temppr)
	else: pr.append("NaN")

	tempId = x.get("data-year")
	if tempId != None: Id.append(tempId)
	else: Id.append("NaN")

	tempname = x.div.find("div", {"itemprop" : "name"}).string
	if tempname != None:
	name.append(conv_var(re.sub('\xae', '', tempname)))
	else:
	name.append("NaN")

	temploc = str(x.div.find("div", {"class" : "location"}).string)
	if temploc != None:
	loc.append(temploc)
	else:
	loc.append("NaN")

	tempttl = conv_var(x.div.find("div", {"class" : "ttl"}).get("title"))
	if tempttl != None:
	ttl.append(tempttl)
	else:
	ttl.append("NaN")

	ip = outpg.find_all("div", {"class" : "search-result-wrapper"})
	url_current = ["" for x in range(0, len(ip[0].contents))]
	for i in range(0, len(url_current)):
	tt = ip[0].contents[i].a.get("href")
	if type(tt) == unicode:
	tt = unicodedata.normalize('NFKD', tt).encode('ascii','ignore')
	url_current[i] = site_path + tt
	else:
	url_current[i] = site_path + tt

	# inner page results
	for url in url_current:
	inpage = BeautifulSoup(requests.get(url).text)
	wd = inpage.find("section", class_ = "widget detail")
	bw = inpage.find("div", class_ = "bike-wrapper")
	ta = inpage.find("article", class_ = "text-area")

	if wd != None:

	temppr2 = wd.find("dt", text = "Price:")
	if temppr2 != None:
	pr2.append(temppr)
	else:
	pr2.append("NaN")

	tempcol = wd.find("dt", text = "Color:")
	if tempcol != None:
	tempcol = tempcol.find_next_sibling("dd")
	if tempcol.string:
	col.append(conv_var(tempcol.string.replace("/", '.')))
	else:
	col.append("NaN")
	else:
	col.append("NaN")

	tempcondit = wd.find("dt", text = "Condition:")
	if tempcondit != None:
	tempcondit = tempcondit.find_next_sibling("dd")
	if tempcondit != None:
	condit.append(str(tempcondit.string))
	else:
	condit.append("NaN")
	else:
	condit.append("NaN")

	tempmile = wd.find("dt", text = "Mileage:")
	if tempmile != None:
	tempmile = tempmile.find_next_sibling("dd")
	if tempmile != None:
	mile.append(str(tempmile.string))
	else:
	mile.append("NaN")
	else:
	mile.append("NaN")

	tempcity = wd.find("dt", text = "Location:")
	if tempcity != None:
	tempcity = tempcity.find_next_sibling("dd")
	if tempcity != None:
	city.append(str(tempcity.string.split(",")[0]))
	else:
	city.append("NaN")
	else:
	city.append("NaN")

	tempstate = wd.find("dt", text = "Location:")
	if tempstate != None:
	tempstate = tempstate.find_next_sibling("dd")
	if tempstate != None:
	state.append(str(tempstate.string.split(",")[1]))
	else:
	state.append("NaN")
	else:
	state.append("NaN")

	templisted = wd.find("dt", text = "Listed: ")
	if templisted != None:
	templisted = templisted.find_next_sibling("dd")
	if templisted.string:
	listed.append(conv_var(templisted.string.replace("/", '.')))
	else:
	listed.append("NaN")
	else:
	listed.append("NaN")

	else:
	col.append("NaN")
	condit.append("NaN")
	mile.append("NaN")
	city.append("NaN")
	state.append("NaN")
	listed.append("NaN")


	if bw != None:

	if bw.get("data-make") != None:
	make.append(conv_var(bw.get("data-make")))
	else:
	make.append("NaN")
	else: make.append("NaN")



	if ta != None:

	if ta.p != None:
	if ta.p.string:
	description.append(conv_var(ta.p.string.replace("/", '.')))
	else:
	description.append("NaN")
	else:
	description.append("NaN")
	else: description.append("NaN")



	km = ["" for x in range(0, len(mile))]

	for i in range(0, len(mile)):
	for st in mile[i].split(","):
	km[i] = str(km[i])
	km[i] += st
	if math.isnan(float(km[i])) == False:
	km[i] = int(float(km[i])*1.60934)
	else:
	km[i] = np.nan

	model.append(url.split("/")[-2])

	dic["year"] = yr
	dic["price"] = pr
	dic["price2"] = pr2
	dic["make"] = make
	dic["name"] = name
	dic["color"] = col
	dic["condition"] = condit
	dic["km"] = km
	dic["city"] = city
	dic["state"] = state
	dic["Id"] = Id
	dic["listed"] = listed
	dic["model"] = model
	dic["description"] = description


	#url for the next page
	next_page = outpg.find_all("span", {"class" : "pgs"})[0].contents[6].get("href")

	current_path = next_page
	print page
	page += 1

	#bike = pd.DataFrame(dic)
	#bike.to_csv('bike400.csv')

	print("--- %s seconds ---" % (time.time() - start_time))