Skip to content

Instantly share code, notes, and snippets.

@brianraila
Last active October 18, 2018 02:26
Show Gist options
  • Save brianraila/3af51abbae2b3c1b6f79dd1815a54137 to your computer and use it in GitHub Desktop.
Save brianraila/3af51abbae2b3c1b6f79dd1815a54137 to your computer and use it in GitHub Desktop.
Old cars
# used cars
import requests
import csv
from bs4 import BeautifulSoup
USED_CARS_URL = 'https://www.sgcarmart.com/used_cars/info.php?ID={}&DL={}'
url = 'https://www.sgcarmart.com/used_cars/listing.php?BRSR={}VEH={}&RPG=100&AVL=2'
def get_links(page_url):
r = requests.get(page_url)
response = r.text
print('got page')
soup = BeautifulSoup(response, 'lxml')
links = soup.find_all('a')
return links
def get_codes(links):
codes = []
for link in links:
if 'info.php' in link.get('href'):
link = link.get('href')
ID = link.split('?')[1].split('&')[0].split('=')[1]
try:
DL = link.split('?')[1].split('&')[1].split('=')[1]
except:
DL = '8000'
codes.append([ID, DL])
return codes
def fetch_and_save(codes, vehicle_type):
all_values = []
index = 0
for code in codes:
if index % 2 == 0:
url = USED_CARS_URL.format(codes[index][0], codes[index][1])
owner_type = "Dealer"
if int(code[1]) == 1000:
owner_type = "Direct Owner Sale"
try:
r = requests.get(url, timeout=1)
response = r.text
response.replace("\r", "").replace("\t", "").replace("\n", "")
soup = BeautifulSoup(response, 'lxml')
vehicle = soup.find("a", {"class": "link_redbanner"}).text
try:
price = soup.find(string="Price").find_next('td').contents[0].text
except:
price = ""
try:
dep = soup.find(string="Depreciation").find_next('td').contents[0].text
except:
dep = ""
try:
reg_date = soup.find(string="Reg Date").find_next('td').contents[0]
reg_date.replace('\r\n\t\t\t\t\t\t\t\t', '')
except:
reg_date = ''
try:
man_f = soup.find(string="Manufactured").find_next('td').contents[0]
man_f.replace('\r\n\t\t\t\t\t\t\t\t', '')
except:
man_f = ""
try:
mil = soup.find(string="Mileage").find_next('td').contents[0]
except:
mil = ''
try:
road_tax = soup.find(string="Road Tax").find_next('td').contents[0]
except:
road_tax = '-'
try:
coe = soup.find(string="COE").find_next('td').contents[0]
except:
coe = ''
try:
arf = soup.find(string="ARF").find_next('td').contents[0]
except:
arf - ''
try:
omv = soup.find(string="OMV").find_next('td').contents[0]
except:
omv = ''
posted = soup.find("div", {"id": "usedcar_postdate"}).text
posted = str(posted)
posted = posted.split('|')[0].split(':')[1]
row = [vehicle, vehicle_type, posted, owner_type,
price, dep, reg_date, man_f, mil, road_tax, coe, omv, arf ]
print(row)
with open('task2.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(row)
print('Saved {}'.format(vehicle))
except:
print("Taking too long...skipped")
index = index + 1
# DONT TRY THIS AT HOME...USED A DICTIONARY INSTEAD ;)
categories = [7,'Station Wagon' ,200, 13,'Mid Sized Sedan' ,1900, 12,'Luxury Sedan' ,2600 ,11,'Hatchback', 1400,
10,'MPV', 1500 ,9,'SUV', 1900 ,8,'Sports car' ,1500 ,
5,'Truck', 600,4,'Van' ,600 ,14,'Bus/Mini Bus', 100 ,3,'Others', 0 ]
all_links = []
index = 0
for i in categories:
if isinstance(i, int):
if index % 3 == 0:
category = categories[index + 1]
limit = categories[index + 2]
limit = int(limit/100)
for k in range(0,limit):
cursor = k * 100
page_links = get_links(url.format(str(cursor), str(categories[index])))
print("Page {} links .page {} fetched".format(category, k))
all_links = all_links + page_links
all_codes = get_codes(all_links)
# print(all_codes)
fetch_and_save(all_codes, category)
all_codes = []
index = index + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment