Skip to content

Instantly share code, notes, and snippets.

@ttresslar
Created August 16, 2019 09:26
Show Gist options
  • Save ttresslar/650f0768b514446711e9b2c766c27e96 to your computer and use it in GitHub Desktop.
Save ttresslar/650f0768b514446711e9b2c766c27e96 to your computer and use it in GitHub Desktop.
import pandas as pd
import requests, re, datetime, urllib.parse
from bs4 import BeautifulSoup as bs
from multiprocessing.pool import ThreadPool
def get_datelist():
base_time = datetime.datetime.today()
base_time = base_time.replace(hour=0, second=0, minute=0, microsecond=0)
date_list = [base_time + datetime.timedelta(days=x) for x in range(1, 15)]
date_list = [unformatted.strftime("%Y-%m-%d") for unformatted in date_list]
return date_list
print("getting date list")
date_list = get_datelist()
date_list
def get_soup(origin, destination, date):
base_url = "https://buupass.com/Booking/search?from={}&to={}&departure_date={}"
url = base_url.format(origin, destination, date)
c = requests.get(url, "html.parse")
c = c.content
soup = bs(c,features="lxml")
return soup
def get_ticket_info(soup):
stuff = soup.findAll("article")
bus_co = [thing.h4.get_text().split(" - ",3) for thing in stuff]
price = [re.findall('\d+',thing.find("span", {"class":"price listprice"}).get_text())[0] for thing in stuff]
seats = [re.findall('\d+',thing.find("div", {"class":"action"}).get_text().strip().replace("\n","").replace("SOLD OUT","0")) for thing in stuff if thing]
seats = [int(seat[0]) if len(seat)>0 else 0 for seat in seats]
route_info = [{"bus_co":bus, "origin":origin, "destination":dest} for origin, dest, bus in bus_co]
meta = [{"price":price, "seats_remaining":seat, "created_at": datetime.datetime.today()} for price, seat in zip(price, seats)]
info = [thing.find("div", {"class":"time"}) for thing in stuff]
headers = [thing.findAll("span", {"class":"skin-color"}) for thing in info]
headers = [[head.get_text().strip() for head in header] for header in headers]
datas = [thing.findAll("span", {"class":"search_data_values"}) for thing in info]
datas = [[datum.get_text().strip() for datum in data] for data in datas]
finale = [dict(zip(header, data)) for header, data in zip(headers, datas)]
ready = [{**route, **meta, **finale} for route, meta, finale in zip(route_info, meta, finale)]
return ready
print("First Iteration")
df = pd.DataFrame(
get_ticket_info(
get_soup("Nairobi", "Arusha", date_list[0])
)
)
df["search_origin"] = "Nairobi"
df["search_destination"] = "Arusha"
df["search_date"] = date_list[0]
def get_places(soup):
places = soup.find_all("select")
_from = places[0].findAll("option")
_from = [place['value'] for place in _from]
_to = places[1].findAll("option")
_to = [place['value'] for place in _to]
_from = [urllib.parse.quote(fro) for fro in _from if fro]
_to = [urllib.parse.quote(to) for to in _to if to]
_from_to = [[_from, _to] for _from, _to in zip(_from,_to) if _from is not _to]
return _from_to
_from_to = get_places(get_soup("Nairobi", "Arusha", date_list[0]))
_from_to_date = []
for date in date_list:
temp = _from_to
temp['date'] = date
_from_to_date.extend(temp.values.tolist())
csv_name = "./scraped"+datetime.datetime.today().strftime("%Y-%m-%d")+".csv"
startTime = datetime.datetime.now()
def df_loops(_list):
origin, destination, date = _list
soup = get_soup(origin,destination,date)
new_df = pd.DataFrame(get_ticket_info(soup))
new_df["search_origin"] = origin
new_df["search_destination"] = destination
new_df["search_date"] = date
return new_df
print("Starting loop at " + str(startTime))
with ThreadPool(10) as pool:
for result in pool.map(df_loops, _from_to_date):
df = df.append(result, ignore_index=True, sort=True)
df.to_csv(csv_name)
print("It took "+str(datetime.datetime.now() - startTime)+" to run this script")
df.drop_duplicates(keep="first", inplace=True)
#make columns lowercase so that I can easily put them in a database if needed.
df.columns = map(str.lower, df.columns)
df.to_csv(csv_name)
print("Final write of csv")
#df.to_sql('buupass', engine, if_exists='append', index=False)
#engine.dispose()
#print("Saved to DB")
print("Finished")
@javix64
Copy link

javix64 commented Aug 20, 2020

i was searching for web scraping, and i found this gem. i will study it more deeply. im new in python, and i love scrap everything. multithread is a good idea!

@ttresslar
Copy link
Author

Thanks @javix64, I'm glad it could help!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment