Created
December 18, 2017 19:02
-
-
Save thusoy/b22320f9f3ab46048ed5012ec5bf7a2c to your computer and use it in GitHub Desktop.
Scrape Uber trips and output as csv.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import csv | |
import os | |
import time | |
import sys | |
import traceback | |
import datetime | |
from collections import OrderedDict | |
import requests | |
from bs4 import BeautifulSoup | |
PY2 = sys.version_info < (3, 0, 0) | |
# Get the cookie from the web site and add it here: | |
# (not putting this as a CLI argument to prevent it from being persisted in bash history) | |
cookie = '' | |
def main(): | |
args = get_args() | |
get_pages(args.cutoff_time) | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-c', '--cutoff-time', help="How far back to scan for trips. Give a " | |
"date in YYYY-MM-DD format to set the cutoff point (inclusive)") | |
args = parser.parse_args() | |
if not cookie: | |
sys.stderr.write("You haven't entered a cookie in the script") | |
sys.exit(1) | |
if args.cutoff_time: | |
args.cutoff_time = datetime.datetime.strptime(args.cutoff_time, '%Y-%m-%d') | |
return args | |
def get_pages(cutoff_time): | |
page = 1 | |
response = do_request(1) | |
while not 'btn--inactive pagination__next' in response.text: | |
response = do_request(page) | |
print('Downloaded page %d (%d)' % (page, response.status_code)) | |
past_cutoff = parse_page(response.text, cutoff_time) | |
if past_cutoff: | |
break | |
time.sleep(2) | |
page += 1 | |
def do_request(page): | |
return requests.get('https://riders.uber.com/trips?page=%d' % page, headers={ | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Cookie': cookie, | |
}) | |
def parse_page(page_html, cutoff_time): | |
soup = BeautifulSoup(page_html, 'html.parser') | |
trips = OrderedDict() | |
found_older_than_cutoff = False | |
for element in soup.select('tbody tr'): | |
target = element.get('data-target') | |
if target: | |
trip_id = target[len('#trip-'):] | |
trips[trip_id] = {} | |
driver = element.select('td')[2].text | |
trips[trip_id]['driver'] = driver | |
trips[trip_id]['ride_type'] = element.select('td')[4].text | |
trips[trip_id]['trip_id'] = trip_id | |
else: | |
try: | |
date = element.select('h6')[0].text | |
parsed_date = datetime.datetime.strptime(date, "%A, %B %d, %Y %I:%M %p") | |
if parsed_date < cutoff_time: | |
found_older_than_cutoff = True | |
del trips[trip_id] | |
break | |
trip_id = element.select_one('.collapse')['id'][len('trip-'):] | |
trips[trip_id]['date'] = date | |
trips[trip_id]['from_address'] = element.select('h6')[1].text | |
trips[trip_id]['to_address'] = element.select('h6')[2].text | |
arrived_at = element.select('p.flush') | |
if len(arrived_at) == 2: | |
trips[trip_id]['ride_type'] += ' - CANCELLED' | |
else: | |
trips[trip_id]['arrived_at'] = arrived_at[2].text | |
trips[trip_id]['price'] = element.select('h3')[0].text | |
trips[trip_id]['card'] = element.select('p.soft--bottom')[0].text | |
except: | |
traceback.print_exc(file=sys.stderr) | |
# for consistent ordering | |
field_names = [ | |
'trip_id', | |
'date', | |
'from_address', | |
'to_address', | |
'driver', | |
'ride_type', | |
'arrived_at', | |
'price', | |
'card', | |
] | |
target = 'result.csv' | |
exists = os.path.exists(target) | |
with open(target, 'a') as fh: | |
csv_writer = csv.DictWriter(fh, fieldnames=field_names) | |
if not exists: | |
csv_writer.writeheader() | |
for trip in trips.values(): | |
if PY2: | |
# csv on Python2 doesn't encode as utf8 | |
for key, value in trip.items(): | |
trip[key] = value.encode('utf-8') if isinstance(value, unicode) else value | |
csv_writer.writerow(trip) | |
return found_older_than_cutoff | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment