Skip to content

Instantly share code, notes, and snippets.

@thusoy
Created December 18, 2017 19:02
Show Gist options
  • Save thusoy/b22320f9f3ab46048ed5012ec5bf7a2c to your computer and use it in GitHub Desktop.
Save thusoy/b22320f9f3ab46048ed5012ec5bf7a2c to your computer and use it in GitHub Desktop.
Scrape Uber trips and output as csv.
#!/usr/bin/env python
import argparse
import csv
import os
import time
import sys
import traceback
import datetime
from collections import OrderedDict
import requests
from bs4 import BeautifulSoup
PY2 = sys.version_info < (3, 0, 0)
# Get the cookie from the web site and add it here:
# (not putting this as a CLI argument to prevent it from being persisted in bash history)
cookie = ''
def main():
args = get_args()
get_pages(args.cutoff_time)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--cutoff-time', help="How far back to scan for trips. Give a "
"date in YYYY-MM-DD format to set the cutoff point (inclusive)")
args = parser.parse_args()
if not cookie:
sys.stderr.write("You haven't entered a cookie in the script")
sys.exit(1)
if args.cutoff_time:
args.cutoff_time = datetime.datetime.strptime(args.cutoff_time, '%Y-%m-%d')
return args
def get_pages(cutoff_time):
page = 1
response = do_request(1)
while not 'btn--inactive pagination__next' in response.text:
response = do_request(page)
print('Downloaded page %d (%d)' % (page, response.status_code))
past_cutoff = parse_page(response.text, cutoff_time)
if past_cutoff:
break
time.sleep(2)
page += 1
def do_request(page):
return requests.get('https://riders.uber.com/trips?page=%d' % page, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Cookie': cookie,
})
def parse_page(page_html, cutoff_time):
soup = BeautifulSoup(page_html, 'html.parser')
trips = OrderedDict()
found_older_than_cutoff = False
for element in soup.select('tbody tr'):
target = element.get('data-target')
if target:
trip_id = target[len('#trip-'):]
trips[trip_id] = {}
driver = element.select('td')[2].text
trips[trip_id]['driver'] = driver
trips[trip_id]['ride_type'] = element.select('td')[4].text
trips[trip_id]['trip_id'] = trip_id
else:
try:
date = element.select('h6')[0].text
parsed_date = datetime.datetime.strptime(date, "%A, %B %d, %Y %I:%M %p")
if parsed_date < cutoff_time:
found_older_than_cutoff = True
del trips[trip_id]
break
trip_id = element.select_one('.collapse')['id'][len('trip-'):]
trips[trip_id]['date'] = date
trips[trip_id]['from_address'] = element.select('h6')[1].text
trips[trip_id]['to_address'] = element.select('h6')[2].text
arrived_at = element.select('p.flush')
if len(arrived_at) == 2:
trips[trip_id]['ride_type'] += ' - CANCELLED'
else:
trips[trip_id]['arrived_at'] = arrived_at[2].text
trips[trip_id]['price'] = element.select('h3')[0].text
trips[trip_id]['card'] = element.select('p.soft--bottom')[0].text
except:
traceback.print_exc(file=sys.stderr)
# for consistent ordering
field_names = [
'trip_id',
'date',
'from_address',
'to_address',
'driver',
'ride_type',
'arrived_at',
'price',
'card',
]
target = 'result.csv'
exists = os.path.exists(target)
with open(target, 'a') as fh:
csv_writer = csv.DictWriter(fh, fieldnames=field_names)
if not exists:
csv_writer.writeheader()
for trip in trips.values():
if PY2:
# csv on Python2 doesn't encode as utf8
for key, value in trip.items():
trip[key] = value.encode('utf-8') if isinstance(value, unicode) else value
csv_writer.writerow(trip)
return found_older_than_cutoff
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment