Skip to content

Instantly share code, notes, and snippets.

@jwhendy
Last active April 7, 2020 23:40
Show Gist options
  • Save jwhendy/d28f17fd1837c1a3679768244969ef2b to your computer and use it in GitHub Desktop.
Save jwhendy/d28f17fd1837c1a3679768244969ef2b to your computer and use it in GitHub Desktop.
import datetime
import pandas as pd
import os
import re
import subprocess
import time
### assumes you have run:
# cd path/mobility-report-data-extractor
# python ./mobility.py download -p US-foo
# python ./mobility.py proc svgs/US-foo.svg ./output
seg_list = [x for _ in range(2)
for x in ['Retail & recreation', 'Grocery & pharmacy', 'Parks',
'Transit stations', 'Workplace', 'Residential']]
### set these paths
path = '/home/uname/foo' # base path containing mobility-report-data-extractor
dir_mob = 'mobility-report-data-extractor'
areas = [d for d in os.listdir(os.path.join(path, dir_mob, 'output'))
if d.startswith('US')]
# start = time.time()
data_all = []
for area in areas:
### process pdf to text
f = os.path.join(path, dir_mob, 'pdfs', area)
subprocess.call(['/usr/bin/pdftotext', '-layout', '-raw', f'{f}.pdf', f'{f}.txt'])
with open(f'{f}.txt') as f:
lines = [l for l in f.read().split('\n') if l.strip()]
header = re.split(', | ', lines[1])
date = f'{header[-1]}-{header[-3]}-{header[-2]}'
date = datetime.datetime.strptime(date, '%Y-%B-%d').strftime('%Y-%m-%d')
area = ' '.join(header[:-3])
data = []
for i, line in enumerate(lines):
if re.findall('Retail & recreation', line) and i<20:
vals = [re.sub('%|\+', '', lines[i+x]) for x in [1, 13, 26, 38, 49, 59]]
rows = [{'area': area, 'loc': 'summary', 'seg': seg_list[i], 'ast': None, 'value': vals[i]} for i in range(6)]
data.extend(rows)
if re.findall('\f', line) and i>50:
locs = [x.strip() for x in [lines[i], lines[i+13]] for _ in range(6)]
locs = [l for l in locs if len(l.split(' ')) < 4]
asts = [lines[i+n-1] for n, x in enumerate(lines[i:i+110]) if x.startswith('Sun')]
asts = [ast if ast=='*' else None for ast in asts]
vals = [re.sub('%|\+|compared to baseline', '', lines[i+x])
for x in [2, 4, 6, 8, 10, 12, 15, 17, 19, 21, 23, 25]]
vals = [val.strip(' ') if val != 'Not enough data for this date' else None for val in vals]
segs = [lines[i+n+1] for n in [0, 2, 4, 6, 8, 10, 13, 15, 17, 19, 21, 23]]
for i, loc in enumerate(locs):
if segs[i] not in seg_list:
continue
data.append({'area': area, 'loc': locs[i], 'seg': segs[i], 'ast': asts[i], 'value': vals[i]})
for i, d in enumerate(data):
seq = (6*int(i/6))+(i%6)+1
data[i]['i'] = seq
data[i]['path'] = f'output/US-{d["area"]}/{seq}.csv'
data_all.extend(data)
df = pd.DataFrame(data_all)
df['value'] = pd.to_numeric(df['value']) # fix missing values
# end = time.time()
# timing on above
# print(end-start) # 8.454864130020142
### example output (run on directory of all 50 states)
# df
# area loc seg ast value i path
#0 Alabama summary Retail & recreation None -41 1 output/US-Alabama/1.csv
#1 Alabama summary Grocery & pharmacy None -13 2 output/US-Alabama/2.csv
#2 Alabama summary Parks None 19 3 output/US-Alabama/3.csv
#3 Alabama summary Transit stations None -30 4 output/US-Alabama/4.csv
#4 Alabama summary Workplace None -32 5 output/US-Alabama/5.csv
#... ... ... ... ... ... ... ...
#17095 Wyoming Weston County Grocery & pharmacy * -24 134 output/US-Wyoming/134.csv
#17096 Wyoming Weston County Parks * None 135 output/US-Wyoming/135.csv
#17097 Wyoming Weston County Transit stations * None 136 output/US-Wyoming/136.csv
#17098 Wyoming Weston County Workplace * -34 137 output/US-Wyoming/137.csv
#17099 Wyoming Weston County Residential * None 138 output/US-Wyoming/138.csv
#
#[17100 rows x 7 columns]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment