Skip to content

Instantly share code, notes, and snippets.

@mobbarley78110
Created February 26, 2022 14:54
Show Gist options
  • Save mobbarley78110/c87518b2ac0d2e64c8c294cdec5e2f7e to your computer and use it in GitHub Desktop.
Save mobbarley78110/c87518b2ac0d2e64c8c294cdec5e2f7e to your computer and use it in GitHub Desktop.
Scrape Fedex data and geolocation
import pandas as pd
import requests
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
# helper functions
def get_raw(a):
try:
return a.raw
except:
return None
def get_lat(a):
try:
return a.latitude
except:
return None
def get_lon(a):
try:
return a.longitude
except:
return None
# creating the Fedex web scrapper function, returns a dictionnary
def get_package_details_fedex(track_no):
try:
header = {
'Origin': 'https://www.fedex.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.fedex.com/apps/fedextrack/?tracknumbers=%s&locale=en_CA&cntry_code=ca_english' % (
str(track_no)),
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,ta;q=0.4,bn;q=0.2'
}
data = {
'action': 'trackpackages',
'data': '{"TrackPackagesRequest":{"appType":"WTRK","appDeviceType":"DESKTOP","uniqueKey":"",'
'"processingParameters":{},"trackingInfoList":[{"trackNumberInfo":{"trackingNumber":"%s",'
'"trackingQualifier":"","trackingCarrier":""}}]}}' % (
str(track_no)),
'format': 'json',
'locale': 'en_CA',
'version': '1'
}
url = "https://www.fedex.com/trackingCal/track"
response = requests.post(url, data=data, headers=header)
if response.status_code == 200:
pass
else:
return
res_json = response.json()
if res_json['TrackPackagesResponse']['packageList'][0]['errorList'][0]['message'] != "":
# exits the function if package id is wrong
return
result = {
'delivery_date':res_json['TrackPackagesResponse']['packageList'][0]['displayActDeliveryDt'],
'dest_city':res_json['TrackPackagesResponse']['packageList'][0]['destLocationCity'],
'dest_state':res_json['TrackPackagesResponse']['packageList'][0]['destLocationStateCD'],
'dest_zip':res_json['TrackPackagesResponse']['packageList'][0]['destLocationZip'],
'dest_country':res_json['TrackPackagesResponse']['packageList'][0]['destLocationCntryCD'],
'orig_city':res_json['TrackPackagesResponse']['packageList'][0]['originCity'],
'orig_state':res_json['TrackPackagesResponse']['packageList'][0]['originStateCD'],
'orig_zip':res_json['TrackPackagesResponse']['packageList'][0]['originZip'],
'orig_country':res_json['TrackPackagesResponse']['packageList'][0]['originCntryCD']
}
return result
except Exception as e:
print(f'Error occurred on awb: {track_no}. \n Error Message : ' + str(e))
pass
# create a sample data frame
data = {'LETTER_ID': ['L0001', 'L0002', 'L0003', 'L0004'],
'LOVER': ['Michael','Pam','Angela','Jim'],
'GIFT_VALUE':[2000,15,5,50],
'TRACKING_NO':['FDX GRND 289724287655','AWB 9128 4510 1597','FEDEX 288542243907','RECEIVED ON 511593037823']}
tracking = pd.DataFrame(data)
# use regex magic to clean tracking numbers that were poorly entered at data entry stage
tracking['TRACKING_NO_CLEAN'] = tracking['TRACKING_NO'].str.replace(r'[^0-9]', '', regex=True)
# adding columns to our data frame, and using the scraper function by iterating on rows
tracking = tracking.reindex(columns = tracking.columns.tolist() + ['STATUS','DELIVERY_DATE','ORIGIN','DESTINATION'])
for row in tracking.itertuples():
index = row.Index
track_id = row.TRACKING_NO_CLEAN
result_dict = get_package_details_fedex(track_id)
if track_id is not None:
tracking.loc[index, 'STATUS'] = 'data complete'
tracking.loc[index, 'DELIVERY_DATE'] = result_dict['delivery_date']
tracking.loc[index, 'ORIGIN'] = result_dict['orig_city'] + ', ' + result_dict['orig_state'] + ', ' + result_dict['orig_country']
tracking.loc[index, 'DESTINATION'] = result_dict['dest_city'] + ', ' + result_dict['dest_state'] + ', ' + result_dict['dest_country']
else:
tracking.loc[index, 'STATUS'] = 'no data found'
# clean city names in create an index with unique cities
tracking['ORIGIN_CLEAN'] = tracking['ORIGIN'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)
tracking['ORIGIN_CLEAN'] = tracking['ORIGIN_CLEAN'].str.strip()
tracking['DESTINATION_CLEAN'] = tracking['DESTINATION'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)
tracking['DESTINATION_CLEAN'] = tracking['DESTINATION_CLEAN'].str.strip()
cities = [*tracking['ORIGIN_CLEAN'].unique(), *tracking['DESTINATION_CLEAN'].unique()]
cities = list(set(cities))
# use city index to create new data frame to store geo locs of each unique city
places = pd.DataFrame(columns = ['city','geocode','raw_geocode','lat','lon'])
places['city'] = cities
# create the geoloc function
geolocator = Nominatim(user_agent='myapplication')
limited_geolocator = RateLimiter(geolocator.geocode, min_delay_seconds = 5)
# use the geoloc function on our unique cities
places['geocode'] = places['city'].apply(geolocator.geocode)
places['raw_geocode'] = places['geocode'].apply(get_raw)
places['lat'] = places['geocode'].apply(get_lat)
places['lon'] = places['geocode'].apply(get_lon)
# bring back geoloc info on main data table, ready to be used in viz
tracking = tracking.merge(places[['city','lon','lat']], left_on = 'ORIGIN_CLEAN', right_on = 'city', how = 'left')
tracking.drop(columns=['city',
'TRACKING_NO',
'TRACKING_NO_CLEAN',
'STATUS','DELIVERY_DATE',
'ORIGIN','DESTINATION'], inplace = True)
# save file as csv
#tracking.to_csv('letters.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment