Skip to content

Instantly share code, notes, and snippets.

@shaunagm
Last active May 4, 2022 21:17
Embed
What would you like to do?
Script for "Intro to ETL" Training (Part 1)
#-----------------------------------------------------------
# Import Necessary Packages
#-----------------------------------------------------------
import json
from parsons import Table, MobilizeAmerica, GoogleSheets
from datetime import datetime
#-----------------------------------------------------------
# Instantiate Classes/Connect to Mobilize and Google Sheets
#-----------------------------------------------------------
# Parsons looks for environmental variables MOBILIZE_AMERICA_API_KEY
# and GOOGLE_DRIVE_CREDENTIALS when you run the following lines
mobilize = MobilizeAmerica()
google_sheets = GoogleSheets()
#-----------------------------------------------------------
# Extract data from Mobilize America and explore the table
#-----------------------------------------------------------
attendance_records = mobilize.get_attendances()
# Let's take a look ...
# Parsons Table documentation: https://move-coop.github.io/parsons/html/table
# You can think of it as similar to a pandas df! But it's faster
attendance_records
# Each row can be treated as a dictionary
for index, attendance in enumerate(attendance_records):
print(attendance['person'])
if index == 5:
break
# Check the number of rows
attendance_records.num_rows
# Check out what columns exist in the table
attendance_records.columns
#-----------------------------------------------------------
# Transform Mobilize data
#-----------------------------------------------------------
# Convert date column from unix to EST.
# Quickly define a function that converts input to a legible date string
def convert_to_legible_date(unix_date):
return datetime.utcfromtimestamp(int(unix_date)).strftime('%Y-%m-%d %H:%M:%S')
# Now convert all columns in timeslot_start_date column using that function
attendance_records.convert_column('timeslot_start_date', convert_to_legible_date)
# Each persons contact info is crammed into one column as a JSON string. Ew!
# Let's fix that
# First convert column value from json string to python dictionary
attendance_records.convert_column('person', json.loads)
# Now use fun ETL function to unpack the dictionary into multiple columns!
attendance_records.unpack_dict('person', prepend=False)
# Finally, we're going to break the event attendances up by
# the month that they happened.
jan_attendances = attendance_records.select_rows("'2022-01' in {timeslot_start_date}")
feb_attendances = attendance_records.select_rows("'2022-02' in {timeslot_start_date}")
mar_attendances = attendance_records.select_rows("'2022-03' in {timeslot_start_date}")
### Side note for advanced usage!
# Parsons tables are built from PETL tables, which means you can use any PETL function on a parsons table!
# https://petl.readthedocs.io/en/stable/
# .table method converts into a petl table
petl_table = attendance_records.table
petl_table
# Example of a useful petl function to count signups by event -- Aggregate()
sign_ups_by_event_petl = petl_table.aggregate('event_title', len)
# Table() function converts back to parsons table
sign_ups_by_event_parsons = Table(sign_ups_by_event_petl)
#-----------------------------------------------------------
# Load data into Google Sheets
#-----------------------------------------------------------
spreadsheet_name = "Volunteer Attendance Records"
folder_id = "1y1jgygK5YUQLVrgRgNw7A8Hf2ppqOJJZ" # get from URL
# Creates a new spreadsheet
sheet_id = google_sheets.create_spreadsheet(spreadsheet_name, folder_id=folder_id)
# sheet_id = # get from command line for re-running script
# Use the ID to update the spreadsheet
google_sheets.overwrite_sheet(sheet_id, jan_attendances)
google_sheets.overwrite_sheet(sheet_id, feb_attendances)
# Note: if you want to be fancy, the optional parameter value_input_option = True
# allows you to supply data as formulas, not just raw values
# Overwrite erases any existing data. You can also append to the end of a sheet
google_sheets.append_to_sheet(sheet_id, feb_attendances)
google_sheets.append_to_sheet(sheet_id, mar_attendances)
# You can even format cells!
red = {"red": 1.0, "green": 0.0, "blue": 0.0}
google_sheets.format_cells(sheet_id, "A1", {"backgroundColor": red}, worksheet=0)
# Let's mark all the people who failed to attend in red.
google_sheets.overwrite_sheet(sheet_id, attendance_records) # overwrite sheet
for index, row in enumerate(attendance_records):
adjusted_index = index + 2 # accounts for python zero-indexing and header row
if row["attended"] == "false":
cell_range = f"A{adjusted_index}:N{adjusted_index}"
google_sheets.format_cells(sheet_id, cell_range, {"backgroundColor": red}, worksheet=0)
# NOTE: the Parsons connector is more limited than the Google Sheets API. If you want to use something on the Google Sheets API,
# you can access the client. Let's mark the people who failed to attend in red again, this time using the underlying client
# to do so.
google_sheets.overwrite_sheet(sheet_id, attendance_records) # overwrite sheet
worksheet = google_sheets.gspread_client.open(spreadsheet_name).sheet1 # get client's worksheet object
for index, row in enumerate(attendance_records):
adjusted_index = index + 2 # accounts for python zero-indexing and header row
if row["attended"] == "false":
cell_range = f"A{adjusted_index}:N{adjusted_index}"
worksheet.format(cell_range, {"backgroundColor": red})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment