shaunagm/etl_training_script.py

## etl_training_script.py
#-----------------------------------------------------------
# Import Necessary Packages
#-----------------------------------------------------------
import json
from parsons import Table, MobilizeAmerica, GoogleSheets
from datetime import datetime


#-----------------------------------------------------------
# Instantiate Classes/Connect to Mobilize and Google Sheets
#-----------------------------------------------------------
# Parsons looks for environmental variables MOBILIZE_AMERICA_API_KEY
# and GOOGLE_DRIVE_CREDENTIALS when you run the following lines
mobilize = MobilizeAmerica()
google_sheets = GoogleSheets()


#-----------------------------------------------------------
# Extract data from Mobilize America and explore the table
#-----------------------------------------------------------
attendance_records = mobilize.get_attendances()

# Let's take a look ...
# Parsons Table documentation: https://move-coop.github.io/parsons/html/table
# You can think of it as similar to a pandas df! But it's faster
attendance_records

# Each row can be treated as a dictionary
for index, attendance in enumerate(attendance_records):
  print(attendance['person'])
  if index == 5:
    break

# Check the number of rows
attendance_records.num_rows

# Check out what columns exist in the table
attendance_records.columns


#-----------------------------------------------------------
# Transform Mobilize data
#-----------------------------------------------------------
# Convert date column from unix to EST.
# Quickly define a function that converts input to a legible date string
def convert_to_legible_date(unix_date):
    return datetime.utcfromtimestamp(int(unix_date)).strftime('%Y-%m-%d %H:%M:%S')

# Now convert all columns in timeslot_start_date column using that function
attendance_records.convert_column('timeslot_start_date', convert_to_legible_date)

# Each persons contact info is crammed into one column as a JSON string. Ew!
# Let's fix that
# First convert column value from json string to python dictionary
attendance_records.convert_column('person', json.loads)

# Now use fun ETL function to unpack the dictionary into multiple columns!
attendance_records.unpack_dict('person', prepend=False)

# Finally, we're going to break the event attendances up by
# the month that they happened.
jan_attendances = attendance_records.select_rows("'2022-01' in {timeslot_start_date}")
feb_attendances = attendance_records.select_rows("'2022-02' in {timeslot_start_date}")
mar_attendances = attendance_records.select_rows("'2022-03' in {timeslot_start_date}")

### Side note for advanced usage!
# Parsons tables are built from PETL tables, which means you can use any PETL function on a parsons table!
# https://petl.readthedocs.io/en/stable/
# .table method converts into a petl table
petl_table = attendance_records.table
petl_table
# Example of a useful petl function to count signups by event -- Aggregate()
sign_ups_by_event_petl = petl_table.aggregate('event_title', len)
# Table() function converts back to parsons table
sign_ups_by_event_parsons = Table(sign_ups_by_event_petl)


#-----------------------------------------------------------
# Load data into Google Sheets
#-----------------------------------------------------------

spreadsheet_name = "Volunteer Attendance Records"
folder_id = "1y1jgygK5YUQLVrgRgNw7A8Hf2ppqOJJZ"  # get from URL

# Creates a new spreadsheet
sheet_id = google_sheets.create_spreadsheet(spreadsheet_name, folder_id=folder_id)
# sheet_id =        # get from command line for re-running script

# Use the ID to update the spreadsheet
google_sheets.overwrite_sheet(sheet_id, jan_attendances)
google_sheets.overwrite_sheet(sheet_id, feb_attendances)

# Note: if you want to be fancy, the optional parameter value_input_option = True
# allows you to supply data as formulas, not just raw values

# Overwrite erases any existing data. You can also append to the end of a sheet
google_sheets.append_to_sheet(sheet_id, feb_attendances)
google_sheets.append_to_sheet(sheet_id, mar_attendances)

# You can even format cells!
red = {"red": 1.0, "green": 0.0, "blue": 0.0}
google_sheets.format_cells(sheet_id, "A1",  {"backgroundColor": red}, worksheet=0)

# Let's mark all the people who failed to attend in red.
google_sheets.overwrite_sheet(sheet_id, attendance_records)  # overwrite sheet

for index, row in enumerate(attendance_records):
    adjusted_index = index + 2   # accounts for python zero-indexing and header row
    if row["attended"] == "false":
        cell_range = f"A{adjusted_index}:N{adjusted_index}"
        google_sheets.format_cells(sheet_id, cell_range, {"backgroundColor": red}, worksheet=0)

# NOTE: the Parsons connector is more limited than the Google Sheets API. If you want to use something on the Google Sheets API,
# you can access the client. Let's mark the people who failed to attend in red again, this time using the underlying client
# to do so.

google_sheets.overwrite_sheet(sheet_id, attendance_records)  # overwrite sheet
worksheet = google_sheets.gspread_client.open(spreadsheet_name).sheet1  # get client's worksheet object

for index, row in enumerate(attendance_records):
    adjusted_index = index + 2   # accounts for python zero-indexing and header row
    if row["attended"] == "false":
        cell_range = f"A{adjusted_index}:N{adjusted_index}"
        worksheet.format(cell_range, {"backgroundColor": red})
	#-----------------------------------------------------------
	# Import Necessary Packages
	#-----------------------------------------------------------
	import json
	from parsons import Table, MobilizeAmerica, GoogleSheets
	from datetime import datetime


	#-----------------------------------------------------------
	# Instantiate Classes/Connect to Mobilize and Google Sheets
	#-----------------------------------------------------------
	# Parsons looks for environmental variables MOBILIZE_AMERICA_API_KEY
	# and GOOGLE_DRIVE_CREDENTIALS when you run the following lines
	mobilize = MobilizeAmerica()
	google_sheets = GoogleSheets()


	#-----------------------------------------------------------
	# Extract data from Mobilize America and explore the table
	#-----------------------------------------------------------
	attendance_records = mobilize.get_attendances()

	# Let's take a look ...
	# Parsons Table documentation: https://move-coop.github.io/parsons/html/table
	# You can think of it as similar to a pandas df! But it's faster
	attendance_records

	# Each row can be treated as a dictionary
	for index, attendance in enumerate(attendance_records):
	print(attendance['person'])
	if index == 5:
	break

	# Check the number of rows
	attendance_records.num_rows

	# Check out what columns exist in the table
	attendance_records.columns


	#-----------------------------------------------------------
	# Transform Mobilize data
	#-----------------------------------------------------------
	# Convert date column from unix to EST.
	# Quickly define a function that converts input to a legible date string
	def convert_to_legible_date(unix_date):
	return datetime.utcfromtimestamp(int(unix_date)).strftime('%Y-%m-%d %H:%M:%S')

	# Now convert all columns in timeslot_start_date column using that function
	attendance_records.convert_column('timeslot_start_date', convert_to_legible_date)

	# Each persons contact info is crammed into one column as a JSON string. Ew!
	# Let's fix that
	# First convert column value from json string to python dictionary
	attendance_records.convert_column('person', json.loads)

	# Now use fun ETL function to unpack the dictionary into multiple columns!
	attendance_records.unpack_dict('person', prepend=False)

	# Finally, we're going to break the event attendances up by
	# the month that they happened.
	jan_attendances = attendance_records.select_rows("'2022-01' in {timeslot_start_date}")
	feb_attendances = attendance_records.select_rows("'2022-02' in {timeslot_start_date}")
	mar_attendances = attendance_records.select_rows("'2022-03' in {timeslot_start_date}")

	### Side note for advanced usage!
	# Parsons tables are built from PETL tables, which means you can use any PETL function on a parsons table!
	# https://petl.readthedocs.io/en/stable/
	# .table method converts into a petl table
	petl_table = attendance_records.table
	petl_table
	# Example of a useful petl function to count signups by event -- Aggregate()
	sign_ups_by_event_petl = petl_table.aggregate('event_title', len)
	# Table() function converts back to parsons table
	sign_ups_by_event_parsons = Table(sign_ups_by_event_petl)


	#-----------------------------------------------------------
	# Load data into Google Sheets
	#-----------------------------------------------------------

	spreadsheet_name = "Volunteer Attendance Records"
	folder_id = "1y1jgygK5YUQLVrgRgNw7A8Hf2ppqOJJZ" # get from URL

	# Creates a new spreadsheet
	sheet_id = google_sheets.create_spreadsheet(spreadsheet_name, folder_id=folder_id)
	# sheet_id = # get from command line for re-running script

	# Use the ID to update the spreadsheet
	google_sheets.overwrite_sheet(sheet_id, jan_attendances)
	google_sheets.overwrite_sheet(sheet_id, feb_attendances)

	# Note: if you want to be fancy, the optional parameter value_input_option = True
	# allows you to supply data as formulas, not just raw values

	# Overwrite erases any existing data. You can also append to the end of a sheet
	google_sheets.append_to_sheet(sheet_id, feb_attendances)
	google_sheets.append_to_sheet(sheet_id, mar_attendances)

	# You can even format cells!
	red = {"red": 1.0, "green": 0.0, "blue": 0.0}
	google_sheets.format_cells(sheet_id, "A1", {"backgroundColor": red}, worksheet=0)

	# Let's mark all the people who failed to attend in red.
	google_sheets.overwrite_sheet(sheet_id, attendance_records) # overwrite sheet

	for index, row in enumerate(attendance_records):
	adjusted_index = index + 2 # accounts for python zero-indexing and header row
	if row["attended"] == "false":
	cell_range = f"A{adjusted_index}:N{adjusted_index}"
	google_sheets.format_cells(sheet_id, cell_range, {"backgroundColor": red}, worksheet=0)

	# NOTE: the Parsons connector is more limited than the Google Sheets API. If you want to use something on the Google Sheets API,
	# you can access the client. Let's mark the people who failed to attend in red again, this time using the underlying client
	# to do so.

	google_sheets.overwrite_sheet(sheet_id, attendance_records) # overwrite sheet
	worksheet = google_sheets.gspread_client.open(spreadsheet_name).sheet1 # get client's worksheet object

	for index, row in enumerate(attendance_records):
	adjusted_index = index + 2 # accounts for python zero-indexing and header row
	if row["attended"] == "false":
	cell_range = f"A{adjusted_index}:N{adjusted_index}"
	worksheet.format(cell_range, {"backgroundColor": red})