shaunagm/day3script.py

## day3script.py
from parsons import MobilizeAmerica, ActionNetwork, GoogleSheets, Table


#########################
### Getting your data ###
#########################

"""Typically you'd load data directly from your third party platform:"""

# mobilize = MobilizeAmerica()
# mobilize_data = mobilize.get_people()

# action_network = ActionNetwork()
# actionetwork_data = action_network.get_people()

"""However, to make things easier (and to sneakily put in some data problems to fix), we've created a
CSV for everyone to use. These are already preformatted to flatted out some of the nested fields with
functions like get_primary_contact_or_first() that we showed you last session."""

mobilize_data = Table.from_csv("mobilize_data.csv")
an_data = Table.from_csv("actionnetwork_data.csv")


####################################
### Transforming + Cleaning data ###
####################################

"""Cleaning phones"""

import re

def clean_phones(old_number):
    """Given a phone number, standardizes to a 10 digit format with no spaces or extra characters.
    Assumes North American phone number and thus strips country code. For international numbers, consider
    python-phonenumbers package: https://github.com/daviddrysdale/python-phonenumbers"""

    numbers = re.compile(r'\d+(?:\.\d+)?').findall(old_number)  # extracts digits from "old_number" & returns array
    number = "".join(numbers)  # turns array of numbers back into a single string

    if len(number) == 11 and number[0] == 1:
        number = number[1:]   # remove first number if it's 1

    return number if len(number) == 10 else None

mobilize_data.convert_column('phone_numbers', clean_phones)  # apply cleaning function
an_data.convert_column('phone_numbers', clean_phones)  # apply cleaning function

"""Cleaning timestamps"""

from datetime import datetime

def convert_to_legible_date(unix_date):
    return datetime.utcfromtimestamp(int(unix_date)).strftime('%Y-%m-%d %H:%M:%S')

mobilize_data.convert_column("created_date", convert_to_legible_date)
an_data.convert_column("created_date", convert_to_legible_date)


##################################
### De-duplicating and merging ###
##################################

"""We're now going to merge our two sets of contacts together. Some people are in both datasets, some
are in only one."""

"""The first thing we'll do is rename the ID columns in the two datasets so that we can have separate
ID columns for Action Network and Mobilize."""

mobilize_data.rename_column("id", "mobilize_id")
an_data.rename_column("id", "actionetwork_id")

"""Next, we'll combine the two datasets into a single Parsons table using the `concat` helper method."""

mobilize_data.concat(an_data)
combined_data = mobilize_data   # rename dataset

"""The petl library, which Parsons is a wrapper around, has a function to merge duplicates. Here, we use
the keys for postal_addresses (zip code) and email addresses"""

merged_data = combined_data.table.mergeduplicates(key=("postal_addresses", "email_addresses"))
merged_data = Table(merged_data)  # turn back into Parsons Table (from petl object)

"""Conflicting data is stored by petl in the row under a Conflict() function. Probably the easiest way
to inspect our data is by outputting it to CSV and opening it in an editor. When we're done, we'll come
back to the script to resolve problems programatically."""

merged_data.to_csv("debug_merged_data.csv")   # note we have to turn the petl table back into a Parsons Table

"""We can also get the subset of the data that has conflicts with the conflicts method:"""

conflict_table = combined_data.table.conflicts(key=("postal_addresses", "email_addresses"))

conflict_table = Table(conflict_table)

"""There are many ways to resolve conflicts—what you chose will depend on your context. Here, we chose the
option of going with the data associated with a more recent timestamp (modified_data). So we'll use the
conflicts table to figure out which source value comes from an account that has been more recently updated."""

resolutions = {}

for row in conflict_table:

    key = (row["email_addresses"], row["postal_addresses"])  # use merge keys to create unique key

    if key not in resolutions:
        resolutions.update({key: {"modified_date": None, "data": {}}})

    if not resolutions[key]["modified_date"] or row["modified_date"] > resolutions[key]["modified_date"]:
        # if timestamp is more recent, or if this is first row found, copy all data to data field
        resolutions[key]["data"] = row
        resolutions[key]["modified_date"] = row["modified_date"]

print(resolutions)

"""We now have a dictionary that has the most recent data for the individual rows with conflicts. We can
use the keys to find and overwrite the data in the merged data table.

Parsons Tables are optimized for big bulk transformations, so we're going to turn our Parsons Table into
a regular old Python object while we fiddle with the cell values. Then we'll turn it back into a Parsons
Table when we're ready to upload.
"""

combined_data = combined_data.to_dicts()

for index, row in enumerate(combined_data):

    key = (row["email_addresses"], row["postal_addresses"])

    if key in resolutions:  # if key matches a row with conflicting data

        """Note that the following code only overwrites columns that are not null, so for instance, it
        won't overwrite the mobilize_id for a row if it's being overwritten with the same person's
        action network data, becaue the action network row won't have a value for the mobilize_id.
        Alternative, if you know the conflicts will only be in specific columns, you can also specify
        those here."""

        for row_key, row_data in resolutions[key]["data"].items():
            if row_data:
                combined_data[index][row_key] = row_data

combined_data = Table(combined_data)  # transform it back

"""Now, if we try to find conflicts in our combined data, there aren't any:"""

conflict_table = combined_data.table.conflicts(key=("postal_addresses", "email_addresses"))
print(conflict_table)

"""And if we run merge duplicates again, it merges without any Conflicts() created:"""

merged_data = combined_data.table.mergeduplicates(key=("postal_addresses", "email_addresses"))
merged_data = Table(merged_data)  # turn back into Parsons Table (from petl object)
merged_data.to_csv("debug_merged_data.csv")   # note we have to turn the petl table back into a Parsons Table


#####################################
### Pushing Data to Google Sheets ###
#####################################

google_sheets = GoogleSheets()

sheet_id = sheets.create_spreadsheet('Combined Data', editor_email="example@example.com")
# sheet_id = ""

google_sheets.overwrite_sheet(sheet_id, combined_data)

"""This script has gotten data from Mobilize and Action Network, merged them, and uploaded them to
Google Sheets. It works fine as a one-time script, but what if we wanted to run it every day, or every
hour, and only handle new additions to Mobilize and/or Action Network?

TODO: add discussion of idempotency, how you might approach it

(Not going to show example in Google Sheets because you can't really do indexes or lookups in Google Sheets
the way you could in a database, making these kinds of operations hard - you're better off overwriting
the sheet each time.)
"""
	from parsons import MobilizeAmerica, ActionNetwork, GoogleSheets, Table


	#########################
	### Getting your data ###
	#########################

	"""Typically you'd load data directly from your third party platform:"""

	# mobilize = MobilizeAmerica()
	# mobilize_data = mobilize.get_people()

	# action_network = ActionNetwork()
	# actionetwork_data = action_network.get_people()

	"""However, to make things easier (and to sneakily put in some data problems to fix), we've created a
	CSV for everyone to use. These are already preformatted to flatted out some of the nested fields with
	functions like get_primary_contact_or_first() that we showed you last session."""

	mobilize_data = Table.from_csv("mobilize_data.csv")
	an_data = Table.from_csv("actionnetwork_data.csv")


	####################################
	### Transforming + Cleaning data ###
	####################################

	"""Cleaning phones"""

	import re

	def clean_phones(old_number):
	"""Given a phone number, standardizes to a 10 digit format with no spaces or extra characters.
	Assumes North American phone number and thus strips country code. For international numbers, consider
	python-phonenumbers package: https://github.com/daviddrysdale/python-phonenumbers"""

	numbers = re.compile(r'\d+(?:\.\d+)?').findall(old_number) # extracts digits from "old_number" & returns array
	number = "".join(numbers) # turns array of numbers back into a single string

	if len(number) == 11 and number[0] == 1:
	number = number[1:] # remove first number if it's 1

	return number if len(number) == 10 else None

	mobilize_data.convert_column('phone_numbers', clean_phones) # apply cleaning function
	an_data.convert_column('phone_numbers', clean_phones) # apply cleaning function

	"""Cleaning timestamps"""

	from datetime import datetime

	def convert_to_legible_date(unix_date):
	return datetime.utcfromtimestamp(int(unix_date)).strftime('%Y-%m-%d %H:%M:%S')

	mobilize_data.convert_column("created_date", convert_to_legible_date)
	an_data.convert_column("created_date", convert_to_legible_date)


	##################################
	### De-duplicating and merging ###
	##################################

	"""We're now going to merge our two sets of contacts together. Some people are in both datasets, some
	are in only one."""

	"""The first thing we'll do is rename the ID columns in the two datasets so that we can have separate
	ID columns for Action Network and Mobilize."""

	mobilize_data.rename_column("id", "mobilize_id")
	an_data.rename_column("id", "actionetwork_id")

	"""Next, we'll combine the two datasets into a single Parsons table using the `concat` helper method."""

	mobilize_data.concat(an_data)
	combined_data = mobilize_data # rename dataset

	"""The petl library, which Parsons is a wrapper around, has a function to merge duplicates. Here, we use
	the keys for postal_addresses (zip code) and email addresses"""

	merged_data = combined_data.table.mergeduplicates(key=("postal_addresses", "email_addresses"))
	merged_data = Table(merged_data) # turn back into Parsons Table (from petl object)

	"""Conflicting data is stored by petl in the row under a Conflict() function. Probably the easiest way
	to inspect our data is by outputting it to CSV and opening it in an editor. When we're done, we'll come
	back to the script to resolve problems programatically."""

	merged_data.to_csv("debug_merged_data.csv") # note we have to turn the petl table back into a Parsons Table

	"""We can also get the subset of the data that has conflicts with the conflicts method:"""

	conflict_table = combined_data.table.conflicts(key=("postal_addresses", "email_addresses"))

	conflict_table = Table(conflict_table)

	"""There are many ways to resolve conflicts—what you chose will depend on your context. Here, we chose the
	option of going with the data associated with a more recent timestamp (modified_data). So we'll use the
	conflicts table to figure out which source value comes from an account that has been more recently updated."""

	resolutions = {}

	for row in conflict_table:

	key = (row["email_addresses"], row["postal_addresses"]) # use merge keys to create unique key

	if key not in resolutions:
	resolutions.update({key: {"modified_date": None, "data": {}}})

	if not resolutions[key]["modified_date"] or row["modified_date"] > resolutions[key]["modified_date"]:
	# if timestamp is more recent, or if this is first row found, copy all data to data field
	resolutions[key]["data"] = row
	resolutions[key]["modified_date"] = row["modified_date"]

	print(resolutions)

	"""We now have a dictionary that has the most recent data for the individual rows with conflicts. We can
	use the keys to find and overwrite the data in the merged data table.

	Parsons Tables are optimized for big bulk transformations, so we're going to turn our Parsons Table into
	a regular old Python object while we fiddle with the cell values. Then we'll turn it back into a Parsons
	Table when we're ready to upload.
	"""

	combined_data = combined_data.to_dicts()

	for index, row in enumerate(combined_data):

	key = (row["email_addresses"], row["postal_addresses"])

	if key in resolutions: # if key matches a row with conflicting data

	"""Note that the following code only overwrites columns that are not null, so for instance, it
	won't overwrite the mobilize_id for a row if it's being overwritten with the same person's
	action network data, becaue the action network row won't have a value for the mobilize_id.
	Alternative, if you know the conflicts will only be in specific columns, you can also specify
	those here."""

	for row_key, row_data in resolutions[key]["data"].items():
	if row_data:
	combined_data[index][row_key] = row_data

	combined_data = Table(combined_data) # transform it back

	"""Now, if we try to find conflicts in our combined data, there aren't any:"""

	conflict_table = combined_data.table.conflicts(key=("postal_addresses", "email_addresses"))
	print(conflict_table)

	"""And if we run merge duplicates again, it merges without any Conflicts() created:"""

	merged_data = combined_data.table.mergeduplicates(key=("postal_addresses", "email_addresses"))
	merged_data = Table(merged_data) # turn back into Parsons Table (from petl object)
	merged_data.to_csv("debug_merged_data.csv") # note we have to turn the petl table back into a Parsons Table


	#####################################
	### Pushing Data to Google Sheets ###
	#####################################

	google_sheets = GoogleSheets()

	sheet_id = sheets.create_spreadsheet('Combined Data', editor_email="example@example.com")
	# sheet_id = ""

	google_sheets.overwrite_sheet(sheet_id, combined_data)

	"""This script has gotten data from Mobilize and Action Network, merged them, and uploaded them to
	Google Sheets. It works fine as a one-time script, but what if we wanted to run it every day, or every
	hour, and only handle new additions to Mobilize and/or Action Network?

	TODO: add discussion of idempotency, how you might approach it

	(Not going to show example in Google Sheets because you can't really do indexes or lookups in Google Sheets
	the way you could in a database, making these kinds of operations hard - you're better off overwriting
	the sheet each time.)
	"""