Skip to content

Instantly share code, notes, and snippets.

@jgarciabu
Created October 10, 2018 19:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgarciabu/81512476cd038d6fa4116c2ce37ab6b2 to your computer and use it in GitHub Desktop.
Save jgarciabu/81512476cd038d6fa4116c2ce37ab6b2 to your computer and use it in GitHub Desktop.
Code lib created to house all functions necessary to process data for this use case.
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 11:56:27 2018
@author: jeffrey.garcia
"""
import pandas as pd
import numpy as np
import csv
import glob as gb
def ml_validation(renamed_columns_data):
# Reorder columns to expected order for import
reordered_columns_data = renamed_columns_data[
["Rec. Type", "Member ID", "Loc. Name", "Loc. Address", "Loc. Address 2", "Loc. City", "Loc. State", "Loc. Zip",
"Operator ID", "Distributor", "DC", "PLG", "Dist Acct Num.", "Sector", "Org. Level 2", "Org. Level 3",
"Org. Level 4", "Segment",
"Food Service ?", "GLN", "Loc. Country", "Member Status", "Start Date", "End Date", "Contract Num.",
"Contract Name", "Contact Name",
"Contact Title", "Contact Phone", "Contact Fax", "Contact Email"]]
# First test output
# reordered_columns_data.to_csv('testoutput.csv', index=False)
# Remove all duplicate rows except for first occurrence.
deduped_data = reordered_columns_data.drop_duplicates()
# Dedupe test output
# data.to_csv('dedupetestoutput.csv', index=False)
# Drop rows with empty 'Loc. Name' field
no_empty_locations_data = deduped_data.replace({'Loc. Name': {'': np.nan}}).dropna(subset=['Loc. Name'])
# Empty 'Loc. Name' name drop test
# data.to_csv('blanklocname.csv', index=False)
# Remove carriage returns and new lines from areas of the dataset where these new lines have caused problems during ingestion
no_empty_locations_data['Loc. Address'] = no_empty_locations_data['Loc. Address'].str.replace('\n', ' ')
no_empty_locations_data['Loc. Address 2'] = no_empty_locations_data['Loc. Address 2'].str.replace('\n', ' ')
no_empty_locations_data['Loc. Zip'] = no_empty_locations_data['Loc. Zip'].str.replace('\n', ' ')
# Proper state abbreviations for validation later
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
# Incorrect state abbreviations rows set aside for separate manual review CSV
incorrect_state_abbr_rows = no_empty_locations_data[~no_empty_locations_data["Loc. State"].str.upper().isin(states)]
if len(incorrect_state_abbr_rows) > 0:
incorrect_state_abbr_rows.to_csv('incorrect_state_abbreviations.csv', index=False)
# Bad state abbreviation validation
state_abbr_filtered_data = no_empty_locations_data[no_empty_locations_data["Loc. State"].str.upper().isin(states)]
# Adds leading zeroes to zip code if state in (CT,MA,ME,NH,NJ,RI,VT)
state_abbr_filtered_data = state_abbr_filtered_data.copy()
state_abbr_filtered_data['Loc. Zip'] = state_abbr_filtered_data['Loc. Zip'].str.strip()
state_abbr_filtered_data.loc[(state_abbr_filtered_data['Loc. Zip'].str.len() == 4) & (
state_abbr_filtered_data['Loc. State'].str.upper().isin(
['CT', 'MA', 'ME', 'NH', 'NJ', 'RI', 'VT'])), 'Loc. Zip'] = '0' + state_abbr_filtered_data[
'Loc. Zip'].astype(str)
# Collects rows with Zip Codes less than 5 digits or more than 10 digits and exports to CSV for manual review
bad_zip_lengths = state_abbr_filtered_data.loc[~((state_abbr_filtered_data["Loc. Zip"].str.len() <= 10)), :]
if len(bad_zip_lengths) > 0:
bad_zip_lengths.to_csv('incorrect_zip_code_lengths.csv', index=False)
# Filters out rows with 'Loc. Zip' lengths shorter than 5 and longer than 10 digits
zip_length_filtered_data = state_abbr_filtered_data.loc[((state_abbr_filtered_data["Loc. Zip"].str.len() <= 10)), :]
# Collects rows with 'Loc. Address' fields longer than 50 characters long and exports to CSV for manual review.
zip_length_filtered_data = zip_length_filtered_data.copy()
zip_length_filtered_data['Loc. Address'] = zip_length_filtered_data['Loc. Address'].str.strip()
zip_length_filtered_data['Loc. Address 2'] = zip_length_filtered_data['Loc. Address 2'].str.strip()
long_address_data = zip_length_filtered_data.loc[((zip_length_filtered_data["Loc. Address"].str.len() > 50) | (
zip_length_filtered_data["Loc. Address 2"].str.len() > 50)), :]
if len(long_address_data) > 0:
long_address_data.to_csv('long_address_lengths.csv', index=False)
# Filters out rows with 'Loc. Address' lengths longer than 50 characters
long_address_filtered_data = zip_length_filtered_data.loc[
~((zip_length_filtered_data["Loc. Address"].str.len() > 50) | (
zip_length_filtered_data["Loc. Address 2"].str.len() > 50)), :]
return long_address_filtered_data
def ml_validation_sector(renamed_columns_data):
# Reorder columns to expected order for import
reordered_columns_data = renamed_columns_data[
["Rec. Type", "Member ID", "Loc. Name", "Loc. Address", "Loc. Address 2", "Loc. City", "Loc. State", "Loc. Zip",
"Operator ID", "Distributor", "DC", "PLG", "Dist Acct Num.", "Sector", "Org. Level 2", "Org. Level 3",
"Org. Level 4", "Segment",
"Food Service ?", "GLN", "Loc. Country", "Member Status", "Start Date", "End Date", "Contract Num.",
"Contract Name", "Contact Name",
"Contact Title", "Contact Phone", "Contact Fax", "Contact Email"]]
# First test output
# reordered_columns_data.to_csv('testoutput.csv', index=False)
# Remove all duplicate rows except for first occurrence.
deduped_data = reordered_columns_data.drop_duplicates()
# Dedupe test output
# data.to_csv('dedupetestoutput.csv', index=False)
# Drop rows with empty 'Loc. Name' field
no_empty_locations_data = deduped_data.replace({'Loc. Name': {'': np.nan}}).dropna(subset=['Loc. Name'])
# Empty 'Loc. Name' name drop test
# data.to_csv('blanklocname.csv', index=False)
# Remove carriage returns and new lines from dataset
no_empty_locations_data['Loc. Address'] = no_empty_locations_data['Loc. Address'].str.replace('\n', ' ')
no_empty_locations_data['Loc. Address 2'] = no_empty_locations_data['Loc. Address 2'].str.replace('\n', ' ')
no_empty_locations_data['Loc. Zip'] = no_empty_locations_data['Loc. Zip'].str.replace('\n', ' ')
# Excluded sector list for validation later
sector_filter_list = ["CLINIC", "EMERGENCY SERVICES", "HME/DME", "HOME HEALTH CARE", "INTEGRATED DELIVERY NETWORK",
"LABORATORY",
"NON-OWN USE (Retail/Pharmacy)", "PHYSICIAN PRACTICE", "SPECIALTY PHARMACY", "VETERINARY"]
# Proper state abbreviations for validation later
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
# Sector validation
sector_filtered_data = no_empty_locations_data[
~no_empty_locations_data["Sector"].str.upper().isin(sector_filter_list)]
# Incorrect state abbreviations rows set aside for separate manual review CSV
incorrect_state_abbr_rows = sector_filtered_data[~sector_filtered_data["Loc. State"].str.upper().isin(states)]
if len(incorrect_state_abbr_rows) > 0:
incorrect_state_abbr_rows.to_csv('incorrect_state_abbreviations.csv', index=False)
# Bad state abbreviation validation
state_abbr_filtered_data = sector_filtered_data[sector_filtered_data["Loc. State"].str.upper().isin(states)]
# Adds leading zeroes to zip code if state in (CT,MA,ME,NH,NJ,RI,VT)
state_abbr_filtered_data = state_abbr_filtered_data.copy()
state_abbr_filtered_data['Loc. Zip'] = state_abbr_filtered_data['Loc. Zip'].str.strip()
state_abbr_filtered_data.loc[(state_abbr_filtered_data['Loc. Zip'].str.len() == 4) & (
state_abbr_filtered_data['Loc. State'].str.upper().isin(
['CT', 'MA', 'ME', 'NH', 'NJ', 'RI', 'VT'])), 'Loc. Zip'] = '0' + state_abbr_filtered_data[
'Loc. Zip'].astype(str)
# Collects rows with Zip Codes less than 5 digits or more than 10 digits and exports to CSV for manual review
bad_zip_lengths = state_abbr_filtered_data.loc[~((state_abbr_filtered_data["Loc. Zip"].str.len() <= 10)), :]
if len(bad_zip_lengths) > 0:
bad_zip_lengths.to_csv('incorrect_zip_code_lengths.csv', index=False)
# Filters out rows with 'Loc. Zip' lengths shorter than 5 and longer than 10 digits
zip_length_filtered_data = state_abbr_filtered_data.loc[((state_abbr_filtered_data["Loc. Zip"].str.len() <= 10)), :]
# Collects rows with 'Loc. Address' fields longer than 50 characters long and exports to CSV for manual review.
zip_length_filtered_data = zip_length_filtered_data.copy()
zip_length_filtered_data['Loc. Address'] = zip_length_filtered_data['Loc. Address'].str.strip()
zip_length_filtered_data['Loc. Address 2'] = zip_length_filtered_data['Loc. Address 2'].str.strip()
long_address_data = zip_length_filtered_data.loc[((zip_length_filtered_data["Loc. Address"].str.len() > 50) | (
zip_length_filtered_data["Loc. Address 2"].str.len() > 50)), :]
if len(long_address_data) > 0:
long_address_data.to_csv('long_address_lengths.csv', index=False)
# Filters out rows with 'Loc. Address' lengths longer than 50 characters
long_address_filtered_data = zip_length_filtered_data.loc[
~((zip_length_filtered_data["Loc. Address"].str.len() > 50) | (
zip_length_filtered_data["Loc. Address 2"].str.len() > 50)), :]
return long_address_filtered_data
def lw_hilton():
try:
# Search for any files that follow the naming convention established below
files = gb.glob("hiltonproperties*.xlsx")
# Create empty dataframe
raw_data = pd.DataFrame()
# Read all files in directory and add to the empty dataframe creating one dataframe from one or more files.
for file in files:
stage_data = pd.read_excel(file)
raw_data = raw_data.append(stage_data)
# Add missing columns necessary for memberlist import
raw_data["Rec. Type"] = "600"
raw_data["Distributor"] = "Hilton"
raw_data["DC"] = "Hilton"
raw_data["PLG"] = "Hilton Member List"
raw_data["Operator ID"] = ""
raw_data["Food Service ?"] = ""
raw_data["Member Status"] = ""
raw_data["Start Date"] = ""
raw_data["End Date"] = ""
raw_data["Contract Num."] = ""
raw_data["Contract Name"] = ""
raw_data["Contact Title"] = ""
raw_data["Dist Acct Num."] = ""
raw_data["Org. Level 4"] = ""
raw_data["GLN"] = ""
# Delete unnecessary columns
del raw_data["Facilityid"]
del raw_data["Addressline3"]
del raw_data["Foreigncity2"]
del raw_data["Province2"]
del raw_data["Rooms"]
del raw_data["Ownercompany Name"]
del raw_data["ManagementCat"]
del raw_data["Mgmtcompany Name"]
del raw_data["Rponame"]
# Rename remaining columns to expected values for import
renamed_columns_data = raw_data.rename(columns={"Inncode": "Member ID", "Full Hotel Name": "Loc. Name",
"Addressline1": "Loc. Address",
"Addressline2": "Loc. Address 2", "City2": "Loc. City",
"State": "Loc. State", "ZipCode": "Loc. Zip",
"Managementcategory": "Sector",
"Subbrand": "Org. Level 2", "Brandname": "Org. Level 3",
"Facilitytypedescription": "Segment", "Country": "Loc. Country",
"Gm Name": "Contact Name",
"PhoneNumber": "Contact Phone", "FaxNumber": "Contact Fax",
"Propertyemail": "Contact Email"})
return renamed_columns_data
except Exception as e:
errorinfo = []
errorinfo.append(e)
error_log_filename = 'LWHiltonPreErrorLog.csv'
with open(error_log_filename, 'a') as errorlog:
mywriter = csv.writer(errorlog)
mywriter.writerows([errorinfo])
def lw_inta():
try:
# Search for any files that follow the naming convention established below
files = gb.glob("intalereproperties*.xlsx")
# Create empty dataframe
raw_data = pd.DataFrame()
# Read all files in directory and add to the empty dataframe creating one dataframe from one or more files.
for file in files:
stage_data = pd.read_excel(file, header=1)
raw_data = raw_data.append(stage_data)
# Add missing columns necessary for memberlist import
raw_data["Rec. Type"] = "600"
raw_data["Distributor"] = "Amerinet"
raw_data["DC"] = "Amerinet"
raw_data["PLG"] = "Amerinet MemberList"
raw_data["Operator ID"] = ""
raw_data["Food Service ?"] = ""
raw_data["Loc. Country"] = ""
raw_data["Member Status"] = ""
raw_data["Start Date"] = ""
raw_data["End Date"] = ""
raw_data["Contract Num."] = ""
raw_data["Contract Name"] = ""
raw_data["Contact Name"] = ""
raw_data["Contact Title"] = ""
raw_data["Contact Phone"] = ""
raw_data["Contact Fax"] = ""
raw_data["Contact Email"] = ""
raw_data["Dist Acct Num."] = ""
raw_data["Org. Level 4"] = ""
# Delete unnecessary columns
del raw_data["Shareholder Name"]
del raw_data["Membership blank=No Update A=Add C=Change D=Delete"]
del raw_data["Update Effective Date"]
del raw_data["Previous Facility Name"]
del raw_data["Facility Name Extension"]
del raw_data["Area Code"]
del raw_data["Phone #"]
del raw_data["Profit Status Y/N"]
del raw_data["Member Since Date"]
del raw_data["DEA #"]
del raw_data["HIN"]
del raw_data["Mail Code"]
del raw_data["# of Beds"]
del raw_data["IDN"]
del raw_data["Sole Source Mbr Y/N"]
del raw_data["Pharmacy Member Y/N"]
del raw_data["Pharmacy Start Date"]
del raw_data["Retail Pharmacy Member Y/N"]
del raw_data["Retail Pharmacy Start Date"]
del raw_data["Pharmacy Associate"]
del raw_data["Pharmacy Associate Start Date"]
del raw_data["Parent Number"]
del raw_data["Family Parent Facility Number"]
del raw_data["Alliance"]
# Rename remaining columns to expected values for import
renamed_columns_data = raw_data.rename(columns={"Facility Number": "Member ID", "Facility Name": "Loc. Name",
"Address 1": "Loc. Address", "Address 2": "Loc. Address 2",
"City": "Loc. City",
"State": "Loc. State", "Zip Code": "Loc. Zip",
"Facility Type": "Sector",
"Parent Facility": "Org. Level 2",
"Family Parent Facility Name": "Org. Level 3",
"Sub-Class": "Segment", "GLN Number": "GLN"})
return renamed_columns_data
except Exception as e:
errorinfo = []
errorinfo.append(e)
error_log_filename = 'LWIntalerePreErrorLog.csv'
with open(error_log_filename, 'a') as errorlog:
mywriter = csv.writer(errorlog)
mywriter.writerows([errorinfo])
def rp_fb():
try:
# Search for any files that follow the naming convention established below
files = gb.glob("Subscribed Units List Rich*.csv")
# Create empty dataframe
raw_data = pd.DataFrame()
# Read all files in directory and add to the empty dataframe creating one dataframe from one or more files.
for file in files:
stage_data = pd.read_csv(file)
raw_data = raw_data.append(stage_data)
# Add missing columns necessary for memberlist import
raw_data["Rec. Type"] = "600"
raw_data["Distributor"] = "Foodbuy"
raw_data["DC"] = "Foodbuy"
raw_data["PLG"] = "Foodbuy MemberList"
raw_data["Operator ID"] = ""
raw_data["Food Service ?"] = ""
raw_data["Loc. Country"] = ""
raw_data["Member Status"] = ""
raw_data["End Date"] = ""
raw_data["Contract Num."] = ""
raw_data["Contract Name"] = ""
raw_data["Contact Name"] = ""
raw_data["Contact Title"] = ""
raw_data["Contact Phone"] = ""
raw_data["Contact Fax"] = ""
raw_data["Contact Email"] = ""
raw_data["Dist Acct Num."] = ""
raw_data["GLN"] = ""
# Delete unnecessary columns
del raw_data["owner_id"]
del raw_data["PERIOD"]
del raw_data["ORG_GROUP_ID"]
del raw_data["ORG_LEVEL2_ID"]
del raw_data["ORG_LEVEL3_ID"]
del raw_data["ORG_LEVEL4_ID"]
del raw_data["UNIT_STATUS"]
del raw_data["Primary_Distributor"]
del raw_data["Primary_Dist_Org_Name"]
del raw_data["DISTRIBUTOR_CUSTOMER_ID"]
del raw_data["DISTRIBUTOR_CUSTOMER_NAME"]
del raw_data["LAST_TRANSACTION_DATE"]
# Rename remaining columns to expected values for import
renamed_columns_data = raw_data.rename(columns={"CUSTOMER_ID": "Member ID", "CUSTOMER_NAME": "Loc. Name",
"MAIN_ADDRESS_LINE_1": "Loc. Address",
"MAIN_ADDRESS_LINE_2": "Loc. Address 2", "CITY": "Loc. City",
"STATE": "Loc. State", "ZIP": "Loc. Zip",
"ORG_GROUP_NAME": "Sector",
"ORG_LEVEL2_NAME": "Org. Level 2",
"ORG_LEVEL3_NAME": "Org. Level 3",
"ORG_LEVEL4_NAME": "Org. Level 4",
"CHANNEL": "Segment", "CREATE_DATE": "Start Date"})
return renamed_columns_data
except Exception as e:
errorinfo = []
errorinfo.append(e)
error_log_filename = 'RPFoodbuyPreErrorLog.csv'
with open(error_log_filename, 'a') as errorlog:
mywriter = csv.writer(errorlog)
mywriter.writerows([errorinfo])
def con_fb():
try:
# Search for any files that follow the naming convention established below
files = gb.glob("Subscribed Units List ConAgra*.csv")
# Create empty dataframe
raw_data = pd.DataFrame()
# Read all files in directory and add to the empty dataframe creating one dataframe from one or more files.
for file in files:
stage_data = pd.read_csv(file)
raw_data = raw_data.append(stage_data)
# Add missing columns necessary for memberlist import
raw_data["Rec. Type"] = "600"
raw_data["Distributor"] = "Foodbuy"
raw_data["DC"] = "Foodbuy"
raw_data["PLG"] = "Foodbuy MemberList"
raw_data["Operator ID"] = ""
raw_data["Food Service ?"] = ""
raw_data["Loc. Country"] = ""
raw_data["Member Status"] = ""
raw_data["End Date"] = ""
raw_data["Contract Num."] = ""
raw_data["Contract Name"] = ""
raw_data["Contact Name"] = ""
raw_data["Contact Title"] = ""
raw_data["Contact Phone"] = ""
raw_data["Contact Fax"] = ""
raw_data["Contact Email"] = ""
raw_data["Dist Acct Num."] = ""
raw_data["GLN"] = ""
# Delete unnecessary columns
del raw_data["owner_id"]
del raw_data["PERIOD"]
del raw_data["ORG_GROUP_ID"]
del raw_data["ORG_LEVEL2_ID"]
del raw_data["ORG_LEVEL3_ID"]
del raw_data["ORG_LEVEL4_ID"]
del raw_data["UNIT_STATUS"]
del raw_data["Primary_Distributor"]
del raw_data["Primary_Dist_Org_Name"]
del raw_data["DISTRIBUTOR_CUSTOMER_ID"]
del raw_data["DISTRIBUTOR_CUSTOMER_NAME"]
del raw_data["LAST_TRANSACTION_DATE"]
# Rename remaining columns to expected values for import
renamed_columns_data = raw_data.rename(columns={"CUSTOMER_ID": "Member ID", "CUSTOMER_NAME": "Loc. Name",
"MAIN_ADDRESS_LINE_1": "Loc. Address",
"MAIN_ADDRESS_LINE_2": "Loc. Address 2", "CITY": "Loc. City",
"STATE": "Loc. State", "ZIP": "Loc. Zip",
"ORG_GROUP_NAME": "Sector",
"ORG_LEVEL2_NAME": "Org. Level 2",
"ORG_LEVEL3_NAME": "Org. Level 3",
"ORG_LEVEL4_NAME": "Org. Level 4",
"CHANNEL": "Segment", "CREATE_DATE": "Start Date"})
return renamed_columns_data
except Exception as e:
errorinfo = []
errorinfo.append(e)
error_log_filename = 'ConFoodbuyPreErrorLog.csv'
with open(error_log_filename, 'a') as errorlog:
mywriter = csv.writer(errorlog)
mywriter.writerows([errorinfo])
def kel_com():
try:
# Search for any files that follow the naming convention established below
files = gb.glob("Compass-Foodbuy*Kellogg.xlsx")
# Create empty dataframe
raw_data = pd.DataFrame()
# Read all files in directory and add to the empty dataframe creating one dataframe from one or more files.
for file in files:
stage_data = pd.read_excel(file, header=3)
raw_data = raw_data.append(stage_data)
# Add missing columns necessary for memberlist import
raw_data["Rec. Type"] = "600"
raw_data["Distributor"] = "Foodbuy"
raw_data["DC"] = "Foodbuy"
raw_data["PLG"] = "Compass MemberList"
raw_data["Food Service ?"] = ""
raw_data["Start Date"] = ""
raw_data["End Date"] = ""
raw_data["Contract Num."] = ""
raw_data["Contract Name"] = ""
raw_data["Contact Name"] = ""
raw_data["Contact Title"] = ""
raw_data["Contact Phone"] = ""
raw_data["Contact Fax"] = ""
raw_data["Contact Email"] = ""
raw_data["Dist Acct Num."] = ""
raw_data["GLN"] = ""
raw_data["Loc. Address 2"] = ""
raw_data["Sector"] = ""
raw_data["Member Status"] = ""
# Delete unnecessary columns
del raw_data["Company"]
del raw_data["Oper Org Level 2 ID"]
del raw_data["Oper Org Level 3 ID"]
del raw_data["Oper Org Level 4 ID"]
del raw_data["Oper Org Level 4 Name"]
del raw_data["Oper Org Level 5 ID"]
del raw_data["Oper Org Level 5 Name"]
# Rename remaining columns to expected values for import
renamed_columns_data = raw_data.rename(columns={"Oper Org Group ID": "Member ID", "Operator Name": "Loc. Name",
"Main Contact Address": "Loc. Address", "Main Contact City": "Loc. City",
"Main Contact State": "Loc. State", "Main Contact Zip": "Loc. Zip",
"Oper Org Group Name": "Org. Level 2",
"Oper Org Level 2 Name": "Org. Level 3",
"Oper Org Level 3 Name": "Org. Level 4",
"CHANNEL": "Segment", "MAIN COUNTRY NAME":"Loc. Country"})
return renamed_columns_data
except Exception as e:
errorinfo = []
errorinfo.append(e)
error_log_filename = 'ConFoodbuyPreErrorLog.csv'
with open(error_log_filename, 'a') as errorlog:
mywriter = csv.writer(errorlog)
mywriter.writerows([errorinfo])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment