Peter Laudenslager Plaudenslager

## dateregex.py
# These are for a Pandas Series, but regex portion works for any text

# Match dates like MM/DD/YYY with 1 or 2 digit month and date, and 2 or 4 digit year, and either / or - separators
df.str.findall(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}')

# Match dates like 24 Jan 2001 with 1 or 2 digit day, and 2 or 4 digit year, full or abbreviated month
# with a possible period and / or coma after the month
df.str.findall(r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.,]* \d{2,4}')

# Same as above, but with possibility for date to be before or after month, like Jan 24, 2001

## gist:7db7f52e87457ff6f9e5877142d81422
time = default 0
fill = time > 1 ? 'blue' : time > 0 ? 'red' : 'white'

tea:
	-> hot-water
	-> leaves
	-> cup
	-> logo-cup
	stage = 3
cup:

## gist:e1075858d71a90ad9edcbd36a21b6059
cost0 = default 0 usd
cost = cost0 + sum(needs.cost)
fill = cost > .3 usd ? 'red' : cost > .1 usd ? 'yellow' : 'white'

tea:
	-> hot-water
	-> leaves
	stage = 3
hot-water:
	-> kettle

## fix_time.py
# Import times with a variety of broken formatting
# g_time_string is a character string
# g_time is the datetime object to produce

if g_time_string[-1] == 'n':
    g_time_string = g_time_string[0:-1] + "m"
g_time = None

# Various formatting for time, including common mistakes
time_formats = ["%I:%M %p", "%I:%M%p", "%I::%M %p", "%I;%M %p", "%I: %M %p", "%I:%M", "%I;%M%p",

## state_names.py
_state_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
               'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois',  'Indiana', 'Iowa', 'Kansas', 'Kentucky',
               'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
               'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
               'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
               'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
               'West Virginia', 'Wisconsin', 'Wyoming']

## NAto0.R
clean_data[is.na(clean_data)] <- 0

## CleanCurrencyField.R
clean_data <- mutate(raw_data, Software_Amount_USD = as.numeric(sub(",","", Software_Amount___USD)))

## RemoveExtraHeaders.R
# Look for a duplicate header in the rows
header_in_rows <- startsWith(raw_data[,1], substr(colnames(raw_data)[1], 1, 5))
if(TRUE %in% header_in_rows){
  print(raw_data[header_in_rows,])
  raw_data <- filter(raw_data, !header_in_rows)
}

## cleanup_names.r
library(dplyr)
library(tidyr)
# sum_trialname contains product names, including three different versions of one product
# dropping everything after the first space gets me to a consistent product naming
# the extract function, by default, captures the initial alphanumeric data, and drops everything after the first non-alpha character
# by default, teh extract function also drops the original column (sum_trialname, in this case)

# Create a new column with clean, consistent product names
clean_data <- extract(clean_data, sum_trialname, "Product", remove=FALSE)

## Group_by_Date.r
library(dplyr)
# Signup.Date is original date information in Date format
# Signup.Week is beginning date of week that contains Signup.Date
# cut function can take other parameters instead of "weeks": "day", "week", "month", "quarter" and "year"

# adds a column of dates to the dataframe that groups data into week-long buckets
clean_data <- clean_data %>% mutate(Signup.Week = cut(Signup.Date, "weeks"))
	# These are for a Pandas Series, but regex portion works for any text

	# Match dates like MM/DD/YYY with 1 or 2 digit month and date, and 2 or 4 digit year, and either / or - separators
	df.str.findall(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}')

	# Match dates like 24 Jan 2001 with 1 or 2 digit day, and 2 or 4 digit year, full or abbreviated month
	# with a possible period and / or coma after the month
	df.str.findall(r'\d{1,2} (?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z.,]* \d{2,4}')

	# Same as above, but with possibility for date to be before or after month, like Jan 24, 2001
	time = default 0
	fill = time > 1 ? 'blue' : time > 0 ? 'red' : 'white'

	tea:
	-> hot-water
	-> leaves
	-> cup
	-> logo-cup
	stage = 3
	cup:
	cost0 = default 0 usd
	cost = cost0 + sum(needs.cost)
	fill = cost > .3 usd ? 'red' : cost > .1 usd ? 'yellow' : 'white'

	tea:
	-> hot-water
	-> leaves
	stage = 3
	hot-water:
	-> kettle
	# Import times with a variety of broken formatting
	# g_time_string is a character string
	# g_time is the datetime object to produce

	if g_time_string[-1] == 'n':
	g_time_string = g_time_string[0:-1] + "m"
	g_time = None

	# Various formatting for time, including common mistakes
	time_formats = ["%I:%M %p", "%I:%M%p", "%I::%M %p", "%I;%M %p", "%I: %M %p", "%I:%M", "%I;%M%p",
	_state_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
	'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
	'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
	'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
	'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
	'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
	'West Virginia', 'Wisconsin', 'Wyoming']
	# Look for a duplicate header in the rows
	header_in_rows <- startsWith(raw_data[,1], substr(colnames(raw_data)[1], 1, 5))
	if(TRUE %in% header_in_rows){
	print(raw_data[header_in_rows,])
	raw_data <- filter(raw_data, !header_in_rows)
	}
	library(dplyr)
	library(tidyr)
	# sum_trialname contains product names, including three different versions of one product
	# dropping everything after the first space gets me to a consistent product naming
	# the extract function, by default, captures the initial alphanumeric data, and drops everything after the first non-alpha character
	# by default, teh extract function also drops the original column (sum_trialname, in this case)

	# Create a new column with clean, consistent product names
	clean_data <- extract(clean_data, sum_trialname, "Product", remove=FALSE)
	library(dplyr)
	# Signup.Date is original date information in Date format
	# Signup.Week is beginning date of week that contains Signup.Date
	# cut function can take other parameters instead of "weeks": "day", "week", "month", "quarter" and "year"

	# adds a column of dates to the dataframe that groups data into week-long buckets
	clean_data <- clean_data %>% mutate(Signup.Week = cut(Signup.Date, "weeks"))