This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These are for a Pandas Series, but regex portion works for any text | |
# Match dates like MM/DD/YYY with 1 or 2 digit month and date, and 2 or 4 digit year, and either / or - separators | |
df.str.findall(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}') | |
# Match dates like 24 Jan 2001 with 1 or 2 digit day, and 2 or 4 digit year, full or abbreviated month | |
# with a possible period and / or coma after the month | |
df.str.findall(r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.,]* \d{2,4}') | |
# Same as above, but with possibility for date to be before or after month, like Jan 24, 2001 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
time = default 0 | |
fill = time > 1 ? 'blue' : time > 0 ? 'red' : 'white' | |
tea: | |
-> hot-water | |
-> leaves | |
-> cup | |
-> logo-cup | |
stage = 3 | |
cup: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cost0 = default 0 usd | |
cost = cost0 + sum(needs.cost) | |
fill = cost > .3 usd ? 'red' : cost > .1 usd ? 'yellow' : 'white' | |
tea: | |
-> hot-water | |
-> leaves | |
stage = 3 | |
hot-water: | |
-> kettle |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import times with a variety of broken formatting | |
# g_time_string is a character string | |
# g_time is the datetime object to produce | |
if g_time_string[-1] == 'n': | |
g_time_string = g_time_string[0:-1] + "m" | |
g_time = None | |
# Various formatting for time, including common mistakes | |
time_formats = ["%I:%M %p", "%I:%M%p", "%I::%M %p", "%I;%M %p", "%I: %M %p", "%I:%M", "%I;%M%p", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_state_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', | |
'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', | |
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', | |
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', | |
'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', | |
'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', | |
'West Virginia', 'Wisconsin', 'Wyoming'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clean_data[is.na(clean_data)] <- 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clean_data <- mutate(raw_data, Software_Amount_USD = as.numeric(sub(",","", Software_Amount___USD))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Look for a duplicate header in the rows | |
header_in_rows <- startsWith(raw_data[,1], substr(colnames(raw_data)[1], 1, 5)) | |
if(TRUE %in% header_in_rows){ | |
print(raw_data[header_in_rows,]) | |
raw_data <- filter(raw_data, !header_in_rows) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
# sum_trialname contains product names, including three different versions of one product | |
# dropping everything after the first space gets me to a consistent product naming | |
# the extract function, by default, captures the initial alphanumeric data, and drops everything after the first non-alpha character | |
# by default, teh extract function also drops the original column (sum_trialname, in this case) | |
# Create a new column with clean, consistent product names | |
clean_data <- extract(clean_data, sum_trialname, "Product", remove=FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# Signup.Date is original date information in Date format | |
# Signup.Week is beginning date of week that contains Signup.Date | |
# cut function can take other parameters instead of "weeks": "day", "week", "month", "quarter" and "year" | |
# adds a column of dates to the dataframe that groups data into week-long buckets | |
clean_data <- clean_data %>% mutate(Signup.Week = cut(Signup.Date, "weeks")) |
NewerOlder