Skip to content

Instantly share code, notes, and snippets.

@dggoldst
Last active July 31, 2017 17:32
Show Gist options
  • Save dggoldst/23acb326cb7ed3125bb7f172fb639a80 to your computer and use it in GitHub Desktop.
Save dggoldst/23acb326cb7ed3125bb7f172fb639a80 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(weatherData)
library(viridis)
library(lubridate)
library(maps)
library(ggmap)
retList = vector('list', 10)
for (i in 2:6) {
retList[[i]] =
read_csv(
paste("data/allstationsalldays.201", i, ".csv.gz", sep = ""),
col_types = cols(Date = col_date(format = "%Y-%m-%d")))
}
df = do.call('rbind', retList)
relevant_states = subset(data.frame(State = state.abb),!State %in%
c("HI", "AK"))
relevant_stations = left_join(relevant_states,
USAirportWeatherStations)
df = left_join(relevant_stations, df)
#Add useful categories
df = df %>% mutate(
Year = year(Date),
Quarter = quarter(Date),
Month = month(Date),
Week = week(Date),
Day = day(Date)
)
#EDA reveals a few bad data points
ggplot(subset(df, !is.na(Max_TemperatureF)),
aes(x = "", group = Quarter, y = Max_TemperatureF)) + geom_boxplot()
#Delete bad points and stations w/o all 5 years of data
df = df %>%
filter(Max_TemperatureF < 200) %>%
filter(Max_TemperatureF > -200) %>%
group_by(airportCode) %>%
mutate(num_years = length(unique(Year)),
num_obs = length(!is.na(Max_TemperatureF))) %>%
filter(num_years == 5) %>%
ungroup()
#Create wide data frame for clustering
clust_df = df %>%
group_by(airportCode, Week) %>%
mutate(weekly_median = median(Max_TemperatureF, na.rm = TRUE)) %>%
slice(1) %>%
ungroup() %>%
dplyr::select(Week, airportCode, weekly_median) %>%
spread(key = Week, value = weekly_median)
set.seed(3)
NCLUST = 5 #Number of clusters. 5 captures the big picture
#Run K means clustering
kmean_results = kmeans(clust_df[, 2:ncol(clust_df)], NCLUST)
kmean_results$cluster
kmean_df = data.frame(airportCode = clust_df$airportCode,
cluster = kmean_results$cluster)
kmean_df = left_join(kmean_df, relevant_stations)
#Compute the weekly average weather in each cluster
t_df = left_join(df, kmean_df, by = "airportCode")
plot_data = t_df %>% group_by(Week, cluster) %>%
summarize(mu = mean(Max_TemperatureF),
d2clustmu2 = mean(abs(Max_TemperatureF - mean(Max_TemperatureF,na.rm=TRUE)))) %>%
mutate(cluster = as.factor(cluster)) %>%
ungroup()
#Get the December weather in each cluster to relabel them intelligently
new_order = plot_data %>% filter(Week == 52) %>% arrange(desc(mu))
plot_data = mutate(plot_data,
newclust = factor(cluster,
levels = as.character(new_order$cluster),
labels = rev(LETTERS[1:NCLUST])))
kmean_df = mutate(kmean_df,
newclust = factor(cluster,
levels = as.character(new_order$cluster),
labels = rev(LETTERS[1:NCLUST])))
write.csv(kmean_df,
file = paste("final_stations_", NCLUST, "clusters.csv", sep = ""),
row.names = FALSE)
#Plot the clusters
#Plot station locations
p = ggplot(kmean_df, aes(x = Lon, y = Lat, color = newclust)) +
geom_point(size = 2) + scale_color_discrete()
p
ggsave( plot = p, file = paste("point_overview_", NCLUST, "clust.png", sep = ""), width = 6, height = 4)
########plot regional averages
p = ggplot(plot_data, aes( x = Week, y = mu, group = newclust, color = newclust)) +
geom_smooth(se = FALSE) + scale_color_discrete() +
scale_x_continuous(breaks = seq(1, 52, by = 10)) +
scale_y_continuous(breaks = seq(20, 105, by = 10)) +
labs(x = "Week of Year", y = "Average Temperature",color = "") + theme_bw() +
theme(legend.position = "bottom")
p
ggsave( plot = p, file = paste("seaonal_trends_weekly_", NCLUST, "clust.png", sep = ""), width = 6, height = 4)
########plot cluster of each station on map
us <- map_data("state")
us <- fortify(us, region = "region")
gg <- ggplot()
gg <- gg + geom_map(
data = us,
map = us,
aes( x = long, y = lat, map_id = region, group = group),
fill = "#ffffff",
color = "#7f7f7f",
size = 0.25)
gg = gg + theme_nothing() +
geom_text( data = kmean_df, size = 2.75, alpha = .9, aes( x = Lon, y = Lat, label = newclust, color = newclust)) +
scale_color_discrete() +
theme(legend.position = "none")
ggsave( plot = gg, file = paste("geom_map_weekly_", NCLUST, "clust.png", sep = ""), width = 6, height = 4)
####How far are temperatures from the cluster averages?
p = ggplot(plot_data, aes(x = Week, y = d2clustmu2,group=newclust,color=newclust)) +
geom_smooth(se=FALSE)
ggsave( plot = p, file = paste("dist_from_cluster_mean_", NCLUST, "clust.png", sep = ""), width = 6, height = 4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment