Skip to content

Instantly share code, notes, and snippets.

@Quantisan
Created February 12, 2012 11:43
Show Gist options
  • Save Quantisan/1808134 to your computer and use it in GitHub Desktop.
Save Quantisan/1808134 to your computer and use it in GitHub Desktop.
a weather station data scrapper
# Scrapes wunderground weather record
#
# Based on http://casoilresource.lawr.ucdavis.edu/drupal/node/991
#
# Usage:
# fetch.temperature('ILONDONL8', '2012-01-01', '2012-01-30')
#
fetch.temperature <- function(station, start.date, end.date) {
date.range <- seq.Date(from=as.Date(start.date), to=as.Date(end.date), by='1 day')
# pre-allocate list
C <- vector(mode='numeric', length=length(date.range))
# loop over dates, and fetch data
for(i in seq_along(date.range)) {
print(date.range[i])
C[i] <- fetch.temperature.one(station, date.range[i])
}
return(data.frame(C, row.names=date.range))
}
fetch.temperature.one <- function(station, date) {
w <- fetch.weather.raw(station, date)
return(median(w$TemperatureC))
}
# Work in progress
# Usage:
# fetch.airport.weather('EGLL', '2012-02-01')
#
fetch.airport.weather <- function(station, date) {
base_url <- 'http://www.wunderground.com/history/airport'
date <- as.Date(date)
# parse date
m <- as.integer(format(date, '%m'))
d <- as.integer(format(date, '%d'))
y <- format(date, '%Y')
# compose final url
final_url <- paste(base_url,
'/', station,
'/', y,
'/', m,
'/', d,
'/', 'DailyHistory.html?&format=1',
sep='')
print(final_url)
# reading in as raw lines from the web server
# contains <br> tags on every other line
u <- url(final_url)
the_data <- readLines(u)
close(u)
# only keep records with more than 3 rows of data
if(length(the_data) > 3 )
{
# remove the first and last lines
the_data <- the_data[-c(1, length(the_data))]
# remove odd numbers starting from 3 --> end
# the_data <- the_data[-seq(3, length(the_data), by=2)]
# extract header and cleanup
the_header <- the_data[1]
the_header <- make.names(strsplit(the_header, ',')[[1]])
# convert to CSV, without header
tC <- textConnection(paste(the_data, collapse='\n'))
the_data <- read.csv(tC, as.is=TRUE, row.names=NULL, header=FALSE, skip=1)
close(tC)
# remove the last column, created by trailing comma
the_data <- the_data[, -ncol(the_data)]
# assign column names
names(the_data) <- the_header
# convert Time column into properly encoded date time
the_data$Time <- as.POSIXct(strptime(the_data$Time, format='%Y-%m-%d %H:%M:%S'))
# remove UTC column
the_data$DateUTC.br... <- NULL
# sort and fix rownames
the_data <- the_data[order(the_data$Time), ]
row.names(the_data) <- 1:nrow(the_data)
# done
return(the_data)
}
}
#
# Usage:
# fetch.weather('IKENSING2', '2012-02-01')
#
fetch.weather.raw <- function(station, date) {
base_url <- 'http://www.wunderground.com/weatherstation/WXDailyHistory.asp?'
date <- as.Date(date)
# parse date
m <- as.integer(format(date, '%m'))
d <- as.integer(format(date, '%d'))
y <- format(date, '%Y')
# compose final url
final_url <- paste(base_url,
'ID=', station,
'&month=', m,
'&day=', d,
'&year=', y,
'&format=1', sep='')
# reading in as raw lines from the web server
# contains <br> tags on every other line
u <- url(final_url)
the_data <- readLines(u)
close(u)
# only keep records with more than 5 rows of data
if(length(the_data) > 5 )
{
# remove the first and last lines
the_data <- the_data[-c(1, length(the_data))]
# remove odd numbers starting from 3 --> end
the_data <- the_data[-seq(3, length(the_data), by=2)]
# extract header and cleanup
the_header <- the_data[1]
the_header <- make.names(strsplit(the_header, ',')[[1]])
# convert to CSV, without header
tC <- textConnection(paste(the_data, collapse='\n'))
the_data <- read.csv(tC, as.is=TRUE, row.names=NULL, header=FALSE, skip=1)
close(tC)
# remove the last column, created by trailing comma
the_data <- the_data[, -ncol(the_data)]
# assign column names
names(the_data) <- the_header
# convert Time column into properly encoded date time
the_data$Time <- as.POSIXct(strptime(the_data$Time, format='%Y-%m-%d %H:%M:%S'))
# remove UTC and software type columns
the_data$DateUTC.br. <- NULL
the_data$SoftwareType <- NULL
# sort and fix rownames
the_data <- the_data[order(the_data$Time), ]
row.names(the_data) <- 1:nrow(the_data)
# done
return(the_data)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment