Aaron Ou aoyh

## gist:e75812282c5b830f4c97
library(rvest)

from <- 248
to <- 366
pages <- paste0("http://www.shanhaimiwenlu.com/", c(245, seq(from, to)), ".html")


system.time(
thelist <- lapply(pages, function(p) {
  writeLines(substr(p, 31, 33),  "break.txt")

## style_with_color.R
require(ggplot2)
require(reshape2)

# populate some data
d <- data.frame(month = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"),
                visitors = c(156898, 187456, 238456, 256789, 228764, 185632),
                ratio = c(1.45, 1.32, 1.29, 1.22, 1.14, 1.01)
                )
d$month <- factor(d$month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"))
d$visits <- round(d$visitors * d$ratio, 0)

## performanceReportREADME.txt
###############################################################################
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

## ggmap_fudan_to_forrest.r
# 思路： 以两点中心位置作为地图中心绘制底图 >> 获取点坐标 >> 在底图上绘制点 >> 绘制连线

library(ggmap)

# 获取复旦的经纬度坐标
fd_geo <- geocode("Fudan University")
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Shanghai&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
##       lon      lat
## 1 121.4737 31.23039

## dance_world_ranking.r
# Adult Standard  成人标准舞
# Adult Latin  成人拉丁舞
# Adult Ten Dance	成人十项舞
# Youth Standard	青年标准舞
# Youth Latin	青年拉丁舞
# Senior I Standard	中青一组标准舞
# Senior I Latin	中青一组拉丁舞
# Senior II Standard	中青二组标准舞
# Senior II Latin	中青二组拉丁舞
# Senior III Standard	中青三组标准舞

## dance_split_excelworkbook_into_csv.vba
Private Sub splitworkbook()
       Dim sht As Worksheet
       Dim MyBook As Workbook
       Set MyBook = ActiveWorkbook
       For Each sht In MyBook.Sheets
           sht.Copy
           ActiveWorkbook.SaveAs Filename:=MyBook.Path & "\" & Left(ThisWorkbook.Name, 7) & "_" & LCase(Trim(sht.Name)), FileFormat:=xlCSV
  	   'Filename:一句将工作簿名"w201106"加上小写的工作表名作为拆分后的csv文件名
		   'xlCSV: 将工作簿另存为CSV默认格式
           ActiveWorkbook.Close savechanges:=True   'savechanges:=True 避免了每次弹出确认保存的对话框

## dance_athlete.r
# generate the htm file names of male athletes to read from
male_adult <- paste("male_adult_",1:3, ".htm", sep = "")
male_youth <- paste("male_youth_",1:2, ".htm", sep = "")
male_senior <- paste("male_senior_",1:3, ".htm", sep = "")
male_junior <- paste("male_junior_",1:2, ".htm", sep = "")
male <- c(male_adult, male_youth, male_senior, male_junior, "male_juvenile.htm")
male <- matrix(male, nrow = length(male))
male <- sapply(male, function(x) readHTMLTable(x, encoding ="utf-8", stringsAsFactors = F))
head(male[[1]])
#                     Name   Surname Country Category Status Member #

## dance_couple_list.r
library(XML)
url <- paste("couple_list_", 1:6, ".htm", sep = "")
url <- matrix(url, nrow = 6)
couple <- sapply(url, function(x) readHTMLTable(x, stringsAsFactors = F, encoding = "utf-8"))
head(couple[[1]])
head(couple[[1]][,2:6])
couplelist <- data.frame()
for (i in 1:6) {couplepage <- couple[[i]][,2:6]
                couplelist <- rbind(couplelist, couplepage)
                }

## clean_weather_data.r
# let's do the data cleansing in R
# read back the txt into R, as character
d <- scan("weather.txt", what = "", sep = "")
d[grep("[0123456789]$",d)] <- "\n"
i <- length(d)
wt1 <- c()
for (j in 1:i) {wt1 <- paste(wt1, d[j], sep = ",")}
write.table(wt1, "wt1.txt")  # please go check wt1.txt

# import back wt1.txt as data frame

## webscraping_weather.r
# R for web scraping - weather project
# Get web raw data via readHTMLtable() from XML package using R
library(XML)
# generate the url set in matrix format, to be applied with sallpy() later on
y1 <- paste("20110", 1:12, sep = "")
y1[10:12] <- c("201110", "201111", "201112")
y2 <- paste("20120", 1:12, sep = "")
y2[10:12] <- c("201210", "201211", "201212")
y3 <- c("201301", "201302", "201303")
y <- c(y1, y2, y3)
	library(rvest)

	from <- 248
	to <- 366
	pages <- paste0("http://www.shanhaimiwenlu.com/", c(245, seq(from, to)), ".html")


	system.time(
	thelist <- lapply(pages, function(p) {
	writeLines(substr(p, 31, 33), "break.txt")
	require(ggplot2)
	require(reshape2)

	# populate some data
	d <- data.frame(month = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"),
	visitors = c(156898, 187456, 238456, 256789, 228764, 185632),
	ratio = c(1.45, 1.32, 1.29, 1.22, 1.14, 1.01)
	)
	d$month <- factor(d$month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"))
	d$visits <- round(d$visitors * d$ratio, 0)
	###############################################################################
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	# 思路：以两点中心位置作为地图中心绘制底图 >> 获取点坐标 >> 在底图上绘制点 >> 绘制连线

	library(ggmap)

	# 获取复旦的经纬度坐标
	fd_geo <- geocode("Fudan University")
	## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Shanghai&sensor=false
	## Google Maps API Terms of Service : http://developers.google.com/maps/terms
	## lon lat
	## 1 121.4737 31.23039
	# Adult Standard 成人标准舞
	# Adult Latin 成人拉丁舞
	# Adult Ten Dance 成人十项舞
	# Youth Standard 青年标准舞
	# Youth Latin 青年拉丁舞
	# Senior I Standard 中青一组标准舞
	# Senior I Latin 中青一组拉丁舞
	# Senior II Standard 中青二组标准舞
	# Senior II Latin 中青二组拉丁舞
	# Senior III Standard 中青三组标准舞
	Private Sub splitworkbook()
	Dim sht As Worksheet
	Dim MyBook As Workbook
	Set MyBook = ActiveWorkbook
	For Each sht In MyBook.Sheets
	sht.Copy
	ActiveWorkbook.SaveAs Filename:=MyBook.Path & "\" & Left(ThisWorkbook.Name, 7) & "_" & LCase(Trim(sht.Name)), FileFormat:=xlCSV
	'Filename:一句将工作簿名"w201106"加上小写的工作表名作为拆分后的csv文件名
	'xlCSV: 将工作簿另存为CSV默认格式
	ActiveWorkbook.Close savechanges:=True 'savechanges:=True 避免了每次弹出确认保存的对话框
	# generate the htm file names of male athletes to read from
	male_adult <- paste("male_adult_",1:3, ".htm", sep = "")
	male_youth <- paste("male_youth_",1:2, ".htm", sep = "")
	male_senior <- paste("male_senior_",1:3, ".htm", sep = "")
	male_junior <- paste("male_junior_",1:2, ".htm", sep = "")
	male <- c(male_adult, male_youth, male_senior, male_junior, "male_juvenile.htm")
	male <- matrix(male, nrow = length(male))
	male <- sapply(male, function(x) readHTMLTable(x, encoding ="utf-8", stringsAsFactors = F))
	head(male[[1]])
	# Name Surname Country Category Status Member #
	library(XML)
	url <- paste("couple_list_", 1:6, ".htm", sep = "")
	url <- matrix(url, nrow = 6)
	couple <- sapply(url, function(x) readHTMLTable(x, stringsAsFactors = F, encoding = "utf-8"))
	head(couple[[1]])
	head(couple[[1]][,2:6])
	couplelist <- data.frame()
	for (i in 1:6) {couplepage <- couple[[i]][,2:6]
	couplelist <- rbind(couplelist, couplepage)
	}
	# let's do the data cleansing in R
	# read back the txt into R, as character
	d <- scan("weather.txt", what = "", sep = "")
	d[grep("[0123456789]$",d)] <- "\n"
	i <- length(d)
	wt1 <- c()
	for (j in 1:i) {wt1 <- paste(wt1, d[j], sep = ",")}
	write.table(wt1, "wt1.txt") # please go check wt1.txt

	# import back wt1.txt as data frame
	# R for web scraping - weather project
	# Get web raw data via readHTMLtable() from XML package using R
	library(XML)
	# generate the url set in matrix format, to be applied with sallpy() later on
	y1 <- paste("20110", 1:12, sep = "")
	y1[10:12] <- c("201110", "201111", "201112")
	y2 <- paste("20120", 1:12, sep = "")
	y2[10:12] <- c("201210", "201211", "201212")
	y3 <- c("201301", "201302", "201303")
	y <- c(y1, y2, y3)