yanping/test.R

## test.R
require(XML)

pg1 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index.shtml'
pg2 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_2.shtml'
pg3 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_3.shtml'
pg4 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_4.shtml'

url1 = htmlTreeParse(pg1, useInternal = TRUE)
url2 = htmlTreeParse(pg2, useInternal = TRUE)
url3 = htmlTreeParse(pg3, useInternal = TRUE)
url4 = htmlTreeParse(pg4, useInternal = TRUE)

urls1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
urls2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
urls3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
urls4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))

title1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
title2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
title3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
title4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))

urls = paste('http://www.wxtj.gov.cn', c(urls1, urls2, urls3, urls4), sep = '')
title = c(title1, title2, title3, title4)

year = substr(title, 1, 4)
tmp = sub('^.*年([^ ]+)月.*$', '\\1', title)
tmp = sub('^.*1－([^ ]+).*$', '\\1', tmp)
tmp = sub('^.*1―([^ ]+).*$', '\\1', tmp)
tmp = sub('^.*1-([^ ]+).*$', '\\1', tmp)
month = replace(tmp, which(nchar(tmp) == 1),
                paste('0', tmp[which(nchar(tmp) == 1)], sep = ''))
name = paste('data', year, month, sep = '_')

for (i in 1:length(urls)) eval(parse(text = paste(name[i], "= readHTMLTable('", urls[i], "')[[1]]", sep = "")))
	require(XML)

	pg1 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index.shtml'
	pg2 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_2.shtml'
	pg3 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_3.shtml'
	pg4 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_4.shtml'

	url1 = htmlTreeParse(pg1, useInternal = TRUE)
	url2 = htmlTreeParse(pg2, useInternal = TRUE)
	url3 = htmlTreeParse(pg3, useInternal = TRUE)
	url4 = htmlTreeParse(pg4, useInternal = TRUE)

	urls1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))

	title1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))

	urls = paste('http://www.wxtj.gov.cn', c(urls1, urls2, urls3, urls4), sep = '')
	title = c(title1, title2, title3, title4)

	year = substr(title, 1, 4)
	tmp = sub('^.年([^ ]+)月.$', '\\1', title)
	tmp = sub('^.1－([^ ]+).$', '\\1', tmp)
	tmp = sub('^.1―([^ ]+).$', '\\1', tmp)
	tmp = sub('^.1-([^ ]+).$', '\\1', tmp)
	month = replace(tmp, which(nchar(tmp) == 1),
	paste('0', tmp[which(nchar(tmp) == 1)], sep = ''))
	name = paste('data', year, month, sep = '_')

	for (i in 1:length(urls)) eval(parse(text = paste(name[i], "= readHTMLTable('", urls[i], "')[[1]]", sep = "")))