jackysee/MySinaBlogHtmlBackup.groovy

## MySinaBlogHtmlBackup.groovy
/**
This script will crawl the MySinaBlog content to your hardisk as HTML files
It can resolves more than one page of comment.

Setup
1. You need to install groovy (http://groovy.codehaus.org/).
   See the setup here (http://groovy.codehaus.org/Tutorial+1+-+Getting+started)
2. You need to install nekoParser to your groovy lib
   2.1 download from http://sourceforge.net/projects/nekohtml/
   2.2 copy in the zip file lib\xercesImpl.jar, nekohtml.jar to e.g. c:\Documents And Settings\User\.groovy\lib

Step:
1. fill the blogUrl below.
2. run the script
3. Your blog content will be at 'backup' folder.
   Each single entry will be named as {articleId}.html
   If the entry has comments, the name will be {articleId}_0.html, {articleId}_1.html....
*/

def blogUrl = "http://jackysee.mysinablog.com/"

/*======== do not modify below ====*/
def nekoParser = new org.cyberneko.html.parsers.SAXParser()
nekoParser.setFeature('http://xml.org/sax/features/namespaces', false)

def host = (blogUrl =~ /(http:\/\/[^\/]+)\/?.*/)[0][1]
def base = blogUrl[0..blogUrl.lastIndexOf('/')]
def parser = new XmlParser(nekoParser)
def frontPage = parser.parse(blogUrl)
def nextUrl = frontPage.depthFirst().DIV.findAll{it.'@class'=='c_title'}[0].A.'@href'[0]

new File("backup").mkdir()
while(nextUrl != null){
    println "Getting...  ${base + nextUrl}"
    def permaLink = base + nextUrl
    def articleId = (nextUrl =~ /articleId=(\d*)/)[0][1]
	def page = parser.parse(permaLink).depthFirst()

	def commentText = page.DIV.findAll{ it.'@class'=='posted'}.A.findAll{ it.text().contains("留言")}[0]?.text()
	def commentCount = (commentText)?(commentText =~ /\((\d*)\)/)[0][1]:'0'
	if(Integer.parseInt(commentCount) > 0){
        	//#blogPager
		def blogPage = page.findAll{it.'@id'=='blogPager'}[0]
		def pagelist = (blogPage)? blogPage.OPTION.collect{ "${base}index.php${it.'@value'}"}:[permaLink]
        	pagelist.eachWithIndex{pp,ii->
			download(pp, "${articleId}_${ii}")
		}
	}
	else{
		download(permaLink, articleId);
	}
	def prevLink = page.DIV.findAll{it.'@id'=='post_index'}.A.findAll{it.text().contains("上一篇")}[0]?.'@href'
    nextUrl = prevLink?prevLink[0]:null
    sleep 1000
}

def download(address, fileName){
	def file = new FileOutputStream("backup/${fileName}.html")
	println "... writing to ${fileName}.html"
	def out = new BufferedOutputStream(file)
	out << new URL(address).openStream()
	out.close()
}
	/**
	This script will crawl the MySinaBlog content to your hardisk as HTML files
	It can resolves more than one page of comment.

	Setup
	1. You need to install groovy (http://groovy.codehaus.org/).
	See the setup here (http://groovy.codehaus.org/Tutorial+1+-+Getting+started)
	2. You need to install nekoParser to your groovy lib
	2.1 download from http://sourceforge.net/projects/nekohtml/
	2.2 copy in the zip file lib\xercesImpl.jar, nekohtml.jar to e.g. c:\Documents And Settings\User\.groovy\lib

	Step:
	1. fill the blogUrl below.
	2. run the script
	3. Your blog content will be at 'backup' folder.
	Each single entry will be named as {articleId}.html
	If the entry has comments, the name will be {articleId}_0.html, {articleId}_1.html....
	*/

	def blogUrl = "http://jackysee.mysinablog.com/"

	/======== do not modify below ====/
	def nekoParser = new org.cyberneko.html.parsers.SAXParser()
	nekoParser.setFeature('http://xml.org/sax/features/namespaces', false)

	def host = (blogUrl =~ /(http:\/\/[^\/]+)\/?.*/)[0][1]
	def base = blogUrl[0..blogUrl.lastIndexOf('/')]
	def parser = new XmlParser(nekoParser)
	def frontPage = parser.parse(blogUrl)
	def nextUrl = frontPage.depthFirst().DIV.findAll{it.'@class'=='c_title'}[0].A.'@href'[0]

	new File("backup").mkdir()
	while(nextUrl != null){
	println "Getting... ${base + nextUrl}"
	def permaLink = base + nextUrl
	def articleId = (nextUrl =~ /articleId=(\d*)/)[0][1]
	def page = parser.parse(permaLink).depthFirst()

	def commentText = page.DIV.findAll{ it.'@class'=='posted'}.A.findAll{ it.text().contains("留言")}[0]?.text()
	def commentCount = (commentText)?(commentText =~ /\((\d*)\)/)[0][1]:'0'
	if(Integer.parseInt(commentCount) > 0){
	//#blogPager
	def blogPage = page.findAll{it.'@id'=='blogPager'}[0]
	def pagelist = (blogPage)? blogPage.OPTION.collect{ "${base}index.php${it.'@value'}"}:[permaLink]
	pagelist.eachWithIndex{pp,ii->
	download(pp, "${articleId}_${ii}")
	}
	}
	else{
	download(permaLink, articleId);
	}
	def prevLink = page.DIV.findAll{it.'@id'=='post_index'}.A.findAll{it.text().contains("上一篇")}[0]?.'@href'
	nextUrl = prevLink?prevLink[0]:null
	sleep 1000
	}

	def download(address, fileName){
	def file = new FileOutputStream("backup/${fileName}.html")
	println "... writing to ${fileName}.html"
	def out = new BufferedOutputStream(file)
	out << new URL(address).openStream()
	out.close()
	}