Skip to content

Instantly share code, notes, and snippets.

@jackysee
Created December 30, 2009 18:17
Show Gist options
  • Save jackysee/266261 to your computer and use it in GitHub Desktop.
Save jackysee/266261 to your computer and use it in GitHub Desktop.
/**
This script will crawl the MySinaBlog content to your hardisk as HTML files
It can resolves more than one page of comment.
Setup
1. You need to install groovy (http://groovy.codehaus.org/).
See the setup here (http://groovy.codehaus.org/Tutorial+1+-+Getting+started)
2. You need to install nekoParser to your groovy lib
2.1 download from http://sourceforge.net/projects/nekohtml/
2.2 copy in the zip file lib\xercesImpl.jar, nekohtml.jar to e.g. c:\Documents And Settings\User\.groovy\lib
Step:
1. fill the blogUrl below.
2. run the script
3. Your blog content will be at 'backup' folder.
Each single entry will be named as {articleId}.html
If the entry has comments, the name will be {articleId}_0.html, {articleId}_1.html....
*/
def blogUrl = "http://jackysee.mysinablog.com/"
/*======== do not modify below ====*/
def nekoParser = new org.cyberneko.html.parsers.SAXParser()
nekoParser.setFeature('http://xml.org/sax/features/namespaces', false)
def host = (blogUrl =~ /(http:\/\/[^\/]+)\/?.*/)[0][1]
def base = blogUrl[0..blogUrl.lastIndexOf('/')]
def parser = new XmlParser(nekoParser)
def frontPage = parser.parse(blogUrl)
def nextUrl = frontPage.depthFirst().DIV.findAll{it.'@class'=='c_title'}[0].A.'@href'[0]
new File("backup").mkdir()
while(nextUrl != null){
println "Getting... ${base + nextUrl}"
def permaLink = base + nextUrl
def articleId = (nextUrl =~ /articleId=(\d*)/)[0][1]
def page = parser.parse(permaLink).depthFirst()
def commentText = page.DIV.findAll{ it.'@class'=='posted'}.A.findAll{ it.text().contains("留言")}[0]?.text()
def commentCount = (commentText)?(commentText =~ /\((\d*)\)/)[0][1]:'0'
if(Integer.parseInt(commentCount) > 0){
//#blogPager
def blogPage = page.findAll{it.'@id'=='blogPager'}[0]
def pagelist = (blogPage)? blogPage.OPTION.collect{ "${base}index.php${it.'@value'}"}:[permaLink]
pagelist.eachWithIndex{pp,ii->
download(pp, "${articleId}_${ii}")
}
}
else{
download(permaLink, articleId);
}
def prevLink = page.DIV.findAll{it.'@id'=='post_index'}.A.findAll{it.text().contains("上一篇")}[0]?.'@href'
nextUrl = prevLink?prevLink[0]:null
sleep 1000
}
def download(address, fileName){
def file = new FileOutputStream("backup/${fileName}.html")
println "... writing to ${fileName}.html"
def out = new BufferedOutputStream(file)
out << new URL(address).openStream()
out.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment