Created
December 30, 2009 18:17
-
-
Save jackysee/266261 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
This script will crawl the MySinaBlog content to your hardisk as HTML files | |
It can resolves more than one page of comment. | |
Setup | |
1. You need to install groovy (http://groovy.codehaus.org/). | |
See the setup here (http://groovy.codehaus.org/Tutorial+1+-+Getting+started) | |
2. You need to install nekoParser to your groovy lib | |
2.1 download from http://sourceforge.net/projects/nekohtml/ | |
2.2 copy in the zip file lib\xercesImpl.jar, nekohtml.jar to e.g. c:\Documents And Settings\User\.groovy\lib | |
Step: | |
1. fill the blogUrl below. | |
2. run the script | |
3. Your blog content will be at 'backup' folder. | |
Each single entry will be named as {articleId}.html | |
If the entry has comments, the name will be {articleId}_0.html, {articleId}_1.html.... | |
*/ | |
def blogUrl = "http://jackysee.mysinablog.com/" | |
/*======== do not modify below ====*/ | |
def nekoParser = new org.cyberneko.html.parsers.SAXParser() | |
nekoParser.setFeature('http://xml.org/sax/features/namespaces', false) | |
def host = (blogUrl =~ /(http:\/\/[^\/]+)\/?.*/)[0][1] | |
def base = blogUrl[0..blogUrl.lastIndexOf('/')] | |
def parser = new XmlParser(nekoParser) | |
def frontPage = parser.parse(blogUrl) | |
def nextUrl = frontPage.depthFirst().DIV.findAll{it.'@class'=='c_title'}[0].A.'@href'[0] | |
new File("backup").mkdir() | |
while(nextUrl != null){ | |
println "Getting... ${base + nextUrl}" | |
def permaLink = base + nextUrl | |
def articleId = (nextUrl =~ /articleId=(\d*)/)[0][1] | |
def page = parser.parse(permaLink).depthFirst() | |
def commentText = page.DIV.findAll{ it.'@class'=='posted'}.A.findAll{ it.text().contains("留言")}[0]?.text() | |
def commentCount = (commentText)?(commentText =~ /\((\d*)\)/)[0][1]:'0' | |
if(Integer.parseInt(commentCount) > 0){ | |
//#blogPager | |
def blogPage = page.findAll{it.'@id'=='blogPager'}[0] | |
def pagelist = (blogPage)? blogPage.OPTION.collect{ "${base}index.php${it.'@value'}"}:[permaLink] | |
pagelist.eachWithIndex{pp,ii-> | |
download(pp, "${articleId}_${ii}") | |
} | |
} | |
else{ | |
download(permaLink, articleId); | |
} | |
def prevLink = page.DIV.findAll{it.'@id'=='post_index'}.A.findAll{it.text().contains("上一篇")}[0]?.'@href' | |
nextUrl = prevLink?prevLink[0]:null | |
sleep 1000 | |
} | |
def download(address, fileName){ | |
def file = new FileOutputStream("backup/${fileName}.html") | |
println "... writing to ${fileName}.html" | |
def out = new BufferedOutputStream(file) | |
out << new URL(address).openStream() | |
out.close() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment