Created
June 27, 2011 23:46
-
-
Save yongboy/1050139 to your computer and use it in GitHub Desktop.
DownPage.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.UnsupportedEncodingException; | |
import java.net.URLEncoder; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.concurrent.ConcurrentLinkedQueue; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.commons.lang.StringUtils; | |
public class DownPage { | |
public static void main(String[] args) throws IOException { | |
// String url = | |
// "http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp?site_id=10&contentarea_by=Web development&sort_by=&sort_order=2&start=101&end=200&topic_by=&product_by=&type_by=所有类别&show_abstract=true&search_by=&industry_by=&series_title_by="; | |
// String nextPageUrl = initPageUrl(url); | |
// System.out.println(NetUtils.readUrlContent(nextPageUrl)); | |
// System.out.println(StringUtils.substringBetween(url, "/views/", | |
// "/")); | |
String[] urls = { | |
"http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp", | |
"http://www.ibm.com/developerworks/cn/views/java/libraryview.jsp", | |
"http://www.ibm.com/developerworks/cn/views/opensource/libraryview.jsp", | |
"http://www.ibm.com/developerworks/cn/views/cloud/libraryview.jsp" }; | |
for (String url : urls) { | |
new DownPage(url).doAction(); | |
} | |
} | |
private static String initPageUrl(String url) { | |
String nextPageUrl = null; | |
try { | |
nextPageUrl = StringUtils.replace(url, "所有类别", | |
URLEncoder.encode("所有类别", "UTF-8")).replace(" ", "%20"); | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
} | |
return nextPageUrl; | |
} | |
private String name; | |
private String url; | |
private static String domain = "http://www.ibm.com"; | |
private final ConcurrentLinkedQueue<List<String>> htmlDownQueue = new ConcurrentLinkedQueue<List<String>>(); | |
private static String savePath = "E:/developerworks/"; | |
/** | |
* 添加列表页面 | |
* | |
* @param name | |
* @param url | |
*/ | |
public DownPage(String url) { | |
this.url = url; | |
name = StringUtils.substringBetween(url, "/views/", "/"); | |
} | |
public void doAction() throws IOException { | |
String pageHtmlSource = NetUtils.readUrlContent(this.url); | |
while (StringUtils.isNotBlank(pageHtmlSource)) { | |
analyticsHtmlPage(pageHtmlSource); | |
String nextPageUrl = getNextPageUrl(pageHtmlSource); | |
if (StringUtils.isBlank(nextPageUrl)) { | |
break; | |
} | |
pageHtmlSource = NetUtils.readUrlContent(initPageUrl(nextPageUrl)); | |
} | |
if (htmlDownQueue.isEmpty()) | |
return; | |
ExecutorService pool = Executors.newFixedThreadPool(10); | |
List<String> strList = null; | |
File indexFile = new File(savePath + name + ".html"); | |
if (!indexFile.exists()) { | |
indexFile.createNewFile(); | |
} | |
FileWriter writer = new FileWriter(indexFile, true); | |
BufferedWriter out = new BufferedWriter(writer); | |
while ((strList = htmlDownQueue.peek()) != null) { | |
out.write("<a href=\"" + strList.get(0) + "\">" + strList.get(1) | |
+ "</a><br/>\n"); | |
pool.execute(new DownRunnable(strList.get(0))); | |
} | |
out.flush(); | |
writer.close(); | |
pool.shutdown(); | |
System.out.println("done !"); | |
} | |
class DownRunnable implements Runnable { | |
private String s; | |
public DownRunnable(String s) { | |
this.s = s; | |
} | |
@Override | |
public void run() { | |
String htmlSource = NetUtils.readUrlContent(s); | |
if (StringUtils.isBlank(htmlSource)) | |
return; | |
List<String> fileNames = RegUtils | |
.getValues(s, | |
"http://www.ibm.com/developerworks/cn/(.*?)/(.*?)/index.html"); | |
if (fileNames.get(0).startsWith("data")) { | |
return; | |
} | |
try { | |
File saveFile = new File(savePath + fileNames.get(0) + "/" | |
+ fileNames.get(1) + "/index.html"); | |
if (saveFile.exists()) { | |
return; | |
} | |
if (!saveFile.getParentFile().exists()) { | |
saveFile.getParentFile().mkdirs(); | |
} | |
String content = getContent(htmlSource); | |
FileUtils.writeStringToFile(saveFile, content); | |
HashSet<String> imgDownQueue = initImg(content, s); | |
for (String imgUrl : imgDownQueue) { | |
String imgPath = StringUtils.substringAfter(imgUrl, | |
"http://www.ibm.com/developerworks/cn/"); | |
System.out.println("down img : " + imgUrl + " with path : " | |
+ savePath + imgPath); | |
NetUtils.downImageUrl(imgUrl, savePath + imgPath); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
@SuppressWarnings("deprecation") | |
private void analyticsHtmlPage(String pageHtmlSource) { | |
if (StringUtils.isBlank(pageHtmlSource)) | |
return; | |
String htmlLinkeReg = "<a href=\"(http://www.ibm.com/developerworks/cn/.*?/index.html)\"><strong>(.*?)</strong></a>"; | |
List<List<String>> listList = RegUtils.getValuesList(pageHtmlSource, | |
htmlLinkeReg, 2); | |
if (listList == null || listList.isEmpty()) | |
return; | |
for (List<String> listStr : listList) { | |
System.out.println("down page url : " + listStr.get(0)); | |
htmlDownQueue.add(listStr); | |
} | |
} | |
private String getNextPageUrl(String pageHtmlSource) { | |
if (StringUtils.isBlank(pageHtmlSource)) | |
return null; | |
String regStr = "<a.*?href=\"(.*?)\">下一页</a>"; | |
String url = RegUtils.getValue(pageHtmlSource, regStr); | |
if (StringUtils.isBlank(url)) | |
return null; | |
if (url.startsWith("/")) { | |
return domain + url; | |
} | |
return url; | |
} | |
String start = "<div id=\"ibm-pcon\">"; | |
String end = "<div id=\"ibm-footer\">"; | |
private String getContent(String htmlSource) { | |
String content = start | |
+ StringUtils.substringBetween(htmlSource, start, end); | |
return content; | |
} | |
@SuppressWarnings("deprecation") | |
private HashSet<String> initImg(String content, String url) { | |
HashSet<String> imgDownQueue = new HashSet<String>(); | |
String imgReg = "<img.*?src=\"(.*?)\"[^>]*>"; | |
List<List<String>> imgList = RegUtils.getValuesList(content, imgReg, 1); | |
if (imgList == null || imgList.isEmpty()) { | |
System.out.println("imgList is null !"); | |
return imgDownQueue; | |
} | |
for (List<String> imgStrs : imgList) { | |
String imgPath = imgStrs.get(0); | |
if (StringUtils.isBlank(imgPath)) { | |
continue; | |
} | |
if (imgPath.startsWith("/") || imgPath.startsWith("http")) { | |
continue; | |
} | |
String imgPrefixUrl = StringUtils.substringBeforeLast(url, "/"); | |
if (!imgPrefixUrl.endsWith("/")) { | |
imgPrefixUrl += "/"; | |
} | |
imgDownQueue.add(imgPrefixUrl + imgStrs.get(0)); | |
} | |
return imgDownQueue; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment