Skip to content

Instantly share code, notes, and snippets.

@yongboy
Created June 27, 2011 23:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yongboy/1050139 to your computer and use it in GitHub Desktop.
Save yongboy/1050139 to your computer and use it in GitHub Desktop.
DownPage.java
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
public class DownPage {
public static void main(String[] args) throws IOException {
// String url =
// "http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp?site_id=10&contentarea_by=Web development&sort_by=&sort_order=2&start=101&end=200&topic_by=&product_by=&type_by=所有类别&show_abstract=true&search_by=&industry_by=&series_title_by=";
// String nextPageUrl = initPageUrl(url);
// System.out.println(NetUtils.readUrlContent(nextPageUrl));
// System.out.println(StringUtils.substringBetween(url, "/views/",
// "/"));
String[] urls = {
"http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp",
"http://www.ibm.com/developerworks/cn/views/java/libraryview.jsp",
"http://www.ibm.com/developerworks/cn/views/opensource/libraryview.jsp",
"http://www.ibm.com/developerworks/cn/views/cloud/libraryview.jsp" };
for (String url : urls) {
new DownPage(url).doAction();
}
}
private static String initPageUrl(String url) {
String nextPageUrl = null;
try {
nextPageUrl = StringUtils.replace(url, "所有类别",
URLEncoder.encode("所有类别", "UTF-8")).replace(" ", "%20");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return nextPageUrl;
}
private String name;
private String url;
private static String domain = "http://www.ibm.com";
private final ConcurrentLinkedQueue<List<String>> htmlDownQueue = new ConcurrentLinkedQueue<List<String>>();
private static String savePath = "E:/developerworks/";
/**
* 添加列表页面
*
* @param name
* @param url
*/
public DownPage(String url) {
this.url = url;
name = StringUtils.substringBetween(url, "/views/", "/");
}
public void doAction() throws IOException {
String pageHtmlSource = NetUtils.readUrlContent(this.url);
while (StringUtils.isNotBlank(pageHtmlSource)) {
analyticsHtmlPage(pageHtmlSource);
String nextPageUrl = getNextPageUrl(pageHtmlSource);
if (StringUtils.isBlank(nextPageUrl)) {
break;
}
pageHtmlSource = NetUtils.readUrlContent(initPageUrl(nextPageUrl));
}
if (htmlDownQueue.isEmpty())
return;
ExecutorService pool = Executors.newFixedThreadPool(10);
List<String> strList = null;
File indexFile = new File(savePath + name + ".html");
if (!indexFile.exists()) {
indexFile.createNewFile();
}
FileWriter writer = new FileWriter(indexFile, true);
BufferedWriter out = new BufferedWriter(writer);
while ((strList = htmlDownQueue.peek()) != null) {
out.write("<a href=\"" + strList.get(0) + "\">" + strList.get(1)
+ "</a><br/>\n");
pool.execute(new DownRunnable(strList.get(0)));
}
out.flush();
writer.close();
pool.shutdown();
System.out.println("done !");
}
class DownRunnable implements Runnable {
private String s;
public DownRunnable(String s) {
this.s = s;
}
@Override
public void run() {
String htmlSource = NetUtils.readUrlContent(s);
if (StringUtils.isBlank(htmlSource))
return;
List<String> fileNames = RegUtils
.getValues(s,
"http://www.ibm.com/developerworks/cn/(.*?)/(.*?)/index.html");
if (fileNames.get(0).startsWith("data")) {
return;
}
try {
File saveFile = new File(savePath + fileNames.get(0) + "/"
+ fileNames.get(1) + "/index.html");
if (saveFile.exists()) {
return;
}
if (!saveFile.getParentFile().exists()) {
saveFile.getParentFile().mkdirs();
}
String content = getContent(htmlSource);
FileUtils.writeStringToFile(saveFile, content);
HashSet<String> imgDownQueue = initImg(content, s);
for (String imgUrl : imgDownQueue) {
String imgPath = StringUtils.substringAfter(imgUrl,
"http://www.ibm.com/developerworks/cn/");
System.out.println("down img : " + imgUrl + " with path : "
+ savePath + imgPath);
NetUtils.downImageUrl(imgUrl, savePath + imgPath);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
@SuppressWarnings("deprecation")
private void analyticsHtmlPage(String pageHtmlSource) {
if (StringUtils.isBlank(pageHtmlSource))
return;
String htmlLinkeReg = "<a href=\"(http://www.ibm.com/developerworks/cn/.*?/index.html)\"><strong>(.*?)</strong></a>";
List<List<String>> listList = RegUtils.getValuesList(pageHtmlSource,
htmlLinkeReg, 2);
if (listList == null || listList.isEmpty())
return;
for (List<String> listStr : listList) {
System.out.println("down page url : " + listStr.get(0));
htmlDownQueue.add(listStr);
}
}
private String getNextPageUrl(String pageHtmlSource) {
if (StringUtils.isBlank(pageHtmlSource))
return null;
String regStr = "<a.*?href=\"(.*?)\">下一页</a>";
String url = RegUtils.getValue(pageHtmlSource, regStr);
if (StringUtils.isBlank(url))
return null;
if (url.startsWith("/")) {
return domain + url;
}
return url;
}
String start = "<div id=\"ibm-pcon\">";
String end = "<div id=\"ibm-footer\">";
private String getContent(String htmlSource) {
String content = start
+ StringUtils.substringBetween(htmlSource, start, end);
return content;
}
@SuppressWarnings("deprecation")
private HashSet<String> initImg(String content, String url) {
HashSet<String> imgDownQueue = new HashSet<String>();
String imgReg = "<img.*?src=\"(.*?)\"[^>]*>";
List<List<String>> imgList = RegUtils.getValuesList(content, imgReg, 1);
if (imgList == null || imgList.isEmpty()) {
System.out.println("imgList is null !");
return imgDownQueue;
}
for (List<String> imgStrs : imgList) {
String imgPath = imgStrs.get(0);
if (StringUtils.isBlank(imgPath)) {
continue;
}
if (imgPath.startsWith("/") || imgPath.startsWith("http")) {
continue;
}
String imgPrefixUrl = StringUtils.substringBeforeLast(url, "/");
if (!imgPrefixUrl.endsWith("/")) {
imgPrefixUrl += "/";
}
imgDownQueue.add(imgPrefixUrl + imgStrs.get(0));
}
return imgDownQueue;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment