yongboy/DownPage.java

## DownPage.java
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

public class DownPage {

	public static void main(String[] args) throws IOException {
		// String url =
		// "http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp?site_id=10&contentarea_by=Web development&sort_by=&sort_order=2&start=101&end=200&topic_by=&product_by=&type_by=所有类别&show_abstract=true&search_by=&industry_by=&series_title_by=";

		// String nextPageUrl = initPageUrl(url);
		// System.out.println(NetUtils.readUrlContent(nextPageUrl));
		// System.out.println(StringUtils.substringBetween(url, "/views/",
		// "/"));

		String[] urls = {
				"http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp",
				"http://www.ibm.com/developerworks/cn/views/java/libraryview.jsp",
				"http://www.ibm.com/developerworks/cn/views/opensource/libraryview.jsp",
				"http://www.ibm.com/developerworks/cn/views/cloud/libraryview.jsp" };

		for (String url : urls) {
			new DownPage(url).doAction();
		}
	}

	private static String initPageUrl(String url) {
		String nextPageUrl = null;
		try {
			nextPageUrl = StringUtils.replace(url, "所有类别",
					URLEncoder.encode("所有类别", "UTF-8")).replace(" ", "%20");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}

		return nextPageUrl;
	}

	private String name;
	private String url;
	private static String domain = "http://www.ibm.com";
	private final ConcurrentLinkedQueue<List<String>> htmlDownQueue = new ConcurrentLinkedQueue<List<String>>();
	private static String savePath = "E:/developerworks/";

	/**
	 * 添加列表页面
	 *
	 * @param name
	 * @param url
	 */
	public DownPage(String url) {
		this.url = url;
		name = StringUtils.substringBetween(url, "/views/", "/");
	}

	public void doAction() throws IOException {
		String pageHtmlSource = NetUtils.readUrlContent(this.url);

		while (StringUtils.isNotBlank(pageHtmlSource)) {
			analyticsHtmlPage(pageHtmlSource);
			String nextPageUrl = getNextPageUrl(pageHtmlSource);

			if (StringUtils.isBlank(nextPageUrl)) {
				break;
			}

			pageHtmlSource = NetUtils.readUrlContent(initPageUrl(nextPageUrl));
		}

		if (htmlDownQueue.isEmpty())
			return;

		ExecutorService pool = Executors.newFixedThreadPool(10);
		List<String> strList = null;
		File indexFile = new File(savePath + name + ".html");
		if (!indexFile.exists()) {
			indexFile.createNewFile();
		}
		FileWriter writer = new FileWriter(indexFile, true);
		BufferedWriter out = new BufferedWriter(writer);

		while ((strList = htmlDownQueue.peek()) != null) {
			out.write("<a href=\"" + strList.get(0) + "\">" + strList.get(1)
					+ "</a><br/>\n");
			pool.execute(new DownRunnable(strList.get(0)));
		}
		out.flush();
		writer.close();

		pool.shutdown();

		System.out.println("done !");
	}

	class DownRunnable implements Runnable {
		private String s;

		public DownRunnable(String s) {
			this.s = s;
		}

		@Override
		public void run() {
			String htmlSource = NetUtils.readUrlContent(s);
			if (StringUtils.isBlank(htmlSource))
				return;

			List<String> fileNames = RegUtils
					.getValues(s,
							"http://www.ibm.com/developerworks/cn/(.*?)/(.*?)/index.html");

			if (fileNames.get(0).startsWith("data")) {
				return;
			}

			try {
				File saveFile = new File(savePath + fileNames.get(0) + "/"
						+ fileNames.get(1) + "/index.html");
				if (saveFile.exists()) {
					return;
				}

				if (!saveFile.getParentFile().exists()) {
					saveFile.getParentFile().mkdirs();
				}

				String content = getContent(htmlSource);
				FileUtils.writeStringToFile(saveFile, content);

				HashSet<String> imgDownQueue = initImg(content, s);

				for (String imgUrl : imgDownQueue) {
					String imgPath = StringUtils.substringAfter(imgUrl,
							"http://www.ibm.com/developerworks/cn/");

					System.out.println("down img : " + imgUrl + " with path : "
							+ savePath + imgPath);
					NetUtils.downImageUrl(imgUrl, savePath + imgPath);
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	@SuppressWarnings("deprecation")
	private void analyticsHtmlPage(String pageHtmlSource) {
		if (StringUtils.isBlank(pageHtmlSource))
			return;

		String htmlLinkeReg = "<a href=\"(http://www.ibm.com/developerworks/cn/.*?/index.html)\"><strong>(.*?)</strong></a>";

		List<List<String>> listList = RegUtils.getValuesList(pageHtmlSource,
				htmlLinkeReg, 2);

		if (listList == null || listList.isEmpty())
			return;

		for (List<String> listStr : listList) {
			System.out.println("down page url : " + listStr.get(0));
			htmlDownQueue.add(listStr);
		}
	}

	private String getNextPageUrl(String pageHtmlSource) {
		if (StringUtils.isBlank(pageHtmlSource))
			return null;
		String regStr = "<a.*?href=\"(.*?)\">下一页</a>";
		String url = RegUtils.getValue(pageHtmlSource, regStr);

		if (StringUtils.isBlank(url))
			return null;

		if (url.startsWith("/")) {
			return domain + url;
		}

		return url;
	}

	String start = "<div id=\"ibm-pcon\">";
	String end = "<div id=\"ibm-footer\">";

	private String getContent(String htmlSource) {
		String content = start
				+ StringUtils.substringBetween(htmlSource, start, end);

		return content;
	}

	@SuppressWarnings("deprecation")
	private HashSet<String> initImg(String content, String url) {
		HashSet<String> imgDownQueue = new HashSet<String>();

		String imgReg = "<img.*?src=\"(.*?)\"[^>]*>";

		List<List<String>> imgList = RegUtils.getValuesList(content, imgReg, 1);
		if (imgList == null || imgList.isEmpty()) {
			System.out.println("imgList is null !");
			return imgDownQueue;
		}

		for (List<String> imgStrs : imgList) {
			String imgPath = imgStrs.get(0);
			if (StringUtils.isBlank(imgPath)) {
				continue;
			}
			if (imgPath.startsWith("/") || imgPath.startsWith("http")) {
				continue;
			}

			String imgPrefixUrl = StringUtils.substringBeforeLast(url, "/");
			if (!imgPrefixUrl.endsWith("/")) {
				imgPrefixUrl += "/";
			}

			imgDownQueue.add(imgPrefixUrl + imgStrs.get(0));
		}

		return imgDownQueue;
	}
}
	import java.io.BufferedWriter;
	import java.io.File;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.UnsupportedEncodingException;
	import java.net.URLEncoder;
	import java.util.HashSet;
	import java.util.List;
	import java.util.concurrent.ConcurrentLinkedQueue;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;

	import org.apache.commons.io.FileUtils;
	import org.apache.commons.lang.StringUtils;

	public class DownPage {

	public static void main(String[] args) throws IOException {
	// String url =
	// "http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp?site_id=10&contentarea_by=Web development&sort_by=&sort_order=2&start=101&end=200&topic_by=&product_by=&type_by=所有类别&show_abstract=true&search_by=&industry_by=&series_title_by=";

	// String nextPageUrl = initPageUrl(url);
	// System.out.println(NetUtils.readUrlContent(nextPageUrl));
	// System.out.println(StringUtils.substringBetween(url, "/views/",
	// "/"));

	String[] urls = {
	"http://www.ibm.com/developerworks/cn/views/web/libraryview.jsp",
	"http://www.ibm.com/developerworks/cn/views/java/libraryview.jsp",
	"http://www.ibm.com/developerworks/cn/views/opensource/libraryview.jsp",
	"http://www.ibm.com/developerworks/cn/views/cloud/libraryview.jsp" };

	for (String url : urls) {
	new DownPage(url).doAction();
	}
	}

	private static String initPageUrl(String url) {
	String nextPageUrl = null;
	try {
	nextPageUrl = StringUtils.replace(url, "所有类别",
	URLEncoder.encode("所有类别", "UTF-8")).replace(" ", "%20");
	} catch (UnsupportedEncodingException e) {
	e.printStackTrace();
	}

	return nextPageUrl;
	}

	private String name;
	private String url;
	private static String domain = "http://www.ibm.com";
	private final ConcurrentLinkedQueue<List<String>> htmlDownQueue = new ConcurrentLinkedQueue<List<String>>();
	private static String savePath = "E:/developerworks/";

	/**
	* 添加列表页面
	*
	* @param name
	* @param url
	*/
	public DownPage(String url) {
	this.url = url;
	name = StringUtils.substringBetween(url, "/views/", "/");
	}

	public void doAction() throws IOException {
	String pageHtmlSource = NetUtils.readUrlContent(this.url);

	while (StringUtils.isNotBlank(pageHtmlSource)) {
	analyticsHtmlPage(pageHtmlSource);
	String nextPageUrl = getNextPageUrl(pageHtmlSource);

	if (StringUtils.isBlank(nextPageUrl)) {
	break;
	}

	pageHtmlSource = NetUtils.readUrlContent(initPageUrl(nextPageUrl));
	}

	if (htmlDownQueue.isEmpty())
	return;

	ExecutorService pool = Executors.newFixedThreadPool(10);
	List<String> strList = null;
	File indexFile = new File(savePath + name + ".html");
	if (!indexFile.exists()) {
	indexFile.createNewFile();
	}
	FileWriter writer = new FileWriter(indexFile, true);
	BufferedWriter out = new BufferedWriter(writer);

	while ((strList = htmlDownQueue.peek()) != null) {
	out.write("<a href=\"" + strList.get(0) + "\">" + strList.get(1)
	+ "</a><br/>\n");
	pool.execute(new DownRunnable(strList.get(0)));
	}
	out.flush();
	writer.close();

	pool.shutdown();

	System.out.println("done !");
	}

	class DownRunnable implements Runnable {
	private String s;

	public DownRunnable(String s) {
	this.s = s;
	}

	@Override
	public void run() {
	String htmlSource = NetUtils.readUrlContent(s);
	if (StringUtils.isBlank(htmlSource))
	return;

	List<String> fileNames = RegUtils
	.getValues(s,
	"http://www.ibm.com/developerworks/cn/(.?)/(.?)/index.html");

	if (fileNames.get(0).startsWith("data")) {
	return;
	}

	try {
	File saveFile = new File(savePath + fileNames.get(0) + "/"
	+ fileNames.get(1) + "/index.html");
	if (saveFile.exists()) {
	return;
	}

	if (!saveFile.getParentFile().exists()) {
	saveFile.getParentFile().mkdirs();
	}

	String content = getContent(htmlSource);
	FileUtils.writeStringToFile(saveFile, content);

	HashSet<String> imgDownQueue = initImg(content, s);

	for (String imgUrl : imgDownQueue) {
	String imgPath = StringUtils.substringAfter(imgUrl,
	"http://www.ibm.com/developerworks/cn/");

	System.out.println("down img : " + imgUrl + " with path : "
	+ savePath + imgPath);
	NetUtils.downImageUrl(imgUrl, savePath + imgPath);
	}
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}

	@SuppressWarnings("deprecation")
	private void analyticsHtmlPage(String pageHtmlSource) {
	if (StringUtils.isBlank(pageHtmlSource))
	return;

	String htmlLinkeReg = "<a href=\"(http://www.ibm.com/developerworks/cn/.?/index.html)\"><strong>(.?)</strong></a>";

	List<List<String>> listList = RegUtils.getValuesList(pageHtmlSource,
	htmlLinkeReg, 2);

	if (listList == null \|\| listList.isEmpty())
	return;

	for (List<String> listStr : listList) {
	System.out.println("down page url : " + listStr.get(0));
	htmlDownQueue.add(listStr);
	}
	}

	private String getNextPageUrl(String pageHtmlSource) {
	if (StringUtils.isBlank(pageHtmlSource))
	return null;
	String regStr = "<a.?href=\"(.?)\">下一页</a>";
	String url = RegUtils.getValue(pageHtmlSource, regStr);

	if (StringUtils.isBlank(url))
	return null;

	if (url.startsWith("/")) {
	return domain + url;
	}

	return url;
	}

	String start = "<div id=\"ibm-pcon\">";
	String end = "<div id=\"ibm-footer\">";

	private String getContent(String htmlSource) {
	String content = start
	+ StringUtils.substringBetween(htmlSource, start, end);

	return content;
	}

	@SuppressWarnings("deprecation")
	private HashSet<String> initImg(String content, String url) {
	HashSet<String> imgDownQueue = new HashSet<String>();

	String imgReg = "<img.?src=\"(.?)\"[^>]*>";

	List<List<String>> imgList = RegUtils.getValuesList(content, imgReg, 1);
	if (imgList == null \|\| imgList.isEmpty()) {
	System.out.println("imgList is null !");
	return imgDownQueue;
	}

	for (List<String> imgStrs : imgList) {
	String imgPath = imgStrs.get(0);
	if (StringUtils.isBlank(imgPath)) {
	continue;
	}
	if (imgPath.startsWith("/") \|\| imgPath.startsWith("http")) {
	continue;
	}

	String imgPrefixUrl = StringUtils.substringBeforeLast(url, "/");
	if (!imgPrefixUrl.endsWith("/")) {
	imgPrefixUrl += "/";
	}

	imgDownQueue.add(imgPrefixUrl + imgStrs.get(0));
	}

	return imgDownQueue;
	}
	}