Skip to content

Instantly share code, notes, and snippets.

@bluishoul
Created December 29, 2012 08:09
Show Gist options
  • Save bluishoul/4405363 to your computer and use it in GitHub Desktop.
Save bluishoul/4405363 to your computer and use it in GitHub Desktop.
Spider
package common;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.codec.StringDecoder;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.lang.SystemUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author Administrator
*/
public class Spider {
public static String OSC_ARTICLE_POST_URL = "http://localhost/action/spider/postArticle";
public final static String USER_AGNET = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.19 (KHTML, like Gecko) Chrome/25.0.1321.0 Safari/537.19";
private static final String REMOVE_PREFIX = "[置顶]";
/**
* 代理服务器的地址
*/
private static String proxyHost;
/**
* 代理服务器的端口
*/
private static String proxyPort;
/**
* 代理服务器用户名
*/
private static String proxyUser;
/**
* 代理服务器密码
*/
private static String proxyPassword;
/**
* 网页抓取方法
*
* @param urlString 要抓取的url地址
* @param charset 网页编码方式
* @param timeout 超时时间
* @return 抓取的网页内容
* @throws IOException 抓取异常
*/
public static String GetWebContent(String urlString, final String charset,
int timeout) throws IOException {
if (urlString == null || urlString.length() == 0) {
return null;
}
urlString = (urlString.startsWith("http://") || urlString
.startsWith("https://")) ? urlString : ("http://" + urlString)
.intern();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
GetProxy();
conn.setRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.52 Safari/536.5");// 增加报头,模拟浏览器,防止屏蔽
conn.setRequestProperty("Accept", "text/html");// 只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些
conn.setConnectTimeout(timeout);
try {
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
return "请坚持网络连接";
}
} catch (IOException e) {
System.err.println("网络出现问题");
return null;
}
InputStream input = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(input,
charset));
String line = null;
StringBuffer sb = new StringBuffer();
while ((line = reader.readLine()) != null) {
sb.append(line).append("\r\n");
}
if (reader != null) {
reader.close();
}
if (conn != null) {
conn.disconnect();
}
return sb.toString();
}
public static Document GetDocument(String...url) throws IOException {
// System.out.println("Fetching "+url);
String html = Spider.GetWebContent(url[0], url.length==1?"UTF-8":url[1]);
return Jsoup.parse(html);
}
/**
* 网页抓取方法
*
* @param urlString 要抓取的url地址
* @return 抓取的网页内容
* @throws IOException 抓取异常
*/
public static String GetWebContent(String urlString) throws IOException {
return GetWebContent(urlString, "iso-8859-1", 5000);
}
/**
* 网页抓取方法
*
* @param urlString 要抓取的url地址
* @param pageCharset 目标网页编码方式
* @return 抓取的网页内容
* @throws IOException 抓取异常
*/
public static String GetWebContent(String urlString, String pageCharset)
throws IOException {
String strHTML = GetWebContent(urlString, pageCharset, 5000);
if ("".equals(strHTML) || strHTML == null) {
return urlString;
}
return strHTML;
}
/**
* 设定代理服务器
*
* @param proxyHost
* @param proxyPort
*/
public static void SetProxy(String proxyHost, String proxyPort) {
SetProxy(proxyHost, proxyPort, null, null);
}
/**
* 设定代理服务器
*
* @param sproxyHost 代理服务器的地址
* @param sproxyPort 代理服务器的端口
* @param sproxyUser 代理服务器用户名
* @param sproxyPassword 代理服务器密码
*/
public static void SetProxy(String sproxyHost, String sproxyPort,
String sproxyUser, String sproxyPassword) {
proxyHost = sproxyHost;
proxyPort = sproxyPort;
if (sproxyPassword != null && sproxyPassword.length() > 0) {
proxyUser = sproxyUser;
proxyPassword = sproxyPassword;
}
}
/**
* 取得代理设定
*
* @return
*/
private static Properties GetProxy() {
Properties propRet = null;
if (proxyHost != null && proxyHost.length() > 0) {
propRet = System.getProperties();
// 设置http访问要使用的代理服务器的地址
propRet.setProperty("http.proxyHost", proxyHost);
// 设置http访问要使用的代理服务器的端口
propRet.setProperty("http.proxyPort", proxyPort);
if (proxyUser != null && proxyUser.length() > 0) {
// 用户名密码
propRet.setProperty("http.proxyUser", proxyUser);
propRet.setProperty("http.proxyPassword", proxyPassword);
}
}
return propRet;
}
/**
* @param html
* @param className
* @return
*/
public static List<Element> GetElementByClassName(String html,
String className) {
Document doc = Jsoup.parse(html);
List<Element> list = new ArrayList<Element>();
Elements elms = doc.getElementsByClass(className);
for (int i = 0; i < elms.size(); i++) {
list.add(elms.get(i));
}
return list;
}
public static String postToOschina(int uid, Article art, String module) {
HttpClient client = new HttpClient();
client.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,
"UTF-8");
client.getParams().setParameter(HttpMethodParams.USER_AGENT, USER_AGNET);
PostMethod pm = new PostMethod(OSC_ARTICLE_POST_URL);
String title = art.getTitle();
title = HtmlUtil.filterUserInputContent(title).replace(REMOVE_PREFIX,
"");
NameValuePair[] article = {new NameValuePair("uid", uid + ""),
new NameValuePair("title", title),
new NameValuePair("link", art.getLink()),
new NameValuePair("content", art.getContent()),
new NameValuePair("module", module)};
pm.setRequestBody(article);
String info = "";
try {
client.executeMethod(pm);
int code = pm.getStatusCode();
if (code == HttpStatus.SC_OK){
info = new String(pm.getResponseBodyAsString());
System.err.println("Spider:" + info);
}
System.out.println("Spider:the post return value"
+ pm.getStatusLine());
pm.releaseConnection();
} catch (Exception e) {
e.printStackTrace();
return "";
}
return info;
}
public static void main(String[] args) throws Exception {
System.out.println(Spider.GetDocument("http://tieba.baidu.com/index.html"));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment