Last active
October 25, 2017 01:39
-
-
Save AhianZhang/51f545998eb698c32d00bd92804126f2 to your computer and use it in GitHub Desktop.
jsoup爬虫简单演示
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.http.HttpHost; | |
import org.apache.http.HttpResponse; | |
import org.apache.http.client.HttpClient; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.conn.params.ConnRouteParams; | |
import org.apache.http.impl.client.DefaultHttpClient; | |
import org.apache.http.params.CoreConnectionPNames; | |
import org.apache.http.util.EntityUtils; | |
/**利用httpclient爬取相关数据 | |
* Created by 张巍瀚 on 2017/6/4. | |
*/ | |
public class demo | |
{ | |
public static void main(String[] args) throws Exception | |
{ //创建HttpClient对象 | |
HttpClient httpClient=new DefaultHttpClient(); | |
//创建爬虫所需要的Get请求地址 本案例只是一个演示流程,所以不涉及有关网络协议问题以及证书问题 | |
HttpGet httpGet =new HttpGet("http://www.12306.cn/mormhweb/"); | |
//设置响应时间 传输时间 设置代理服务器 | |
// httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,10000);//连接时间 | |
// httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,10000); | |
// httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,new HttpHost("222.218.208.104",8998)); | |
//向服务器请求获取相应代码 | |
HttpResponse httpResponse= httpClient.execute(httpGet); | |
// 利用工具将所获得的代码转换成字符串 | |
String content= EntityUtils.toString(httpResponse.getEntity(),"utf-8"); | |
System.out.println(content); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
/**利用Jsoup解析网页 | |
* Created by 张巍瀚 on 2017/6/5. | |
*/ | |
public class jsoupDemo | |
{ | |
public static void main(String[] args)throws Exception | |
{ | |
Document doc=Jsoup.connect("http://www.12306.cn/mormhweb/").get(); | |
// Elements element= doc.getElementsByTag("a"); | |
// System.out.println(element.text()); | |
Elements element= doc.select("ul.menubg a"); | |
// System.out.println(element); | |
for(Element elements:element){ | |
System.out.println(elements.text()+":"+elements.attr("href")); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
对于HttpClient版本不同,写法也不同,本人刚入门,代码也是有所参考的