Skip to content

Instantly share code, notes, and snippets.

@v5tech
Created January 21, 2015 09:53
Show Gist options
  • Save v5tech/5145059b89d8d040c8c9 to your computer and use it in GitHub Desktop.
Save v5tech/5145059b89d8d040c8c9 to your computer and use it in GitHub Desktop.
webmagic爬虫示例
import java.util.ArrayList;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 爬取http://www.tianyisw.com/ 政策法规数据
* @author welcome
*
*/
public class CrawlZCFG implements PageProcessor {
private Site site = Site.me()/*.setDomain("www.tianyisw.com")*/
.setRetryTimes(3)
.setSleepTime(1000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
// 用于匹配网页正文中分页链接
private static final String url = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=政策法规&owen2=&page=\\d+";
// 列表页请求地址
private static final String URL_LIST = "http://www\\.tianyisw.com/Policies\\.asp\\?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=&page=\\d+";
// 详情页请求地址
private static final String URL_POST = "http://www\\.tianyisw\\.com/show_Policies\\.asp\\?id=\\d+";
@Override
public void process(Page page) {
// 若请求地址为列表页
if(page.getUrl().regex(URL_LIST).match()){
// 添加详情地址到爬虫请求列表中
page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
// 处理匹配到的分页请求链接
List<String> list = page.getHtml().links().regex(url).all();
ArrayList<String> lst = new ArrayList<String>();
for (String string : list) {
// 处理地址中的中文
String url = string.replace("政策法规", "%D5%FE%B2%DF%B7%A8%B9%E6");
lst.add(url);
}
// 将分页请求链接地址添加到爬虫请求列表中
page.addTargetRequests(lst);
}else{
// 详情页数据解析 为详情地址所指向的内容页
page.putField("title", page.getHtml().xpath("//table/tbody/tr/td[@class=\"tit\"]/strong/text()").toString());
page.putField("body", page.getHtml().xpath("//td[@class=\"hanggao\"]").toString());
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new CrawlZCFG())
// 爬虫初始地址
.addUrl("http://www.tianyisw.com/Policies.asp?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=&page=1")
.thread(5)
.run();
}
}
import java.util.ArrayList;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 爬取http://www.tianyisw.com/ 政策解读数据
* @author welcome
*
*/
public class CrawlZCJD implements PageProcessor {
private Site site = Site.me()/*.setDomain("www.tianyisw.com")*/
.setRetryTimes(3)
.setSleepTime(1000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
// 用于匹配网页正文中分页链接
private static final String url = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=政策法规&owen2=政策解读&page=\\d+";
// 列表页请求地址
private static final String URL_LIST = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=%D5%FE%B2%DF%BD%E2%B6%C1&page=\\d+";
// 详情页请求地址
private static final String URL_POST = "http://www\\.tianyisw\\.com/show_Policies\\.asp\\?id=\\d+";
@Override
public void process(Page page) {
// 若请求地址为列表页
if(page.getUrl().regex(URL_LIST).match()){
// 添加详情地址到爬虫请求列表中
page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
// 处理匹配到的分页请求链接
List<String> list = page.getHtml().links().regex(url).all();
ArrayList<String> lst = new ArrayList<String>();
for (String string : list) {
// 处理地址中的中文
String url = string.replace("政策法规", "%D5%FE%B2%DF%B7%A8%B9%E6");
url = url.replace("政策解读", "%D5%FE%B2%DF%BD%E2%B6%C1");
lst.add(url);
}
// 将分页请求链接地址添加到爬虫请求列表中
page.addTargetRequests(lst);
}else{
// 详情页数据解析 为详情地址所指向的内容页
page.putField("title", page.getHtml().xpath("//table/tbody/tr/td[@class=\"tit\"]/strong/text()").toString());
page.putField("body", page.getHtml().xpath("//td[@class=\"hanggao\"]").toString());
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new CrawlZCJD())
// 爬虫初始地址
.addUrl("http://www.tianyisw.com/Policies.asp?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=%D5%FE%B2%DF%BD%E2%B6%C1&page=1")
.thread(5)
.run();
}
}
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* webmagic垂直爬虫爬取深圳医院数据
* 测试类
* @author welcome
*
*/
public class HospitalCrawl implements PageProcessor
{
public static final String URL_LIST = "http://sz\\.91160\\.com/search/index/p-\\d+\\.html";
public static final String URL_POST = "http://sz\\.91160\\.com/unit/show/uid-\\w+\\.html";
private Site site = Site
.me()
.setDomain("sz.91160.com")
.setSleepTime(3000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
@Override
public void process(Page page) {
//列表页
if (page.getUrl().regex(URL_LIST).match()) {
page.addTargetRequests(page.getHtml().xpath("//div[@class=\"search_list layout\"]//ul").links().regex(URL_POST).all());
page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
}
else
{
page.putField("医院名称", page.getHtml().xpath("//div[2]/div[1]/h1/allText()"));
page.putField("别名", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[1]/allText()"));
page.putField("地址", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[2]/allText()"));
page.putField("电话", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[3]/allText()"));
page.putField("官网", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[4]/allText()"));
page.putField("科室", page.getHtml().xpath("//div[3]/div/div[1]/div[2]/div[2]"));
//获取科室信息
List<String> list = page.getHtml().xpath("//div[3]/div/div[1]/div[2]/div[2]").links().regex("http://sz\\.91160\\.com/dep/show/depid-\\w+\\.html").all();
for (String string : list) {
System.out.println(string);
}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args)
{
ConsolePipeline consolePipeline = new ConsolePipeline();
Spider.create(new HospitalCrawl()).addUrl("http://sz.91160.com/search/index/p-2.html").addPipeline(consolePipeline).run();
}
}
@v5tech
Copy link
Author

v5tech commented Jan 21, 2015

依赖的jar如下:

commons-lang3-3.3.2.jar
slf4j-log4j12-1.7.10.jar
webmagic-core-0.5.2.jar
webmagic-extension-0.5.2.jar
xsoup-0.2.4.jar

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment