Skip to content

Instantly share code, notes, and snippets.

@shikaiwen
Created February 26, 2017 14:54
Show Gist options
  • Save shikaiwen/29421bba3ea3aae2424a18869f622f2e to your computer and use it in GitHub Desktop.
Save shikaiwen/29421bba3ea3aae2424a18869f622f2e to your computer and use it in GitHub Desktop.
MongoWorker 文件
package com.kevin.spider;
import com.mongodb.DB;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import org.bson.Document;
/**
* http://www.cnblogs.com/yjmyzz/p/3865175.html
*
* Created by root on 2/26/2017.
*/
public class MongoWorker {
public static MongoDatabase getDB(String db){
// Mongo mongo = new Mongo("localhost", 27017);
MongoClient client = new MongoClient("localhost", 27017);
MongoDatabase database = client.getDatabase(db);
return database;
}
public void insert(){
MongoDatabase test = getDB("test");
MongoCollection<Document> province = test.getCollection("province");
}
}
package com.kevin.spider;
import com.alibaba.fastjson.JSON;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
/**
* Created by root on 2/26/2017.
*/
public class ProvinceCityCountySpider {
public static void main(String[] args) {
ProvinceCityCountySpider spider = new ProvinceCityCountySpider();
Map map = spider.getProvince().get(0);
spider.getCityList(map.get("nextUrl").toString());
}
public Document getDocument(String url){
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
e.printStackTrace();
}
// Elements newsHeadlines = doc.select("#mp-itn b a");
return doc;
}
public void getCity(String url){
}
public List getVillageList(String url) {
List countyList = new ArrayList();
Document document = getDocument(url);
Elements cityTrs = document.select(".villagetable .villagetr");
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) {
Element next = iterator.next();
Element codeTd = next.child(0);
String code = codeTd.child(0).text();
Element nameTd = next.child(1);
String name = nameTd.child(0).text();
String href = nameTd.child(0).attr("href");
String nextUrl = getUrlPath(url) + href;
Map map = new HashMap();
map.put("name", name);
map.put("code", code);
map.put("nextUrl", nextUrl);
countyList.add(map);
}
return countyList;
}
public List getTownList(String url) {
List countyList = new ArrayList();
Document document = getDocument(url);
Elements cityTrs = document.select(".towntable .towntr");
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) {
Element next = iterator.next();
Element codeTd = next.child(0);
String code = codeTd.child(0).text();
Element nameTd = next.child(1);
String name = nameTd.child(0).text();
String href = nameTd.child(0).attr("href");
String nextUrl = getUrlPath(url) + href;
Map map = new HashMap();
map.put("name", name);
map.put("code", code);
map.put("nextUrl", nextUrl);
countyList.add(map);
}
return countyList;
}
public List getCountyList(String url) {
List countyList = new ArrayList();
Document document = getDocument(url);
Elements cityTrs = document.select(".countytable .countytr");
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) {
Element next = iterator.next();
Element codeTd = next.child(0);
String code = codeTd.child(0).text();
Element nameTd = next.child(1);
String name = nameTd.child(0).text();
String href = nameTd.child(0).attr("href");
String nextUrl = getUrlPath(url) + href;
Map map = new HashMap();
map.put("name", name);
map.put("code", code);
map.put("nextUrl", nextUrl);
countyList.add(map);
}
return countyList;
}
public List getCityList(String url) {
List cityList = new ArrayList();
Document document = getDocument(url);
Elements cityTrs = document.select(".citytable .citytr");
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) {
Element next = iterator.next();
Element codeTd = next.child(0);
String code = codeTd.child(0).text();
Element nameTd = next.child(1);
String name = nameTd.child(0).text();
String href = nameTd.child(0).attr("href");
String nextUrl = getUrlPath(url) + href;
Map map = new HashMap();
map.put("name", name);
map.put("code", code);
map.put("nextUrl", nextUrl);
cityList.add(map);
}
return cityList;
}
public List<Map> getProvince(){
String allUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
Document document = getDocument(allUrl);
Elements select = document.select(".provincetable .provincetr td");
List<Map> provList = new ArrayList<>();
for(Iterator iter = select.iterator();iter.hasNext();) {
Element next = (Element) iter.next();
Element aLink = next.child(0);
String href = aLink.attr("href");
Map map = new HashMap<>();
map.put("name", aLink.text());
String baseUrl =
allUrl.lastIndexOf("/") == allUrl.length() - 1 ? allUrl : allUrl.substring(0,allUrl.lastIndexOf("/")+1);
map.put("nextUrl", baseUrl + href);
provList.add(map);
}
// System.out.println(JSON.toJSONString(provList));
return provList;
}
String getUrlPath(String allUrl){
String str = allUrl.lastIndexOf("/")
== allUrl.length() - 1 ? allUrl : allUrl.substring(0,allUrl.lastIndexOf("/")+1);
return str;
}
}
<!--爬虫-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
<!--mongo-->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.4.0</version>
</dependency>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment