Created
February 26, 2017 14:54
-
-
Save shikaiwen/29421bba3ea3aae2424a18869f622f2e to your computer and use it in GitHub Desktop.
MongoWorker 文件
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.kevin.spider; | |
import com.mongodb.DB; | |
import com.mongodb.Mongo; | |
import com.mongodb.MongoClient; | |
import com.mongodb.client.MongoCollection; | |
import com.mongodb.client.MongoDatabase; | |
import org.bson.Document; | |
/** | |
* http://www.cnblogs.com/yjmyzz/p/3865175.html | |
* | |
* Created by root on 2/26/2017. | |
*/ | |
public class MongoWorker { | |
public static MongoDatabase getDB(String db){ | |
// Mongo mongo = new Mongo("localhost", 27017); | |
MongoClient client = new MongoClient("localhost", 27017); | |
MongoDatabase database = client.getDatabase(db); | |
return database; | |
} | |
public void insert(){ | |
MongoDatabase test = getDB("test"); | |
MongoCollection<Document> province = test.getCollection("province"); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.kevin.spider; | |
import com.alibaba.fastjson.JSON; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.IOException; | |
import java.util.*; | |
/** | |
* Created by root on 2/26/2017. | |
*/ | |
public class ProvinceCityCountySpider { | |
public static void main(String[] args) { | |
ProvinceCityCountySpider spider = new ProvinceCityCountySpider(); | |
Map map = spider.getProvince().get(0); | |
spider.getCityList(map.get("nextUrl").toString()); | |
} | |
public Document getDocument(String url){ | |
Document doc = null; | |
try { | |
doc = Jsoup.connect(url).get(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
// Elements newsHeadlines = doc.select("#mp-itn b a"); | |
return doc; | |
} | |
public void getCity(String url){ | |
} | |
public List getVillageList(String url) { | |
List countyList = new ArrayList(); | |
Document document = getDocument(url); | |
Elements cityTrs = document.select(".villagetable .villagetr"); | |
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) { | |
Element next = iterator.next(); | |
Element codeTd = next.child(0); | |
String code = codeTd.child(0).text(); | |
Element nameTd = next.child(1); | |
String name = nameTd.child(0).text(); | |
String href = nameTd.child(0).attr("href"); | |
String nextUrl = getUrlPath(url) + href; | |
Map map = new HashMap(); | |
map.put("name", name); | |
map.put("code", code); | |
map.put("nextUrl", nextUrl); | |
countyList.add(map); | |
} | |
return countyList; | |
} | |
public List getTownList(String url) { | |
List countyList = new ArrayList(); | |
Document document = getDocument(url); | |
Elements cityTrs = document.select(".towntable .towntr"); | |
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) { | |
Element next = iterator.next(); | |
Element codeTd = next.child(0); | |
String code = codeTd.child(0).text(); | |
Element nameTd = next.child(1); | |
String name = nameTd.child(0).text(); | |
String href = nameTd.child(0).attr("href"); | |
String nextUrl = getUrlPath(url) + href; | |
Map map = new HashMap(); | |
map.put("name", name); | |
map.put("code", code); | |
map.put("nextUrl", nextUrl); | |
countyList.add(map); | |
} | |
return countyList; | |
} | |
public List getCountyList(String url) { | |
List countyList = new ArrayList(); | |
Document document = getDocument(url); | |
Elements cityTrs = document.select(".countytable .countytr"); | |
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) { | |
Element next = iterator.next(); | |
Element codeTd = next.child(0); | |
String code = codeTd.child(0).text(); | |
Element nameTd = next.child(1); | |
String name = nameTd.child(0).text(); | |
String href = nameTd.child(0).attr("href"); | |
String nextUrl = getUrlPath(url) + href; | |
Map map = new HashMap(); | |
map.put("name", name); | |
map.put("code", code); | |
map.put("nextUrl", nextUrl); | |
countyList.add(map); | |
} | |
return countyList; | |
} | |
public List getCityList(String url) { | |
List cityList = new ArrayList(); | |
Document document = getDocument(url); | |
Elements cityTrs = document.select(".citytable .citytr"); | |
for (Iterator<Element> iterator = cityTrs.iterator(); iterator.hasNext(); ) { | |
Element next = iterator.next(); | |
Element codeTd = next.child(0); | |
String code = codeTd.child(0).text(); | |
Element nameTd = next.child(1); | |
String name = nameTd.child(0).text(); | |
String href = nameTd.child(0).attr("href"); | |
String nextUrl = getUrlPath(url) + href; | |
Map map = new HashMap(); | |
map.put("name", name); | |
map.put("code", code); | |
map.put("nextUrl", nextUrl); | |
cityList.add(map); | |
} | |
return cityList; | |
} | |
public List<Map> getProvince(){ | |
String allUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html"; | |
Document document = getDocument(allUrl); | |
Elements select = document.select(".provincetable .provincetr td"); | |
List<Map> provList = new ArrayList<>(); | |
for(Iterator iter = select.iterator();iter.hasNext();) { | |
Element next = (Element) iter.next(); | |
Element aLink = next.child(0); | |
String href = aLink.attr("href"); | |
Map map = new HashMap<>(); | |
map.put("name", aLink.text()); | |
String baseUrl = | |
allUrl.lastIndexOf("/") == allUrl.length() - 1 ? allUrl : allUrl.substring(0,allUrl.lastIndexOf("/")+1); | |
map.put("nextUrl", baseUrl + href); | |
provList.add(map); | |
} | |
// System.out.println(JSON.toJSONString(provList)); | |
return provList; | |
} | |
String getUrlPath(String allUrl){ | |
String str = allUrl.lastIndexOf("/") | |
== allUrl.length() - 1 ? allUrl : allUrl.substring(0,allUrl.lastIndexOf("/")+1); | |
return str; | |
} | |
} | |
<!--爬虫--> | |
<dependency> | |
<groupId>us.codecraft</groupId> | |
<artifactId>webmagic-core</artifactId> | |
<version>0.6.1</version> | |
</dependency> | |
<dependency> | |
<groupId>us.codecraft</groupId> | |
<artifactId>webmagic-extension</artifactId> | |
<version>0.6.1</version> | |
</dependency> | |
<!--mongo--> | |
<dependency> | |
<groupId>org.mongodb</groupId> | |
<artifactId>mongo-java-driver</artifactId> | |
<version>3.4.0</version> | |
</dependency> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment