Created
June 10, 2019 15:16
-
-
Save chinacaozheng/7f65f71aebc121e44bf505ea0af21f94 to your computer and use it in GitHub Desktop.
通过Java爬虫技术爬取大众点评的数据,在存入数据库中
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
学习Java一段时间后,我们可以通过实列来展示Java语言的魅力,下面是我通过爬取大众点评来展示 | |
在爬虫时要注意爬取的位置的抓取点 有id用#表示,class用 . 来表示。 | |
还有当遇见有反爬虫机制时我们应改这样 | |
 | |
使用双线程可以提高爬虫的性能 | |
 | |
 | |
这是localData1线程 | |
public void loadData1() | |
{ | |
while (num <= 50) | |
{ | |
synchronized (this) | |
{ | |
if (flag) | |
{ | |
try | |
{ | |
//清除之前保存的数据 | |
lists.clear(); | |
System.out.println(Thread.currentThread().getName() + ",线程被阻塞"); | |
this.wait(); | |
} catch (InterruptedException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} else | |
{ | |
String url = "http://www.dianping.com/beijing/ch45/p" + num + "?aid=32595162&cpt=32595162"; | |
System.out.println(Thread.currentThread().getName() + "-->" + url); | |
Connection conn = Jsoup.connect(url).userAgent( | |
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36") | |
.cookie("Cookie", | |
"_hc.v=\"\\\"a3ce3144-92e8-4b42-9bcf-0beeda47dd7d.1545659043\\\"\"; _lxsdk_cuid=167e0754f36c8-040d66753e16a3-5d1f3b1c-144000-167e0754f36c8; _lxsdk=167e0754f36c8-040d66753e16a3-5d1f3b1c-144000-167e0754f36c8; __utma=1.1282234895.1545659043.1545659043.1545659043.1; __utmz=1.1545659043.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s_ViewType=10; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_s=16b2a3c8ab0-8a0-49a-2c3%7C%7C42"); | |
try | |
{ | |
Document doc = conn.get(); | |
//System.out.println(doc); | |
Elements allliObjs = doc.select("#shop-all-list ul li"); | |
System.out.println(Thread.currentThread().getName() + "--->数量为:-->" + allliObjs.size()); | |
for (Element e : allliObjs) | |
{ | |
// System.out.println(e.nodeName()); | |
String title = e.select(".tit h4").text(); | |
String flag = e.select(".comment span").attr("title"); | |
String addr = e.select(".operate a").attr("data-address"); | |
String imgurl = e.select(".pic img").attr("src"); | |
//System.out.println(title + "," + flag + "," + addr + "," + imgurl); | |
// 连接数据库,插入数据 增加子线程的工作量 | |
DZ dz = new DZ(); | |
dz.setTitle(title); | |
dz.setComment(flag); | |
dz.setAddr(addr); | |
dz.setImgurl(imgurl); | |
lists.add(dz);//15 | |
} | |
db.add(lists); | |
Thread.sleep(2000); | |
} catch (InterruptedException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (IOException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
num++; | |
flag = true; | |
this.notify(); | |
} | |
} | |
} | |
} | |
这是将数据载入数据库的代码 | |
public class DB | |
{ | |
Connection conn = null; | |
public void connDB() | |
{ | |
try | |
{ | |
Class.forName("com.mysql.jdbc.Driver"); | |
conn = DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/d_thzm_two", "root", "123456"); | |
} catch (ClassNotFoundException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (SQLException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
public void add(List lists) | |
{ | |
this.connDB(); | |
try | |
{ | |
PreparedStatement pstmt = conn.prepareStatement("insert into t_dz(title,com,addre,imgurl) values(?,?,?,?)"); | |
for (int i = 0; i < lists.size(); i++) | |
{ | |
DZ dz = (DZ) lists.get(i); | |
pstmt.setString(1, dz.getTitle()); | |
pstmt.setString(2, dz.getComment()); | |
pstmt.setString(3, dz.getAddr()); | |
pstmt.setString(4, dz.getImgurl()); | |
pstmt.execute(); | |
} | |
} | |
catch(Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
finally | |
{ | |
if(null!=conn) | |
{ | |
try | |
{ | |
conn.close(); | |
} catch (SQLException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
} | |
} | |
} | |
这是设置爬虫数据位置的名称 | |
public class DZ | |
{ | |
private String title; | |
private String comment; | |
private String addr; | |
private String imgurl; | |
public String getTitle() | |
通过选中右击菜单中的source中的Generate GettersandSetters | |
这是运行mian函数的代码 | |
public class Test | |
{ | |
public static void main(String[] args) | |
{ | |
DataPrase dp = new DataPrase(); | |
Thread t1 = new Thread1(dp); | |
Thread t2 = new Thread2(dp); | |
t1.start(); | |
t2.start(); | |
} | |
} | |
这里只有一个对象dp | |
将其中的数据传给新的对象 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment