Last active
August 31, 2018 04:26
-
-
Save forthxu/e40cdb912b402d28f553d86c4a2f14f5 to your computer and use it in GitHub Desktop.
chuansong.me 传送门指定公众号文章采集 && 利用phantomjs对网页进行截图 && 抓取搜狗微信
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2.7 | |
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import time | |
import csv | |
import sys,os | |
import pymysql | |
def get_cur_file_dir(): | |
path = sys.path[0] | |
if os.path.isdir(path): | |
return path | |
elif os.path.isfile(path): | |
return os.path.dirname(path) | |
def down_content(content_url,path_url): | |
xhtml=open_url(content_url) | |
if False == xhtml : | |
return False | |
soup = BeautifulSoup(xhtml, "html5lib") | |
titleH2 = soup.find("h2", id="activity-name") | |
if None == titleH2: | |
return False | |
title = titleH2.string.encode('utf-8') | |
string_time = soup.find("em", id="post-date").string.encode('utf-8') | |
num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d'))) | |
keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore')) | |
description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore')) | |
content = soup.find_all("div", class_="rich_media_content") | |
if len(content) < 1 : | |
print(" "+"no contet") | |
return False | |
html = """ | |
<!doctype html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>"""+title+"""</title> | |
<meta name="keywords" content=\""""+keywords+"""\"> | |
<meta name="description" content=\""""+description+"""\"> | |
</head> | |
<body> | |
<div id="body"> | |
<h1>"""+title+"""</h1> | |
<div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div> | |
<div id="content"> | |
"""+str(content[0])+""" | |
</div> | |
</div> | |
</body> | |
<script type="text/javascript" src="js/reimg.js"></script> | |
</html> | |
""" | |
f=file(path_url,"w+") | |
f.write(html) | |
f.close() | |
cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time)) | |
#print cur.description | |
#print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID | |
#print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 | |
lastid = int(cur.lastrowid) | |
cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0]))) | |
cur.connection.commit() | |
return True | |
def insert_content(path_url): | |
f = open(path_url,'rb') | |
xhtml = f.read() | |
f.close() | |
soup = BeautifulSoup(xhtml, "html5lib") | |
titleH1 = soup.find("h1") | |
if None == titleH1: | |
return False | |
title = titleH1.string.encode('utf-8') | |
num_time = int(soup.find("div", id="num_time").string.encode('utf-8')) | |
keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore')) | |
description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore')) | |
content = soup.find_all("div", class_="rich_media_content") | |
if len(content) < 1 : | |
print(" "+"no contet") | |
return False | |
cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time)) | |
#print cur.description | |
#print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID | |
#print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 | |
lastid = int(cur.lastrowid) | |
cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0]))) | |
cur.connection.commit() | |
return True | |
def open_url(url): | |
req = urllib2.Request(url) | |
req.add_header('User-agent', 'Mozilla 5.10') | |
for i in range(0, 3): | |
try: | |
xhtml = urllib2.urlopen(req) | |
return xhtml | |
except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面 | |
print "The server couldn't fulfill the request" | |
print "Error code:",e.code | |
if e.code!=503: | |
return False | |
time.sleep(5) | |
print("try again") | |
except urllib2.URLError,e: | |
print "Failed to reach the server" | |
print "The reason:",e.reason | |
if e.code!=503: | |
return False | |
time.sleep(5) | |
print("try again") | |
return Fasle | |
def down_list(list_url): | |
xhtml=open_url(list_url) | |
if False == xhtml : | |
return False | |
soup = BeautifulSoup(xhtml, "html5lib") | |
title = soup.title.string.encode('utf-8') | |
li_a = soup.find_all("a", class_="question_link") | |
next_list = soup.find_all("a", text="下一页") | |
writer = csv.writer(file(datapath+'list.csv', 'a+b')) | |
x = 0 | |
for i in range(0, len(li_a)): | |
content_id = li_a[i]['href'].encode('utf-8')[3:] | |
content_title = li_a[i].string.encode('utf-8') | |
content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8') | |
path_url = datapath+content_id+".html" | |
if not os.path.exists(path_url): | |
if False == down_content(content_url,path_url) : | |
print(" "+str(x)+content_url+" down fail") | |
continue | |
return False | |
print(" "+str(x)+content_url+" down end") | |
writer.writerow([content_id, content_title, content_url]) | |
x=x+1 | |
if x%2 == 1 : | |
time.sleep(3) | |
time.sleep(1) | |
else: | |
#insert_content(path_url) | |
print(" "+content_url+" exist") | |
return False | |
print(list_url+" end") | |
if len(next_list) < 1 : | |
return False | |
print("next "+next_list[0]['href'].encode('utf-8')+"\n") | |
return True | |
def get_list(): | |
start=0 | |
while True: | |
if start==0: | |
url = 'http://chuansong.me/account/xingdongpai77' | |
else: | |
url = 'http://chuansong.me/account/xingdongpai77?start='+str(start) | |
if False == down_list(url) or start>2000: | |
break | |
start+=12 | |
time.sleep(1) | |
print("get_list end") | |
if __name__ == "__main__": | |
datapath = get_cur_file_dir()+'/data/' | |
if not os.path.exists(datapath): | |
os.makedirs(datapath) | |
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd="123456", db='mysql') | |
cur = conn.cursor() | |
cur.execute("SET NAMES utf8") | |
cur.execute("USE x") | |
get_list() | |
cur.close() | |
conn.close() | |
# xtime = time.strftime("%Y-%m-%d %H:%M:%S") | |
# xday = time.strftime("%Y-%m-%d") | |
# f=file(datapath+xtime+".html","w+") | |
# f.write(body) | |
# f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 | |
//[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2 | |
//[root@vps3 work]# vim screenshots.js | |
var page = require('webpage').create(); | |
var args = require('system').args; | |
var url = args[1]; | |
var filename = args[2]; | |
page.open(url, function(status) { | |
console.log("Status: " + status); | |
if(status === "success") { | |
#执行js | |
var title = page.evaluate(function(){ | |
#滚动加载惰性图片 | |
window.scrollTo(0,10000); | |
#返回标题 | |
return document.title; | |
}); | |
#调试信息 | |
console.log('Page title is ' + title); | |
#延迟处理,以便加载图片执行js | |
window.setTimeout(function () | |
{ | |
#截图渲染 | |
page.render(filename); | |
#退出 | |
phantom.exit(); | |
}, 5000); | |
}else{ | |
phantom.exit(); | |
} | |
}); | |
//安装微软雅黑字体 | |
//[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig | |
//[root@vps3 work]#mkdir /usr/share/fonts/win/ | |
//[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf | |
//[root@vps3 work]#mkfontscale | |
//[root@vps3 work]#mkfontdir | |
//[root@vps3 work]#fc-cache | |
//执行截图功能 | |
//[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png && phantomjs-2.1.1-linux-x86_64/bin/phantomjs screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
# https://github.com/lining0806/WechatSearchProjects | |
import sys | |
import re | |
import urllib, urllib2 | |
import requests | |
import pymongo | |
import datetime | |
from bs4 import BeautifulSoup | |
import multiprocessing as mp | |
class MongoDBIO: | |
# 申明相关的属性 | |
def __init__(self, host, port, name, password, database, collection): | |
self.host = host | |
self.port = port | |
self.name = name | |
self.password = password | |
self.database = database | |
self.collection = collection | |
# 连接数据库,db和posts为数据库和集合的游标 | |
def Connection(self): | |
# connection = pymongo.Connection() # 连接本地数据库 | |
connection = pymongo.Connection(host=self.host, port=self.port) | |
# db = connection.datas | |
db = connection[self.database] | |
if self.name or self.password: | |
db.authenticate(name=self.name, password=self.password) # 验证用户名密码 | |
# print "Database:", db.name | |
# posts = db.cn_live_news | |
posts = db[self.collection] | |
# print "Collection:", posts.name | |
return posts | |
# # 保存操作 | |
# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): | |
# posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() | |
# | |
# for save_content in save_contents: | |
# posts.save(save_content) | |
# 保存操作 | |
def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): | |
posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() | |
posts.save(save_content) | |
def GetTitleUrl(url, data): | |
content = requests.get(url=url, params=data).content # GET请求发送 | |
soup = BeautifulSoup(content) | |
tags = soup.findAll("h4") | |
titleurl = [] | |
for tag in tags: | |
item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""} | |
titleurl.append(item) | |
return titleurl | |
def GetContent(url): | |
soup = BeautifulSoup(requests.get(url=url).content) | |
tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 | |
content_list = [tag_i.text for tag_i in tag.findAll("p")] | |
content = "".join(content_list) | |
return content | |
def ContentSave(item): | |
# 保存配置 | |
save_host = "localhost" | |
save_port = 27017 | |
save_name = "" | |
save_password = "" | |
save_database = "testwechat" | |
save_collection = "result" | |
save_content = { | |
"title":item["title"], | |
"link":item["link"], | |
"content":item["content"] | |
} | |
ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) | |
def func(tuple): | |
querystring, type, page = tuple[0], tuple[1], tuple[2] | |
url = "http://weixin.sogou.com/weixin" | |
# get参数 | |
data = { | |
"query":querystring, | |
"type":type, | |
"page":page | |
} | |
titleurl = GetTitleUrl(url, data) | |
for item in titleurl: | |
url = item["link"] | |
print "url:", url | |
content = GetContent(url) | |
item["content"] = content | |
ContentSave(item) | |
if __name__ == '__main__': | |
start = datetime.datetime.now() | |
querystring = u"清华" | |
type = 2 # 2-文章,1-微信号 | |
# 多进程抓取 | |
p = mp.Pool() | |
p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)]) | |
p.close() | |
p.join() | |
# # 单进程抓取 | |
# for page in range(1, 50, 1): | |
# tuple = (querystring, type, page) | |
# func(tuple) | |
end = datetime.datetime.now() | |
print "last time: ", end-start |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment