Skip to content

Instantly share code, notes, and snippets.

@dirtyluke
Created December 27, 2016 15:13
Show Gist options
  • Save dirtyluke/2b69e0f6287b5d2ce623dac7448e0c64 to your computer and use it in GitHub Desktop.
Save dirtyluke/2b69e0f6287b5d2ce623dac7448e0c64 to your computer and use it in GitHub Desktop.
#
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import URLError
import mysql.connector
import time
import pymysql
import json
import sys
import lxml
import re
# links: Python初学者之网络爬虫 - Python - 伯乐在线 http://python.jobbole.com/86872/?utm_source=blog.jobbole.com&utm_medium=relatedPosts
# filter out live ids from a url
def filterLiveIds(url):
html = urlopen(url)
liveIds = set()
soup = BeautifulSoup(html, "html.parser")
for link in soup.findAll("a", href=re.compile("^(/l/)")):
if 'href' in link.attrs:
newPage = link.attrs['href']
liveId = re.findall("[0-9]+", newPage)
liveIds.add(liveId[0])
return liveIds
# get user id from live page
def getUserId(liveId):
html = urlopen("http://www.huajiao.com/l/" + str(liveId),timeout=1)
soup = BeautifulSoup(html, "html.parser")
text = soup.title.get_text()
res = re.findall("[0-9]+", text)
return res[0]
# get user data from page
def getUserData(userId):
data = dict()
try:
html = urlopen("http://www.huajiao.com/user/" + str(userId),timeout=3)
soup = BeautifulSoup(html, "html.parser")
userInfoObj = soup.find("div", {"id": "userInfo"})
data['FAvatar'] = userInfoObj.find("div", {"class": "avatar"}).img.attrs['src']
userId = userInfoObj.find("p", {"class": "user_id"}).get_text()
data['FUserId'] = re.findall("[0-9]+", userId)[0]
tmp = userInfoObj.h3.get_text('|', strip=True).split('|')
# print(tmp[0].encode("utf-8")
data['FUserName'] = tmp[0]
data['FLevel'] = tmp[1]
tmp = userInfoObj.find("ul", {"class": "clearfix"}).get_text("|", strip=True).split("|")
data['FFollow'] = tmp[0]
data['FFollowed'] = tmp[2]
data['FSupported'] = tmp[4]
data['FExperience'] = tmp[6]
return data
except URLError:
print(str(userId) + ":html parse error in getUserData")
return 0
except Exception:
print(str(userId) + ":html parse error in getUserData")
return 0
def getNowTime():
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
# updata user data
def replaceUserData(data):
conn = mysql.connector.connect(user='root', password='toor', database='wanghong')
cur = conn.cursor()
try:
cur.execute("use wanghong")
cur.execute("set names utf8mb4")
cur.execute(
"replace into Tbl_Huajiao_User(FUserId,FUserName,FLevel,FFollow,FFollowed,FSupported,FExperience,FAvatar,FScrapedTime) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(int(data['FUserId']), data['FUserName'], int(data['FLevel']), int(data['FFollow']), int(data['FFollowed']),
int(data['FSupported']), int(data['FExperience']), data['FAvatar'], getNowTime()))
conn.commit()
except mysql.connector.Error as e:
print(e)
def replaceUserDatas(datas):
conn = mysql.connector.connect(user='root', password='toor', database='wanghong')
cur = conn.cursor()
try:
cur.execute("use wanghong")
cur.execute("set names utf8mb4")
for data in datas:
print(data)
cur.execute(
"replace into Tbl_Huajiao_User(FUserId,FUserName,FLevel,FFollow,FFollowed,FSupported,FExperience,FAvatar,FScrapedTime) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(int(data['FUserId']), data['FUserName'], int(data['FLevel']), int(data['FFollow']), int(data['FFollowed']),
int(data['FSupported']), int(data['FExperience']), data['FAvatar'], getNowTime()))
conn.commit()
except mysql.connector.Error as e:
print(e)
# get user history lives
def getUserLives(userId):
try:
url = "http://webh.huajiao.com/User/getUserFeeds?fmt=json&uid=" + str(userId)
html = urlopen(url).read().decode('utf-8')
jsonData = json.loads(html)
if jsonData['error'] != 0:
print(str(userId) + 'error occured in getUserFeeds for:' + jsonData['msg'])
return 0
return jsonData['data']['feeds']
except Exception as e:
print(e)
return 0
def getLiveIdsFromRecommendPage():
url = "http://www.huajiao.com/category/1000"
liveIds = filterLiveIds(url)
url = "http://www.huajiao.com/category/1000?pageno=2"
liveIds1 = filterLiveIds(url)
for k in liveIds1:
liveIds.add(k)
return liveIds
# spider user ids
def spiderUserDatas():
userDatas=[]
liveIds = getLiveIdsFromRecommendPage()
for liveId in liveIds:
userId = getUserId(liveId)
print(userId)
userData=None
while userData==None:
try:
userData = getUserData(userId)
except URLError as e:
print(e)
except Exception as e:
print(e)
if userData:
userDatas.append(userData)
print(userDatas)
replaceUserDatas(userDatas)
return 1
# spider user lives
def spiderUserLives():
userIds = selectUserIds(100)
for userId in userIds:
liveDatas = getUserLives(userId[0])
for liveData in liveDatas:
liveData['feed']['FUserId'] = userId[0]
replaceUserLive(liveData['feed'])
return 1
def replaceUserLive(live):
conn = mysql.connector.connect(user="root", password="toor", databases="wanghong")
cur = conn.cursor()
try:
cur.execute("set names utf8mb4")
cur.execute("replace into Tbl_Huajiao_Live")
conn.commit()
except mysql.connector.Error as e:
print(e)
def selectUserIds(num):
conn = mysql.connector.connect(user='root', password='toor', database='wanghong')
cursor = conn.cursor()
cursor.execute('select FUserId from tbl_huajiao_user limit %s' % num)
UserIds = cursor.fetchall()
if UserIds:
return UserIds
def main(argv):
if len(argv) < 2:
print("Usage: python3 huajiao.py[spiderUserDatas|spiderUserLives]")
exit()
if (argv[1] == 'spiderUserDatas'):
spiderUserDatas()
elif (argv[1] == 'spiderUserLives'):
spiderUserLives()
elif (argv[1] == 'getUserCount'):
print(getUserCount())
elif (argv[1] == 'getLiveCount'):
print(getLiveCount())
else:
print("Usage: python3 huajiao.py [spiderUserDatas|spiderUserLives|getUserCount|getLiveCount]")
# TODO 代码补全
def getUserCount():
pass
# TODO 代码补全
def getLiveCount():
pass
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment