Skip to content

Instantly share code, notes, and snippets.

@jintao-zero
Last active February 28, 2020 15:52
Show Gist options
  • Save jintao-zero/133820d8a535d7a43b1c92269efb8039 to your computer and use it in GitHub Desktop.
Save jintao-zero/133820d8a535d7a43b1c92269efb8039 to your computer and use it in GitHub Desktop.
解析微博搜索结果,获取微博内容和博主信息
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import urllib
import urllib2
from lxml import etree
import time
import random
import logging
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36',
'Cookie':'SINAGLOBAL=3405332213604.6436.1471069596406; un=jintao_jt@163.com; wvr=6; SWB=usrmdinst_1; SCF=AunhJILVe0zscPofTah6Lg5-Rekj9hI4zR6YRk2I9gUN6oN9uLAdC_m-8Se8ZFhdtvjGB-vk0BDZWXlcLM-jHYE.; SUB=_2A251tsH6DeTxGeRK71IR-SrEzziIHXVWxbQyrDV8PUNbmtBeLVmgkW90tABRj2hae-k0m8mxGm5hb4VMCQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWDZ8PVcd.RL1ddcn_r134X5JpX5KMhUgL.FozXSh571KBRShB2dJLoIpxDdCH8Sb-ReE-RBCH8SbHFSb-4Bntt; SUHB=0ft4NiAlOevBaq; ALF=1519641897; SSOLoginState=1488105899; _s_tentry=s.weibo.com; Apache=2390812289834.8115.1488105914049; ULV=1488105915047:15:5:1:2390812289834.8115.1488105914049:1487931254048; UOR=www.vpsee.com,widget.weibo.com,login.sina.com.cn'
}
def get_logger(logname):
logger = logging.getLogger(logname)
logger.setLevel(logging.DEBUG)
#create file handler
log_path = r'./' + logname + '.log'
fh = logging.FileHandler(log_path)
fh.setLevel(logging.DEBUG)
#create stream handler to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
#create formatter
fmt = "%(asctime)s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s"
date_fmt = "%a %d %b %Y %H:%M:%S"
formatter = logging.Formatter(fmt, date_fmt)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
class SearchResultPage:
'one search result page of weibo'
def __init__(self, url):
self.url = url
self.all_blogger_items = []
self.logger = get_logger('weibo_search_result')
def __download(self, weibo_url):
req = urllib2.Request(url = weibo_url, headers = header)
for i in range(5):
try:
sleeptime = random.randint(2, 7)
time.sleep(sleeptime)
html = urllib2.urlopen(req, timeout = 12)
data = html.read()
break
except:
data = None
return data
def __parse_search_result_page_html(self, data):
lines = data.splitlines()
more = True
for line in lines:
## 判断是否有微博内容, 出现这一行,则说明没有被认为是机器人
if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'):
n = line.find('html":"')
if n > 0:
j = line[n + 7: -12].decode('unicode_escape').encode("utf-8").replace("\\", "")
# no more result
if (j.find('<div class="search_noresult">') > 0):
more = False
else:
myparser = etree.HTMLParser(encoding="utf-8")
page = etree.HTML(j, parser = myparser)
self.logger.debug(etree.tostring(page))
ps = page.xpath("//p[@node-type='feed_list_content']") #使用xpath解析得到微博内容
addrs = page.xpath("//a[@class='W_texta W_fb']") # 使用xpath解析得到博主地址
times = page.xpath("//a[@class='W_textb' and @node-type='feed_list_item_date']") #使用xpath获取微博发布时间
index = 0
#获取昵称和微博内容
for p in ps:
name = p.attrib.get('nick-name')
weibo_text = p.xpath('string(.)') #
addr = addrs[index].attrib.get('href')
i = weibo_text.find('|')
if i > 0:
weibo_text = weibo_text[0:i]
weibo_time = times[index].text
name = name.encode("utf-8")
addr = addr.encode('utf-8').strip()
weibo_text = weibo_text.encode('utf-8').strip('\r\n\t').replace('\n','')
weibo_time = weibo_time.encode('utf-8')
weibo_text = weibo_text.strip('\r\n\t').replace('\n','')
if weibo_time.find('-') == -1:
localtime = time.localtime(time.time())
weibo_time = str(localtime.tm_year) + '年'+ weibo_time
print 'weibo_time: ', weibo_time
t = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(weibo_time,'%Y年%m月%d日 %H:%M'))
else:
t = weibo_time
self.all_blogger_items.append([name, addr, weibo_text, t])
print 'name: ', name, 'addr: ', addr, ' text: ', weibo_text, ' time: ', t
index += 1
def __get_blogger_pedit_more_addr(self, data):
lines = data.splitlines()
morepage = None
for line in lines:
if line.startswith('<script>FM.view({"ns":"pl.content.homeFeed.index","domid":"Pl_Core_UserInfo'):
n = line.find('html":"')
if n > 0:
j = line[n+7:-12].replace("\\/", "/").replace('\\"','"')
e = etree.HTML(j)
node = e.xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]')
if len(node) == 1:
morepage = node[0].attrib['href']
else:
print "cannot find more link"
morepage = None
if morepage.find('about') != -1:
morepage = None
else:
morepage = 'http://weibo.com/' + morepage
break
return morepage
def __parse_blogger_pedit_more(self, data):
lines = data.splitlines()
for line in lines:
if not line.startswith('<script>FM.view({"ns":"","domid":"Pl_Official_PersonalInfo'):
continue
n = line.find('html":"')
if n > 0:
j = line[n+7:-12].replace("\\/", "/").replace('\\"','"')
e = etree.HTML(j.decode('utf-8'))
nodes = e.xpath('//span')
location_flag = False
location = None
gender_flag = False
gender = None
for node in nodes:
text = node.xpath('string(.)')
text.strip()
if location_flag:
location = text
location_flag = False
if gender_flag:
gender = text
gender_flag = False
if text == u'所在地:':
location_flag = True
if text == u'性别:':
gender_flag = True
if location and gender:
return [location.encode('utf-8'), gender.encode('utf-8')]
return []
def __parse_blogger_detail(self, data):
more_page = self.__get_blogger_pedit_more_addr(data)
#assert(morepage)
if more_page is None:
print 'no morepage'
return []
pedit_more_page = self.__download(more_page)
if pedit_more_page is None:
return None
more_detail = self.__parse_blogger_pedit_more(pedit_more_page)
return more_detail
def __get_blogger_detail(self, blogger_info):
tryNum = 0
blogger_detail = []
while (len(blogger_detail) == 0) and (tryNum < 3):
page_html = self.__download(blogger_info[1])
blogger_detail = self.__parse_blogger_detail(page_html)
tryNum += 1
blogger_info.extend(blogger_detail)
def parse_page(self):
self.logger.info('start to parse url:%s ' % self.url)
page_html = self.__download(self.url)
open('page.html','w').write(page_html)
if page_html is None:
self.logger.critical('download %s fail' % page_html)
return
self.__parse_search_result_page_html(page_html)
for blogger in self.all_blogger_items:
self.__get_blogger_detail(blogger)
self.logger.debug(' '.join(blogger))
def dump_to_file(self, dst):
fo = open(dst, 'w')
for blogger in self.all_blogger_items:
s = '\t'.join(blogger)
fo.write(s)
fo.write('\n')
fo.close()
if __name__ == '__main__':
if len(sys.argv) < 3:
print 'useage: ./parse_weibo_search_page url dst_file'
exit(0)
page = SearchResultPage(sys.argv[1])
page.parse_page()
page.dump_to_file(sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment