Skip to content

Instantly share code, notes, and snippets.

@zii
Last active December 7, 2017 10:20
Show Gist options
  • Save zii/231bbc2c5f8ac0562972d16ea7defbbc to your computer and use it in GitHub Desktop.
Save zii/231bbc2c5f8ac0562972d16ea7defbbc to your computer and use it in GitHub Desktop.
#coding: utf-8
#TODO: 还是要确认一下职业的, 先带职业名搜索
#TODO: 头像
#TODO: 如果没有描述, 用job等字段组成描述
from scrapy import Selector
import requests
from lorm import Struct
PROXY_ADDR = '127.0.0.1:5555'
def google_search(keyword, hl=None):
"""
:param keyword: search keyword
:param hl: language, en/zh
:return: (html, error)
"""
url = u"https://www.google.com/search"
proxies = None
if PROXY_ADDR:
proxies = {'http':PROXY_ADDR, 'https':PROXY_ADDR}
headers = {}
#headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
params = {
'q': keyword,
'ie': 'UTF-8'
}
if hl:
params['hl'] = hl
r = requests.get(url, params=params, timeout=10, proxies=proxies,
headers=headers)
if r.status_code != 200:
return None, r.status
return r.content, None
def google_kg(keyword, hl=None):
"""
grab google knowledge gragh
:param keyword: keyword
:param hl: language, en/zh
:return: (data, error)
"""
html, error = google_search(keyword, hl)
if not html:
return None, error
sel = Selector(text=html, type='html')
rhs_block = sel.css('#rhs_block')
if not rhs_block:
return None, 'not find #rhs_block'
name = rhs_block.css('div._B5d::text').extract()
name = ''.join(name)
print 'name:', name
job = rhs_block.css('div._zdb::text').extract_first()
print 'job:', job
desc = rhs_block.css('div._tXc span::text').extract()
desc = ''.join(desc)
print 'desc:', desc
keys = rhs_block.css('span._gS')
vals = rhs_block.css('span._tA')
for i, key in enumerate(keys):
val = vals[i]
key = key.css('::text').extract_first()
val = val.css('::text').extract()
val = ''.join(val)
print 'key:', key
print 'val:', val
# 第二套方案(有可能有多个资料,需要确认;顺便通过Actress确定性别)
tables = rhs_block.css('table')
if tables:
for table in tables:
title = table.css('td div._fce::text').extract()
title = ''.join(title)
print 'title:', title
for el in table.css('td div._Vbe'):
info = el.css('::text').extract()
info = ''.join(info)
print 'info:', info
data = Struct()
return data, None
if __name__ == '__main__':
# html, error = google_search(u'Zoë Cooper', 'zh')
# with open('5.html', 'wb') as f:
# f.write(html)
data, error = google_kg(u"沈炜竣", 'zh')
print data or error
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment