zii/google_kg.py

## google_kg.py
#coding: utf-8
#TODO: 还是要确认一下职业的, 先带职业名搜索
#TODO: 头像
#TODO: 如果没有描述, 用job等字段组成描述
from scrapy import Selector
import requests
from lorm import Struct

PROXY_ADDR = '127.0.0.1:5555'

def google_search(keyword, hl=None):
    """
    :param keyword: search keyword
    :param hl: language, en/zh
    :return: (html, error)
    """
    url = u"https://www.google.com/search"
    proxies = None
    if PROXY_ADDR:
        proxies = {'http':PROXY_ADDR, 'https':PROXY_ADDR}
    headers = {}
    #headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    params = {
        'q': keyword,
        'ie': 'UTF-8'
    }
    if hl:
        params['hl'] = hl
    r = requests.get(url, params=params, timeout=10, proxies=proxies,
        headers=headers)
    if r.status_code != 200:
        return None, r.status
    return r.content, None


def google_kg(keyword, hl=None):
    """
    grab google knowledge gragh
    :param keyword: keyword
    :param hl: language, en/zh
    :return: (data, error)
    """
    html, error = google_search(keyword, hl)
    if not html:
        return None, error
    sel = Selector(text=html, type='html')
    rhs_block = sel.css('#rhs_block')
    if not rhs_block:
        return None, 'not find #rhs_block'
    name = rhs_block.css('div._B5d::text').extract()
    name = ''.join(name)
    print 'name:', name
    job = rhs_block.css('div._zdb::text').extract_first()
    print 'job:', job
    desc = rhs_block.css('div._tXc span::text').extract()
    desc = ''.join(desc)
    print 'desc:', desc
    keys = rhs_block.css('span._gS')
    vals = rhs_block.css('span._tA')
    for i, key in enumerate(keys):
        val = vals[i]
        key = key.css('::text').extract_first()
        val = val.css('::text').extract()
        val = ''.join(val)
        print 'key:', key
        print 'val:', val
    # 第二套方案(有可能有多个资料,需要确认;顺便通过Actress确定性别)
    tables = rhs_block.css('table')
    if tables:
        for table in tables:
            title = table.css('td div._fce::text').extract()
            title = ''.join(title)
            print 'title:', title
            for el in table.css('td div._Vbe'):
                info = el.css('::text').extract()
                info = ''.join(info)
                print 'info:', info
    data = Struct()
    return data, None

if __name__ == '__main__':
    # html, error = google_search(u'Zoë Cooper', 'zh')
    # with open('5.html', 'wb') as f:
    #    f.write(html)
    data, error = google_kg(u"沈炜竣", 'zh')
    print data or error
	#coding: utf-8
	#TODO: 还是要确认一下职业的, 先带职业名搜索
	#TODO: 头像
	#TODO: 如果没有描述, 用job等字段组成描述
	from scrapy import Selector
	import requests
	from lorm import Struct

	PROXY_ADDR = '127.0.0.1:5555'

	def google_search(keyword, hl=None):
	"""
	:param keyword: search keyword
	:param hl: language, en/zh
	:return: (html, error)
	"""
	url = u"https://www.google.com/search"
	proxies = None
	if PROXY_ADDR:
	proxies = {'http':PROXY_ADDR, 'https':PROXY_ADDR}
	headers = {}
	#headers['user-agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
	params = {
	'q': keyword,
	'ie': 'UTF-8'
	}
	if hl:
	params['hl'] = hl
	r = requests.get(url, params=params, timeout=10, proxies=proxies,
	headers=headers)
	if r.status_code != 200:
	return None, r.status
	return r.content, None


	def google_kg(keyword, hl=None):
	"""
	grab google knowledge gragh
	:param keyword: keyword
	:param hl: language, en/zh
	:return: (data, error)
	"""
	html, error = google_search(keyword, hl)
	if not html:
	return None, error
	sel = Selector(text=html, type='html')
	rhs_block = sel.css('#rhs_block')
	if not rhs_block:
	return None, 'not find #rhs_block'
	name = rhs_block.css('div._B5d::text').extract()
	name = ''.join(name)
	print 'name:', name
	job = rhs_block.css('div._zdb::text').extract_first()
	print 'job:', job
	desc = rhs_block.css('div._tXc span::text').extract()
	desc = ''.join(desc)
	print 'desc:', desc
	keys = rhs_block.css('span._gS')
	vals = rhs_block.css('span._tA')
	for i, key in enumerate(keys):
	val = vals[i]
	key = key.css('::text').extract_first()
	val = val.css('::text').extract()
	val = ''.join(val)
	print 'key:', key
	print 'val:', val
	# 第二套方案(有可能有多个资料,需要确认;顺便通过Actress确定性别)
	tables = rhs_block.css('table')
	if tables:
	for table in tables:
	title = table.css('td div._fce::text').extract()
	title = ''.join(title)
	print 'title:', title
	for el in table.css('td div._Vbe'):
	info = el.css('::text').extract()
	info = ''.join(info)
	print 'info:', info
	data = Struct()
	return data, None

	if __name__ == '__main__':
	# html, error = google_search(u'Zoë Cooper', 'zh')
	# with open('5.html', 'wb') as f:
	# f.write(html)
	data, error = google_kg(u"沈炜竣", 'zh')
	print data or error