Skip to content

Instantly share code, notes, and snippets.

@123789987
Forked from mugbya/facebook_crawler.py
Created June 9, 2019 09:02
Show Gist options
  • Save 123789987/1758b69687e94acf81af88859de6f59e to your computer and use it in GitHub Desktop.
Save 123789987/1758b69687e94acf81af88859de6f59e to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from uuid import uuid1
import logging
logger = logging.getLogger(__name__)
class TiebaSpider(scrapy.Spider):
name = "facebook"
allowed_domains = ['www.facebook.com']
start_urls = ['https://www.facebook.com/diana.liu.31521']
# start_urls = ['https://www.facebook.com/yang.liu.96']
def parse(self, response):
res = {'id': str(uuid1()).replace('-', "")}
username = response.xpath('//*[@id="fb-timeline-cover-name"]/a/text()').extract()
if username:
res.update({'username': username[0]})
# 头像
profile_pic_url_hd = response.xpath('//*[@id="fbTimelineHeadline"]/div[3]/div/div/div/img/@src').extract()
if profile_pic_url_hd:
res.update({'profile_pic_url_hd': profile_pic_url_hd[0]})
# 背景图
profile_pic_url_bg = response.xpath('//*[@id="fbCoverImageContainer"]/img[1]/@src').extract()
if profile_pic_url_bg:
res.update({'profile_pic_url_bg': profile_pic_url_bg[0]})
# 个人简介
person_desc = response.xpath('//*[@id="pagelet_bio"]/div/ul/li/div/div/span/text()').extract()
if person_desc:
res.update({'person_desc': ' '.join(person_desc)})
# 格言
quotes = response.xpath('//*[@id="pagelet_quotes"]/div/ul/li/div/div/span/text()').extract()
if quotes:
res.update({'quotes': ' '.join(quotes)})
education_list = []
work_list = []
hometownlist = []
skill_list = []
work_nodes = None
skills_node = None
education_nodes = None
hometown_nodes = response.xpath('//*[@id="pagelet_hometown"]/div/div/ul/li')
eduwork_nodes = response.xpath('//*[@id="pagelet_eduwork"]/div/div')
for node in eduwork_nodes:
sign = node.xpath('div/span/text()').extract()
if 'Work' in sign:
work_nodes = node.xpath('ul/li')
if 'Professional Skills' in sign:
skills_node = node.xpath('ul/li/div')
if 'Education' in sign:
education_nodes = node.xpath('ul/li')
favorites_nodes = response.xpath('//*[@id="favorites"]/div[2]/table/tbody')
for node in favorites_nodes:
sign = node.xpath('tr[1]/th/div/text()').extract()
if 'Music' in sign:
music_nodes = node.xpath('ul/li')
if 'Other' in sign:
others_node = node.xpath('ul/li/div')
if work_nodes:
for node in work_nodes:
title = ""
company = node.css('a::text').extract()
other_info = node.css('div::text').extract()
if other_info:
title = other_info[0]
other_info = other_info[1:]
work_dict = {
'company': company[0],
'title': title,
'other_info': other_info,
}
work_list.append(work_dict)
if skills_node:
skill_list = skills_node.xpath('a/text()').extract()
if education_nodes:
for node in education_nodes:
school = node.css('a::text').extract()
other_info = node.css('div::text').extract()
education_dict = {
"school": school[0],
"other_desc": other_info,
}
education_list.append(education_dict)
if hometown_nodes:
for node in hometown_nodes:
addr = node.css('a::text').extract()
desc = node.css('div::text').extract()
hometown_dict = {
'addr': addr[0],
'desc': desc[0],
}
hometownlist.append(hometown_dict)
res.update({
'work_list': work_list,
'hometownlist': hometownlist,
'skill_list': skill_list,
'education_list': education_list,
})
print(str(res).replace("'", '"'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment