Skip to content

Instantly share code, notes, and snippets.

@nix010
Last active April 10, 2018 04:56
Show Gist options
  • Save nix010/87bf179f682c91b22810bc00c0bb57fc to your computer and use it in GitHub Desktop.
Save nix010/87bf179f682c91b22810bc00c0bb57fc to your computer and use it in GitHub Desktop.
import json
from pprint import pprint
from datetime import datetime
import requests
from facebook_user_crawler import FbBaseCrawler
"""
FbBaseCrawler can be found here https://gist.github.com/gearazk/7e5a7178dfdee70222bdeb9d8e8d155e
"""
class FbUserListCrawler(FbBaseCrawler):
pages_crawl = 10
# _keyword = 'football'
_fbuser_id = '100003307297044'
API_URL = 'https://www.facebook.com/ajax/pagelet/generic.php/BrowseScrollingSetPagelet'
def __init__(self,keyword):
self.r = requests.Session()
self.r.cookies.update({
...<your fb account cookies>
})
self._keyword = keyword
pass
def crawl_now(self):
self._next_page_params = {}
user_list = {}
# Crawl :self.pages_crawl pages
for i in range(self.pages_crawl):
# Call the request
resp = self._get(self.API_URL, params=self._search_keyword_payload(keyword=self._keyword))
# Extract the json data. Because the response is in JS function format
# So we have to cut it manually.
json_data = json.loads(resp.content[9:])
# Get the paging cursor data to prepair for the next page.
self._next_page_params = self._search_cursor_dict(json_data.get('jsmods', {}).get('require'))
# If there is no HTML data return, it mean error.
if json_data.get('payload') is None or json_data.get('payload') == []:
print ('response-data-error')
return
# Extract to User FB_ID from the json response
_user_list = self._extract_post_info(json_data)
# For demo purpose
print(_user_list)
print('Page %s completed' % (str(i + 1)))
# If the cursor for next page is not found. Stop crawling
if not isinstance(self._next_page_params, dict):
print('Stop of page %d' % (i+1))
print('next-page-error')
break
# Gather the results
user_list = {**user_list,**_user_list}
# Return a list of FB id for the next crawler to get contact info from each profile
return list(user_list.keys())
def _extract_post_info(self,json_data):
post_dict = {}
attr_list = json_data.get('jsmods',{}).get('require')
for _list in attr_list:
if _list[0] == 'UFIController':
_root = _list[3]
_id = _root[2].get('feedbacktarget', {}).get('ownerid')
post_dict[_id] = {
'owner_name' :_root[1].get('ownerName'),
}
return post_dict
def _search_cursor_dict(self,dict_list):
if dict_list is None: return None
for arr in dict_list:
if len(arr) >= 4 and arr[1] == 'pageletComplete':
return arr[3][0]
return None
def _search_keyword_payload(self,keyword):
"""
Basiclly just prepair the payload data in FB format.
To replicate the request
:return : dict
"""
sub_query = {
"bqf": "keywords_blended_posts(%s)" % keyword,
"vertical": "content",
"post_search_vertical": None,
"filters": {
"filter_author": "stories-feed-friends",
"filter_author_enabled": "true"
},
"has_chrono_sort": False,
"query_analysis": None,
"subrequest_disabled": False,
"token_role": "NONE",
"preloaded_story_ids": [],
"extra_data": None,
"disable_main_browse_unicorn": False,
"entry_point_scope": None,
"entry_point_surface": None,
"squashed_ent_ids": [],
"source_session_id": None,
"preloaded_entity_ids": [],
"preloaded_entity_type": None,
"query_source": None
}
enc_q = {
"view": "list",
"encoded_query": json.dumps(sub_query),
"encoded_title": "",
"ref": "unknown",
"logger_source": "www_main",
"typeahead_sid": "",
"tl_log": False,
# "impression_id": "c02624f9",
"experience_type": "grammar",
"exclude_ids": None,
"browse_location": "browse_location:browse",
"trending_source": None,
"reaction_surface": None,
"reaction_session_id": None,
"ref_path": "/search/str/football/keywords_blended_posts",
"is_trending": False,
"topic_id": None,
"place_id": None,
"story_id": None,
"callsite": "browse_ui:init_result_set",
"has_top_pagelet": True,
"display_params": {
"crct": "none",
"mrss": True
},
}
enc_q.update(self._next_page_params)
return {
'dpr' : '1',
'data' : json.dumps(enc_q),
'__user': self._fbuser_id,
'__a' : '1',
'__be' : '1',
'__pc' : 'PHASED:DEFAULT',
}
#
keyword = input('What industry do you want to find ? : ')
keyword = keyword.strip()
crawler = FbUserListCrawler(keyword=keyword)
ids = crawler.crawl_now()
print(ids)
contact_crawler = FbBaseCrawler(
email='thienkhiemx',
password='xxx',
users_fbid=ids
)
contact_crawler.crawl_now()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment