Skip to content

Instantly share code, notes, and snippets.

@south1907
Created May 12, 2020 17:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save south1907/9ad26e860f78d452b836f431d634e682 to your computer and use it in GitHub Desktop.
Save south1907/9ad26e860f78d452b836f431d634e682 to your computer and use it in GitHub Desktop.
Get all comment facebook plugin of all film (each category) phimmoi
import requests
import json
import time
from bs4 import BeautifulSoup
start_time = time.time()
headers = {
'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
}
limit = 20
def get_id_film(url_film):
params = {'href': url_film}
plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'
# request với param href = url_fillm
r = requests.get(plugin_comment_root, params=params)
resp = r.text
# tìm index targetFBID
start_index = resp.find('"targetFBID":"') + 14
# tìm dấu nháy tiếp theo
end_index = resp.find('"', start_index)
# crop :)
id_film_fb = resp[start_index:end_index]
return id_film_fb
def get_page(film_id, after_cursor = ''):
data = {
'after_cursor': after_cursor,
'limit': limit,
'__a': '1'
}
url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'
r = requests.post(url, headers=headers, data=data)
response = r.text[9:]
res_obj = json.loads(response)
list_comment = res_obj['payload']['idMap']
r = []
film_name = ''
for key in list_comment:
user_id = ''
user_name = ''
user_uri = ''
item = list_comment[key]
# 3 loại
if item['type'] == 'user':
user_id = item['id']
user_name = item['name']
user_uri = item['uri']
if item['type'] == 'ogobject':
if film_name == '':
film_name = item['name']
film_uri = item['uri']
if item['type'] == 'comment':
comment_user = item['authorID']
comment_content = item['body']['text']
comment_timestamp = item['timestamp']
temp = {
'comment_user': comment_user,
'comment_content': comment_content,
'comment_timestamp': comment_timestamp,
'film_id': film_id
}
r.append(temp)
return {
'data': r,
'next': res_obj['payload']['afterCursor'],
'film_name': film_name
}
def get_all_of_film(url_film):
results = []
after_cursor = ''
film_id = get_id_film(url_film)
while 1:
# print('after_cursor: ' + after_cursor)
res = get_page(film_id, after_cursor)
# nếu không có comment nào nữa thì thoát
if len(res['data']) == 0:
break
# nếu không thì + vào results và request next page dựa vào after_cursor
results += res['data']
after_cursor = res['next']
return {
'data': results,
'film_id': film_id
}
def get_film_of_cate(cate, page):
domain = 'http://phimmoi.net/'
url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'
r = requests.get(url_cate_film)
resp = r.text
soup = BeautifulSoup(resp, features="html.parser")
films = soup.findAll('a', {'class': 'block-wrapper'})
results = []
for item in films:
href = domain + item['href']
title = item['title'].replace('/', ' ')
results.append({
'href': href,
'title': title
})
return results
cate_film = 'phim-hanh-dong'
page_film = 1
list_film = get_film_of_cate(cate_film, page_film)
for film in list_film:
url_film = film['href']
title_film = film['title']
print(title_film)
all_data = get_all_of_film(url_film)
with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
end_time = time.time()
total_time = end_time - start_time
# print(total_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment