Skip to content

Instantly share code, notes, and snippets.

@south1907
Created May 12, 2020 17:36
Embed
What would you like to do?
Get all comment facebook plugin of all film (each category) phimmoi
import requests
import json
import time
from bs4 import BeautifulSoup
start_time = time.time()
headers = {
'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
}
limit = 20
def get_id_film(url_film):
params = {'href': url_film}
plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'
# request với param href = url_fillm
r = requests.get(plugin_comment_root, params=params)
resp = r.text
# tìm index targetFBID
start_index = resp.find('"targetFBID":"') + 14
# tìm dấu nháy tiếp theo
end_index = resp.find('"', start_index)
# crop :)
id_film_fb = resp[start_index:end_index]
return id_film_fb
def get_page(film_id, after_cursor = ''):
data = {
'after_cursor': after_cursor,
'limit': limit,
'__a': '1'
}
url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'
r = requests.post(url, headers=headers, data=data)
response = r.text[9:]
res_obj = json.loads(response)
list_comment = res_obj['payload']['idMap']
r = []
film_name = ''
for key in list_comment:
user_id = ''
user_name = ''
user_uri = ''
item = list_comment[key]
# 3 loại
if item['type'] == 'user':
user_id = item['id']
user_name = item['name']
user_uri = item['uri']
if item['type'] == 'ogobject':
if film_name == '':
film_name = item['name']
film_uri = item['uri']
if item['type'] == 'comment':
comment_user = item['authorID']
comment_content = item['body']['text']
comment_timestamp = item['timestamp']
temp = {
'comment_user': comment_user,
'comment_content': comment_content,
'comment_timestamp': comment_timestamp,
'film_id': film_id
}
r.append(temp)
return {
'data': r,
'next': res_obj['payload']['afterCursor'],
'film_name': film_name
}
def get_all_of_film(url_film):
results = []
after_cursor = ''
film_id = get_id_film(url_film)
while 1:
# print('after_cursor: ' + after_cursor)
res = get_page(film_id, after_cursor)
# nếu không có comment nào nữa thì thoát
if len(res['data']) == 0:
break
# nếu không thì + vào results và request next page dựa vào after_cursor
results += res['data']
after_cursor = res['next']
return {
'data': results,
'film_id': film_id
}
def get_film_of_cate(cate, page):
domain = 'http://phimmoi.net/'
url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'
r = requests.get(url_cate_film)
resp = r.text
soup = BeautifulSoup(resp, features="html.parser")
films = soup.findAll('a', {'class': 'block-wrapper'})
results = []
for item in films:
href = domain + item['href']
title = item['title'].replace('/', ' ')
results.append({
'href': href,
'title': title
})
return results
cate_film = 'phim-hanh-dong'
page_film = 1
list_film = get_film_of_cate(cate_film, page_film)
for film in list_film:
url_film = film['href']
title_film = film['title']
print(title_film)
all_data = get_all_of_film(url_film)
with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
end_time = time.time()
total_time = end_time - start_time
# print(total_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment