south1907/get_ful_comment.py

## get_ful_comment.py
import requests
import json
import time
from bs4 import BeautifulSoup

start_time = time.time()

headers = {
	'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
}
limit = 20

def get_id_film(url_film):

	params = {'href': url_film}
	plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'

	# request với param href = url_fillm
	r = requests.get(plugin_comment_root, params=params)

	resp = r.text

	# tìm index targetFBID
	start_index = resp.find('"targetFBID":"') + 14

	# tìm dấu nháy tiếp theo
	end_index = resp.find('"', start_index)

	# crop :)
	id_film_fb = resp[start_index:end_index]

	return id_film_fb

def get_page(film_id, after_cursor = ''):

	data = {
		'after_cursor': after_cursor,
		'limit': limit,
		'__a': '1'
	}

	url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'

	r = requests.post(url, headers=headers, data=data)
	response = r.text[9:]
	res_obj = json.loads(response)

	list_comment = res_obj['payload']['idMap']

	r = []
	film_name = ''

	for key in list_comment:

		user_id = ''
		user_name = ''
		user_uri = ''

		item = list_comment[key]

		# 3 loại
		if item['type'] == 'user':
			user_id = item['id']
			user_name = item['name']
			user_uri = item['uri']

		if item['type'] == 'ogobject':
			if film_name == '':
				film_name = item['name']
				film_uri = item['uri']

		if item['type'] == 'comment':
			comment_user = item['authorID']
			comment_content = item['body']['text']
			comment_timestamp = item['timestamp']

			temp = {
				'comment_user': comment_user,
				'comment_content': comment_content,
				'comment_timestamp': comment_timestamp,
				'film_id': film_id
			}

			r.append(temp)

	return {
		'data': r,
		'next': res_obj['payload']['afterCursor'],
		'film_name': film_name
	}

def get_all_of_film(url_film):
	results = []
	after_cursor = ''
	film_id = get_id_film(url_film)

	while 1:
		# print('after_cursor: ' + after_cursor)
		res = get_page(film_id, after_cursor)

		# nếu không có comment nào nữa thì thoát
		if len(res['data']) == 0:
			break

		# nếu không thì + vào results và request next page dựa vào after_cursor
		results += res['data']
		after_cursor = res['next']

	return {
		'data': results,
		'film_id': film_id
	}

def get_film_of_cate(cate, page):
	domain = 'http://phimmoi.net/'
	url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'

	r = requests.get(url_cate_film)
	resp = r.text
	soup = BeautifulSoup(resp, features="html.parser")

	films = soup.findAll('a', {'class': 'block-wrapper'})

	results = []
	for item in films:
		href = domain + item['href']
		title = item['title'].replace('/', ' ')
		results.append({
			'href': href,
			'title': title
		})

	return results

cate_film = 'phim-hanh-dong'
page_film = 1

list_film = get_film_of_cate(cate_film, page_film)

for film in list_film:
	url_film = film['href']
	title_film = film['title']

	print(title_film)
	all_data = get_all_of_film(url_film)

	with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
	    json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
end_time = time.time()

total_time = end_time - start_time

# print(total_time)
	import requests
	import json
	import time
	from bs4 import BeautifulSoup

	start_time = time.time()

	headers = {
	'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
	}
	limit = 20

	def get_id_film(url_film):

	params = {'href': url_film}
	plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'

	# request với param href = url_fillm
	r = requests.get(plugin_comment_root, params=params)

	resp = r.text

	# tìm index targetFBID
	start_index = resp.find('"targetFBID":"') + 14

	# tìm dấu nháy tiếp theo
	end_index = resp.find('"', start_index)

	# crop :)
	id_film_fb = resp[start_index:end_index]

	return id_film_fb

	def get_page(film_id, after_cursor = ''):

	data = {
	'after_cursor': after_cursor,
	'limit': limit,
	'__a': '1'
	}

	url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'

	r = requests.post(url, headers=headers, data=data)
	response = r.text[9:]
	res_obj = json.loads(response)

	list_comment = res_obj['payload']['idMap']

	r = []
	film_name = ''

	for key in list_comment:

	user_id = ''
	user_name = ''
	user_uri = ''

	item = list_comment[key]

	# 3 loại
	if item['type'] == 'user':
	user_id = item['id']
	user_name = item['name']
	user_uri = item['uri']

	if item['type'] == 'ogobject':
	if film_name == '':
	film_name = item['name']
	film_uri = item['uri']

	if item['type'] == 'comment':
	comment_user = item['authorID']
	comment_content = item['body']['text']
	comment_timestamp = item['timestamp']

	temp = {
	'comment_user': comment_user,
	'comment_content': comment_content,
	'comment_timestamp': comment_timestamp,
	'film_id': film_id
	}

	r.append(temp)

	return {
	'data': r,
	'next': res_obj['payload']['afterCursor'],
	'film_name': film_name
	}

	def get_all_of_film(url_film):
	results = []
	after_cursor = ''
	film_id = get_id_film(url_film)

	while 1:
	# print('after_cursor: ' + after_cursor)
	res = get_page(film_id, after_cursor)

	# nếu không có comment nào nữa thì thoát
	if len(res['data']) == 0:
	break

	# nếu không thì + vào results và request next page dựa vào after_cursor
	results += res['data']
	after_cursor = res['next']

	return {
	'data': results,
	'film_id': film_id
	}

	def get_film_of_cate(cate, page):
	domain = 'http://phimmoi.net/'
	url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'

	r = requests.get(url_cate_film)
	resp = r.text
	soup = BeautifulSoup(resp, features="html.parser")

	films = soup.findAll('a', {'class': 'block-wrapper'})

	results = []
	for item in films:
	href = domain + item['href']
	title = item['title'].replace('/', ' ')
	results.append({
	'href': href,
	'title': title
	})

	return results

	cate_film = 'phim-hanh-dong'
	page_film = 1

	list_film = get_film_of_cate(cate_film, page_film)

	for film in list_film:
	url_film = film['href']
	title_film = film['title']

	print(title_film)
	all_data = get_all_of_film(url_film)

	with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
	json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
	end_time = time.time()

	total_time = end_time - start_time

	# print(total_time)