Skip to content

Instantly share code, notes, and snippets.

@PhanDuc
Created February 18, 2017 01:54
Show Gist options
  • Save PhanDuc/ca70916e832246caed3b71ccbb9f3ba0 to your computer and use it in GitHub Desktop.
Save PhanDuc/ca70916e832246caed3b71ccbb9f3ba0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import re
import csv
import pickle
from gevent.pool import Pool
id_data_list = []
data_csv = []
csv_columns = ['Name','FB ID']
def write_csv(data_list):
try:
with open("socialbakers" + '.csv', 'w+', newline='', encoding='utf-8-sig') as my_csv:
writer = csv.writer(my_csv, dialect='excel', quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(csv_columns)
for data in data_list:
writer.writerow(data)
except:
print("error")
def get_1000_id():
# Get the ID of 1000 brands
try:
session = requests.Session()
for number in range(100):
link = 'https://www.socialbakers.com/statistics/facebook/pages/total/vietnam/brands/page-{0}/?showMoreList-from=1&do=platformList-renderAjax&json='.format(
number + 1)
source = requests.get(link)
# print(source.text)
soup = BeautifulSoup(source.text, 'html.parser')
class_brand = soup.find_all('a')
pattern = r'(\d+)'
class_brand = class_brand[0:len(class_brand) - 1]
try:
[id_data_list.append(re.search(pattern, item.attrs['href']).group(1)) for item in class_brand]
except:
continue
pickle.dump(id_data_list, open("All_id", "wb"))
except:
pass
if __name__ == '__main__':
count_ = 0
get_1000_id()
pool = Pool(25)
try:
for id in id_data_list:
count_ += 1
if(count_ % 100 == 0):
print("{0}/{1}".format(count_, len(id_data_list)))
url = 'https://www.socialbakers.com/statistics/facebook/pages/detail/{0}'.format(id)
pool.spawn(requests.get, url)
source = pool.join()
# print(source.text)
soup = BeautifulSoup(source.text, 'html.parser')
name_page = soup.find('a', attrs={'class':'blank show-tooltip'})['href']
name = soup.find('a', attrs={'class': 'blank show-tooltip'}).text
data_csv.append([name, name_page])
write_csv(data_csv)
except:
print("Error")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment