Skip to content

Instantly share code, notes, and snippets.

@Kurokami1006
Forked from kieuchinh1012/Bot Scraping
Created November 9, 2021 05:33
Show Gist options
  • Save Kurokami1006/5ae0ed70e92c10b8d006d7e1235ff5b7 to your computer and use it in GitHub Desktop.
Save Kurokami1006/5ae0ed70e92c10b8d006d7e1235ff5b7 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common import keys
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
import csv
print('- Finish importing package')
#1 Mo trinh duyet Chrome va truy cap trang web
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(options=options, executable_path=r"C://Users/Dell.T/Documents/Code/chromedriver.exe")
def GetURL():
page_source = BeautifulSoup(driver.page_source,'html.parser')
profile_range = page_source.find('div', class_="col-sm-8 table-striped")
profiles = profile_range.find_all('a')
all_profile_URL = []
for profile in profiles:
profile_URL = profile.get('href')
if profile_URL not in all_profile_URL:
all_profile_URL.append(profile_URL)
return all_profile_URL
def GetURLallPages():
page_number = 1
URLs_all_page = []
input_page = int(input('Ban muon quet bao nhieu trang web: '))
while page_number < input_page:
url='https://ha-noi.congtydoanhnghiep.com/trang-'+str(page_number)
driver.get(url)
URLs_one_page = GetURL()
URLs_all_page = URLs_all_page + URLs_one_page
page_number = page_number + 1
return URLs_all_page
URLs_all_page = GetURLallPages()
with open('output.csv','w',newline = '') as file_output:
headers = ['Company name','english name','ceo name','tax code','address','phone','status','register add','register date','URL']
writer = csv.DictWriter(file_output,delimiter=",",lineterminator='\n',fieldnames=headers)
writer.writeheader()
for Business_URL in URLs_all_page:
driver.get(Business_URL)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
info_div = soup.find('div', class_ ="table-responsive")
info_loc = info_div.find_all('td')
company_name = info_loc[0].get_text().strip()
english_name = info_loc[1].get_text().strip()
ceo_name = info_loc[2].get_text().strip()
tax_code = info_loc[3].get_text().strip()
address = info_loc[4].get_text().strip()
phone = info_loc[5].get_text().strip()
status = info_loc[6].get_text().strip()
register_add = info_loc[7].get_text().strip()
register_date = info_loc[8].get_text().strip()
print(company_name)
print(english_name)
print(ceo_name)
print(tax_code)
print(address)
print(phone)
print(status)
print(register_add)
print(register_date)
print('\n')
write.writerow({headers[0]:company_name, headers[1]:english_name, headers[2]:ceo_name, headers[3]:tax_code, headers[4]:address, headers[5]:phone, headers[6]:status, headers[7]:register_add, headers[8]:register_date})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment