Skip to content

Instantly share code, notes, and snippets.

@kieuchinh1012
Created August 9, 2021 05:18
Show Gist options
  • Save kieuchinh1012/d6af220fd34d48c8106007f4effade10 to your computer and use it in GitHub Desktop.
Save kieuchinh1012/d6af220fd34d48c8106007f4effade10 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common import keys
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
import csv
print('- Finish importing package')
#1 Mo trinh duyet Chrome va truy cap trang web
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(options=options, executable_path=r"C://Users/Dell.T/Documents/Code/chromedriver.exe")
def GetURL():
page_source = BeautifulSoup(driver.page_source,'html.parser')
profile_range = page_source.find('div', class_="col-sm-8 table-striped")
profiles = profile_range.find_all('a')
all_profile_URL = []
for profile in profiles:
profile_URL = profile.get('href')
if profile_URL not in all_profile_URL:
all_profile_URL.append(profile_URL)
return all_profile_URL
def GetURLallPages():
page_number = 1
URLs_all_page = []
input_page = int(input('Ban muon quet bao nhieu trang web: '))
while page_number < input_page:
url='https://ha-noi.congtydoanhnghiep.com/trang-'+str(page_number)
driver.get(url)
URLs_one_page = GetURL()
URLs_all_page = URLs_all_page + URLs_one_page
page_number = page_number + 1
return URLs_all_page
URLs_all_page = GetURLallPages()
with open('output.csv','w',newline = '') as file_output:
headers = ['Company name','english name','ceo name','tax code','address','phone','status','register add','register date','URL']
writer = csv.DictWriter(file_output,delimiter=",",lineterminator='\n',fieldnames=headers)
writer.writeheader()
for Business_URL in URLs_all_page:
driver.get(Business_URL)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
info_div = soup.find('div', class_ ="table-responsive")
info_loc = info_div.find_all('td')
company_name = info_loc[0].get_text().strip()
english_name = info_loc[1].get_text().strip()
ceo_name = info_loc[2].get_text().strip()
tax_code = info_loc[3].get_text().strip()
address = info_loc[4].get_text().strip()
phone = info_loc[5].get_text().strip()
status = info_loc[6].get_text().strip()
register_add = info_loc[7].get_text().strip()
register_date = info_loc[8].get_text().strip()
print(company_name)
print(english_name)
print(ceo_name)
print(tax_code)
print(address)
print(phone)
print(status)
print(register_add)
print(register_date)
print('\n')
write.writerow({headers[0]:company_name, headers[1]:english_name, headers[2]:ceo_name, headers[3]:tax_code, headers[4]:address, headers[5]:phone, headers[6]:status, headers[7]:register_add, headers[8]:register_date})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment