Created
June 22, 2022 16:14
-
-
Save hoyinli1211/9ecaa75ee57f9d3ac50eab24bf80cbec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import packages | |
from selenium import webdriver | |
from selenium.webdriver.firefox.service import Service | |
from webdriver_manager.firefox import GeckoDriverManager | |
from webdriver_manager.chrome import ChromeDriverManager | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from datetime import datetime | |
from selenium.webdriver.common.action_chains import ActionChains | |
from bs4 import BeautifulSoup | |
import time | |
import pandas as pd | |
from datetime import datetime, timedelta | |
import math | |
import requests | |
import numpy as np | |
import spacy | |
nlp = spacy.load('en_core_web_sm') | |
#create a new instance of google chrome. | |
options = webdriver.ChromeOptions() | |
options.add_argument("--window-size=1920,1080") | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) | |
driver.implicitly_wait(0.5) | |
#access google chrome and open the link | |
link = "https://www.icac.org.hk/en/press/index_p1.html" | |
driver.get(link) | |
#wait for 3 seconds to ensure the page is loaded. | |
time.sleep(3) | |
#initialize the data frame | |
all_content = {"Ref":[], | |
"Date":[], | |
"Title":[], | |
"Content":[], | |
"Person_list":[], | |
"Org_list":[], | |
"GPE_list":[], | |
"Law_list":[], | |
"Money_list":[], | |
"Link":[]} | |
page_no = driver.find_element(By.CSS_SELECTOR, ".pagenav a:nth-of-type(5)").get_attribute("textContent") | |
for p in range(1,int(page_no)+1): | |
#initialize the all_links of each page for later usage in second for loop l. | |
all_links = [] | |
#as the link for each page with the format index_p{page number}.html | |
link1 = f"https://www.icac.org.hk/en/press/index_p{p}.html" | |
driver.get(link1) | |
#there are around 10 articles for each page and item is store in <div class='.pressItem.clearfix'> | |
all_posts = driver.find_elements(By.CSS_SELECTOR, ".pressItem.clearfix") | |
for post in all_posts: | |
#for each article within the page, the article link is stored in the href link of class "details" element "a". | |
link2 = post.find_element(By.CSS_SELECTOR, ".details a").get_attribute("href") | |
all_links.append(link2) | |
for l in all_links: | |
time.sleep(2.5) | |
#further access the article link retrieved from above | |
driver.get(l) | |
#Reference of the article | |
ref = linkRef.rsplit('/', 1)[-1].replace("index_id_","").replace(".html","") | |
all_content["Ref"].append(ref) | |
#date of the article is stored in class "date" | |
date = driver.find_element(By.CSS_SELECTOR, ".date").get_attribute("textContent") | |
all_content["Date"].append(date) | |
#title of the article is stored in class "contentWrap" and element "h2" | |
title = driver.find_element(By.CSS_SELECTOR, ".contentWrap h2").get_attribute("textContent") | |
all_content["Title"].append(title) | |
#content of the article is stored in class "pressContent full" | |
content = driver.find_element(By.CSS_SELECTOR, ".pressContent.full").get_attribute("textContent") | |
all_content["Content"].append(content) | |
#weblink of the article | |
linkRef = l | |
all_content["Link"].append(linkRef) | |
#Extract all PERSON to column person_list | |
doc = nlp(f"{content}") | |
Person_list = [] | |
Org_list = [] | |
Gpe_list = [] | |
Law_list = [] | |
Money_list = [] | |
for ent in doc.ents: | |
if ent.label_ == "PERSON": | |
Person_list.append(ent) | |
elif ent.label_ == "ORG": | |
Org_list.append(ent) | |
elif ent.label_ == "GPE": | |
Gpe_list.append(ent) | |
elif ent.label_ == "LAW": | |
Law_list.append(ent) | |
elif ent.label_ == "MONEY": | |
Money_list.append(ent) | |
#Persons | |
persons = ", ".join(map(str, set(Person_list))) | |
all_content["Person_list"].append(persons) | |
#Organization | |
org = ", ".join(map(str, set(Org_list))) | |
all_content["Org_list"].append(org) | |
#Geolocation | |
gpe = ", ".join(map(str, set(Gpe_list))) | |
all_content["GPE_list"].append(gpe) | |
#Law | |
law = ", ".join(map(str, set(Law_list))) | |
all_content["Law_list"].append(law) | |
#Monetary Amount | |
money = ", ".join(map(str, set(Money_list))) | |
all_content["Money_list"].append(money) | |
all_content = pd.DataFrame(all_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment