Skip to content

Instantly share code, notes, and snippets.

@hoyinli1211
Created June 22, 2022 16:14
Show Gist options
  • Save hoyinli1211/9ecaa75ee57f9d3ac50eab24bf80cbec to your computer and use it in GitHub Desktop.
Save hoyinli1211/9ecaa75ee57f9d3ac50eab24bf80cbec to your computer and use it in GitHub Desktop.
#import packages
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta
import math
import requests
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
#create a new instance of google chrome.
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1920,1080")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.implicitly_wait(0.5)
#access google chrome and open the link
link = "https://www.icac.org.hk/en/press/index_p1.html"
driver.get(link)
#wait for 3 seconds to ensure the page is loaded.
time.sleep(3)
#initialize the data frame
all_content = {"Ref":[],
"Date":[],
"Title":[],
"Content":[],
"Person_list":[],
"Org_list":[],
"GPE_list":[],
"Law_list":[],
"Money_list":[],
"Link":[]}
page_no = driver.find_element(By.CSS_SELECTOR, ".pagenav a:nth-of-type(5)").get_attribute("textContent")
for p in range(1,int(page_no)+1):
#initialize the all_links of each page for later usage in second for loop l.
all_links = []
#as the link for each page with the format index_p{page number}.html
link1 = f"https://www.icac.org.hk/en/press/index_p{p}.html"
driver.get(link1)
#there are around 10 articles for each page and item is store in <div class='.pressItem.clearfix'>
all_posts = driver.find_elements(By.CSS_SELECTOR, ".pressItem.clearfix")
for post in all_posts:
#for each article within the page, the article link is stored in the href link of class "details" element "a".
link2 = post.find_element(By.CSS_SELECTOR, ".details a").get_attribute("href")
all_links.append(link2)
for l in all_links:
time.sleep(2.5)
#further access the article link retrieved from above
driver.get(l)
#Reference of the article
ref = linkRef.rsplit('/', 1)[-1].replace("index_id_","").replace(".html","")
all_content["Ref"].append(ref)
#date of the article is stored in class "date"
date = driver.find_element(By.CSS_SELECTOR, ".date").get_attribute("textContent")
all_content["Date"].append(date)
#title of the article is stored in class "contentWrap" and element "h2"
title = driver.find_element(By.CSS_SELECTOR, ".contentWrap h2").get_attribute("textContent")
all_content["Title"].append(title)
#content of the article is stored in class "pressContent full"
content = driver.find_element(By.CSS_SELECTOR, ".pressContent.full").get_attribute("textContent")
all_content["Content"].append(content)
#weblink of the article
linkRef = l
all_content["Link"].append(linkRef)
#Extract all PERSON to column person_list
doc = nlp(f"{content}")
Person_list = []
Org_list = []
Gpe_list = []
Law_list = []
Money_list = []
for ent in doc.ents:
if ent.label_ == "PERSON":
Person_list.append(ent)
elif ent.label_ == "ORG":
Org_list.append(ent)
elif ent.label_ == "GPE":
Gpe_list.append(ent)
elif ent.label_ == "LAW":
Law_list.append(ent)
elif ent.label_ == "MONEY":
Money_list.append(ent)
#Persons
persons = ", ".join(map(str, set(Person_list)))
all_content["Person_list"].append(persons)
#Organization
org = ", ".join(map(str, set(Org_list)))
all_content["Org_list"].append(org)
#Geolocation
gpe = ", ".join(map(str, set(Gpe_list)))
all_content["GPE_list"].append(gpe)
#Law
law = ", ".join(map(str, set(Law_list)))
all_content["Law_list"].append(law)
#Monetary Amount
money = ", ".join(map(str, set(Money_list)))
all_content["Money_list"].append(money)
all_content = pd.DataFrame(all_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment