hoyinli1211/NER_ICAC_all.py

## NER_ICAC_all.py
#import packages
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta
import math
import requests
import numpy as np

import spacy
nlp = spacy.load('en_core_web_sm')

#create a new instance of google chrome.
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1920,1080")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.implicitly_wait(0.5)

#access google chrome and open the link
link = "https://www.icac.org.hk/en/press/index_p1.html"
driver.get(link)
#wait for 3 seconds to ensure the page is loaded.
time.sleep(3)

#initialize the data frame
all_content = {"Ref":[],
               "Date":[],
               "Title":[],
               "Content":[],
               "Person_list":[],
               "Org_list":[],
               "GPE_list":[],
               "Law_list":[],
               "Money_list":[],
               "Link":[]}

page_no = driver.find_element(By.CSS_SELECTOR, ".pagenav a:nth-of-type(5)").get_attribute("textContent")

for p in range(1,int(page_no)+1):
    #initialize the all_links of each page for later usage in second for loop l.
    all_links = []
    #as the link for each page with the format index_p{page number}.html
    link1 = f"https://www.icac.org.hk/en/press/index_p{p}.html"
    driver.get(link1)
    #there are around 10 articles for each page and item is store in <div class='.pressItem.clearfix'>
    all_posts = driver.find_elements(By.CSS_SELECTOR, ".pressItem.clearfix")
    for post in all_posts:
        #for each article within the page, the article link is stored in the href link of class "details" element "a".
        link2 = post.find_element(By.CSS_SELECTOR, ".details a").get_attribute("href")
        all_links.append(link2)
    for l in all_links:
        time.sleep(2.5)
        #further access the article link retrieved from above
        driver.get(l)

        #Reference of the article
        ref = linkRef.rsplit('/', 1)[-1].replace("index_id_","").replace(".html","")
        all_content["Ref"].append(ref)

        #date of the article is stored in class "date"
        date = driver.find_element(By.CSS_SELECTOR, ".date").get_attribute("textContent")
        all_content["Date"].append(date)

        #title of the article is stored in class "contentWrap" and element "h2"
        title = driver.find_element(By.CSS_SELECTOR, ".contentWrap h2").get_attribute("textContent")
        all_content["Title"].append(title)

        #content of the article is stored in class "pressContent full"
        content = driver.find_element(By.CSS_SELECTOR, ".pressContent.full").get_attribute("textContent")
        all_content["Content"].append(content)

        #weblink of the article
        linkRef = l
        all_content["Link"].append(linkRef)

        #Extract all PERSON to column person_list
        doc = nlp(f"{content}")
        Person_list = []
        Org_list = []
        Gpe_list = []
        Law_list = []
        Money_list = []

        for ent in doc.ents:
            if ent.label_ == "PERSON":
                Person_list.append(ent)
            elif ent.label_ == "ORG":
                Org_list.append(ent)
            elif ent.label_ == "GPE":
                Gpe_list.append(ent)
            elif ent.label_ == "LAW":
                Law_list.append(ent)
            elif ent.label_ == "MONEY":
                Money_list.append(ent)

        #Persons
        persons = ", ".join(map(str, set(Person_list)))
        all_content["Person_list"].append(persons)
        #Organization
        org = ", ".join(map(str, set(Org_list)))
        all_content["Org_list"].append(org)
        #Geolocation
        gpe = ", ".join(map(str, set(Gpe_list)))
        all_content["GPE_list"].append(gpe)
        #Law
        law = ", ".join(map(str, set(Law_list)))
        all_content["Law_list"].append(law)
        #Monetary Amount
        money = ", ".join(map(str, set(Money_list)))
        all_content["Money_list"].append(money)

all_content = pd.DataFrame(all_content)
	#import packages
	from selenium import webdriver
	from selenium.webdriver.firefox.service import Service
	from webdriver_manager.firefox import GeckoDriverManager
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from datetime import datetime
	from selenium.webdriver.common.action_chains import ActionChains
	from bs4 import BeautifulSoup
	import time
	import pandas as pd
	from datetime import datetime, timedelta
	import math
	import requests
	import numpy as np

	import spacy
	nlp = spacy.load('en_core_web_sm')

	#create a new instance of google chrome.
	options = webdriver.ChromeOptions()
	options.add_argument("--window-size=1920,1080")
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
	driver.implicitly_wait(0.5)

	#access google chrome and open the link
	link = "https://www.icac.org.hk/en/press/index_p1.html"
	driver.get(link)
	#wait for 3 seconds to ensure the page is loaded.
	time.sleep(3)

	#initialize the data frame
	all_content = {"Ref":[],
	"Date":[],
	"Title":[],
	"Content":[],
	"Person_list":[],
	"Org_list":[],
	"GPE_list":[],
	"Law_list":[],
	"Money_list":[],
	"Link":[]}

	page_no = driver.find_element(By.CSS_SELECTOR, ".pagenav a:nth-of-type(5)").get_attribute("textContent")

	for p in range(1,int(page_no)+1):
	#initialize the all_links of each page for later usage in second for loop l.
	all_links = []
	#as the link for each page with the format index_p{page number}.html
	link1 = f"https://www.icac.org.hk/en/press/index_p{p}.html"
	driver.get(link1)
	#there are around 10 articles for each page and item is store in <div class='.pressItem.clearfix'>
	all_posts = driver.find_elements(By.CSS_SELECTOR, ".pressItem.clearfix")
	for post in all_posts:
	#for each article within the page, the article link is stored in the href link of class "details" element "a".
	link2 = post.find_element(By.CSS_SELECTOR, ".details a").get_attribute("href")
	all_links.append(link2)
	for l in all_links:
	time.sleep(2.5)
	#further access the article link retrieved from above
	driver.get(l)

	#Reference of the article
	ref = linkRef.rsplit('/', 1)[-1].replace("index_id_","").replace(".html","")
	all_content["Ref"].append(ref)

	#date of the article is stored in class "date"
	date = driver.find_element(By.CSS_SELECTOR, ".date").get_attribute("textContent")
	all_content["Date"].append(date)

	#title of the article is stored in class "contentWrap" and element "h2"
	title = driver.find_element(By.CSS_SELECTOR, ".contentWrap h2").get_attribute("textContent")
	all_content["Title"].append(title)

	#content of the article is stored in class "pressContent full"
	content = driver.find_element(By.CSS_SELECTOR, ".pressContent.full").get_attribute("textContent")
	all_content["Content"].append(content)

	#weblink of the article
	linkRef = l
	all_content["Link"].append(linkRef)

	#Extract all PERSON to column person_list
	doc = nlp(f"{content}")
	Person_list = []
	Org_list = []
	Gpe_list = []
	Law_list = []
	Money_list = []

	for ent in doc.ents:
	if ent.label_ == "PERSON":
	Person_list.append(ent)
	elif ent.label_ == "ORG":
	Org_list.append(ent)
	elif ent.label_ == "GPE":
	Gpe_list.append(ent)
	elif ent.label_ == "LAW":
	Law_list.append(ent)
	elif ent.label_ == "MONEY":
	Money_list.append(ent)

	#Persons
	persons = ", ".join(map(str, set(Person_list)))
	all_content["Person_list"].append(persons)
	#Organization
	org = ", ".join(map(str, set(Org_list)))
	all_content["Org_list"].append(org)
	#Geolocation
	gpe = ", ".join(map(str, set(Gpe_list)))
	all_content["GPE_list"].append(gpe)
	#Law
	law = ", ".join(map(str, set(Law_list)))
	all_content["Law_list"].append(law)
	#Monetary Amount
	money = ", ".join(map(str, set(Money_list)))
	all_content["Money_list"].append(money)

	all_content = pd.DataFrame(all_content)