Skip to content

Instantly share code, notes, and snippets.

@Jiali-Qi
Created October 1, 2019 02:50
Show Gist options
  • Save Jiali-Qi/e6ffe256a60b382da80cc6db49dda856 to your computer and use it in GitHub Desktop.
Save Jiali-Qi/e6ffe256a60b382da80cc6db49dda856 to your computer and use it in GitHub Desktop.
Assignment 3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 23 11:06:35 2019
@author: qijia
"""
import queue
import re
import selenium
from urllib.parse import urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pathlib
def is_absolute(url):
"""Determine whether URL is absolute."""
return bool(urlparse(url).netloc)
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='C:/Users/qijia/Downloads/chromedriver_win32/chromedriver.exe', chrome_options=options)
email_addresses = []
q = queue.Queue()
q.put("https://www.stevens.edu/")
for i in range(10000):
url = q.get()
# r = requests.get(url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract all email addresses.
# print(soup.get_text())
email_addresses += re.findall("\S+@stevens.edu", soup.get_text())
email_addresses = list(set(email_addresses))
links = soup.find_all('a')
for link in links:
url_list = ['https://www.stevens.edu/']
u = link.get('href')
if not is_absolute(u):
u = urljoin(url, u)
if u not in url_list:
q.put(u)
print("Queue size: {}".format(q.qsize()))
print("# email addresses: {}".format(len(email_addresses)))
with open("email.txt", "w+") as f:
for e in email_addresses:
f.write(e + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment