CognitiveDave/cleaning.py

## cleaning.py
# -*- coding: utf-8 -*-
"""
Reading List exploration file
Exploring concepts and approaches to solving the problem
"""
#Step 1 - loading the browser and preparing for automation
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

chrome_options = webdriver.ChromeOptions()
# Comment the next line if you want to see what
# happens in the browser during the execution
# of selenium
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

#step 2 - add extra libraries bs4 and time for page interaction
from bs4 import BeautifulSoup
import time
action = ActionChains(wd)

def clicker(elements):
    for e in elements:

        e.click()
        time.sleep(3)


#step 3 - define some key variables
urlSignIn = "get one of these from medium by signing into medium via email"

url = "https://medium.com/@cognitivedave/list/reading-list" #url for the reading list

wd.get(urlSignIn) #Sign in first

time.sleep(5) #wait a few seconds to allow pre and post JS events and full page load

wd.get(url) #navigate to the reading list and wait
time.sleep(20)

# ------------------------------------------------------------
# get the max height and full scroll the page to get all articles
# https://medium.com/@dreamferus/how-to-scrape-code-from-medium-using-python-f51d68f91bd1
height = 0
latest_height = 1

# scroll through the page iteratively until we reach the max height
while (height < latest_height):
    latest_height = wd.execute_script('return document.body.scrollHeight')

    for y in range(height, latest_height, 200):
        wd.execute_script(f"window.scrollTo(0, {y})")
        # wait a little bit
        time.sleep(.4)

    height = latest_height

    latest_height = wd.execute_script('return document.body.scrollHeight')


# ------------------------------------------------------------

# provide the full html into beautifulsoup
soup = BeautifulSoup(wd.page_source, features="lxml")
xtemplateleft = """/html/body/div/div/div[3]/div/div/main/div/div[3]/div[1]/div[2]/div/div/div["""
xtemplateright = """]/label/div[1]"""

counter = -1
elements = wd.find_elements(By.TAG_NAME, value="input")
for e, elem in enumerate(elements):
    counter = counter + 2
    path = xtemplateleft + str(counter) + xtemplateright
    inputbox = wd.find_element(By.XPATH, value=path)
    print(path)
    time.sleep(1)
    inputbox.click()
    time.sleep(1)

wd.close()
	# -- coding: utf-8 --
	"""
	Reading List exploration file
	Exploring concepts and approaches to solving the problem
	"""
	#Step 1 - loading the browser and preparing for automation
	from selenium import webdriver
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.action_chains import ActionChains
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException

	chrome_options = webdriver.ChromeOptions()
	# Comment the next line if you want to see what
	# happens in the browser during the execution
	# of selenium
	#chrome_options.add_argument('--headless')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')
	wd = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

	#step 2 - add extra libraries bs4 and time for page interaction
	from bs4 import BeautifulSoup
	import time
	action = ActionChains(wd)

	def clicker(elements):
	for e in elements:

	e.click()
	time.sleep(3)



	#step 3 - define some key variables
	urlSignIn = "get one of these from medium by signing into medium via email"

	url = "https://medium.com/@cognitivedave/list/reading-list" #url for the reading list

	wd.get(urlSignIn) #Sign in first

	time.sleep(5) #wait a few seconds to allow pre and post JS events and full page load

	wd.get(url) #navigate to the reading list and wait
	time.sleep(20)

	# ------------------------------------------------------------
	# get the max height and full scroll the page to get all articles
	# https://medium.com/@dreamferus/how-to-scrape-code-from-medium-using-python-f51d68f91bd1
	height = 0
	latest_height = 1

	# scroll through the page iteratively until we reach the max height
	while (height < latest_height):
	latest_height = wd.execute_script('return document.body.scrollHeight')

	for y in range(height, latest_height, 200):
	wd.execute_script(f"window.scrollTo(0, {y})")
	# wait a little bit
	time.sleep(.4)

	height = latest_height

	latest_height = wd.execute_script('return document.body.scrollHeight')


	# ------------------------------------------------------------

	# provide the full html into beautifulsoup
	soup = BeautifulSoup(wd.page_source, features="lxml")
	xtemplateleft = """/html/body/div/div/div[3]/div/div/main/div/div[3]/div[1]/div[2]/div/div/div["""
	xtemplateright = """]/label/div[1]"""

	counter = -1
	elements = wd.find_elements(By.TAG_NAME, value="input")
	for e, elem in enumerate(elements):
	counter = counter + 2
	path = xtemplateleft + str(counter) + xtemplateright
	inputbox = wd.find_element(By.XPATH, value=path)
	print(path)
	time.sleep(1)
	inputbox.click()
	time.sleep(1)

	wd.close()