Skip to content

Instantly share code, notes, and snippets.

@CognitiveDave
Created June 16, 2022 09:19
Show Gist options
  • Save CognitiveDave/4c685b4bcb449db5cf07b9614777fcac to your computer and use it in GitHub Desktop.
Save CognitiveDave/4c685b4bcb449db5cf07b9614777fcac to your computer and use it in GitHub Desktop.
Cleaning my medium list
# -*- coding: utf-8 -*-
"""
Reading List exploration file
Exploring concepts and approaches to solving the problem
"""
#Step 1 - loading the browser and preparing for automation
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
chrome_options = webdriver.ChromeOptions()
# Comment the next line if you want to see what
# happens in the browser during the execution
# of selenium
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
#step 2 - add extra libraries bs4 and time for page interaction
from bs4 import BeautifulSoup
import time
action = ActionChains(wd)
def clicker(elements):
for e in elements:
e.click()
time.sleep(3)
#step 3 - define some key variables
urlSignIn = "get one of these from medium by signing into medium via email"
url = "https://medium.com/@cognitivedave/list/reading-list" #url for the reading list
wd.get(urlSignIn) #Sign in first
time.sleep(5) #wait a few seconds to allow pre and post JS events and full page load
wd.get(url) #navigate to the reading list and wait
time.sleep(20)
# ------------------------------------------------------------
# get the max height and full scroll the page to get all articles
# https://medium.com/@dreamferus/how-to-scrape-code-from-medium-using-python-f51d68f91bd1
height = 0
latest_height = 1
# scroll through the page iteratively until we reach the max height
while (height < latest_height):
latest_height = wd.execute_script('return document.body.scrollHeight')
for y in range(height, latest_height, 200):
wd.execute_script(f"window.scrollTo(0, {y})")
# wait a little bit
time.sleep(.4)
height = latest_height
latest_height = wd.execute_script('return document.body.scrollHeight')
# ------------------------------------------------------------
# provide the full html into beautifulsoup
soup = BeautifulSoup(wd.page_source, features="lxml")
xtemplateleft = """/html/body/div/div/div[3]/div/div/main/div/div[3]/div[1]/div[2]/div/div/div["""
xtemplateright = """]/label/div[1]"""
counter = -1
elements = wd.find_elements(By.TAG_NAME, value="input")
for e, elem in enumerate(elements):
counter = counter + 2
path = xtemplateleft + str(counter) + xtemplateright
inputbox = wd.find_element(By.XPATH, value=path)
print(path)
time.sleep(1)
inputbox.click()
time.sleep(1)
wd.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment