kayode-adechinan/scraping-cheat-sheet.md

## scraping-cheat-sheet.md

      
    Raw
  

              scraping-cheat-sheet.md
            
          
    Beautifulsoup weather app

import pandas as pd
import requests
from bs4 import BeautifulSoup

page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)
img = tonight.find("img")
desc = img['title']
print(desc)
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
print(periods)
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs)print(temps)print(descs)
weather = pd.DataFrame({
    "period": periods,
    "short_desc": short_descs,
    "temp": temps,
    "desc":descs
})
print(weather)
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
print(temp_nums)
print(weather["temp_num"].mean())
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
print(is_night)
print(weather[is_night])
Beautifulsoup again

import requests
import csv
from bs4 import BeautifulSoup


f = csv.writer(open('z-artist-names.csv', 'w'))
f.writerow(['Name', 'Link'])

pages = []

for i in range(1, 5):
    url = 'https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anZ' + str(i) + '.htm'
    pages.append(url)


for item in pages:
    page = requests.get(item)
    soup = BeautifulSoup(page.text, 'html.parser')

    last_links = soup.find(class_='AlphaNav')
    last_links.decompose()

    artist_name_list = soup.find(class_='BodyText')
    artist_name_list_items = artist_name_list.find_all('a')

    for artist_name in artist_name_list_items:
        names = artist_name.contents[0]
        links = 'https://web.archive.org' + artist_name.get('href')

        f.writerow([names, links])
Beautifulsoup with regex

import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
Search

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome('./chromedriver')
driver.get("https://www.python.org")
print(driver.title)
search_bar = driver.find_element_by_name("q")
search_bar.clear()
search_bar.send_keys("getting started with python")
search_bar.send_keys(Keys.RETURN)
print(driver.current_url)
driver.close()
Get attribute

username = browser.find_element_by_id('user_full_name')
if "error" in username.get_attribute('outerHTML'):
    pass
Get text

>>> results = browser.find_elements_by_class_name('result')
>>> print(results[0].text)
Login to facebook

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
user_name = "YOUR EMAILID"
password = "YOUR PASSWORD"
driver = webdriver.Firefox()
driver.get("https://www.facebook.com")
element = driver.find_element_by_id("email")
element.send_keys(user_name)
element = driver.find_element_by_id("pass")
element.send_keys(password)
element.send_keys(Keys.RETURN)
element.close()
Login instagram

from time import sleep
import datetime
import DBUsers, Constants
import traceback
import random

def login(webdriver):
    #Open the instagram login page
    webdriver.get('https://www.instagram.com/accounts/login/?source=auth_switcher')
    #sleep for 3 seconds to prevent issues with the server
    sleep(3)
    #Find username and password fields and set their input using our constants
    username = webdriver.find_element_by_name('username')
    username.send_keys(Constants.INST_USER)
    password = webdriver.find_element_by_name('password')
    password.send_keys(Constants.INST_PASS)
    #Get the login button
    try:
        button_login = webdriver.find_element_by_xpath(
            '//*[@id="react-root"]/section/main/div/article/div/div[1]/div/form/div[4]/button')
    except:
        button_login = webdriver.find_element_by_xpath(
            '//*[@id="react-root"]/section/main/div/article/div/div[1]/div/form/div[6]/button/div')
    #sleep again
    sleep(2)
    #click login
    button_login.click()
    sleep(3)
    #In case you get a popup after logging in, press not now.
    #If not, then just return
    try:
        notnow = webdriver.find_element_by_css_selector(
            'body > div.RnEpo.Yx5HN > div > div > div.mt3GC > button.aOOlW.HoLwm')
        notnow.click()
    except:
        return
Scrapping spa

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")

driver = webdriver.Chrome(options=options, executable_path=r'/usr/local/bin/chromedriver')
driver.get("https://www.nintendo.com/")

try:
    elem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.NAME, 'chart')))
    print("Page is ready!")

except TimeoutException:
    print("Timeout")

driver.save_screenshot('screenshot.png')
driver.quit()
To pandas

from selenium.webdriver import Chrome
import pandas as pd

webdriver = "path_to_installed_driver_location"

driver = Chrome(webdriver)
pages = 10

for page in range(1,pages):

    url = "http://quotes.toscrape.com/js/page/" + str(page) + "/"

    driver.get(url)

    items = len(driver.find_elements_by_class_name("quote"))

    total = []
    for item in range(items):
        quotes = driver.find_elements_by_class_name("quote")
        for quote in quotes:
            quote_text = quote.find_element_by_class_name('text').text
            author = quote.find_element_by_class_name('author').text
            new = ((quote_text,author))
            total.append(new)
    df = pd.DataFrame(total,columns=['quote','author'])
    df.to_csv('quoted.csv')
driver.close()
Slow typing

def slow_typing(element, text):
   for character in text:
      element.send_keys(character)
      time.sleep(0.3)
      
# Fill user's email ID
email = browser.find_element_by_id('user_email_login')
slow_typing(email, EMAIL_ID)
twitter bot

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class TwitterBot:
    def __init__(self, email, password):
        self.email = email
        self.password = password
        self.bot = webdriver.Chrome('./chromedriver')

    def login(self):
        bot = self.bot
        bot.get('https://twitter.com/login')
        bot.implicitly_wait(10)  # seconds
        email = bot.find_element_by_class_name('js-username-field')
        password = bot.find_element_by_class_name('js-password-field')
        email.clear()
        password.clear()
        email.send_keys(self.email)
        password.send_keys(self.password)
        password.send_keys(Keys.RETURN)
        bot.implicitly_wait(10)  # seconds

    def like_tweet(self, hashtag):
        bot = self.bot
        bot.get('https://twitter.com/search?q=' + hashtag + '&src=typd')
        bot.implicitly_wait(10)  # seconds
        for i in range(1, 3):
            bot.execute_script(
                'window.scrollTo(0, document.body.scrollHeight)')
            bot.implicitly_wait(2)  # seconds
            tweets = bot.find_elements_by_class_name('tweet')
            links = [
                elem.get_attribute('data-permalink-path') for elem in tweets
            ]
            for link in links:
                if link:
                    bot.get('https://twitter.com' + link)
                    try:
                        bot.find_element_by_class_name(
                            'HeartAnimation').click()
                        bot.implicitly_wait(10)  # seconds
                    except Exception as ex:
                        bot.implicitly_wait(10)  # seconds


if __name__ == "__main__":

    ed = TwitterBot('xxxxxxxxxxx@xxxxxxxx.xxxx', 'xxxxxxxxxxxxxx')
    ed.login()
    ed.like_tweet('webdevelopment')
signup

from selenium.webdriver import Chrome, ChromeOptions
import time

EMAIL_ID = "<your email ID>"

def slow_typing(element, text):
   for character in text:
      element.send_keys(character)
      time.sleep(0.3)

# Visit chrome://version/ and copy profile path in place of '<chrome user profile>'
options = ChromeOptions().add_argument("--user-data-dir=<chrome user profile>")

browser = Chrome(chrome_options=options)
browser.get('https://www.browserstack.com')

time.sleep(2)

# to accept cookie notification so that it doesn't interfere
cookie_cta = browser.find_element_by_id('accept-cookie-notification')
cookie_cta.click()

# Navigate to Signup Page
button = browser.find_element_by_id('signupModalButton')
button.click()

time.sleep(2)

# Fill user's full name
username = browser.find_element_by_id('user_fudll_name')
# username.send_keys('John Doe')
slow_typing(username, 'John Doe')

time.sleep(1)
# Fill user's email ID
email = browser.find_element_by_id('user_email_login')
slow_typing(email, EMAIL_ID)

time.sleep(2)
# Fill user's password
password = browser.find_element_by_id('user_password')

# Reads password from a text file because
# it's silly to save the password in a script.
with open('password.txt', 'r') as myfile:
       Password = myfile.read().replace('\n', '')
slow_typing(password, Password)

time.sleep(1)
# click on Terms and Conditions
toc = browser.find_element_by_name('terms_and_conditions')
toc.click()

# click on signup page
signupbutton = browser.find_element_by_id('user_submit')
signupbutton.click()

time.sleep(20)

browser.close()
wait

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, "id-of-new-element"))
)
finally:
driver.quit()
unit test

import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

class ChromeSearch(unittest.TestCase):

    def setUp(self):
        self.driver = webdriver.Chrome('./chromedriver')

    def test_search_in_python_org(self):
        driver = self.driver
        driver.get("https://www.python.org")
        self.assertIn("Python", driver.title)
        elem = driver.find_element_by_name("q")
        elem.send_keys("getting started with python")
        elem.send_keys(Keys.RETURN)
        assert "https://www.python.org/search/?q=getting+started+with+python&submit=" == driver.current_url

    def tearDown(self):
        self.driver.close()

if __name__ == "__main__":
    unittest.main()