Skip to content

Instantly share code, notes, and snippets.

@harshpanchal-hp
Last active November 2, 2021 08:59
Show Gist options
  • Save harshpanchal-hp/67d25b56f8761b3fa0677b89c5e0a7fb to your computer and use it in GitHub Desktop.
Save harshpanchal-hp/67d25b56f8761b3fa0677b89c5e0a7fb to your computer and use it in GitHub Desktop.
Python Beautifulsoup
import lxml.etree
import requests
# target location
url = 'https://raw.githubusercontent.com/DataFinnovation/public-talks/master/pugs-scraping/example2.html'
# get the page
page = requests.get(url)
# parse it
tree = lxml.etree.fromstring(page.content)
# what elements we care about
theXPath = '//a[text()="link"]/@href'
# grab all of them
resultList = tree.xpath(theXPath)
# now grab one of those links
page2 = requests.get(resultList[0])
# and dump the headers for this next link
print(page2.headers)
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
# URl to web scrap from.
# in this example we web scrap graphics cards from Newegg.com
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
# opens the connection and downloads html page from url
uClient = uReq(page_url)
# parses html into a soup data structure to traverse html
# as if it were a json data type.
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "item-container"})
# name the output file to write to local disk
out_filename = "graphics_cards.csv"
# header of csv file to be written
headers = "brand,product_name,shippingn"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops over each product and grabs attributes about
# each product
for container in containers:
# Finds all link tags "a" from within the first div.
make_rating_sp = container.div.select("a")
# Grabs the title from the image title attribute
# Then does proper casing using .title()
brand = make_rating_sp[0].img["title"].title()
# Grabs the text within the second "(a)" tag from within
# the list of queries.
product_name = container.div.select("a")[2].text
# Grabs the product shipping information by searching
# all lists with the class "price-ship".
# Then cleans the text of white space with strip()
# Cleans the strip of "Shipping $" if it exists to just get number
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
# prints the dataset to console
print("brand: " + brand + "n")
print("product_name: " + product_name + "n")
print("shipping: " + shipping + "n")
# writes the dataset to file
f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "n")
f.close() # Close the file
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('http://coreyms.com').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video_link'])
for article in soup.find_all('article'):
headline = article.h2.a.text
print(headline)
summary = article.find('div', class_='entry-content').p.text
print(summary)
try:
vid_src = article.find('iframe', class_='youtube-player')['src']
vid_id = vid_src.split('/')[4]
vid_id = vid_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={vid_id}'
except Exception as e:
yt_link = None
print(yt_link)
print()
csv_writer.writerow([headline, summary, yt_link])
csv_file.close()
import os
import tempfile
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import Select
# make a temporary download directory
downloadDir = tempfile.mkdtemp()
# and set up chrome to use it
prefs = { 'download.default_directory' : downloadDir }
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs",prefs)
# kick off a chrome
driver = webdriver.Chrome(options=chromeOptions)
# back to our url
url = 'www.demo.com/DownloadBulkData.aspx'
driver.get(url)
# fill out the forms and buttons
productForm = driver.find_element_by_xpath("//select[contains(@name,'ctl00$MainContentHolder$ListBox1')]")
formSelect = Select(productForm)
formSelect.select_by_visible_text('Call Reports -- Single Period')
radioButton = driver.find_element_by_id('XBRLRadiobutton')
radioButton.click()
# find the download button
button = driver.find_element_by_name('ctl00$MainContentHolder$TabStrip1$Download_0')
# get listing of files in the download directory
startFiles = os.listdir(downloadDir)
# kick of the download
button.click()
# loop looking for a finished file
found = False
while not found:
print('searching...')
sleep(1)
# what files do we have now
newFiles = os.listdir(downloadDir)
for f in newFiles:
# any new files ending in .zip?
if f not in startFiles and f[-4:] == '.zip':
found = f
print('found!')
# wait a bit for demo reasons
sleep(15)
driver.quit()
<!doctype html>
<html class="no-js" lang="">
<head>
<title>Test - A Sample Website</title>
<meta charset="utf-8">
<link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css">
</head>
<body>
<h1 id='site_title'>Test Website</h1>
<hr></hr>
<div class="article">
<h2><a href="article_1.html">Article 1 Headline</a></h2>
<p>This is a summary of article 1</p>
</div>
<hr></hr>
<div class="article">
<h2><a href="article_2.html">Article 2 Headline</a></h2>
<p>This is a summary of article 2</p>
</div>
<hr></hr>
<div class='footer'>
<p>Footer Information</p>
</div>
<script src="js/vendor/modernizr-3.5.0.min.js"></script>
<script src="js/plugins.js"></script>
<script src="js/main.js"></script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment