Last active
January 31, 2018 03:50
-
-
Save calistatee/04666c45fd94bd5064b5db7b2371811b to your computer and use it in GitHub Desktop.
Python: Web Scraping with BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
# request web source to get source code | |
source = requests.get('insert your url here').text | |
# assign a var for parsed information w standard BS syntax | |
soup = BeautifulSoup(source, 'lxml') | |
csv_file = open('name your csv file', 'w') | |
csv_writer = csv.writer(csv_file) | |
csv_writer.writerow(['Headlines', 'Summary', 'YouTube link']) | |
# to print out web source code w indentations | |
# print (soup.prettify()) | |
# grab first headline + snippet for first post on the web | |
# find to find_all to grab every posts shown | |
for article in soup.find_all('article'): | |
# headline | |
# doesn't necessarily need to include every single parent tags | |
headline = article.h2.a.text | |
print(headline) | |
# return the summary (just the first paragraph) of article | |
# use class_ because class also a term used in python | |
summary = article.find('div', class_ = 'entry-content').p.text | |
print(summary) | |
# in case some posts don't include a video | |
# to prevent software crash if vid link is not detected | |
try: | |
# return the video source code within article ('iframe') | |
#['src'] -- access source attribute like a dictionary | |
# it's to clearly show the youtube link within the vid source code | |
vid_src = article.find('iframe', class_ = 'youtube-player')['src'] | |
# splitting vid link into a few chunks when code sees a '/' | |
# since our youtube id is rested on the 4th index | |
# we want to print out our 4th index | |
vid_id = vid_src.split('/')[4] | |
vid_id = vid_id.split('?')[0] | |
yt_link = f'https://youtube.com/watch?v={vid_id}' | |
# if youtube link is not available, you want to label it as None | |
# 'None' will show up as a blank space in the .csv file | |
except Exception as e: | |
yt_link = None | |
print (yt_link) | |
print() | |
csv_writer.writerow([headline, summary, yt_link]) | |
csv_file.close() | |
# check your CSV file for results! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment