Last active
March 2, 2018 20:46
-
-
Save pvanallen/d5aef0accb30f8e28b7717d90806d835 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
# Open and read the template file | |
fo = open("template.html", "r") | |
html_template = fo.read(); | |
fo.close() | |
# get the webpage | |
r = requests.get("http://www.nytimes.com") | |
# get the HTML source from that page | |
html_doc = r.text | |
# turn the source into a bs4 "soup" object | |
soup = BeautifulSoup(html_doc, 'lxml') | |
# narrow down to the div on the page that contains our content | |
section = soup.find("div", class_="a-column") | |
# get the first h2, and the link text within that h2 | |
firstHeading = (section.h2.a).get_text() | |
# turn the text back into proper HTML | |
firstHeading_out = BeautifulSoup(firstHeading, 'lxml').prettify(formatter="html") | |
# find the section that contains our image | |
section2 = soup.find("section", class_="top-news") | |
# get the first img, and the src within that section | |
image_src = section2.img['src'] | |
html_file = html_template.format(firstHeading_out,image_src) | |
# write out a file | |
fo = open("nyt.html", "w") | |
fo.write( html_file ); | |
fo.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment