pvanallen/bs4_nyt_scrape_to_html.py

## bs4_nyt_scrape_to_html.py
from bs4 import BeautifulSoup
import requests

# Open and read the template file
fo = open("template.html", "r")
html_template = fo.read();
fo.close()

# get the webpage
r  = requests.get("http://www.nytimes.com")

# get the HTML source from that page
html_doc = r.text

# turn the source into a bs4 "soup" object
soup = BeautifulSoup(html_doc, 'lxml')

# narrow down to the div on the page that contains our content
section = soup.find("div", class_="a-column")

# get the first h2, and the link text within that h2
firstHeading = (section.h2.a).get_text()

# turn the text back into proper HTML
firstHeading_out = BeautifulSoup(firstHeading, 'lxml').prettify(formatter="html")

# find the section that contains our image
section2 = soup.find("section", class_="top-news")

# get the first img, and the src within that section
image_src = section2.img['src']

html_file = html_template.format(firstHeading_out,image_src)

# write out a file
fo = open("nyt.html", "w")
fo.write( html_file );
fo.close()
	from bs4 import BeautifulSoup
	import requests

	# Open and read the template file
	fo = open("template.html", "r")
	html_template = fo.read();
	fo.close()

	# get the webpage
	r = requests.get("http://www.nytimes.com")

	# get the HTML source from that page
	html_doc = r.text

	# turn the source into a bs4 "soup" object
	soup = BeautifulSoup(html_doc, 'lxml')

	# narrow down to the div on the page that contains our content
	section = soup.find("div", class_="a-column")

	# get the first h2, and the link text within that h2
	firstHeading = (section.h2.a).get_text()

	# turn the text back into proper HTML
	firstHeading_out = BeautifulSoup(firstHeading, 'lxml').prettify(formatter="html")

	# find the section that contains our image
	section2 = soup.find("section", class_="top-news")

	# get the first img, and the src within that section
	image_src = section2.img['src']

	html_file = html_template.format(firstHeading_out,image_src)

	# write out a file
	fo = open("nyt.html", "w")
	fo.write( html_file );
	fo.close()