basilesimon/script.py

## script.py
import json

from bs4 import BeautifulSoup
from urllib2 import urlopen
from urlparse import urljoin

BASE_URL = "http://www.centcom.mil"
BASE_NEWS_URL = "http://www.centcom.mil/en/news"
NEWS_PAGE_URL = BASE_NEWS_URL + "/P"


def make_soup(url):
    html = urlopen(url).read()
    return BeautifulSoup(html)


def get_links(section_url):
    print('Scraping %s for press release URLs...' % (section_url))
    soup = make_soup(section_url)
    table = soup.find("table", "blog")
    tds = table.findAll("td", "contentheading")
    return [urljoin(BASE_URL, td.a["href"]) for td in tds]


def get_content(link):
    print('Scraping press release from %s...' % (link))
    soup = make_soup(link)
    table = soup.findAll("table", "contentpaneopen")[1]
    paras = table.findAll("p")
    content = [p.text for p in paras]
    return '\n\n'.join(content)


if __name__ == '__main__':
    links, releases = [], []
    urls = [BASE_NEWS_URL] + [NEWS_PAGE_URL + str(i) for i in range(0, 165, 11)]

    # Scrape following pages
    for url in urls:
        links.extend(get_links(url))

    # Scrape press releases
    for link in links:
        releases.append(get_content(link))

    with open('press-releases.json', 'w') as f:
        json.dump(releases, f, indent=4)
    print "Output result in press-releases.json"
	import json

	from bs4 import BeautifulSoup
	from urllib2 import urlopen
	from urlparse import urljoin

	BASE_URL = "http://www.centcom.mil"
	BASE_NEWS_URL = "http://www.centcom.mil/en/news"
	NEWS_PAGE_URL = BASE_NEWS_URL + "/P"


	def make_soup(url):
	html = urlopen(url).read()
	return BeautifulSoup(html)


	def get_links(section_url):
	print('Scraping %s for press release URLs...' % (section_url))
	soup = make_soup(section_url)
	table = soup.find("table", "blog")
	tds = table.findAll("td", "contentheading")
	return [urljoin(BASE_URL, td.a["href"]) for td in tds]


	def get_content(link):
	print('Scraping press release from %s...' % (link))
	soup = make_soup(link)
	table = soup.findAll("table", "contentpaneopen")[1]
	paras = table.findAll("p")
	content = [p.text for p in paras]
	return '\n\n'.join(content)


	if __name__ == '__main__':
	links, releases = [], []
	urls = [BASE_NEWS_URL] + [NEWS_PAGE_URL + str(i) for i in range(0, 165, 11)]

	# Scrape following pages
	for url in urls:
	links.extend(get_links(url))

	# Scrape press releases
	for link in links:
	releases.append(get_content(link))

	with open('press-releases.json', 'w') as f:
	json.dump(releases, f, indent=4)
	print "Output result in press-releases.json"