Skip to content

Instantly share code, notes, and snippets.

@basilesimon
Created January 22, 2015 17:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save basilesimon/9eb2f7e7449268a39ca0 to your computer and use it in GitHub Desktop.
Save basilesimon/9eb2f7e7449268a39ca0 to your computer and use it in GitHub Desktop.
Scraping CENTCOM's PRs with Python
import json
from bs4 import BeautifulSoup
from urllib2 import urlopen
from urlparse import urljoin
BASE_URL = "http://www.centcom.mil"
BASE_NEWS_URL = "http://www.centcom.mil/en/news"
NEWS_PAGE_URL = BASE_NEWS_URL + "/P"
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_links(section_url):
print('Scraping %s for press release URLs...' % (section_url))
soup = make_soup(section_url)
table = soup.find("table", "blog")
tds = table.findAll("td", "contentheading")
return [urljoin(BASE_URL, td.a["href"]) for td in tds]
def get_content(link):
print('Scraping press release from %s...' % (link))
soup = make_soup(link)
table = soup.findAll("table", "contentpaneopen")[1]
paras = table.findAll("p")
content = [p.text for p in paras]
return '\n\n'.join(content)
if __name__ == '__main__':
links, releases = [], []
urls = [BASE_NEWS_URL] + [NEWS_PAGE_URL + str(i) for i in range(0, 165, 11)]
# Scrape following pages
for url in urls:
links.extend(get_links(url))
# Scrape press releases
for link in links:
releases.append(get_content(link))
with open('press-releases.json', 'w') as f:
json.dump(releases, f, indent=4)
print "Output result in press-releases.json"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment