Skip to content

Instantly share code, notes, and snippets.

@basilesimon
Created January 14, 2015 17:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save basilesimon/7e970e4b4e1dafeadb8d to your computer and use it in GitHub Desktop.
Save basilesimon/7e970e4b4e1dafeadb8d to your computer and use it in GitHub Desktop.
Scraping CENTCOM's PRs with Python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import json
base_url = "http://www.centcom.mil"
links_collection = []
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html, "lxml")
def get_links(section_url):
soup = make_soup(section_url)
# Define where the data is in the page
table = soup.find("table", "blog")
tds = table.findAll("td", "contentheading")
# Grab all the links
for td in tds:
links_collection.append(base_url + td.a["href"])
# Output the whole think in JSON file
with open("links_collections.json", 'w') as outfile:
json.dump(links_collection, outfile)
if __name__ == '__main__':
# Scrape home-page
url_to_scrape = ("http://www.centcom.mil/en/news")
# Scrape following pages
pages = get_links(url_to_scrape)
for i in range(0, 165, 11):
url_to_scrape = ("http://www.centcom.mil/en/news/P" + str(i))
pages = get_links(url_to_scrape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment