Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraping CENTCOM's PRs with Python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import json
base_url = "http://www.centcom.mil"
links_collection = []
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html, "lxml")
def get_links(section_url):
soup = make_soup(section_url)
# Define where the data is in the page
table = soup.find("table", "blog")
tds = table.findAll("td", "contentheading")
# Grab all the links
for td in tds:
links_collection.append(base_url + td.a["href"])
# Output the whole think in JSON file
with open("links_collections.json", 'w') as outfile:
json.dump(links_collection, outfile)
if __name__ == '__main__':
# Scrape home-page
url_to_scrape = ("http://www.centcom.mil/en/news")
# Scrape following pages
pages = get_links(url_to_scrape)
for i in range(0, 165, 11):
url_to_scrape = ("http://www.centcom.mil/en/news/P" + str(i))
pages = get_links(url_to_scrape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment