revox/scrape_ons_release_start.py

## scrape_ons_release_start.py
'''A script to scrape the ONS release schedule'''
import urllib
import bs4

page = 0
URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"

# open webpage
webpage = urllib.urlopen(URL).read()

# turn html into beautiful soup
soup = bs4.BeautifulSoup(webpage)

# extract info from soup
attrs={'class':'count'}
count_pages = soup.find('span', attrs).string


start_of_count = count_pages.find('of')
end_of_count = count_pages.find('|')

# print info to screen
pages = count_pages[start_of_count+2:end_of_count].strip()

print 'total pages: ', int(pages)

for page in range(0,int(pages)):
    URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"
    print URL
	'''A script to scrape the ONS release schedule'''
	import urllib
	import bs4

	page = 0
	URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"

	# open webpage
	webpage = urllib.urlopen(URL).read()

	# turn html into beautiful soup
	soup = bs4.BeautifulSoup(webpage)

	# extract info from soup
	attrs={'class':'count'}
	count_pages = soup.find('span', attrs).string


	start_of_count = count_pages.find('of')
	end_of_count = count_pages.find('\|')

	# print info to screen
	pages = count_pages[start_of_count+2:end_of_count].strip()

	print 'total pages: ', int(pages)

	for page in range(0,int(pages)):
	URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"
	print URL