yarikc/Loading data from http

## Loading data from http
from bs4 import BeautifulSoup
import requests
import urllib
import gzip
import os
import time
import re

url_base = 'http://www1.ncdc.noaa.gov/pub/data/noaa/{}'
ext = 'gz'

testfile = urllib.URLopener()

def get_files():
	for n in range(116):
		year = n + 1901
		url = url_base.format(year)
		page = requests.get(url).text
		soup = BeautifulSoup(page, 'html.parser')
		files = [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]

		for file in files:
			print './files/{}'.format(file.split('/')[-1])

def calculate():
	d = {}

	start = time.time()
	count = 0

	for file in os.listdir('./files/'):

		with gzip.open(os.path.join('./files', file), 'rb') as f:
			file_content = f.read()
			for line in iter(file_content.splitlines()):
				count = count + 1
				(year, temp, q) = (line[15:19], line[87:92], line[92:93])
  				if (temp != "+9999" and re.match("[01459]", q)):
					temp = int(temp)
					try:
						max_temp = d[year]
						if max_temp < temp:
							d[year]=temp
					except KeyError:
						d[year]=temp

	end = time.time()

	print(end - start)
	print len(d)
	for y,t in sorted(d.items()):
		print "%s\t%s" % (y, t)

	print len(os.listdir('./files/')), count


if __name__ == '__main__':
    calculate()
	from bs4 import BeautifulSoup
	import requests
	import urllib
	import gzip
	import os
	import time
	import re

	url_base = 'http://www1.ncdc.noaa.gov/pub/data/noaa/{}'
	ext = 'gz'

	testfile = urllib.URLopener()

	def get_files():
	for n in range(116):
	year = n + 1901
	url = url_base.format(year)
	page = requests.get(url).text
	soup = BeautifulSoup(page, 'html.parser')
	files = [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]

	for file in files:
	print './files/{}'.format(file.split('/')[-1])

	def calculate():
	d = {}

	start = time.time()
	count = 0

	for file in os.listdir('./files/'):

	with gzip.open(os.path.join('./files', file), 'rb') as f:
	file_content = f.read()
	for line in iter(file_content.splitlines()):
	count = count + 1
	(year, temp, q) = (line[15:19], line[87:92], line[92:93])
	if (temp != "+9999" and re.match("[01459]", q)):
	temp = int(temp)
	try:
	max_temp = d[year]
	if max_temp < temp:
	d[year]=temp
	except KeyError:
	d[year]=temp

	end = time.time()

	print(end - start)
	print len(d)
	for y,t in sorted(d.items()):
	print "%s\t%s" % (y, t)

	print len(os.listdir('./files/')), count


	if __name__ == '__main__':
	calculate()