Skip to content

Instantly share code, notes, and snippets.

@yarikc
Created April 11, 2016 11:58
Show Gist options
  • Save yarikc/884aa1755b357175571d06a3122d41c0 to your computer and use it in GitHub Desktop.
Save yarikc/884aa1755b357175571d06a3122d41c0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import urllib
import gzip
import os
import time
import re
url_base = 'http://www1.ncdc.noaa.gov/pub/data/noaa/{}'
ext = 'gz'
testfile = urllib.URLopener()
def get_files():
for n in range(116):
year = n + 1901
url = url_base.format(year)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
files = [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
for file in files:
print './files/{}'.format(file.split('/')[-1])
def calculate():
d = {}
start = time.time()
count = 0
for file in os.listdir('./files/'):
with gzip.open(os.path.join('./files', file), 'rb') as f:
file_content = f.read()
for line in iter(file_content.splitlines()):
count = count + 1
(year, temp, q) = (line[15:19], line[87:92], line[92:93])
if (temp != "+9999" and re.match("[01459]", q)):
temp = int(temp)
try:
max_temp = d[year]
if max_temp < temp:
d[year]=temp
except KeyError:
d[year]=temp
end = time.time()
print(end - start)
print len(d)
for y,t in sorted(d.items()):
print "%s\t%s" % (y, t)
print len(os.listdir('./files/')), count
if __name__ == '__main__':
calculate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment