Created
September 29, 2011 09:39
-
-
Save tecoholic/1250405 to your computer and use it in GitHub Desktop.
A automated downloader and html2csv converter for AWS data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import urllib2 | |
import time | |
from datetime import datetime | |
from BeautifulSoup import BeautifulSoup | |
# The following parameters can be changed to suite the requirements of user | |
# url = "http://www.imd.gov.in/section/nhac/aws/aws00.htm" | |
hr = 1 | |
mi = 0 | |
sec = 0 | |
interval = (hr*3600)+(mi*60)+sec | |
def createFolder(folder): | |
if not os.path.isdir(folder): | |
os.makedirs(folder) | |
return folder | |
def constructFilename(url): | |
now = datetime.now() | |
folderName = now.strftime('%Y_%m_%d') | |
folder = createFolder(folderName) | |
csvfolder = createFolder(os.path.join(folder,'csv')) | |
filename = url.split("/")[-1] | |
return os.path.join(folder,filename) | |
def constructUrl(): | |
base = "http://www.imd.gov.in/section/nhac/aws/aws" | |
now = datetime.now() | |
hour = now.hour - 9 | |
if hour < 0: | |
hour = 24 + hour | |
if hour < 10: | |
hourStr = '0'+str(hour) | |
else: | |
hourStr = str(hour) | |
url = base+hourStr+".htm" | |
return url | |
def html2csv(filename): | |
# Convert HTML 2 CSV | |
filename.replace('/','/csv/') | |
f = open(filename, 'r+') | |
csv = open(filename.replace('htm','csv'), 'w+') | |
print filename | |
for i in range(4): | |
f.readline() | |
soup = BeautifulSoup(f.read()) | |
rows = soup.findAll('tr') | |
for row in rows: | |
cells = row.findAll('td') | |
line = ','.join([cell.text for cell in cells]) | |
csv.write(line.replace(' ',' ')) | |
csv.write('\n') | |
csv.close() | |
f.close() | |
def main(): | |
url = constructUrl() | |
print "Opening URL: %s" % (url) | |
print "Date & Time: "+datetime.now().__str__() | |
f = urllib2.urlopen(url) | |
filesize = float(f.info().getheader("Content-Length")) | |
if filesize == None: | |
print "FileSize Error" | |
else: | |
filename = constructFilename(url) | |
local = open(filename,"wb") | |
bytesRead = 0.0 | |
oldpercent = 0 | |
for line in f: | |
bytesRead += len(line) | |
newpercent = 100*bytesRead/filesize | |
if newpercent-oldpercent > 5: | |
print "%s: %.02f/%.02f kb (%d%%)" %( | |
filename, | |
bytesRead/1024.0, | |
filesize/1024.0, | |
newpercent | |
) | |
oldpercent = newpercent | |
local.write(line) | |
local.close() | |
f.close() | |
if bytesRead == filesize: | |
print "File download Done!" | |
else: | |
print "Error: File download interupted" | |
print 'Converting HTML to CSV....' | |
html2csv(filename) | |
if __name__ == '__main__': | |
while(1): | |
main() | |
time.sleep(interval) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import urllib2 | |
def createFolder(folder): | |
if not os.path.isdir(folder): | |
os.makedirs(folder) | |
return folder | |
def constructFilename(yr,mon): | |
folder = createFolder(str(yr)) | |
return os.path.join(folder,"%02d" %(mon) +".txt") | |
def downloadFile( yr, mon ): | |
baseUrl = "http://www.kea.metsite.com/" | |
url = baseUrl + "%04d_%02d" %(yr,mon) +".txt" | |
f = urllib2.urlopen(url) | |
print "Opening " + url | |
filesize = float(f.info().getheader("Content-Length")) | |
if filesize == None: | |
print "FileSize Error" | |
else: | |
filename = constructFilename(yr,mon) | |
local = open(filename,"wb") | |
bytesRead = 0.0 | |
oldpercent = 0 | |
for line in f: | |
bytesRead += len(line) | |
newpercent = 100*bytesRead/filesize | |
if newpercent-oldpercent > 10: | |
print "%s: %.02f/%.02f kb (%d%%)" %( | |
filename, | |
bytesRead/1024.0, | |
filesize/1024.0, | |
newpercent | |
) | |
oldpercent = newpercent | |
local.write(line) | |
local.close() | |
f.close() | |
if bytesRead == filesize: | |
print filename + " download Done!" | |
else: | |
print "Error: File download interupted" | |
def main(): | |
startYr = 2004 | |
endYr = 2014 | |
for yr in range(startYr,endYr): | |
strMonth = 1 | |
endMonth = 13 | |
if yr == startYr: | |
strMonth = 9 | |
if yr == endYr: | |
endMonth = 2 | |
for month in range(strMonth,endMonth): | |
downloadFile(yr,month) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment