Skip to content

Instantly share code, notes, and snippets.

@tecoholic
Created September 29, 2011 09:39
Show Gist options
  • Save tecoholic/1250405 to your computer and use it in GitHub Desktop.
Save tecoholic/1250405 to your computer and use it in GitHub Desktop.
A automated downloader and html2csv converter for AWS data
import os
import sys
import urllib2
import time
from datetime import datetime
from BeautifulSoup import BeautifulSoup
# The following parameters can be changed to suite the requirements of user
# url = "http://www.imd.gov.in/section/nhac/aws/aws00.htm"
hr = 1
mi = 0
sec = 0
interval = (hr*3600)+(mi*60)+sec
def createFolder(folder):
if not os.path.isdir(folder):
os.makedirs(folder)
return folder
def constructFilename(url):
now = datetime.now()
folderName = now.strftime('%Y_%m_%d')
folder = createFolder(folderName)
csvfolder = createFolder(os.path.join(folder,'csv'))
filename = url.split("/")[-1]
return os.path.join(folder,filename)
def constructUrl():
base = "http://www.imd.gov.in/section/nhac/aws/aws"
now = datetime.now()
hour = now.hour - 9
if hour < 0:
hour = 24 + hour
if hour < 10:
hourStr = '0'+str(hour)
else:
hourStr = str(hour)
url = base+hourStr+".htm"
return url
def html2csv(filename):
# Convert HTML 2 CSV
filename.replace('/','/csv/')
f = open(filename, 'r+')
csv = open(filename.replace('htm','csv'), 'w+')
print filename
for i in range(4):
f.readline()
soup = BeautifulSoup(f.read())
rows = soup.findAll('tr')
for row in rows:
cells = row.findAll('td')
line = ','.join([cell.text for cell in cells])
csv.write(line.replace('&nbsp;',' '))
csv.write('\n')
csv.close()
f.close()
def main():
url = constructUrl()
print "Opening URL: %s" % (url)
print "Date & Time: "+datetime.now().__str__()
f = urllib2.urlopen(url)
filesize = float(f.info().getheader("Content-Length"))
if filesize == None:
print "FileSize Error"
else:
filename = constructFilename(url)
local = open(filename,"wb")
bytesRead = 0.0
oldpercent = 0
for line in f:
bytesRead += len(line)
newpercent = 100*bytesRead/filesize
if newpercent-oldpercent > 5:
print "%s: %.02f/%.02f kb (%d%%)" %(
filename,
bytesRead/1024.0,
filesize/1024.0,
newpercent
)
oldpercent = newpercent
local.write(line)
local.close()
f.close()
if bytesRead == filesize:
print "File download Done!"
else:
print "Error: File download interupted"
print 'Converting HTML to CSV....'
html2csv(filename)
if __name__ == '__main__':
while(1):
main()
time.sleep(interval)
import os
import sys
import urllib2
def createFolder(folder):
if not os.path.isdir(folder):
os.makedirs(folder)
return folder
def constructFilename(yr,mon):
folder = createFolder(str(yr))
return os.path.join(folder,"%02d" %(mon) +".txt")
def downloadFile( yr, mon ):
baseUrl = "http://www.kea.metsite.com/"
url = baseUrl + "%04d_%02d" %(yr,mon) +".txt"
f = urllib2.urlopen(url)
print "Opening " + url
filesize = float(f.info().getheader("Content-Length"))
if filesize == None:
print "FileSize Error"
else:
filename = constructFilename(yr,mon)
local = open(filename,"wb")
bytesRead = 0.0
oldpercent = 0
for line in f:
bytesRead += len(line)
newpercent = 100*bytesRead/filesize
if newpercent-oldpercent > 10:
print "%s: %.02f/%.02f kb (%d%%)" %(
filename,
bytesRead/1024.0,
filesize/1024.0,
newpercent
)
oldpercent = newpercent
local.write(line)
local.close()
f.close()
if bytesRead == filesize:
print filename + " download Done!"
else:
print "Error: File download interupted"
def main():
startYr = 2004
endYr = 2014
for yr in range(startYr,endYr):
strMonth = 1
endMonth = 13
if yr == startYr:
strMonth = 9
if yr == endYr:
endMonth = 2
for month in range(strMonth,endMonth):
downloadFile(yr,month)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment