websrapping
###cpcb.py ===> its library, save this as cpcb.py | |
#!/usr/bin/env python | |
''' | |
Copyright 2017 Mrityunjai Kumar | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
''' | |
# created by :- mrityunjai.kmr@gmail.com, www.mrityunjai.in | |
import urllib2 | |
import re | |
from bs4 import BeautifulSoup | |
class CPCB: | |
'Parameters, Nitric Oxide, Nitrogen Dioxide, NOx, Sulfur Dioxide, Ozone, PM 2.5, Benzene, Toluene, Ethyl Benzene, Temperature, Relative Humidity, Wind Speed, Wind Direction, Vertical Wind Speed, Solar Radiation' | |
def __init__(self): | |
print "cpcb class instanciated" | |
def getData(self): | |
cpcb_url="http://www.cpcb.gov.in/CAAQM/frmCurrentDataNew.aspx?StationName=Dwarka&StateId=6&CityId=85" | |
page = urllib2.urlopen(cpcb_url) | |
soup = BeautifulSoup(page, 'html.parser') | |
tableRows=soup.find('span', attrs={'id':'lblReportCurrentData'}).find('table').find_all('tr') | |
airContents={} | |
for row in tableRows: | |
cols = row.find_all('td') | |
pattern = re.compile('<span style=\"\w+\:\w+\;\">(.*?)\</span>', re.IGNORECASE) | |
if len(cols)>6: | |
subjectCase = cols[3].find('span',attrs={'style':'color:Blue;'}) | |
concentrationValue=pattern.findall(str(subjectCase)) | |
if concentrationValue: | |
concentrationValue= concentrationValue[0] | |
else: | |
concentrationValue=None | |
airContents.update({(cols[0].string):{ | |
"Parameters":(cols[0].string), | |
"Date":(cols[1].string), | |
"Time":(cols[2].string), | |
"Concentration":concentrationValue, | |
"Unit":(cols[4].string), | |
#"Standard":(cols[5].string) | |
}}) | |
return airContents | |
def getDataOf(self,airParameter): | |
cpcb_url="http://www.cpcb.gov.in/CAAQM/frmCurrentDataNew.aspx?StationName=Dwarka&StateId=6&CityId=85" | |
page = urllib2.urlopen(cpcb_url) | |
soup = BeautifulSoup(page, 'html.parser') | |
tableRows=soup.find('span', attrs={'id':'lblReportCurrentData'}).find('table').find_all('tr') | |
airContents={} | |
for row in tableRows: | |
cols = row.find_all('td') | |
pattern = re.compile('<span style=\"\w+\:\w+\;\">(.*?)\</span>', re.IGNORECASE) | |
if len(cols)>6: | |
subjectCase = cols[3].find('span',attrs={'style':'color:Blue;'}) | |
concentrationValue=pattern.findall(str(subjectCase)) | |
if concentrationValue: | |
concentrationValue= concentrationValue[0] | |
else: | |
concentrationValue=None | |
airContents.update({(cols[0].string):{ | |
"Parameters":(cols[0].string), | |
"Date":(cols[1].string), | |
"Time":(cols[2].string), | |
"Concentration":concentrationValue, | |
"Unit":(cols[4].string), | |
#"Standard":(cols[5].string) | |
}}) | |
return airContents[str(airParameter)] | |
if __name__ == "__main__": | |
cpcb=CPCB() | |
print CPCB.__doc__ | |
print "##################" | |
cpcbData=cpcb.getData() | |
print cpcbData.items() | |
print "##################" | |
cpcbDataNOx=cpcb.getDataOf('NOx') | |
print cpcbDataNOx | |
print "##################" | |
#### | |
#main file for the execution | |
''' | |
Copyright 2017 Mrityunjai Kumar | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
''' | |
# created by :- mrityunjai.kmr@gmail.com, www.mrityunjai.in | |
import csv | |
import os | |
import datetime | |
import time | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
from time import gmtime, strftime | |
import cpcb as cpcbModule | |
unixTime= int(time.time()) | |
gmtDate = str(strftime("%d-%m-%Y", gmtime())) | |
gmtTime = str(strftime("%H:%M:%S", gmtime())) | |
filename='cpcbData.csv' | |
data_file = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/'+filename)) | |
file_existence_flag=int(os.path.isfile(data_file)) | |
cpcb=cpcbModule.CPCB() | |
cpcbData=cpcb.getData() | |
def cpcbToCsv(): | |
f = open(data_file, 'a+') | |
try: | |
writer = csv.writer(f,lineterminator='\n') | |
if file_existence_flag==0:# True | |
print data_file + " is created" | |
writer.writerow( ('GMT Time','Unix TimeStamp','Date','Time','Parameters','Concentration','Unit','RecordDate','RecordTime')) | |
else: | |
print data_file + " is opened" | |
#writer.writerow( ((str(strftime("%d-%m-%Y %H-%M-%S", gmtime()))),unixTime,gmtDate,gmtTime)) | |
writer.writerow(makeStringForCSV('Nitric Oxide')) | |
writer.writerow(makeStringForCSV('Nitrogen Dioxide')) | |
writer.writerow(makeStringForCSV('NOx')) | |
writer.writerow(makeStringForCSV('Sulfur Dioxide')) | |
writer.writerow(makeStringForCSV('Ozone')) | |
writer.writerow(makeStringForCSV('PM 2.5')) | |
writer.writerow(makeStringForCSV('Benzene')) | |
writer.writerow(makeStringForCSV('Toluene')) | |
writer.writerow(makeStringForCSV('Ethyl Benzene')) | |
writer.writerow(makeStringForCSV('Temperature')) | |
writer.writerow(makeStringForCSV('Relative Humidity')) | |
#writer.writerow(makeStringForCSV('Wind Speed')) | |
writer.writerow(makeStringForCSV('Wind Direction')) | |
writer.writerow(makeStringForCSV('Vertical Wind Speed')) | |
writer.writerow(makeStringForCSV('Solar Radiation')) | |
finally: | |
f.close() | |
def makeStringForCSV(x): | |
print '%s <= data' % str(cpcbData[x]['Parameters']) | |
return ((str(strftime("%d-%m-%Y %H-%M-%S", gmtime()))), | |
unixTime, | |
gmtDate, | |
gmtTime, | |
cpcbData[x]['Parameters'],cpcbData[x]['Concentration'],cpcbData[x]['Unit'],cpcbData[x]['Date'],cpcbData[x]['Time'] | |
) | |
if __name__ == "__main__": | |
cpcbToCsv() | |
print '%s <= Time' % str(cpcbData['Ozone']['Time']) | |
print "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment