Last active
December 12, 2015 07:28
-
-
Save hrwgc/4736519 to your computer and use it in GitHub Desktop.
Scraper for NOAA VIIRS Combustion Source CSVs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2, urllib | |
import re, os | |
import sqlite3 | |
import uuid | |
import contextlib | |
from bs4 import * | |
import csv | |
import unicodedata | |
base_url = "http://www.ngdc.noaa.gov/dmsp/data/viirs_fire/" | |
con = sqlite3.connect('noaa_fires.sqlite') | |
cur = con.cursor() | |
cur.execute('DROP TABLE IF EXISTS noaa_scraped;') | |
cur.execute('DROP TABLE IF EXISTS fires;') | |
cur.execute('DROP TABLE IF EXISTS spatial_ref_sys;') | |
cur.execute('DROP TABLE IF EXISTS geometry_columns;') | |
cur.execute('CREATE TABLE IF NOT EXISTS noaa_scraped (`url` TEXT PRIMARY KEY);') | |
cur.execute('CREATE TABLE IF NOT EXISTS geometry_columns ( f_table_name VARCHAR, f_geometry_column VARCHAR, geometry_type INTEGER, coord_dimension INTEGER, srid INTEGER, geometry_format VARCHAR);') | |
cur.execute("INSERT INTO geometry_columns ( f_table_name, f_geometry_column, geometry_type, coord_dimension, srid, geometry_format) values ('fires', 'GEOMETRY', '1', '2', '4326', 'WKT');") | |
cur.execute('CREATE TABLE IF NOT EXISTS spatial_ref_sys ( srid INTEGER UNIQUE, auth_name TEXT, auth_srid TEXT, srtext TEXT);') | |
cur.execute("INSERT INTO spatial_ref_sys ( srid, auth_name, auth_srid, srtext) values ('4326', 'EPSG', '4326', 'GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563,AUTHORITY[\"EPSG\",\"7030\"]],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.01745329251994328,AUTHORITY[\"EPSG\",\"9122\"]],AUTHORITY[\"EPSG\",\"4326\"]]');") | |
cur.execute('CREATE TABLE IF NOT EXISTS `fires` (`SOLA_GMTCO` TEXT, `DNB_Rad` TEXT, `SCVY_GMTCO` TEXT, `SCAY_GMTCO` TEXT, `COT_IVCOP` TEXT, `QF2_IICMO` TEXT, `M14_Rad` TEXT, `M07_Rad` TEXT, `QF1_GMTCO` TEXT, `M10_Line` TEXT, `SOLZ_GMTCO` TEXT, `QF3_IVCOP` TEXT, `SCPZ_GMTCO` TEXT, `Proc_Date` TEXT, `QF2_GMTCO` TEXT, `QF6_IICMO` TEXT, `QF4_IICMO` TEXT, `SCVZ_GMTCO` TEXT, `M10_Center` TEXT, `M10_Std` TEXT, `SCVX_GMTCO` TEXT, `M10_Nsigma` TEXT, `M10_File` TEXT, `M16_Rad` TEXT, `DNB_Line` TEXT, `M10_Sample` TEXT, `SATA_GMTCO` TEXT, `SCAZ_GMTCO` TEXT, `DNB_Dist` TEXT, `M10_Avg` TEXT, `DNB_Lat` TEXT, `DNB_Sample` TEXT, `Date_Mscan` TEXT, `M10_Rad` TEXT, `EPS_IVCOP` TEXT, `SATZ_GMTCO` TEXT, `M10_DN` TEXT, `BB_Temp` TEXT, `ID` VARCHAR primary key, `SCPX_GMTCO` TEXT, `M15_Rad` TEXT, `Lon_GMTCO` REAL, `SCAX_GMTCO` TEXT, `QF3_IICMO` TEXT, `QF1_IICMO` TEXT, `QF2_IVCOP` TEXT, `M13_Rad` TEXT, `QF1_IVCOP` TEXT, `Total_Rad` TEXT, `Lat_GMTCO` REAL, `CM_IICMO` TEXT, `DNB_Lon` TEXT, `SCPY_GMTCO` TEXT, `QF5_IICMO` TEXT, `M12_Rad` TEXT, `M08_Rad` TEXT, `GEOMETRY` BLOB);') | |
with contextlib.closing(urllib.urlopen(base_url)) as x: | |
html = x.read() | |
soup = BeautifulSoup(html) | |
table = soup.body.find('div',attrs={"id":"main_text"}).table | |
rows = table.find_all('tr') | |
for x in range(8,len(rows)): | |
if rows[x].find(href=re.compile('\.csv')) != None: | |
data_stub_url = rows[x].find(href=re.compile('\.csv')) | |
data_url = data_stub_url['href'] | |
cur.execute('INSERT into noaa_scraped values(?)',(str(data_url),)) | |
with contextlib.closing(urllib.urlopen(base_url + data_url)) as x: | |
data = x.read() | |
reader = csv.DictReader(data.splitlines()) | |
for row in reader: | |
row['ID']=uuid.uuid1() | |
row['GEOMETRY'] = "POINT(" + row['Lon_GMTCO'] + ", " + row['Lat_GMTCO'] + ")" | |
try: | |
cur.execute('INSERT into fires values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )', (row["SOLA_GMTCO"], row["DNB_Rad"], row["SCVY_GMTCO"], row["SCAY_GMTCO"], row["COT_IVCOP"], row["QF2_IICMO"], row["M14_Rad"], row["M07_Rad"], row["QF1_GMTCO"], row["M10_Line"], row["SOLZ_GMTCO"], row["QF3_IVCOP"], row["SCPZ_GMTCO"], row["Proc_Date"], row["QF2_GMTCO"], row["QF6_IICMO"], row["QF4_IICMO"], row["SCVZ_GMTCO"], row["M10_Center"], row["M10_Std"], row["SCVX_GMTCO"], row["M10_Nsigma"], row["M10_File"], row["M16_Rad"], row["DNB_Line"], row["M10_Sample"], row["SATA_GMTCO"], row["SCAZ_GMTCO"], row["DNB_Dist"], row["M10_Avg"], row["DNB_Lat"], row["DNB_Sample"], row["Date_Mscan"], row["M10_Rad"], row["EPS_IVCOP"], row["SATZ_GMTCO"], row["M10_DN"], row["BB_Temp"], unicode(row["ID"]), row["SCPX_GMTCO"], row["M15_Rad"], float(row["Lon_GMTCO"]), row["SCAX_GMTCO"], row["QF3_IICMO"], row["QF1_IICMO"], row["QF2_IVCOP"], row["M13_Rad"], row["QF1_IVCOP"], row["Total_Rad"], float(row["Lat_GMTCO"]), row["CM_IICMO"], row["DNB_Lon"], row["SCPY_GMTCO"], row["QF5_IICMO"], row["M12_Rad"], row["M08_Rad"], row["GEOMETRY"])) | |
con.commit() | |
except: | |
print ("Fail:") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment