Skip to content

Instantly share code, notes, and snippets.

@hrwgc
Last active December 12, 2015 07:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrwgc/4736519 to your computer and use it in GitHub Desktop.
Save hrwgc/4736519 to your computer and use it in GitHub Desktop.
Scraper for NOAA VIIRS Combustion Source CSVs
import urllib2, urllib
import re, os
import sqlite3
import uuid
import contextlib
from bs4 import *
import csv
import unicodedata
base_url = "http://www.ngdc.noaa.gov/dmsp/data/viirs_fire/"
con = sqlite3.connect('noaa_fires.sqlite')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS noaa_scraped;')
cur.execute('DROP TABLE IF EXISTS fires;')
cur.execute('DROP TABLE IF EXISTS spatial_ref_sys;')
cur.execute('DROP TABLE IF EXISTS geometry_columns;')
cur.execute('CREATE TABLE IF NOT EXISTS noaa_scraped (`url` TEXT PRIMARY KEY);')
cur.execute('CREATE TABLE IF NOT EXISTS geometry_columns ( f_table_name VARCHAR, f_geometry_column VARCHAR, geometry_type INTEGER, coord_dimension INTEGER, srid INTEGER, geometry_format VARCHAR);')
cur.execute("INSERT INTO geometry_columns ( f_table_name, f_geometry_column, geometry_type, coord_dimension, srid, geometry_format) values ('fires', 'GEOMETRY', '1', '2', '4326', 'WKT');")
cur.execute('CREATE TABLE IF NOT EXISTS spatial_ref_sys ( srid INTEGER UNIQUE, auth_name TEXT, auth_srid TEXT, srtext TEXT);')
cur.execute("INSERT INTO spatial_ref_sys ( srid, auth_name, auth_srid, srtext) values ('4326', 'EPSG', '4326', 'GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563,AUTHORITY[\"EPSG\",\"7030\"]],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.01745329251994328,AUTHORITY[\"EPSG\",\"9122\"]],AUTHORITY[\"EPSG\",\"4326\"]]');")
cur.execute('CREATE TABLE IF NOT EXISTS `fires` (`SOLA_GMTCO` TEXT, `DNB_Rad` TEXT, `SCVY_GMTCO` TEXT, `SCAY_GMTCO` TEXT, `COT_IVCOP` TEXT, `QF2_IICMO` TEXT, `M14_Rad` TEXT, `M07_Rad` TEXT, `QF1_GMTCO` TEXT, `M10_Line` TEXT, `SOLZ_GMTCO` TEXT, `QF3_IVCOP` TEXT, `SCPZ_GMTCO` TEXT, `Proc_Date` TEXT, `QF2_GMTCO` TEXT, `QF6_IICMO` TEXT, `QF4_IICMO` TEXT, `SCVZ_GMTCO` TEXT, `M10_Center` TEXT, `M10_Std` TEXT, `SCVX_GMTCO` TEXT, `M10_Nsigma` TEXT, `M10_File` TEXT, `M16_Rad` TEXT, `DNB_Line` TEXT, `M10_Sample` TEXT, `SATA_GMTCO` TEXT, `SCAZ_GMTCO` TEXT, `DNB_Dist` TEXT, `M10_Avg` TEXT, `DNB_Lat` TEXT, `DNB_Sample` TEXT, `Date_Mscan` TEXT, `M10_Rad` TEXT, `EPS_IVCOP` TEXT, `SATZ_GMTCO` TEXT, `M10_DN` TEXT, `BB_Temp` TEXT, `ID` VARCHAR primary key, `SCPX_GMTCO` TEXT, `M15_Rad` TEXT, `Lon_GMTCO` REAL, `SCAX_GMTCO` TEXT, `QF3_IICMO` TEXT, `QF1_IICMO` TEXT, `QF2_IVCOP` TEXT, `M13_Rad` TEXT, `QF1_IVCOP` TEXT, `Total_Rad` TEXT, `Lat_GMTCO` REAL, `CM_IICMO` TEXT, `DNB_Lon` TEXT, `SCPY_GMTCO` TEXT, `QF5_IICMO` TEXT, `M12_Rad` TEXT, `M08_Rad` TEXT, `GEOMETRY` BLOB);')
with contextlib.closing(urllib.urlopen(base_url)) as x:
html = x.read()
soup = BeautifulSoup(html)
table = soup.body.find('div',attrs={"id":"main_text"}).table
rows = table.find_all('tr')
for x in range(8,len(rows)):
if rows[x].find(href=re.compile('\.csv')) != None:
data_stub_url = rows[x].find(href=re.compile('\.csv'))
data_url = data_stub_url['href']
cur.execute('INSERT into noaa_scraped values(?)',(str(data_url),))
with contextlib.closing(urllib.urlopen(base_url + data_url)) as x:
data = x.read()
reader = csv.DictReader(data.splitlines())
for row in reader:
row['ID']=uuid.uuid1()
row['GEOMETRY'] = "POINT(" + row['Lon_GMTCO'] + ", " + row['Lat_GMTCO'] + ")"
try:
cur.execute('INSERT into fires values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )', (row["SOLA_GMTCO"], row["DNB_Rad"], row["SCVY_GMTCO"], row["SCAY_GMTCO"], row["COT_IVCOP"], row["QF2_IICMO"], row["M14_Rad"], row["M07_Rad"], row["QF1_GMTCO"], row["M10_Line"], row["SOLZ_GMTCO"], row["QF3_IVCOP"], row["SCPZ_GMTCO"], row["Proc_Date"], row["QF2_GMTCO"], row["QF6_IICMO"], row["QF4_IICMO"], row["SCVZ_GMTCO"], row["M10_Center"], row["M10_Std"], row["SCVX_GMTCO"], row["M10_Nsigma"], row["M10_File"], row["M16_Rad"], row["DNB_Line"], row["M10_Sample"], row["SATA_GMTCO"], row["SCAZ_GMTCO"], row["DNB_Dist"], row["M10_Avg"], row["DNB_Lat"], row["DNB_Sample"], row["Date_Mscan"], row["M10_Rad"], row["EPS_IVCOP"], row["SATZ_GMTCO"], row["M10_DN"], row["BB_Temp"], unicode(row["ID"]), row["SCPX_GMTCO"], row["M15_Rad"], float(row["Lon_GMTCO"]), row["SCAX_GMTCO"], row["QF3_IICMO"], row["QF1_IICMO"], row["QF2_IVCOP"], row["M13_Rad"], row["QF1_IVCOP"], row["Total_Rad"], float(row["Lat_GMTCO"]), row["CM_IICMO"], row["DNB_Lon"], row["SCPY_GMTCO"], row["QF5_IICMO"], row["M12_Rad"], row["M08_Rad"], row["GEOMETRY"]))
con.commit()
except:
print ("Fail:")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment