Skip to content

Instantly share code, notes, and snippets.

@mtigas
Forked from brianboyer/import_usat
Created February 18, 2011 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mtigas/834285 to your computer and use it in GitHub Desktop.
Save mtigas/834285 to your computer and use it in GitHub Desktop.
#!/bin/bash
#
# usat_import
# Fork by Mike Tigas
# Based on work by Ryan Nagle, Chris Groskopf and Brian Boyer
#
# Updated to allow import of multiple states into the same database.
# Unlike the original script, this *only* generates super-wide tables.
#
# Shapes are stored in WGS84 (EPSG:4326) for portability.
#
# This is excessive, I know.
# =====
#
# How-to:
#
# 0: Requires Python, PostgreSQL+PostGIS, and a GIS stack.
# Build your own with Homebrew on OSX (what I do):
# http://blog.apps.chicagotribune.com/2010/02/17/quick-install-pythonpostgis-geo-stack-on-snow-leopard/
#
# Also requires usat_table.py helper script, which generates the create statement
# for those extra-wide tables.
#
# 1: Get data.
#
# Join IRE (it's cheap!), and download the data for your state:
# http://www.ire.org/getcensus/
#
# Download the state, county, and tract shapefiles for your state, from the US Census:
# http://www.census.gov/cgi-bin/geo/shapefiles2010/main
#
# 3: Unzip all those files in the same directory.
#
# 4: Make sure the settings below (DATABASE_* and CENSUS_*_TABLE) are what you want.
#
# 5: Run this.
#
# cd into the directory your data is in.
#
# /path/to/import_usat.sh (state_abbr) (state_number)
# where `state_number` is the middle bit in the shapefile file names: "46" if you got tl_2010_46_state10
# i.e.: /path/to/import_usat.sh NJ 34
#
# DB info
DATABASE_NAME=census2010geo
DATABASE_USER=mtigas
DATABASE_HOST="127.0.0.1"
# Where to create the extra-huge tables
CENSUS_STATE_TABLE="census2010_state"
CENSUS_COUNTY_TABLE="census2010_county"
CENSUS_TRACT_TABLE="census2010_censustract"
# =====
#dropdb census2010geo
#createdb census2010geo -T template_postgis
# =====
#STATE="NJ"
#STATE_NUM="34"
#STATE="TX"
#STATE_NUM="48"
#STATE="SD"
#STATE_NUM="46"
STATE="$1"
STATE_NUM="$2"
BINDIR="`dirname $0`"
# =====
# Do our target state/county/tract tables exist?
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from ${CENSUS_STATE_TABLE}" > /dev/null 2>&1
CREATE_STATE_TABLE=$?
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from $CENSUS_COUNTY_TABLE" > /dev/null 2>&1
CREATE_COUNTY_TABLE=$?
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from $CENSUS_TRACT_TABLE" > /dev/null 2>&1
CREATE_TRACT_TABLE=$?
# =====
# If we have leftover temporary tables (from a previously-failed import), drop 'em.
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_STATE_TABLE}_tempdata; drop table if exists ${CENSUS_STATE_TABLE}_tempgeo;"> /dev/null 2>&1
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_COUNTY_TABLE}_tempdata; drop table if exists ${CENSUS_COUNTY_TABLE}_tempgeo;"> /dev/null 2>&1
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_TRACT_TABLE}_tempdata; drop table if exists ${CENSUS_TRACT_TABLE}_tempgeo;"> /dev/null 2>&1
# =====
# State-level table
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_state10/tl_2010_${STATE_NUM}_state10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_STATE_TABLE}_tempgeo
python $BINDIR/usat_table.py ${CENSUS_STATE_TABLE}_tempdata $STATE/State$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_STATE_TABLE}_tempdata from '$PWD/$STATE/State$STATE.csv' delimiters ',' CSV HEADER;"
if [ "$CREATE_STATE_TABLE" = "1" ]; then
echo "===== CREATING STATE TABLE WITH $STATE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_STATE_TABLE from ${CENSUS_STATE_TABLE}_tempdata inner join ${CENSUS_STATE_TABLE}_tempgeo on ${CENSUS_STATE_TABLE}_tempdata.state = cast(${CENSUS_STATE_TABLE}_tempgeo.statefp10 as int);"
CREATE_STATE_TABLE=0
else
echo "===== IMPORTING $STATE INTO STATE TABLE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_STATE_TABLE SELECT * FROM ${CENSUS_STATE_TABLE}_tempdata inner join ${CENSUS_STATE_TABLE}_tempgeo on ${CENSUS_STATE_TABLE}_tempdata.state = cast(${CENSUS_STATE_TABLE}_tempgeo.statefp10 as int);"
fi
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_STATE_TABLE}_tempdata; drop table ${CENSUS_STATE_TABLE}_tempgeo;"
# =====
# County-level table
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_county10/tl_2010_${STATE_NUM}_county10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_COUNTY_TABLE}_tempgeo
python $BINDIR/usat_table.py ${CENSUS_COUNTY_TABLE}_tempdata $STATE/County$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_COUNTY_TABLE}_tempdata from '$PWD/$STATE/County$STATE.csv' delimiters ',' CSV HEADER;"
if [ "$CREATE_COUNTY_TABLE" = "1" ]; then
echo "===== CREATING COUNTY TABLE WITH $STATE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_COUNTY_TABLE from ${CENSUS_COUNTY_TABLE}_tempdata inner join ${CENSUS_COUNTY_TABLE}_tempgeo on ${CENSUS_COUNTY_TABLE}_tempdata.state = cast(${CENSUS_COUNTY_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_COUNTY_TABLE}_tempdata.county = cast(${CENSUS_COUNTY_TABLE}_tempgeo.countyfp10 as int);"
else
echo "===== IMPORTING $STATE INTO COUNTY TABLE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_COUNTY_TABLE SELECT * FROM ${CENSUS_COUNTY_TABLE}_tempdata inner join ${CENSUS_COUNTY_TABLE}_tempgeo on ${CENSUS_COUNTY_TABLE}_tempdata.state = cast(${CENSUS_COUNTY_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_COUNTY_TABLE}_tempdata.county = cast(${CENSUS_COUNTY_TABLE}_tempgeo.countyfp10 as int);"
fi
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_COUNTY_TABLE}_tempdata; drop table ${CENSUS_COUNTY_TABLE}_tempgeo;"
# =====
# Census Tract-level table
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_tract10/tl_2010_${STATE_NUM}_tract10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_TRACT_TABLE}_tempgeo
python $BINDIR/usat_table.py ${CENSUS_TRACT_TABLE}_tempdata $STATE/Tract$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_TRACT_TABLE}_tempdata from '$PWD/$STATE/Tract$STATE.csv' delimiters ',' CSV HEADER;"
if [ "$CREATE_TRACT_TABLE" = "1" ]; then
echo "===== CREATING TRACT TABLE WITH $STATE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_TRACT_TABLE from ${CENSUS_TRACT_TABLE}_tempdata inner join ${CENSUS_TRACT_TABLE}_tempgeo on ${CENSUS_TRACT_TABLE}_tempdata.state = cast(${CENSUS_TRACT_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.county = cast(${CENSUS_TRACT_TABLE}_tempgeo.countyfp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.tract = cast(${CENSUS_TRACT_TABLE}_tempgeo.tractce10 as int);"
else
echo "===== IMPORTING $STATE INTO TRACT TABLE ====="
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_TRACT_TABLE SELECT * FROM ${CENSUS_TRACT_TABLE}_tempdata inner join ${CENSUS_TRACT_TABLE}_tempgeo on ${CENSUS_TRACT_TABLE}_tempdata.state = cast(${CENSUS_TRACT_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.county = cast(${CENSUS_TRACT_TABLE}_tempgeo.countyfp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.tract = cast(${CENSUS_TRACT_TABLE}_tempgeo.tractce10 as int);"
fi
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_TRACT_TABLE}_tempdata; drop table ${CENSUS_TRACT_TABLE}_tempgeo;"
# =====
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "VACUUM"
exit
# tests
# SELECT name FROM "census2010_state" WHERE ST_Contains("census2010_state"."wkb_geometry", ST_GeomFromEWKB(E'\\001\\001\\000\\000 \\346\\020\\000\\000\\000\\000\\000\\000\\200fY\\3005\\201\\377\\310\\363\\014@@'::bytea));
# explain SELECT name FROM "census2010_state" WHERE ST_Contains("census2010_state"."wkb_geometry", ST_GeomFromEWKB(E'\\001\\001\\000\\000 \\346\\020\\000\\000\\000\\000\\000\\000\\200fY\\3005\\201\\377\\310\\363\\014@@'::bytea));
#
# SELECT name, _2000pop, p0010001 as _2010pop, pctchgtotpop2010_2000 from "census2010_state" WHERE "name" = 'Texas';
# EXPLAIN SELECT name, _2000pop, p0010001 as _2010pop, pctchgtotpop2010_2000 from "census2010_state" WHERE "name" = 'Texas';
#
# SELECT s.name, t._2000pop, t.p0010001 as _2010pop, t.pctchgtotpop2010_2000 from "census2010_state" s, "census2010_censustract" t WHERE t."state" = '34' AND t."state"=s."state";
# EXPLAIN SELECT s.name, t._2000pop, t.p0010001 as _2010pop, t.pctchgtotpop2010_2000 from "census2010_state" s, "census2010_censustract" t WHERE t."state" = '34' AND t."state"=s."state";
# create_table.py
#
# Python helper for PosegreSQL importer of pre-processed census files
# provided to IRE members by Paul Overberg and Anthony DeBarros from USA Today.
#
# See usat_import for usage
#
import csv
import sys
table_name = sys.argv[1]
file_name = sys.argv[2]
r = csv.reader(open(file_name,'rb'))
headers = r.next()
values = r.next()
s = "create table %s (\n" % (table_name)
for i,val in enumerate(headers):
col = headers[i]
value = values[i]
if value == '': #blank, skip it!
col += " numeric"
else:
try:
float(value)
col += " numeric"
except ValueError:
col += " varchar(50)"
if len(headers) - 1 != i:
col += ",\n"
s += col
s += "\n);"
print s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment