charleslparker/gist:3037105

## gistfile1.sh
#!/bin/bash

# Set credentials for BigML and for the US Census
BIGML_USERNAME=bigml
BIGML_API_KEY=****
CENSUS_API_KEY=****
BIGML_AUTH="username=$BIGML_USERNAME;api_key=$BIGML_API_KEY"

# Download demographic data into files.  Have to do it this way
# because the census API doesn't seem to allow returns of more
# than eight columns (at least, not with this many rows).
#
# This is median income for various levels of education
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B20004_002E,B20004_003E,B20004_004E,B20004_005E,B20004_006E&for=county:*&key=$CENSUS_API_KEY" > inc.tmp

# This is percent of the female population attaining various levels
# of education up to associate's degrees
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_019E,B15002_028E,B15002_029E,B15002_030E,B15002_031E&for=county:*&key=$CENSUS_API_KEY" > fem1.tmp

# This is percent of the female population attaining various levels
# of education for bachelor's degrees and above
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_032E,B15002_033E,B15002_034E,B15002_035E&for=county:*&key=$CENSUS_API_KEY" > fem2.tmp

# This is the number of people in poverty along with the total number
# for whom poverty was determined.
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B17001_001E,B17001_002E&for=county:*&key=$CENSUS_API_KEY" > pov.tmp

# This is percent of the male population attaining various levels
# of education up to associate's degrees
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_002E,B15002_011E,B15002_012E,B15002_013E,B15002_014E&for=county:*&key=$CENSUS_API_KEY" > male1.tmp

# This is percent of the male population attaining various levels
# of education for bachelor's degrees and above
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_015E,B15002_016E,B15002_017E,B15002_018E&for=county:*&key=$CENSUS_API_KEY" > male2.tmp

# Concatenate the rows from the male educational data and
# the female educational data into a single file for each gender
paste -d "," fem1.tmp fem2.tmp > fem-edu-cat.tmp
paste -d "," male1.tmp male2.tmp > male-edu-cat.tmp

# Normalize the columns of the female education row by the total.
# We're also summing some columns so we get percent of the population with,
# e.g., "high school or greater" education rather than just "high school",
# so we have four columns, high school or greater, associates or greater,
# bachelors or greater, and graduate or greater
#
# We're also stripping off the header here.  We'll put a new one
# in at the end.
awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' fem-edu-cat.tmp > fem-edu.tmp

# Do the same thing with the male education stats.
awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' male-edu-cat.tmp > male-edu.tmp

# Just clean the brackets and header out of the income file
awk -F [][,\"]+ 'NR > 1{print $2","$3","$4","$5","$6}' inc.tmp > income.tmp

# Create the variable for poverty, people in poverty / total
awk -F [][,\"]+ 'NR > 1{print $3/$2}' pov.tmp > poverty.tmp

# Create a new header with better column names
echo "Income: Less than High School,Income: High School,Income: Associates,Income: Bachelors,Income: Graduate,Education: Female - High School,Education: Female - Associate's,Education: Female - Bachelor's,Education: Female - Graduate,Education: Male - High School,Education: Male - Associate's,Education: Male - Bachelor's,Education: Male - Graduate,Poverty" > head.tmp

# Concatenate all of the demographic attributes to a single file
paste -d "," income.tmp fem-edu.tmp male-edu.tmp poverty.tmp > data.tmp

# Concatenate the header with the rest of the data
cat head.tmp data.tmp > census-data.csv

# Remove temporary files
rm *.tmp

# Create a BigML dataource with the resulting file
curl https://bigml.io/andromeda/source?$BIGML_AUTH -F file=@census-data.csv
	#!/bin/bash

	# Set credentials for BigML and for the US Census
	BIGML_USERNAME=bigml
	BIGML_API_KEY=****
	CENSUS_API_KEY=****
	BIGML_AUTH="username=$BIGML_USERNAME;api_key=$BIGML_API_KEY"

	# Download demographic data into files. Have to do it this way
	# because the census API doesn't seem to allow returns of more
	# than eight columns (at least, not with this many rows).
	#
	# This is median income for various levels of education
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B20004_002E,B20004_003E,B20004_004E,B20004_005E,B20004_006E&for=county:*&key=$CENSUS_API_KEY" > inc.tmp

	# This is percent of the female population attaining various levels
	# of education up to associate's degrees
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_019E,B15002_028E,B15002_029E,B15002_030E,B15002_031E&for=county:*&key=$CENSUS_API_KEY" > fem1.tmp

	# This is percent of the female population attaining various levels
	# of education for bachelor's degrees and above
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_032E,B15002_033E,B15002_034E,B15002_035E&for=county:*&key=$CENSUS_API_KEY" > fem2.tmp

	# This is the number of people in poverty along with the total number
	# for whom poverty was determined.
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B17001_001E,B17001_002E&for=county:*&key=$CENSUS_API_KEY" > pov.tmp

	# This is percent of the male population attaining various levels
	# of education up to associate's degrees
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_002E,B15002_011E,B15002_012E,B15002_013E,B15002_014E&for=county:*&key=$CENSUS_API_KEY" > male1.tmp

	# This is percent of the male population attaining various levels
	# of education for bachelor's degrees and above
	curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_015E,B15002_016E,B15002_017E,B15002_018E&for=county:*&key=$CENSUS_API_KEY" > male2.tmp

	# Concatenate the rows from the male educational data and
	# the female educational data into a single file for each gender
	paste -d "," fem1.tmp fem2.tmp > fem-edu-cat.tmp
	paste -d "," male1.tmp male2.tmp > male-edu-cat.tmp

	# Normalize the columns of the female education row by the total.
	# We're also summing some columns so we get percent of the population with,
	# e.g., "high school or greater" education rather than just "high school",
	# so we have four columns, high school or greater, associates or greater,
	# bachelors or greater, and graduate or greater
	#
	# We're also stripping off the header here. We'll put a new one
	# in at the end.
	awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' fem-edu-cat.tmp > fem-edu.tmp

	# Do the same thing with the male education stats.
	awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' male-edu-cat.tmp > male-edu.tmp

	# Just clean the brackets and header out of the income file
	awk -F [][,\"]+ 'NR > 1{print $2","$3","$4","$5","$6}' inc.tmp > income.tmp

	# Create the variable for poverty, people in poverty / total
	awk -F [][,\"]+ 'NR > 1{print $3/$2}' pov.tmp > poverty.tmp

	# Create a new header with better column names
	echo "Income: Less than High School,Income: High School,Income: Associates,Income: Bachelors,Income: Graduate,Education: Female - High School,Education: Female - Associate's,Education: Female - Bachelor's,Education: Female - Graduate,Education: Male - High School,Education: Male - Associate's,Education: Male - Bachelor's,Education: Male - Graduate,Poverty" > head.tmp

	# Concatenate all of the demographic attributes to a single file
	paste -d "," income.tmp fem-edu.tmp male-edu.tmp poverty.tmp > data.tmp

	# Concatenate the header with the rest of the data
	cat head.tmp data.tmp > census-data.csv

	# Remove temporary files
	rm *.tmp

	# Create a BigML dataource with the resulting file
	curl https://bigml.io/andromeda/source?$BIGML_AUTH -F file=@census-data.csv