Rio517/process_open_secrets.sh

## process_open_secrets.sh
#### README ######
#
#### Overview
# This script prepares FEC data for use in a 3rd party search service.Info
# on search service can be found here:
# https://developer.ibm.com/clouddataservices/2016/01/21/introducing-simple-faceted-search-service/
#
# This was script was created pretty quickly, but should mostly work on any
# generic *nix box with the dependencies listed below.
#
# This bash script will create a combined output of FEC donors, with joined info
# on who they donated to. This is for MAYDAY.US's upcoming donor strike.  To use,
# you need to:
#
#   1. Visit opensecrets.org http://www.opensecrets.org/myos/bulk.php to download
#      the election data you want. (Sign up and log in to access the url.)
#   2. Look for the heading "Campaign Finance Data", and download the zip files
#      of "cycle tables" you're looking for (Example: 2016 Cycle tables).
#   3. Unzip all the files you downloaded so that all the unzipped files are in
#      the same single directory together.
#   4. Place this file in that same folder. Then run it in your terminal with:
#      `bash process_open_secrets.sh`. The whole script will take 10-25 minutes
#      to run, depending on your machine.  It will attempt to "say" and audio alert
#      when done.

#### NOTE: To use csvkit, you'll need to install it.
#   sudo apt-get install python-dev python-pip python-setuptools build-essential
#   sudo pip install csvkit

#### NOTE: Ensure you have uconv installed.
#   sudo apt-get install uconv

#### OPEN SECRETS TABLE HEADERS  (for general info)
# candidates
# Cycle, FECCandID, CID, FirstLastP, Party, DistIDRunFor, DistIDCurr, CurrCand, CycleCand, CRPICO, RecipCode, NoPacs
# committees
# Cycle, CmteID, PACShort, Affiliate, Ultorg, RecipID, RecipCode, FECCandID, Party, PrimCode, Source, Sensitive, Foreign, Active
# individuals
# FECTransID, ContribID, Contrib, RecipID, Orgname, UltOrg, RealCode, Date, Amount, Street, City, State, Zip, RecipCode, Type, CmteID, OtherID, Gender, Microfilm, Occupation, Employer, Source
#
## We don't need the Pacs data, so I didn't gather it. Look in the users guide PDF.


###### AFTER GENERATING
# This data needs to be uploaded into our bluemix Simple Search Service (SSS) instance here:
#   http://simple-search-service-mayday-2040.mybluemix.net/#/settings
# If the app is in "lockdown mode", you will have to disable that, read more here: https://github.com/ibm-cds-labs/simple-search-service
# FInally, you can log in to change those environment variables in our app, the console to start at is here: https://console.ng.bluemix.net/?direct=classic/#/resources
# PW, should be in lastpass, but you may need to reset it.  email is admin@mayday.us


#bash commands after unzip all the files from opensecrets
echo 'Starting... This full process will take about 15 minutes on a well speced laptop.'
echo 'Removing files that will remain unused...'
rm pac*.txt
rm CF_*_Read_Me.txt
echo 'combining downloaded years into single files... '
cat cands*.txt > all_cands.txt
cat cmtes*.txt > all_cmtes.txt
cat indivs*.txt > all_indivs.txt
rm cands*.txt cmtes*.txt indivs*.txt


echo 'Cleaning up opensecrets data to our liking...'
echo 'This will take about 7wc - minutes, depending on your machine and how many files you combined...'
date
sed 's/\"/\\\"/g' all_*.txt -i
sed 's/|/\"/g' all_*.txt -i
uconv --from-code ISO_8859-1 --to-code UTF8 all_indivs.txt -o all_indivs_utf8.txt
uconv --from-code ISO_8859-1 --to-code UTF8 all_cands.txt -o all_cands_utf8.txt
uconv --from-code ISO_8859-1 --to-code UTF8 all_cmtes.txt -o all_cmtes_utf8.txt
echo 'Filter out committee contributions...'
cat all_indivs_utf8.txt  | awk -F, '$3 ~ !/" "/ {print}' > all_filtered_indivs_utf8.txt
date
echo 'Ok, now, let us join those puppies... This is over 6 million records so this will take a while... 20 minutes or so...'
join -t, <(csvcut -c 5,2,4,19,21-22,1,9-10,12-14 all_filtered_indivs_utf8.txt | sort) <(csvcut -c 2,3 all_cmtes_utf8.txt | sort) > matched_committees.csv
join -t, <(csvcut -c 5,2,4,19,21-22,1,9-10,12-14 all_filtered_indivs_utf8.txt | sort) <(csvcut -c 3,4-7 all_cands_utf8.txt | sort) > matched_candidates.csv
awk -F, '!_[$2]++' matched_committees.csv  > deduped_matched_committees.csv
awk -F, '!_[$2]++' matched_candidates.csv  > deduped_matched_candidates.csv
date
echo 'Ok, now let us put that all in one file.  Look for opensecrets_aggregated_data.csv'
echo 'recipient_id,contribution_id,name,gender,occupation,employer,cycle,contributed_at,amount,city,state,zip,recipient,party,office' > opensecrets_aggregated_data.csv
cat deduped_matched_c*.csv >> opensecrets_aggregated_data.csv
echo 'Cleaning up intermediary files...'
rm matched_c*.csv all_*.txt deduped_matched_c*.csv
echo 'We are done processing open secrets data! Whoo hoo!'
say 'We are done processing open secrets data! Whoo hoo!'
echo 'We are done processing open secrets data! Whoo hoo!' | espeak -s 120 2>/dev/null
	#### README ######
	#
	#### Overview
	# This script prepares FEC data for use in a 3rd party search service.Info
	# on search service can be found here:
	# https://developer.ibm.com/clouddataservices/2016/01/21/introducing-simple-faceted-search-service/
	#
	# This was script was created pretty quickly, but should mostly work on any
	# generic *nix box with the dependencies listed below.
	#
	# This bash script will create a combined output of FEC donors, with joined info
	# on who they donated to. This is for MAYDAY.US's upcoming donor strike. To use,
	# you need to:
	#
	# 1. Visit opensecrets.org http://www.opensecrets.org/myos/bulk.php to download
	# the election data you want. (Sign up and log in to access the url.)
	# 2. Look for the heading "Campaign Finance Data", and download the zip files
	# of "cycle tables" you're looking for (Example: 2016 Cycle tables).
	# 3. Unzip all the files you downloaded so that all the unzipped files are in
	# the same single directory together.
	# 4. Place this file in that same folder. Then run it in your terminal with:
	# `bash process_open_secrets.sh`. The whole script will take 10-25 minutes
	# to run, depending on your machine. It will attempt to "say" and audio alert
	# when done.

	#### NOTE: To use csvkit, you'll need to install it.
	# sudo apt-get install python-dev python-pip python-setuptools build-essential
	# sudo pip install csvkit

	#### NOTE: Ensure you have uconv installed.
	# sudo apt-get install uconv

	#### OPEN SECRETS TABLE HEADERS (for general info)
	# candidates
	# Cycle, FECCandID, CID, FirstLastP, Party, DistIDRunFor, DistIDCurr, CurrCand, CycleCand, CRPICO, RecipCode, NoPacs
	# committees
	# Cycle, CmteID, PACShort, Affiliate, Ultorg, RecipID, RecipCode, FECCandID, Party, PrimCode, Source, Sensitive, Foreign, Active
	# individuals
	# FECTransID, ContribID, Contrib, RecipID, Orgname, UltOrg, RealCode, Date, Amount, Street, City, State, Zip, RecipCode, Type, CmteID, OtherID, Gender, Microfilm, Occupation, Employer, Source
	#
	## We don't need the Pacs data, so I didn't gather it. Look in the users guide PDF.


	###### AFTER GENERATING
	# This data needs to be uploaded into our bluemix Simple Search Service (SSS) instance here:
	# http://simple-search-service-mayday-2040.mybluemix.net/#/settings
	# If the app is in "lockdown mode", you will have to disable that, read more here: https://github.com/ibm-cds-labs/simple-search-service
	# FInally, you can log in to change those environment variables in our app, the console to start at is here: https://console.ng.bluemix.net/?direct=classic/#/resources
	# PW, should be in lastpass, but you may need to reset it. email is admin@mayday.us


	#bash commands after unzip all the files from opensecrets
	echo 'Starting... This full process will take about 15 minutes on a well speced laptop.'
	echo 'Removing files that will remain unused...'
	rm pac*.txt
	rm CF_*_Read_Me.txt
	echo 'combining downloaded years into single files... '
	cat cands*.txt > all_cands.txt
	cat cmtes*.txt > all_cmtes.txt
	cat indivs*.txt > all_indivs.txt
	rm cands.txt cmtes.txt indivs*.txt


	echo 'Cleaning up opensecrets data to our liking...'
	echo 'This will take about 7wc - minutes, depending on your machine and how many files you combined...'
	date
	sed 's/\"/\\\"/g' all_*.txt -i
	sed 's/\|/\"/g' all_*.txt -i
	uconv --from-code ISO_8859-1 --to-code UTF8 all_indivs.txt -o all_indivs_utf8.txt
	uconv --from-code ISO_8859-1 --to-code UTF8 all_cands.txt -o all_cands_utf8.txt
	uconv --from-code ISO_8859-1 --to-code UTF8 all_cmtes.txt -o all_cmtes_utf8.txt
	echo 'Filter out committee contributions...'
	cat all_indivs_utf8.txt \| awk -F, '$3 ~ !/" "/ {print}' > all_filtered_indivs_utf8.txt
	date
	echo 'Ok, now, let us join those puppies... This is over 6 million records so this will take a while... 20 minutes or so...'
	join -t, <(csvcut -c 5,2,4,19,21-22,1,9-10,12-14 all_filtered_indivs_utf8.txt \| sort) <(csvcut -c 2,3 all_cmtes_utf8.txt \| sort) > matched_committees.csv
	join -t, <(csvcut -c 5,2,4,19,21-22,1,9-10,12-14 all_filtered_indivs_utf8.txt \| sort) <(csvcut -c 3,4-7 all_cands_utf8.txt \| sort) > matched_candidates.csv
	awk -F, '!_[$2]++' matched_committees.csv > deduped_matched_committees.csv
	awk -F, '!_[$2]++' matched_candidates.csv > deduped_matched_candidates.csv
	date
	echo 'Ok, now let us put that all in one file. Look for opensecrets_aggregated_data.csv'
	echo 'recipient_id,contribution_id,name,gender,occupation,employer,cycle,contributed_at,amount,city,state,zip,recipient,party,office' > opensecrets_aggregated_data.csv
	cat deduped_matched_c*.csv >> opensecrets_aggregated_data.csv
	echo 'Cleaning up intermediary files...'
	rm matched_c.csv all_.txt deduped_matched_c*.csv
	echo 'We are done processing open secrets data! Whoo hoo!'
	say 'We are done processing open secrets data! Whoo hoo!'
	echo 'We are done processing open secrets data! Whoo hoo!' \| espeak -s 120 2>/dev/null