Created
June 27, 2016 19:53
-
-
Save wboykinm/2d3782a5c90de5ef4bece2971e69c710 to your computer and use it in GitHub Desktop.
A scratchpad for dealing with the FEC whole-cycle data dumps
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# look at the top 20 rows of the data | |
head -n 20 itcont.txt | csvlook -d '|' | |
# combine a header file with a csv-formatted ful table | |
cp indiv_header_file.csv individual_2016.csv | |
csvformat -d '|' itcont.txt >> individual_2016.csv | |
# pull out all the records where the donor is from Vermont | |
csvgrep -c 10 -m VT individual_2016.csv > individual_2016_vt.csv | |
# join committee information to donors | |
csvjoin -c "CMTE_ID,CMTE_ID" individual_2016.csv committees_2016.csv --left > individual_2016_joined.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- build the shell of the table with the right schema for both individual contributions and committee info | |
DROP TABLE IF EXISTS individual_2016; | |
CREATE TABLE individual_2016 ( | |
cmte_id text, | |
amndt_ind text, | |
rpt_tp text, | |
transaction_pgi text, | |
image_num text, | |
transaction_tp text, | |
entity_tp text, | |
name text, | |
city text, | |
state text, | |
zip_code text, | |
employer text, | |
occupation text, | |
transaction_dt text, | |
transaction_amt int, | |
other_id text, | |
tran_id text, | |
file_num text, | |
memo_cd text, | |
memo_text text, | |
sub_id text, | |
cmte_nm text, | |
tres_nm text, | |
cmte_st1 text, | |
cmte_st2 text, | |
cmte_city text, | |
cmte_st text, | |
cmte_zip text, | |
cmte_dsgn text, | |
cmte_tp text, | |
cmte_pty_affiliation text, | |
cmte_filing_freq text, | |
org_tp text, | |
connected_org_nm text, | |
cand_id text | |
); | |
-- pull in TEH DATAZ | |
\COPY individual_2016 FROM 'usa_2016.csv' DELIMITER ',' CSV HEADER; | |
-- add a few indexes to speed things up | |
CREATE INDEX IF NOT EXISTS ind_name_idx ON individual_2016 USING GIN(to_tsvector('english', name)); | |
CREATE INDEX IF NOT EXISTS state_idx ON individual_2016 (state); | |
CREATE INDEX IF NOT EXISTS zip_code_idx ON individual_2016 (zip_code); | |
CREATE INDEX IF NOT EXISTS occupation_idx ON individual_2016 USING GIN(to_tsvector('english', occupation)); | |
CREATE INDEX IF NOT EXISTS employer_idx ON individual_2016 USING GIN(to_tsvector('english', employer)); | |
CREATE INDEX IF NOT EXISTS cmte_nm_idx ON individual_2016 USING GIN(to_tsvector('english', cmte_nm)); | |
CREATE INDEX IF NOT EXISTS cmte_tp_idx ON individual_2016 (cmte_tp); | |
CREATE INDEX IF NOT EXISTS transaction_amt_idx ON individual_2016 (transaction_amt); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment