wboykinm/process.sh

## process.sh
# look at the top 20 rows of the data
head -n 20 itcont.txt | csvlook -d '|'

# combine a header file with a csv-formatted ful table
cp indiv_header_file.csv individual_2016.csv
csvformat -d '|' itcont.txt >> individual_2016.csv

# pull out all the records where the donor is from Vermont
csvgrep -c 10 -m VT individual_2016.csv > individual_2016_vt.csv

# join committee information to donors
csvjoin -c "CMTE_ID,CMTE_ID" individual_2016.csv committees_2016.csv --left > individual_2016_joined.csv

## process.sql

-- build the shell of the table with the right schema for both individual contributions and committee info
DROP TABLE IF EXISTS individual_2016;
CREATE TABLE individual_2016 (
  cmte_id text,
  amndt_ind text,
  rpt_tp text,
  transaction_pgi text,
  image_num text,
  transaction_tp text,
  entity_tp text,
  name text,
  city text,
  state text,
  zip_code text,
  employer text,
  occupation text,
  transaction_dt text,
  transaction_amt int,
  other_id text,
  tran_id text,
  file_num text,
  memo_cd text,
  memo_text text,
  sub_id text,
  cmte_nm text,
  tres_nm text,
  cmte_st1 text,
  cmte_st2 text,
  cmte_city text,
  cmte_st text,
  cmte_zip text,
  cmte_dsgn text,
  cmte_tp text,
  cmte_pty_affiliation text,
  cmte_filing_freq text,
  org_tp text,
  connected_org_nm text,
  cand_id text
);

-- pull in TEH DATAZ
\COPY individual_2016 FROM 'usa_2016.csv' DELIMITER ',' CSV HEADER;

-- add a few indexes to speed things up
CREATE INDEX IF NOT EXISTS ind_name_idx ON individual_2016 USING GIN(to_tsvector('english', name));
CREATE INDEX IF NOT EXISTS state_idx ON individual_2016 (state);
CREATE INDEX IF NOT EXISTS zip_code_idx ON individual_2016 (zip_code);
CREATE INDEX IF NOT EXISTS occupation_idx ON individual_2016 USING GIN(to_tsvector('english', occupation));
CREATE INDEX IF NOT EXISTS employer_idx ON individual_2016 USING GIN(to_tsvector('english', employer));
CREATE INDEX IF NOT EXISTS cmte_nm_idx ON individual_2016 USING GIN(to_tsvector('english', cmte_nm));
CREATE INDEX IF NOT EXISTS cmte_tp_idx ON individual_2016 (cmte_tp);
CREATE INDEX IF NOT EXISTS transaction_amt_idx ON individual_2016 (transaction_amt);
	# look at the top 20 rows of the data
	head -n 20 itcont.txt \| csvlook -d '\|'

	# combine a header file with a csv-formatted ful table
	cp indiv_header_file.csv individual_2016.csv
	csvformat -d '\|' itcont.txt >> individual_2016.csv

	# pull out all the records where the donor is from Vermont
	csvgrep -c 10 -m VT individual_2016.csv > individual_2016_vt.csv

	# join committee information to donors
	csvjoin -c "CMTE_ID,CMTE_ID" individual_2016.csv committees_2016.csv --left > individual_2016_joined.csv

	-- build the shell of the table with the right schema for both individual contributions and committee info
	DROP TABLE IF EXISTS individual_2016;
	CREATE TABLE individual_2016 (
	cmte_id text,
	amndt_ind text,
	rpt_tp text,
	transaction_pgi text,
	image_num text,
	transaction_tp text,
	entity_tp text,
	name text,
	city text,
	state text,
	zip_code text,
	employer text,
	occupation text,
	transaction_dt text,
	transaction_amt int,
	other_id text,
	tran_id text,
	file_num text,
	memo_cd text,
	memo_text text,
	sub_id text,
	cmte_nm text,
	tres_nm text,
	cmte_st1 text,
	cmte_st2 text,
	cmte_city text,
	cmte_st text,
	cmte_zip text,
	cmte_dsgn text,
	cmte_tp text,
	cmte_pty_affiliation text,
	cmte_filing_freq text,
	org_tp text,
	connected_org_nm text,
	cand_id text
	);

	-- pull in TEH DATAZ
	\COPY individual_2016 FROM 'usa_2016.csv' DELIMITER ',' CSV HEADER;

	-- add a few indexes to speed things up
	CREATE INDEX IF NOT EXISTS ind_name_idx ON individual_2016 USING GIN(to_tsvector('english', name));
	CREATE INDEX IF NOT EXISTS state_idx ON individual_2016 (state);
	CREATE INDEX IF NOT EXISTS zip_code_idx ON individual_2016 (zip_code);
	CREATE INDEX IF NOT EXISTS occupation_idx ON individual_2016 USING GIN(to_tsvector('english', occupation));
	CREATE INDEX IF NOT EXISTS employer_idx ON individual_2016 USING GIN(to_tsvector('english', employer));
	CREATE INDEX IF NOT EXISTS cmte_nm_idx ON individual_2016 USING GIN(to_tsvector('english', cmte_nm));
	CREATE INDEX IF NOT EXISTS cmte_tp_idx ON individual_2016 (cmte_tp);
	CREATE INDEX IF NOT EXISTS transaction_amt_idx ON individual_2016 (transaction_amt);