Skip to content

Instantly share code, notes, and snippets.

@jmealo
Created August 15, 2017 15:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmealo/f652e380e3edd5521313dbbc49e79287 to your computer and use it in GitHub Desktop.
Save jmealo/f652e380e3edd5521313dbbc49e79287 to your computer and use it in GitHub Desktop.
Download CCD schools and load the CSV file into PostgreSQL with reasonable full text searching enabled
#!/bin/bash -e
apt-get install -y wget unzip recode
wget -nc https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1516_txt_prel_tab.zip
unzip -o ccd_sch_029_1516_txt_prel_tab.zip
recode ISO-8859-15..UTF8 /tmp/ccd_sch_029_1516_txt_prel.tab
#EXPLAIN ANALYZE select sch_name, lea_name, ncessch, website, updated_status, charter_text, ts_rank(search, to_tsquery('simple', 'Michi:*')) AS rank FROM ccd_schools ORDER BY rank DESC LIMIT 100;
#EXPLAIN ANALYZE WITH results AS (select sch_name, lea_name, ncessch, website, updated_status, charter_text, search FROM ccd_schools WHERE search @@ to_tsquery('simple', 'Michi:*')) SELECT *, ts_rank(search, to_tsquery('simple', 'Michi:*')) AS rank FROM results ORDER BY rank DESC LIMIT 100;
psql spark <<EOSQL
CREATE TABLE IF NOT EXISTS public.ccd_schools (
survyear TEXT,
fipst TEXT,
stabr TEXT,
statename TEXT,
seaname TEXT,
leaid TEXT,
st_leaid TEXT,
lea_name TEXT,
schid TEXT,
st_schid TEXT,
ncessch TEXT,
sch_name TEXT,
mstreet1 TEXT,
mstreet2 TEXT,
mstreet3 TEXT,
mcity TEXT,
mstate TEXT,
mzip TEXT,
mzip4 TEXT,
phone TEXT,
lstreet1 TEXT,
lstreet2 TEXT,
lstreet3 TEXT,
lcity TEXT,
lstate TEXT,
lzip TEXT,
lzip4 TEXT,
website TEXT,
"union" TEXT,
out_of_state_flag TEXT,
sch_type_text TEXT,
sch_type TEXT,
recon_status TEXT,
gslo TEXT,
gshi TEXT,
level TEXT,
virtual TEXT,
bies TEXT,
sy_status_text TEXT,
sy_status TEXT,
updated_status_text TEXT,
updated_status TEXT,
effective_date DATE,
charter_text TEXT,
pkoffered TEXT,
kgoffered TEXT,
g1offered TEXT,
g2offered TEXT,
g3offered TEXT,
g4offered TEXT,
g5offered TEXT,
g6offered TEXT,
g7offered TEXT,
g8offered TEXT,
g9offered TEXT,
g10offered TEXT,
g11offered TEXT,
g12offered TEXT,
g13offered TEXT,
aeoffered TEXT,
ugoffered TEXT,
nogrades TEXT,
igoffered TEXT,
chartauth1 TEXT,
chartauthn1 TEXT,
chartauth2 TEXT,
chartauthn2 TEXT
);
TRUNCATE TABLE public.ccd_schools;
EOSQL
psql spark <<EOSQL
COPY ccd_schools(
survyear,
fipst,
stabr,
statename,
seaname,
leaid,
st_leaid,
lea_name,
schid,
st_schid,
ncessch,
sch_name,
mstreet1,
mstreet2,
mstreet3,
mcity,
mstate,
mzip,
mzip4,
phone,
lstreet1,
lstreet2,
lstreet3,
lcity,
lstate,
lzip,
lzip4,
website,
"union",
out_of_state_flag,
sch_type_text,
sch_type,
recon_status,
gslo,
gshi,
level,
virtual,
bies,
sy_status_text,
sy_status,
updated_status_text,
updated_status,
effective_date,
charter_text,
pkoffered,
kgoffered,
g1offered,
g2offered,
g3offered,
g4offered,
g5offered,
g6offered,
g7offered,
g8offered,
g9offered,
g10offered,
g11offered,
g12offered,
g13offered,
aeoffered,
ugoffered,
nogrades,
igoffered,
chartauth1,
chartauthn1,
chartauth2,
chartauthn2
)
FROM '/tmp/ccd_sch_029_1516_txt_prel.tab'
EOSQL
psql spark <<EOSQL
DROP INDEX IF EXISTS ccd_schools_idx_fts_search;
ALTER TABLE public.ccd_schools ADD COLUMN IF NOT EXISTS "search" tsvector;
UPDATE public.ccd_schools SET search =
setweight(to_tsvector(coalesce(sch_name,'')), 'A') ||
setweight(to_tsvector(coalesce(lea_name,'')), 'B') ||
setweight(to_tsvector(coalesce(mstreet1,'')), 'C') ||
setweight(to_tsvector(coalesce(mcity,'')), 'D');
CREATE INDEX IF NOT EXISTS ccd_schools_idx_fts_search ON public.ccd_schools USING gin(search);
EOSQL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment