Last active
December 5, 2022 21:59
-
-
Save mikesparr/e90c406f246a7f015e582c8847cdf975 to your computer and use it in GitHub Desktop.
Quick data loader for national animal shelter (and pet type/breed) data for research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# FETCH AND BUILD DATA LOCALLY | |
############################################################# | |
# "FETCH" MARKET RESEARCH DATA :-) | |
# - https://www.petfinder.com/developers/v2/docs | |
# - TIP: store CLIENT_ID and CLIENT_SECRET in .env-local file | |
############################################################# | |
# fetch token for building org DB | |
export DATA_DIR="data" | |
export ORG_DIR="orgs" | |
export PET_DIR="pets" | |
export API_BASE="https://api.petfinder.com" | |
source .env-local # fetch CLIENT_ID and CLIENT_SECRET | |
export API_TOKEN=$(curl -d "grant_type=client_credentials&client_id=$CLIENT_ID&client_secret=$CLIENT_SECRET" $API_BASE/v2/oauth2/token) | |
export API_HEADER="Authorization: $(echo $API_TOKEN | jq -r .token_type) $(echo $API_TOKEN | jq -r .access_token)" | |
# fetch organizations from every state | |
mkdir -p $DATA_DIR/$ORG_DIR | |
# create array from states file (with 1 abbr state per line) | |
states=() | |
while IFS= read -r line; do | |
states+=("$line") | |
done < us_states.txt | |
# loop through states array and fetch records from API | |
for state in $states; | |
do curl -H "$API_HEADER" $API_BASE/v2/organizations?state=$ST&limit=100 > $DATA_DIR/$ORG_DIR/$state.json; | |
sleep 3 # avoid rate-limit block | |
done | |
# fetch animal types | |
mkdir -p $DATA_DIR/$PET_DIR | |
curl -H "$API_HEADER" $API_BASE/v2/types > $DATA_DIR/$PET_DIR/types.json | |
# loop through types and fetch breeds for each | |
jq -r '.types[]._links.breeds.href' $DATA_DIR/$PET_DIR/types.json | while read link; do | |
type=${link:h4:t} # fetch the type name from 4th element in URL path | |
echo "Fetching breeds for type $type\n" | |
curl -H "$API_HEADER" $API_BASE$link > $DATA_DIR/$PET_DIR/breeds-$type.json | |
done | |
############################################################# | |
# BUILD SHELTER & PET DATABASE | |
# - https://launchschool.com/books/sql_first_edition/read/create_table#createtable | |
# - https://hasura.io/learn/database/postgresql/core-concepts/3-postgresql-data-types-columns/ | |
# - https://www.cockroachlabs.com/docs/stable/uuid.html | |
# - https://www.enterprisedb.com/postgres-tutorials/how-load-sample-database-schemas-and-data-postgresql | |
# - https://www.cockroachlabs.com/docs/cockroachcloud/quickstart.html?filters=local | |
############################################################# | |
# install cockroachdb on Mac | |
brew tap cockroachdb/tap | |
brew install cockroach | |
brew services start cockroachdb/tap/cockroach # read console for URL:PORT | |
brew install libpq # install psql client | |
brew link --force libpq # add psql to path | |
# create database schema | |
export SCHEMA_FILE="schema.sql" | |
cat > $SCHEMA_FILE << EOF | |
DROP DATABASE pet_publish; | |
CREATE DATABASE IF NOT EXISTS pet_publish; | |
DROP TABLE IF EXISTS organizations; | |
CREATE TABLE IF NOT EXISTS organizations ( | |
id serial, | |
uid uuid DEFAULT gen_random_uuid(), | |
name VARCHAR(255) NOT NULL, | |
description text, | |
address1 text, | |
address2 text, | |
city text, | |
state varchar(2), | |
postal_code varchar(10), | |
country varchar(2), | |
email text, | |
phone varchar(60), | |
website text, | |
created timestamp NOT NULL DEFAULT NOW(), | |
modified timestamp, | |
status smallint DEFAULT 1, | |
sort_order smallint, | |
extra text, | |
PRIMARY KEY (uid), | |
UNIQUE (id) | |
); | |
DROP TABLE IF EXISTS animal_types; | |
CREATE TABLE IF NOT EXISTS animal_types ( | |
id serial, | |
uid uuid DEFAULT gen_random_uuid(), | |
name VARCHAR(100) NOT NULL, | |
code VARCHAR(100) NOT NULL, | |
description text, | |
coats jsonb, | |
colors jsonb, | |
genders jsonb, | |
created timestamp NOT NULL DEFAULT NOW(), | |
modified timestamp, | |
status smallint DEFAULT 1, | |
sort_order smallint, | |
extra text, | |
PRIMARY KEY (uid), | |
UNIQUE (id) | |
); | |
DROP TABLE IF EXISTS animal_breeds; | |
CREATE TABLE IF NOT EXISTS animal_breeds ( | |
id serial, | |
animal_type_id uuid NOT NULL, | |
uid uuid DEFAULT gen_random_uuid(), | |
name VARCHAR(100) NOT NULL, | |
created timestamp NOT NULL DEFAULT NOW(), | |
modified timestamp, | |
status smallint DEFAULT 1, | |
sort_order smallint, | |
extra text, | |
PRIMARY KEY (uid), | |
UNIQUE (id), | |
CONSTRAINT fk_animal_type_id FOREIGN KEY (animal_type_id) REFERENCES animal_types (uid) ON DELETE CASCADE | |
); | |
EOF | |
# load schema.sql file into database | |
psql -f $SCHEMA_FILE -h localhost -p 26257 -U root pet_publish | |
############################################################# | |
# LOAD FILES INTO DATABASE | |
# - | |
############################################################# | |
export DATABASE_URL="postgresql://root@localhost:26257/pet_publish?sslmode=disable" | |
# setup python environment | |
python3 -m venv ${PWD}/env | |
# activate environment and upgrade pip | |
source ${PWD}/env/bin/activate | |
pip3 install --upgrade pip | |
# install python dependencies | |
pip3 install requests beautifulsoup4 "psycopg[binary]" | |
# run data loader | |
python3 data_loader.py | |
# cheer! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Loads JSON file data into RDBMS. | |
""" | |
import logging | |
import os | |
import sys | |
import glob | |
import json | |
import psycopg | |
def get_animal_types_from_files(): | |
"""Builds list of animal types with nested breeds""" | |
animal_types = [] | |
with open('data/pets/types.json', 'r') as f: | |
try: | |
types_file = json.load(f) | |
animal_types = types_file['types'] | |
for animal_type in animal_types: | |
type_id = os.path.basename(os.path.normpath(animal_type['_links']['self']['href'])) | |
# build nested breeds list for animal type | |
breeds_file = f'data/pets/breeds-{type_id}.json' | |
breeds_list = [] | |
with open(breeds_file) as b: | |
try: | |
breeds_json = json.load(b) | |
breeds = breeds_json['breeds'] | |
for breed in breeds: | |
breeds_list.append({'name': breed['name']}) | |
# end for breed in breeds | |
except KeyError as ke: | |
print(ke) | |
finally: | |
b.close() | |
# build animal type dictionary | |
type = { | |
'type_id': type_id, | |
'name': animal_type['name'], | |
'coats': animal_type['coats'], | |
'colors': animal_type['colors'], | |
'genders': animal_type['genders'], | |
'breeds': breeds_list, | |
} | |
animal_types.append(type) | |
# end for animal_type in animal_types | |
except KeyError as fe: | |
print(fe) | |
finally: | |
f.close() | |
print(f'Parsed files with {len(animal_types)} types') | |
return animal_types | |
def get_organizations_from_files(): | |
"""Builds list of organizations for every state file""" | |
organizations = [] | |
files = glob.glob('data/orgs/*', recursive=True) | |
for data_file in files: | |
with open(data_file, 'r') as f: | |
try: | |
json_file = json.load(f) | |
orgs = json_file['organizations'] | |
for org in orgs: | |
organizations.append({ | |
'name': org['name'], | |
'description': org['mission_statement'], | |
'address1': org['address']['address1'], | |
'address2': org['address']['address2'], | |
'city': org['address']['city'], | |
'state': org['address']['state'], | |
'postal_code': org['address']['postcode'], | |
'country': org['address']['country'], | |
'email': org['email'], | |
'phone': org['phone'], | |
'website': org['website'], | |
}) | |
except KeyError as e: | |
print(f'Skipping {data_file}') | |
print(e) | |
finally: | |
f.close() | |
print(f'Parsed files with {len(organizations)} rows') | |
return organizations | |
def load_animal_types(conn): | |
"""Builds animal types/breeds dictionary and loads in database""" | |
pass | |
def load_organizations(conn): | |
"""Builds national organizations dictionary and loads in database""" | |
orgs = get_organizations_from_files() | |
try: | |
with conn.cursor() as cur: | |
for org in orgs: | |
logging.info(f"Loading {org['name']} into database") | |
cur.execute( | |
"""UPSERT INTO organizations | |
(name, description, address1, address2, city, state, postal_code, | |
country, email, phone, website) | |
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) | |
""", | |
( | |
org['name'], | |
org['description'], | |
org['address1'], | |
org['address2'], | |
org['city'], | |
org['state'], | |
org['postal_code'], | |
org['country'], | |
org['email'], | |
org['phone'], | |
org['website'], | |
) | |
) | |
logging.debug("load_organizations(): status message: %s", cur.statusmessage) | |
except Exception as e: | |
logging.error(e) | |
def main(): | |
root = logging.getLogger() | |
root.setLevel(logging.DEBUG) | |
handler = logging.StreamHandler(sys.stdout) | |
handler.setLevel(logging.DEBUG) | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
handler.setFormatter(formatter) | |
root.addHandler(handler) | |
try: | |
logging.info("connecting to database") | |
with psycopg.connect(os.environ['DATABASE_URL']) as conn: | |
logging.info("loading organizations") | |
load_organizations(conn) | |
#load_animal_types(conn) | |
except Exception as e: | |
logging.fatal("database connection failed") | |
logging.fatal(e) | |
return | |
finally: | |
conn.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Overview
Kicking off research for a new startup idea my wife had, to provide a mobile app for animal shelter staff that simplifies promoting pets online. Leveraged online resources to build database of shelters for initial research to flush out designs for MVP app.
DB of nearly 5,000 shelters and organizations in US
Registered domain and contracted logo design in a day