mikesparr/01-fetch-shelter-and-pet-data.sh

## 01-fetch-shelter-and-pet-data.sh
#!/usr/bin/env bash

# FETCH AND BUILD DATA LOCALLY

#############################################################
# "FETCH" MARKET RESEARCH DATA :-)
# - https://www.petfinder.com/developers/v2/docs
# - TIP: store CLIENT_ID and CLIENT_SECRET in .env-local file
#############################################################

# fetch token for building org DB
export DATA_DIR="data"
export ORG_DIR="orgs"
export PET_DIR="pets"
export API_BASE="https://api.petfinder.com"

source .env-local  # fetch CLIENT_ID and CLIENT_SECRET
export API_TOKEN=$(curl -d "grant_type=client_credentials&client_id=$CLIENT_ID&client_secret=$CLIENT_SECRET" $API_BASE/v2/oauth2/token)
export API_HEADER="Authorization: $(echo $API_TOKEN | jq -r .token_type) $(echo $API_TOKEN | jq -r .access_token)"

# fetch organizations from every state
mkdir -p $DATA_DIR/$ORG_DIR

# create array from states file (with 1 abbr state per line)
states=()
while IFS= read -r line; do
   states+=("$line")
done < us_states.txt

# loop through states array and fetch records from API
for state in $states;
    do curl -H "$API_HEADER" $API_BASE/v2/organizations?state=$ST&limit=100 > $DATA_DIR/$ORG_DIR/$state.json;
    sleep 3 # avoid rate-limit block
done

# fetch animal types
mkdir -p $DATA_DIR/$PET_DIR
curl -H "$API_HEADER" $API_BASE/v2/types > $DATA_DIR/$PET_DIR/types.json

# loop through types and fetch breeds for each
jq -r '.types[]._links.breeds.href' $DATA_DIR/$PET_DIR/types.json | while read link; do
    type=${link:h4:t} # fetch the type name from 4th element in URL path
    echo "Fetching breeds for type $type\n"
    curl -H "$API_HEADER" $API_BASE$link > $DATA_DIR/$PET_DIR/breeds-$type.json
done


#############################################################
# BUILD SHELTER & PET DATABASE
# - https://launchschool.com/books/sql_first_edition/read/create_table#createtable
# - https://hasura.io/learn/database/postgresql/core-concepts/3-postgresql-data-types-columns/
# - https://www.cockroachlabs.com/docs/stable/uuid.html
# - https://www.enterprisedb.com/postgres-tutorials/how-load-sample-database-schemas-and-data-postgresql
# - https://www.cockroachlabs.com/docs/cockroachcloud/quickstart.html?filters=local
#############################################################

# install cockroachdb on Mac
brew tap cockroachdb/tap
brew install cockroach
brew services start cockroachdb/tap/cockroach # read console for URL:PORT
brew install libpq # install psql client
brew link --force libpq # add psql to path

# create database schema
export SCHEMA_FILE="schema.sql"

cat > $SCHEMA_FILE << EOF
DROP DATABASE pet_publish;
CREATE DATABASE IF NOT EXISTS pet_publish;

DROP TABLE IF EXISTS organizations;
CREATE TABLE IF NOT EXISTS organizations (
    id serial,
    uid uuid DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL,
    description text,
    address1 text,
    address2 text,
    city text,
    state varchar(2),
    postal_code varchar(10),
    country varchar(2),
    email text,
    phone varchar(60),
    website text,
    created timestamp NOT NULL DEFAULT NOW(),
    modified timestamp,
    status smallint DEFAULT 1,
    sort_order smallint,
    extra text,
    PRIMARY KEY (uid),
    UNIQUE (id)
);

DROP TABLE IF EXISTS animal_types;
CREATE TABLE IF NOT EXISTS animal_types (
    id serial,
    uid uuid DEFAULT gen_random_uuid(),
    name VARCHAR(100) NOT NULL,
    code VARCHAR(100) NOT NULL,
    description text,
    coats jsonb,
    colors jsonb,
    genders jsonb,
    created timestamp NOT NULL DEFAULT NOW(),
    modified timestamp,
    status smallint DEFAULT 1,
    sort_order smallint,
    extra text,
    PRIMARY KEY (uid),
    UNIQUE (id)
);

DROP TABLE IF EXISTS animal_breeds;
CREATE TABLE IF NOT EXISTS animal_breeds (
    id serial,
    animal_type_id uuid NOT NULL,
    uid uuid DEFAULT gen_random_uuid(),
    name VARCHAR(100) NOT NULL,
    created timestamp NOT NULL DEFAULT NOW(),
    modified timestamp,
    status smallint DEFAULT 1,
    sort_order smallint,
    extra text,
    PRIMARY KEY (uid),
    UNIQUE (id),
    CONSTRAINT fk_animal_type_id FOREIGN KEY (animal_type_id) REFERENCES animal_types (uid) ON DELETE CASCADE
);
EOF

# load schema.sql file into database
psql -f $SCHEMA_FILE -h localhost -p 26257 -U root pet_publish


#############################################################
# LOAD FILES INTO DATABASE
# -
#############################################################
export DATABASE_URL="postgresql://root@localhost:26257/pet_publish?sslmode=disable"

# setup python environment
python3 -m venv ${PWD}/env

# activate environment and upgrade pip
source ${PWD}/env/bin/activate
pip3 install --upgrade pip

# install python dependencies
pip3 install requests beautifulsoup4 "psycopg[binary]"

# run data loader
python3 data_loader.py

# cheer!

## data_loader.py
#!/usr/bin/env python3
"""
Loads JSON file data into RDBMS.
"""
import logging
import os
import sys
import glob
import json
import psycopg

def get_animal_types_from_files():
    """Builds list of animal types with nested breeds"""
    animal_types = []

    with open('data/pets/types.json', 'r') as f:
        try:
            types_file = json.load(f)
            animal_types = types_file['types']
            for animal_type in animal_types:
                type_id = os.path.basename(os.path.normpath(animal_type['_links']['self']['href']))

                # build nested breeds list for animal type
                breeds_file = f'data/pets/breeds-{type_id}.json'
                breeds_list = []
                with open(breeds_file) as b:
                    try:
                        breeds_json = json.load(b)
                        breeds = breeds_json['breeds']
                        for breed in breeds:
                            breeds_list.append({'name': breed['name']})
                        # end for breed in breeds
                    except KeyError as ke:
                        print(ke)
                    finally:
                        b.close()

                # build animal type dictionary
                type = {
                    'type_id': type_id,
                    'name': animal_type['name'],
                    'coats': animal_type['coats'],
                    'colors': animal_type['colors'],
                    'genders': animal_type['genders'],
                    'breeds': breeds_list,
                }
                animal_types.append(type)
            # end for animal_type in animal_types
        except KeyError as fe:
            print(fe)
        finally:
            f.close()

    print(f'Parsed files with {len(animal_types)} types')
    return animal_types

def get_organizations_from_files():
    """Builds list of organizations for every state file"""
    organizations = []
    files = glob.glob('data/orgs/*', recursive=True)

    for data_file in files:
        with open(data_file, 'r') as f:
            try:
                json_file = json.load(f)
                orgs = json_file['organizations']
                for org in orgs:
                    organizations.append({
                        'name': org['name'],
                        'description': org['mission_statement'],
                        'address1': org['address']['address1'],
                        'address2': org['address']['address2'],
                        'city': org['address']['city'],
                        'state': org['address']['state'],
                        'postal_code': org['address']['postcode'],
                        'country': org['address']['country'],
                        'email': org['email'],
                        'phone': org['phone'],
                        'website': org['website'],
                    })
            except KeyError as e:
                print(f'Skipping {data_file}')
                print(e)
            finally:
                f.close()

    print(f'Parsed files with {len(organizations)} rows')
    return organizations

def load_animal_types(conn):
    """Builds animal types/breeds dictionary and loads in database"""
    pass

def load_organizations(conn):
    """Builds national organizations dictionary and loads in database"""
    orgs = get_organizations_from_files()
    try:
        with conn.cursor() as cur:
            for org in orgs:
                logging.info(f"Loading {org['name']} into database")
                cur.execute(
                    """UPSERT INTO organizations
                    (name, description, address1, address2, city, state, postal_code,
                    country, email, phone, website)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """,
                    (
                        org['name'],
                        org['description'],
                        org['address1'],
                        org['address2'],
                        org['city'],
                        org['state'],
                        org['postal_code'],
                        org['country'],
                        org['email'],
                        org['phone'],
                        org['website'],
                    )
                )
                logging.debug("load_organizations(): status message: %s", cur.statusmessage)
    except Exception as e:
        logging.error(e)

def main():
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    root.addHandler(handler)

    try:
        logging.info("connecting to database")
        with psycopg.connect(os.environ['DATABASE_URL']) as conn:

            logging.info("loading organizations")
            load_organizations(conn)
            #load_animal_types(conn)
    except Exception as e:
        logging.fatal("database connection failed")
        logging.fatal(e)
        return
    finally:
        conn.close()

if __name__ == "__main__":
    main()
	#!/usr/bin/env bash

	# FETCH AND BUILD DATA LOCALLY

	#############################################################
	# "FETCH" MARKET RESEARCH DATA :-)
	# - https://www.petfinder.com/developers/v2/docs
	# - TIP: store CLIENT_ID and CLIENT_SECRET in .env-local file
	#############################################################

	# fetch token for building org DB
	export DATA_DIR="data"
	export ORG_DIR="orgs"
	export PET_DIR="pets"
	export API_BASE="https://api.petfinder.com"

	source .env-local # fetch CLIENT_ID and CLIENT_SECRET
	export API_TOKEN=$(curl -d "grant_type=client_credentials&client_id=$CLIENT_ID&client_secret=$CLIENT_SECRET" $API_BASE/v2/oauth2/token)
	export API_HEADER="Authorization: $(echo $API_TOKEN \| jq -r .token_type) $(echo $API_TOKEN \| jq -r .access_token)"

	# fetch organizations from every state
	mkdir -p $DATA_DIR/$ORG_DIR

	# create array from states file (with 1 abbr state per line)
	states=()
	while IFS= read -r line; do
	states+=("$line")
	done < us_states.txt

	# loop through states array and fetch records from API
	for state in $states;
	do curl -H "$API_HEADER" $API_BASE/v2/organizations?state=$ST&limit=100 > $DATA_DIR/$ORG_DIR/$state.json;
	sleep 3 # avoid rate-limit block
	done

	# fetch animal types
	mkdir -p $DATA_DIR/$PET_DIR
	curl -H "$API_HEADER" $API_BASE/v2/types > $DATA_DIR/$PET_DIR/types.json

	# loop through types and fetch breeds for each
	jq -r '.types[]._links.breeds.href' $DATA_DIR/$PET_DIR/types.json \| while read link; do
	type=${link:h4:t} # fetch the type name from 4th element in URL path
	echo "Fetching breeds for type $type\n"
	curl -H "$API_HEADER" $API_BASE$link > $DATA_DIR/$PET_DIR/breeds-$type.json
	done


	#############################################################
	# BUILD SHELTER & PET DATABASE
	# - https://launchschool.com/books/sql_first_edition/read/create_table#createtable
	# - https://hasura.io/learn/database/postgresql/core-concepts/3-postgresql-data-types-columns/
	# - https://www.cockroachlabs.com/docs/stable/uuid.html
	# - https://www.enterprisedb.com/postgres-tutorials/how-load-sample-database-schemas-and-data-postgresql
	# - https://www.cockroachlabs.com/docs/cockroachcloud/quickstart.html?filters=local
	#############################################################

	# install cockroachdb on Mac
	brew tap cockroachdb/tap
	brew install cockroach
	brew services start cockroachdb/tap/cockroach # read console for URL:PORT
	brew install libpq # install psql client
	brew link --force libpq # add psql to path

	# create database schema
	export SCHEMA_FILE="schema.sql"

	cat > $SCHEMA_FILE << EOF
	DROP DATABASE pet_publish;
	CREATE DATABASE IF NOT EXISTS pet_publish;

	DROP TABLE IF EXISTS organizations;
	CREATE TABLE IF NOT EXISTS organizations (
	id serial,
	uid uuid DEFAULT gen_random_uuid(),
	name VARCHAR(255) NOT NULL,
	description text,
	address1 text,
	address2 text,
	city text,
	state varchar(2),
	postal_code varchar(10),
	country varchar(2),
	email text,
	phone varchar(60),
	website text,
	created timestamp NOT NULL DEFAULT NOW(),
	modified timestamp,
	status smallint DEFAULT 1,
	sort_order smallint,
	extra text,
	PRIMARY KEY (uid),
	UNIQUE (id)
	);

	DROP TABLE IF EXISTS animal_types;
	CREATE TABLE IF NOT EXISTS animal_types (
	id serial,
	uid uuid DEFAULT gen_random_uuid(),
	name VARCHAR(100) NOT NULL,
	code VARCHAR(100) NOT NULL,
	description text,
	coats jsonb,
	colors jsonb,
	genders jsonb,
	created timestamp NOT NULL DEFAULT NOW(),
	modified timestamp,
	status smallint DEFAULT 1,
	sort_order smallint,
	extra text,
	PRIMARY KEY (uid),
	UNIQUE (id)
	);

	DROP TABLE IF EXISTS animal_breeds;
	CREATE TABLE IF NOT EXISTS animal_breeds (
	id serial,
	animal_type_id uuid NOT NULL,
	uid uuid DEFAULT gen_random_uuid(),
	name VARCHAR(100) NOT NULL,
	created timestamp NOT NULL DEFAULT NOW(),
	modified timestamp,
	status smallint DEFAULT 1,
	sort_order smallint,
	extra text,
	PRIMARY KEY (uid),
	UNIQUE (id),
	CONSTRAINT fk_animal_type_id FOREIGN KEY (animal_type_id) REFERENCES animal_types (uid) ON DELETE CASCADE
	);
	EOF

	# load schema.sql file into database
	psql -f $SCHEMA_FILE -h localhost -p 26257 -U root pet_publish


	#############################################################
	# LOAD FILES INTO DATABASE
	# -
	#############################################################
	export DATABASE_URL="postgresql://root@localhost:26257/pet_publish?sslmode=disable"

	# setup python environment
	python3 -m venv ${PWD}/env

	# activate environment and upgrade pip
	source ${PWD}/env/bin/activate
	pip3 install --upgrade pip

	# install python dependencies
	pip3 install requests beautifulsoup4 "psycopg[binary]"

	# run data loader
	python3 data_loader.py

	# cheer!
	#!/usr/bin/env python3
	"""
	Loads JSON file data into RDBMS.
	"""
	import logging
	import os
	import sys
	import glob
	import json
	import psycopg

	def get_animal_types_from_files():
	"""Builds list of animal types with nested breeds"""
	animal_types = []

	with open('data/pets/types.json', 'r') as f:
	try:
	types_file = json.load(f)
	animal_types = types_file['types']
	for animal_type in animal_types:
	type_id = os.path.basename(os.path.normpath(animal_type['_links']['self']['href']))

	# build nested breeds list for animal type
	breeds_file = f'data/pets/breeds-{type_id}.json'
	breeds_list = []
	with open(breeds_file) as b:
	try:
	breeds_json = json.load(b)
	breeds = breeds_json['breeds']
	for breed in breeds:
	breeds_list.append({'name': breed['name']})
	# end for breed in breeds
	except KeyError as ke:
	print(ke)
	finally:
	b.close()

	# build animal type dictionary
	type = {
	'type_id': type_id,
	'name': animal_type['name'],
	'coats': animal_type['coats'],
	'colors': animal_type['colors'],
	'genders': animal_type['genders'],
	'breeds': breeds_list,
	}
	animal_types.append(type)
	# end for animal_type in animal_types
	except KeyError as fe:
	print(fe)
	finally:
	f.close()

	print(f'Parsed files with {len(animal_types)} types')
	return animal_types

	def get_organizations_from_files():
	"""Builds list of organizations for every state file"""
	organizations = []
	files = glob.glob('data/orgs/*', recursive=True)

	for data_file in files:
	with open(data_file, 'r') as f:
	try:
	json_file = json.load(f)
	orgs = json_file['organizations']
	for org in orgs:
	organizations.append({
	'name': org['name'],
	'description': org['mission_statement'],
	'address1': org['address']['address1'],
	'address2': org['address']['address2'],
	'city': org['address']['city'],
	'state': org['address']['state'],
	'postal_code': org['address']['postcode'],
	'country': org['address']['country'],
	'email': org['email'],
	'phone': org['phone'],
	'website': org['website'],
	})
	except KeyError as e:
	print(f'Skipping {data_file}')
	print(e)
	finally:
	f.close()

	print(f'Parsed files with {len(organizations)} rows')
	return organizations

	def load_animal_types(conn):
	"""Builds animal types/breeds dictionary and loads in database"""
	pass

	def load_organizations(conn):
	"""Builds national organizations dictionary and loads in database"""
	orgs = get_organizations_from_files()
	try:
	with conn.cursor() as cur:
	for org in orgs:
	logging.info(f"Loading {org['name']} into database")
	cur.execute(
	"""UPSERT INTO organizations
	(name, description, address1, address2, city, state, postal_code,
	country, email, phone, website)
	VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
	""",
	(
	org['name'],
	org['description'],
	org['address1'],
	org['address2'],
	org['city'],
	org['state'],
	org['postal_code'],
	org['country'],
	org['email'],
	org['phone'],
	org['website'],
	)
	)
	logging.debug("load_organizations(): status message: %s", cur.statusmessage)
	except Exception as e:
	logging.error(e)

	def main():
	root = logging.getLogger()
	root.setLevel(logging.DEBUG)

	handler = logging.StreamHandler(sys.stdout)
	handler.setLevel(logging.DEBUG)
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	root.addHandler(handler)

	try:
	logging.info("connecting to database")
	with psycopg.connect(os.environ['DATABASE_URL']) as conn:

	logging.info("loading organizations")
	load_organizations(conn)
	#load_animal_types(conn)
	except Exception as e:
	logging.fatal("database connection failed")
	logging.fatal(e)
	return
	finally:
	conn.close()

	if __name__ == "__main__":
	main()