Last active
May 20, 2024 12:37
-
-
Save 1mehal/13c85e108cbc906f5ec34d28d75b1968 to your computer and use it in GitHub Desktop.
IMDB to PostgreSQL database import script + SQL fixer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# please make sure to install | |
# sudo pip install csvkit | |
# also make sure you have pv utility installed | |
# sudo apt-get install pv # sudo port install pv | |
function readinput() { | |
local CLEAN_ARGS="" | |
while [[ $# -gt 0 ]]; do | |
local i="$1" | |
case "$i" in | |
"-i") | |
if read -i "default" 2>/dev/null <<< "test"; then | |
CLEAN_ARGS="$CLEAN_ARGS -i \"$2\"" | |
fi | |
shift | |
shift | |
;; | |
"-p") | |
CLEAN_ARGS="$CLEAN_ARGS -p \"$2\"" | |
shift | |
shift | |
;; | |
*) | |
CLEAN_ARGS="$CLEAN_ARGS $1" | |
shift | |
;; | |
esac | |
done | |
eval read $CLEAN_ARGS | |
} | |
readinput -e -p "Enter the path to the local imdb repository: " -i "./datasets.imdbws.com" REPPATH | |
REPPATH=${REPPATH:-"./datasets.imdbws.com"} | |
if [[ ! -e $REPPATH ]]; then | |
echo "Downloading IMDB repository to ${REPPATH}"; | |
wget -A "*tsv.gz" --mirror "https://datasets.imdbws.com/"; | |
for f in datasets.imdbws.com/*gz; do | |
echo "Extracting ${f%.*}"; | |
pv $f | gunzip > ${f%.*} || break; | |
done | |
else | |
echo "${REPPATH} already exists, using it as a IMDB repository" | |
fi | |
if [ -z ${PGDATABASE} ]; | |
then readinput -e -p "Enter your database name : " -i 'imdb' PGDATABASE | |
PGDATABASE=${PGDATABASE:-"imdb"} | |
export PGDATABASE=$PGDATABASE | |
fi | |
if [ -z ${PGHOST} ]; | |
then readinput -e -p "Enter your database host : " -i 'localhost' PGHOST | |
PGHOST=${PGHOST:-"localhost"} | |
export PGHOST=$PGHOST | |
fi | |
if [ -z ${PGPORT} ]; | |
then readinput -e -p "Enter your database port number : " -i '5432' PGPORT | |
PGPORT=${PGPORT:-"5342"} | |
export PGPORT=$PGPORT | |
fi | |
if [ -z ${PGUSER} ]; | |
then readinput -e -p "Enter your database username : " -i 'postgres' PGUSER | |
PGUSER=${PGUSER:-"postgres"} | |
export PGUSER=$PGUSER | |
fi | |
if [ -z ${PGPASSWORD} ]; | |
then readinput -e -p "Enter your database password : " -i '' PGPASSWORD | |
PGPASSWORD=${PGPASSWORD:-""} | |
export PGPASSWORD=$PGPASSWORD | |
fi | |
echo "Importing datasets into specified database" | |
echo " [PGDATABASE=${PGDATABASE} | |
PGHOST=${PGHOST} | |
PGPORT=${PGPORT} | |
PGUSER=${PGUSER}]" | |
for file in datasets.imdbws.com/*tsv | |
do | |
table_name=$(basename $file .tsv | tr '.' '_') | |
echo "Creating table in PostgreSQL database [${file%.*}]" | |
head -n 20 $file | csvsql --no-constraints --blanks --table $table_name | sed "s/\(\"\)//g" | sed "s/\(\Year VARCHAR\)/Year DECIMAL/g" | psql > /dev/null || break | |
filesize=$(wc -c < "$file") | |
echo "Importing data ${table_name}" | |
tail +2 $file | pv -s $filesize | psql -c \ "COPY \"${table_name}\" FROM STDIN WITH DELIMITER E'\t' QUOTE E'\b' NULL AS '\N' CSV" || break | |
done | |
echo 'Preparing database (adding indexes, relationships, etc)' | |
wget https://gist.githubusercontent.com/1mehal/13c85e108cbc906f5ec34d28d75b1968/raw/imdb_postgresql_setup.sql | |
psql -f imdb_postgresql_setup.sql | |
rm imdb_postgresql_setup.sql |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
alter table name_basics add constraint name_basics_pk primary key (nconst); | |
alter table title_basics add constraint title_basics_pk primary key (tconst); | |
alter table title_crew add constraint title_crew_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid; | |
alter table title_episode add constraint title_episode_title_basics_fk foreign key (parentTconst) references title_basics(tconst) not valid; | |
alter table title_principals add constraint title_principals_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid; | |
alter table title_principals add constraint title_principals_name_basics_fk foreign key (nconst) references name_basics(nconst) not valid; | |
alter table title_akas add constraint title_akas_title_basics_fk foreign key (titleId) references title_basics(tconst) not valid; | |
alter table title_ratings add constraint title_ratings_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid; | |
ALTER TABLE title_basics ADD COLUMN "titleSearchCol" tsvector; | |
UPDATE title_basics SET "titleSearchCol" = | |
to_tsvector('english', coalesce(primaryTitle,'') || ' ' || coalesce(originalTitle,'')); | |
CREATE INDEX title_idx ON title_basics USING GIN ("titleSearchCol"); | |
UPDATE title_basics SET "titleSearchCol" = | |
setweight(to_tsvector(coalesce(primaryTitle,'')), 'A') || | |
setweight(to_tsvector(coalesce(originalTitle,'')), 'C'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment