Skip to content

Instantly share code, notes, and snippets.

@1mehal
Last active May 20, 2024 12:37
Show Gist options
  • Save 1mehal/13c85e108cbc906f5ec34d28d75b1968 to your computer and use it in GitHub Desktop.
Save 1mehal/13c85e108cbc906f5ec34d28d75b1968 to your computer and use it in GitHub Desktop.
IMDB to PostgreSQL database import script + SQL fixer
#!/bin/bash
# please make sure to install
# sudo pip install csvkit
# also make sure you have pv utility installed
# sudo apt-get install pv # sudo port install pv
function readinput() {
local CLEAN_ARGS=""
while [[ $# -gt 0 ]]; do
local i="$1"
case "$i" in
"-i")
if read -i "default" 2>/dev/null <<< "test"; then
CLEAN_ARGS="$CLEAN_ARGS -i \"$2\""
fi
shift
shift
;;
"-p")
CLEAN_ARGS="$CLEAN_ARGS -p \"$2\""
shift
shift
;;
*)
CLEAN_ARGS="$CLEAN_ARGS $1"
shift
;;
esac
done
eval read $CLEAN_ARGS
}
readinput -e -p "Enter the path to the local imdb repository: " -i "./datasets.imdbws.com" REPPATH
REPPATH=${REPPATH:-"./datasets.imdbws.com"}
if [[ ! -e $REPPATH ]]; then
echo "Downloading IMDB repository to ${REPPATH}";
wget -A "*tsv.gz" --mirror "https://datasets.imdbws.com/";
for f in datasets.imdbws.com/*gz; do
echo "Extracting ${f%.*}";
pv $f | gunzip > ${f%.*} || break;
done
else
echo "${REPPATH} already exists, using it as a IMDB repository"
fi
if [ -z ${PGDATABASE} ];
then readinput -e -p "Enter your database name : " -i 'imdb' PGDATABASE
PGDATABASE=${PGDATABASE:-"imdb"}
export PGDATABASE=$PGDATABASE
fi
if [ -z ${PGHOST} ];
then readinput -e -p "Enter your database host : " -i 'localhost' PGHOST
PGHOST=${PGHOST:-"localhost"}
export PGHOST=$PGHOST
fi
if [ -z ${PGPORT} ];
then readinput -e -p "Enter your database port number : " -i '5432' PGPORT
PGPORT=${PGPORT:-"5342"}
export PGPORT=$PGPORT
fi
if [ -z ${PGUSER} ];
then readinput -e -p "Enter your database username : " -i 'postgres' PGUSER
PGUSER=${PGUSER:-"postgres"}
export PGUSER=$PGUSER
fi
if [ -z ${PGPASSWORD} ];
then readinput -e -p "Enter your database password : " -i '' PGPASSWORD
PGPASSWORD=${PGPASSWORD:-""}
export PGPASSWORD=$PGPASSWORD
fi
echo "Importing datasets into specified database"
echo " [PGDATABASE=${PGDATABASE}
PGHOST=${PGHOST}
PGPORT=${PGPORT}
PGUSER=${PGUSER}]"
for file in datasets.imdbws.com/*tsv
do
table_name=$(basename $file .tsv | tr '.' '_')
echo "Creating table in PostgreSQL database [${file%.*}]"
head -n 20 $file | csvsql --no-constraints --blanks --table $table_name | sed "s/\(\"\)//g" | sed "s/\(\Year VARCHAR\)/Year DECIMAL/g" | psql > /dev/null || break
filesize=$(wc -c < "$file")
echo "Importing data ${table_name}"
tail +2 $file | pv -s $filesize | psql -c \ "COPY \"${table_name}\" FROM STDIN WITH DELIMITER E'\t' QUOTE E'\b' NULL AS '\N' CSV" || break
done
echo 'Preparing database (adding indexes, relationships, etc)'
wget https://gist.githubusercontent.com/1mehal/13c85e108cbc906f5ec34d28d75b1968/raw/imdb_postgresql_setup.sql
psql -f imdb_postgresql_setup.sql
rm imdb_postgresql_setup.sql
alter table name_basics add constraint name_basics_pk primary key (nconst);
alter table title_basics add constraint title_basics_pk primary key (tconst);
alter table title_crew add constraint title_crew_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid;
alter table title_episode add constraint title_episode_title_basics_fk foreign key (parentTconst) references title_basics(tconst) not valid;
alter table title_principals add constraint title_principals_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid;
alter table title_principals add constraint title_principals_name_basics_fk foreign key (nconst) references name_basics(nconst) not valid;
alter table title_akas add constraint title_akas_title_basics_fk foreign key (titleId) references title_basics(tconst) not valid;
alter table title_ratings add constraint title_ratings_title_basics_fk foreign key (tconst) references title_basics(tconst) not valid;
ALTER TABLE title_basics ADD COLUMN "titleSearchCol" tsvector;
UPDATE title_basics SET "titleSearchCol" =
to_tsvector('english', coalesce(primaryTitle,'') || ' ' || coalesce(originalTitle,''));
CREATE INDEX title_idx ON title_basics USING GIN ("titleSearchCol");
UPDATE title_basics SET "titleSearchCol" =
setweight(to_tsvector(coalesce(primaryTitle,'')), 'A') ||
setweight(to_tsvector(coalesce(originalTitle,'')), 'C');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment