Skip to content

Instantly share code, notes, and snippets.

@yunpengn
Last active February 6, 2020 10:33
Show Gist options
  • Save yunpengn/fa9d04a63abaa42a93b291f70c49c8b8 to your computer and use it in GitHub Desktop.
Save yunpengn/fa9d04a63abaa42a93b291f70c49c8b8 to your computer and use it in GitHub Desktop.
A simple script to setup IMDb movie database
printf "Program starts at %s.\n" "$(date)"
printf "Going to clean and create new folder ...\n"
rm -rf imdb-datasets-ftp/ imdb-queries/
mkdir imdb-datasets-ftp/
printf "Going to download datasets from FTP server ...\n"
wget http://homepages.cwi.nl/~boncz/job/imdb.tgz
tar -xvzf imdb.tgz -C imdb-datasets-ftp/
rm -f imdb.tgz
printf "Going to download JOB query set ...\n"
wget http://www-db.in.tum.de/~leis/qo/job.tgz
tar -xvzf job.tgz
mv job/ imdb-queries/
rm -f job.tgz
printf "Going to create a new database ...\n"
psql -c "DROP DATABASE IF EXISTS imdb"
psql -c "CREATE DATABASE imdb"
printf "Going to initialize the database ...\n"
psql -d imdb -f imdb-queries/schema.sql
psql -d imdb -f imdb-queries/fkindexes.sql
printf "Going to populate the database ...\n"
psql -d imdb -c "COPY aka_name FROM '$(pwd)/imdb-datasets-ftp/aka_name.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY aka_title FROM '$(pwd)/imdb-datasets-ftp/aka_title.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY cast_info FROM '$(pwd)/imdb-datasets-ftp/cast_info.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY char_name FROM '$(pwd)/imdb-datasets-ftp/char_name.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY comp_cast_type FROM '$(pwd)/imdb-datasets-ftp/comp_cast_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY company_name FROM '$(pwd)/imdb-datasets-ftp/company_name.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY company_type FROM '$(pwd)/imdb-datasets-ftp/company_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY complete_cast FROM '$(pwd)/imdb-datasets-ftp/complete_cast.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY info_type FROM '$(pwd)/imdb-datasets-ftp/info_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY keyword FROM '$(pwd)/imdb-datasets-ftp/keyword.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY kind_type FROM '$(pwd)/imdb-datasets-ftp/kind_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY link_type FROM '$(pwd)/imdb-datasets-ftp/link_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY movie_companies FROM '$(pwd)/imdb-datasets-ftp/movie_companies.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY movie_info FROM '$(pwd)/imdb-datasets-ftp/movie_info.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY movie_info_idx FROM '$(pwd)/imdb-datasets-ftp/movie_info_idx.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY movie_keyword FROM '$(pwd)/imdb-datasets-ftp/movie_keyword.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY movie_link FROM '$(pwd)/imdb-datasets-ftp/movie_link.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY name FROM '$(pwd)/imdb-datasets-ftp/name.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY person_info FROM '$(pwd)/imdb-datasets-ftp/person_info.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY role_type FROM '$(pwd)/imdb-datasets-ftp/role_type.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
psql -d imdb -c "COPY title FROM '$(pwd)/imdb-datasets-ftp/title.csv' WITH (FORMAT csv, DELIMITER ',', QUOTE '\"', ESCAPE '\\')";
printf "Program ends at %s.\n" "$(date)"
printf "Program starts at %s.\n" "$(date)"
printf "Going to clean and create new folder ...\n"
rm -rf imdb-datasets/ imdbpy-master/
mkdir imdb-datasets/
printf "Going to download datasets from FTP server ...\n"
cd imdb-datasets/
wget https://datasets.imdbws.com/name.basics.tsv.gz
wget https://datasets.imdbws.com/title.akas.tsv.gz
wget https://datasets.imdbws.com/title.basics.tsv.gz
wget https://datasets.imdbws.com/title.crew.tsv.gz
wget https://datasets.imdbws.com/title.episode.tsv.gz
wget https://datasets.imdbws.com/title.principals.tsv.gz
wget https://datasets.imdbws.com/title.ratings.tsv.gz
cd ..
printf "Going to download IMDbPy ...\n"
wget https://github.com/alberanid/imdbpy/archive/master.zip
unzip master.zip
rm -f master.zip
printf "Going to create virtual environment ...\n"
cd imdbpy-master/
virtualenv .venv
source .venv/bin/activate
printf "Going to install dependencies ...\n"
pip3 install -r requirements.txt
pip3 install psycopg2
printf "Going to create a new database ...\n"
psql -c "DROP DATABASE IF EXISTS imdb_new"
psql -c "CREATE DATABASE imdb_new"
printf "Going to populate the database ...\n"
mv bin/s32imdbpy.py .
./s32imdbpy.py --verbose ../imdb-datasets/ postgres://yunpeng:<your_password_here>@127.0.0.1/imdb_new
printf "Done!\n"
deactivate
printf "Program ends at %s.\n" "$(date)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment