Skip to content

Instantly share code, notes, and snippets.

@alllexx88
Created December 3, 2016 19:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alllexx88/c4b2c1b96d2f6addbfffed4d763c05c6 to your computer and use it in GitHub Desktop.
Save alllexx88/c4b2c1b96d2f6addbfffed4d763c05c6 to your computer and use it in GitHub Desktop.
Convert myrulib database to calibre format database. Contains database-specific parts
#!/bin/bash
SOURCE_DB="myrulib.db"
TARGET_DB="metadata_my.db"
#an empty Calibre database, with books and series
# create/update triggers removed
EMPTY_DB="metadata_empty.db"
LIB_ROOT="./"
function esc_q {
# double quotes escape themselves in SQL
sed 's/"/""/g'
}
function esc_sed {
# sed special characters escape
sed -e 's/\\/\\\\"/g' -e 's~[\.\*\^\$/]\|\[\|\]~\\&~g'
}
function my_cut {
local i="$1"
i=$((i - 1))
sed -n "s/\([^¶]*¶\)\{$i\}\([^¶]*\).*/\2/p"
}
function get_date {
local arg=""
if [ -n "$1" ]; then
arg="--date=$1"
fi
date -u $arg "+%Y-%m-%d %H:%M:%S.%N%z" | sed 's/[0-9]\{2\}+0000$/+0000/'
}
function get_stamp {
get_date $(stat "${LIB_ROOT}$1" -c%y)
}
# turn comma-separated list of authors into
# newline separated one. Also remove some trash
# from authors names
function sanitize_authors {
sed \
-e 's/, \?\([[:upper:]]\|d[ie][[:upper:]]\|v[ao]n \|d[ea] \|de[ls] \|te[nr]\?'`
`' \|tom \|де \|ван \|ди \|фон \)/\n\1/g' | sed \
-e 's/, \?[ae]t al$//' -e 's/, ред$//' \
-e 's/\([[:upper:]]\)\.\?[, ]*$/\1./'
}
#filter get ids from authors names (case-insensetive)
function auth_ids {
local auths=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
(sed -n -e "/^[^¶]*¶\(${auths}\)\$/Ip" | my_cut 1) <<< "${AUTHORS_FULL_NAMES}"
}
#same as auth_ids, but for series
function series_ids {
local series=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
(sed -n -e "/^[^¶]*¶\(${series}\)\$/Ip" | my_cut 1) <<< "${SERIES}"
}
#same as auth_ids, but for tags
function tag_ids {
local tags=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
(sed -n -e "/^[^¶]*¶\(${tags}\)\$/Ip" | my_cut 1) <<< "${TAGS}"
}
#get hierarchical tag from file path, e.g.,
# "M_Mathematics/MC_Calculus/MCat_Advanced calculus/<file>.pdf" ->
# "Mathematics.Calculus.Advanced calculus"
function tag_from_filepath {
sed -e 's|^[^/]*_||' -e 's|/[^/]*_|/|g' -e 's|/[^/]*$||' -e 's|/|.|g'
}
#same as auth_ids, but for langs
function lang_ids {
local langs=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
(sed -n -e "/^[^¶]*¶\(${langs}\)\$/Ip" | my_cut 1) <<< "${LANGS}"
}
#feed stdin to sqlite, as well as log to "$1"
function feed_sql_and_log {
exec 3>>"$1"
rm -f "$1"
tee /dev/fd/3 | sqlite3 "${TARGET_DB}"
}
#the most painful part: process book entries and collect/feed the respective
# books, books_authors_link, books_series_link, books_tags_link,
# books_languages_link, comments SQL INSERT commands
function books_insert {
# BOOKS:
#<id>¶<title>¶<auths>¶<series>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<desc>
local book
local book_count=$(echo "${BOOKS}" | wc -l)
local n=$(echo -n "${book_count}" | wc -c)
(while IFS='' read -r book || [ -n "$book" ]; do
local book_id=$(echo "$book" | my_cut 1)
(awk "{printf \"[%07.4f]%% Inserting book %${n}d out of ${book_count}"`
`"\n\", (\$0*100)/${book_count}, \$0}" >&2) <<< "${book_id}"
local title=$(echo "$book" | my_cut 2 | esc_q)
local authors=$(echo "$book" | my_cut 3 | sanitize_authors)
local author_sort=$(echo "$authors" | \
sed ':a;N;$!ba;s/\([^\n]*\)\n/\1 \& /g;s/.*/\L&/' | esc_q)
local series=$(echo "$book" | my_cut 4)
local file_path=$(echo "$book" | my_cut 5)
local file_type=$(echo "$book" | my_cut 6)
local md5sum=$(echo "$book" | my_cut 7)
local langs=$(echo "$book" | my_cut 8 | sed -e 's/,/\n/g')
local year=$(echo "$book" | my_cut 9)
local description=$(echo "$book" | my_cut 10 | esc_q | \
sed 's/\\newline/\n/g')
local timestamp=$(get_stamp "${file_path}")
local ISBN=$(echo "${file_path}" | \
sed -n 's|.*(ISBN \([0-9]*\))[^/]*$|\1|p')
#book hierarchical classification, extracted from file path
local tag=$(echo "${file_path}" | tag_from_filepath)
#books INSERT
echo -e "INSERT INTO books (id,title,sort,timestamp,pubdate,"`
`"author_sort,isbn,path,uuid,has_cover,"`
`"last_modified)\n"`
`"VALUES (${book_id},\"$title\",\"$title\",\"$timestamp\","`
`"\"$year\",\"${author_sort}\",\"$ISBN\",\"${file_path}\""`
`",\"$md5sum\",0,\"$timestamp\");"
#books_authors_link INSERT
auth_ids "$authors" | sed -n -e 's/.\+/'`
`"INSERT INTO books_authors_link (book,author)\n"`
`"VALUES (${book_id},&);/p"
#books_series_link INSERT
series_ids "$series" | sed -n -e 's/.\+/'`
`"INSERT INTO books_series_link (book,series)\n"`
`"VALUES (${book_id},&);/p"
#books_tags_link INSERT
tag_ids "$tag" | sed -n -e 's/.\+/'`
`"INSERT INTO books_tags_link (book,tag)\n"`
`"VALUES (${book_id},&);/p"
#books_languages_link INSERT
lang_ids "$langs" | sed -n -e 's/.\+/'`
`"INSERT INTO books_languages_link (book,lang_code,item_order)\n"`
`"VALUES (${book_id},&,0);/p"
#comments INSERT
if [ -n "${description}" ]; then
echo -e "INSERT INTO comments (book,text)\n"`
`"VALUES (${book_id},\"${description}\");"
fi
done <<< "${BOOKS}") | feed_sql_and_log "$1"
}
cp -f "${EMPTY_DB}" "${TARGET_DB}"
# AUTHORS_FULL_NAMES: '<id>¶<name>' lines
AUTHORS_FULL_NAMES=$(sqlite3 "${SOURCE_DB}" 'SELECT full_name FROM authors;' | \
sanitize_authors | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')
# TAGS: '<id>¶<tag>' lines
TAGS=$(sqlite3 "${SOURCE_DB}" 'SELECT file_name FROM books;' | tag_from_filepath | \
sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')
# LANGS: '<id>¶<lang>' lines
LANGS=$(sqlite3 "${SOURCE_DB}" 'SELECT DISTINCT lang FROM books;' | \
sed -e 's/,/\n/g' | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')
# BOOKS:
#'<id>¶<title>¶<authors>¶<sequence>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<description>'
BOOKS=$(sqlite3 -separator '<S_e^Pa_r^At_oR>' "${SOURCE_DB}" \
'SELECT printf("<S_e^Pa_r^At_oR>%s",books.title),authors.full_name,'`
`'sequences.value,books.file_name,books.file_type,'`
`'books.md5sum,books.lang,books.year,books.description '`
`'FROM books,authors,sequences '`
`'WHERE books.id_author=authors.id AND books.id_sequence=sequences.id;' | \
sed -e 's/<S_e^Pa_r^At_oR>/¶/g' | sed ':a;N;$!ba;s/\n\([^¶]\)/\\newline\1/g' | \
awk '{printf("%d%s\n", NR,$0)}')
# SERIES: '<id>¶<name>'
SERIES=$(sqlite3 "${SOURCE_DB}" 'SELECT value FROM sequences;' | sed '/^\s*$/d' | \
sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')
#INSERT authors
echo "${AUTHORS_FULL_NAMES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
`'INSERT INTO authors (id,name,sort)\n'`
`'VALUES (\1,\"\2\",\"\L\2\");/' | \
feed_sql_and_log authors_insert_command.txt
#INSERT tags
echo "${TAGS}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
`'INSERT INTO tags (id,name)\nVALUES (\1,\"\2\");/' | \
feed_sql_and_log tags_insert_command.txt
#INSERT languages
echo "${LANGS}" | sed 's/\([^¶]*\)¶\(.*\)/'`
`'INSERT INTO languages (id,lang_code)\n'`
`'VALUES (\1,\"\2\");/' | \
feed_sql_and_log languages_insert_command.txt
#INSERT series
echo "${SERIES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
`'INSERT INTO series (id,name,sort)\n'`
`'VALUES (\1,\"\2\",\"\2\");/' | \
feed_sql_and_log series_insert_command.txt
#INSERT books, books_authors_link, books_series_link,
# books_tags_link, books_languages_link, comments
books_insert books_insert_command.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment