alllexx88/myrulib_to_calibre.sh

## myrulib_to_calibre.sh
#!/bin/bash

SOURCE_DB="myrulib.db"

TARGET_DB="metadata_my.db"

#an empty Calibre database, with books and series
# create/update triggers removed
EMPTY_DB="metadata_empty.db"

LIB_ROOT="./"

function esc_q {
# double quotes  escape themselves in SQL
    sed 's/"/""/g'
}

function esc_sed {
# sed special characters escape
    sed -e 's/\\/\\\\"/g' -e 's~[\.\*\^\$/]\|\[\|\]~\\&~g'
}

function my_cut {
    local i="$1"
    i=$((i - 1))
    sed -n "s/\([^¶]*¶\)\{$i\}\([^¶]*\).*/\2/p"
}

function get_date {
    local arg=""
    if [ -n "$1" ]; then
        arg="--date=$1"
    fi
    date -u $arg "+%Y-%m-%d %H:%M:%S.%N%z" | sed 's/[0-9]\{2\}+0000$/+0000/'
}

function get_stamp {
    get_date $(stat "${LIB_ROOT}$1" -c%y)
}

# turn comma-separated list of authors into
# newline separated one. Also remove some trash
# from authors names
function sanitize_authors {
    sed \
-e 's/, \?\([[:upper:]]\|d[ie][[:upper:]]\|v[ao]n \|d[ea] \|de[ls] \|te[nr]\?'`
`' \|tom \|де \|ван \|ди \|фон \)/\n\1/g' | sed \
-e 's/, \?[ae]t al$//'  -e 's/, ред$//' \
-e 's/\([[:upper:]]\)\.\?[, ]*$/\1./'
}

#filter get ids from authors names (case-insensetive)
function auth_ids {
    local auths=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
    (sed -n -e "/^[^¶]*¶\(${auths}\)\$/Ip" | my_cut 1) <<< "${AUTHORS_FULL_NAMES}"
}

#same as auth_ids, but for series
function series_ids {
    local series=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
    (sed -n -e "/^[^¶]*¶\(${series}\)\$/Ip" | my_cut 1) <<< "${SERIES}"
}

#same as auth_ids, but for tags
function tag_ids {
    local tags=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
    (sed -n -e "/^[^¶]*¶\(${tags}\)\$/Ip" | my_cut 1) <<< "${TAGS}"
}

#get hierarchical tag from file path, e.g.,
# "M_Mathematics/MC_Calculus/MCat_Advanced calculus/<file>.pdf" ->
# "Mathematics.Calculus.Advanced calculus"
function tag_from_filepath {
    sed -e 's|^[^/]*_||' -e 's|/[^/]*_|/|g' -e 's|/[^/]*$||' -e 's|/|.|g'
}

#same as auth_ids, but for langs
function lang_ids {
    local langs=$(echo "$1" | esc_sed | sed ':a;N;$!ba;s/\n/\\|/g')
    (sed -n -e "/^[^¶]*¶\(${langs}\)\$/Ip" | my_cut 1) <<< "${LANGS}"
}

#feed stdin to sqlite, as well as log to "$1"
function feed_sql_and_log {
    exec 3>>"$1"
    rm -f "$1"
    tee /dev/fd/3 | sqlite3 "${TARGET_DB}"
}

#the most painful part: process book entries and collect/feed the respective
# books, books_authors_link, books_series_link, books_tags_link,
# books_languages_link, comments SQL INSERT commands
function books_insert {
# BOOKS:
#<id>¶<title>¶<auths>¶<series>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<desc>

    local book
    local book_count=$(echo "${BOOKS}" | wc -l)
    local n=$(echo -n "${book_count}" | wc -c)
    (while IFS='' read -r book || [ -n "$book" ]; do
        local book_id=$(echo "$book" | my_cut 1)
        (awk "{printf \"[%07.4f]%% Inserting book %${n}d out of ${book_count}"`
                  `"\n\", (\$0*100)/${book_count}, \$0}" >&2) <<< "${book_id}"
        local title=$(echo "$book" | my_cut 2 | esc_q)
        local authors=$(echo "$book" | my_cut 3 | sanitize_authors)
        local author_sort=$(echo "$authors" | \
                            sed ':a;N;$!ba;s/\([^\n]*\)\n/\1 \& /g;s/.*/\L&/' | esc_q)
        local series=$(echo "$book" | my_cut 4)
        local file_path=$(echo "$book" | my_cut 5)
        local file_type=$(echo "$book" | my_cut 6)
        local md5sum=$(echo "$book" | my_cut 7)
        local langs=$(echo "$book" | my_cut 8 | sed -e 's/,/\n/g')
        local year=$(echo "$book" | my_cut 9)
        local description=$(echo "$book" | my_cut 10 | esc_q | \
                            sed 's/\\newline/\n/g')
        local timestamp=$(get_stamp "${file_path}")
        local ISBN=$(echo "${file_path}" | \
                     sed -n 's|.*(ISBN \([0-9]*\))[^/]*$|\1|p')
        #book hierarchical classification, extracted from file path
        local tag=$(echo "${file_path}" | tag_from_filepath)

        #books INSERT
        echo -e "INSERT INTO books (id,title,sort,timestamp,pubdate,"`
                                  `"author_sort,isbn,path,uuid,has_cover,"`
                                  `"last_modified)\n"`
               `"VALUES (${book_id},\"$title\",\"$title\",\"$timestamp\","`
                    `"\"$year\",\"${author_sort}\",\"$ISBN\",\"${file_path}\""`
                    `",\"$md5sum\",0,\"$timestamp\");"

        #books_authors_link INSERT
        auth_ids "$authors" | sed -n -e 's/.\+/'`
             `"INSERT INTO books_authors_link (book,author)\n"`
             `"VALUES (${book_id},&);/p"

        #books_series_link INSERT
         series_ids "$series" | sed -n -e 's/.\+/'`
             `"INSERT INTO books_series_link (book,series)\n"`
             `"VALUES (${book_id},&);/p"

        #books_tags_link INSERT
         tag_ids "$tag" | sed -n -e 's/.\+/'`
             `"INSERT INTO books_tags_link (book,tag)\n"`
             `"VALUES (${book_id},&);/p"

        #books_languages_link INSERT
         lang_ids "$langs" | sed -n -e 's/.\+/'`
             `"INSERT INTO books_languages_link (book,lang_code,item_order)\n"`
             `"VALUES (${book_id},&,0);/p"

        #comments INSERT
         if [ -n "${description}" ]; then
             echo -e "INSERT INTO comments (book,text)\n"`
                    `"VALUES (${book_id},\"${description}\");"
         fi
    done <<< "${BOOKS}") | feed_sql_and_log "$1"
}

cp -f "${EMPTY_DB}" "${TARGET_DB}"

# AUTHORS_FULL_NAMES: '<id>¶<name>' lines
AUTHORS_FULL_NAMES=$(sqlite3 "${SOURCE_DB}" 'SELECT full_name FROM authors;' | \
                     sanitize_authors | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')

# TAGS: '<id>¶<tag>' lines
TAGS=$(sqlite3 "${SOURCE_DB}" 'SELECT file_name FROM books;' | tag_from_filepath | \
       sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')

# LANGS: '<id>¶<lang>' lines
LANGS=$(sqlite3 "${SOURCE_DB}" 'SELECT DISTINCT lang FROM books;' | \
        sed -e 's/,/\n/g' | sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')

# BOOKS:
#'<id>¶<title>¶<authors>¶<sequence>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<description>'
BOOKS=$(sqlite3  -separator '<S_e^Pa_r^At_oR>' "${SOURCE_DB}" \
    'SELECT printf("<S_e^Pa_r^At_oR>%s",books.title),authors.full_name,'`
                     `'sequences.value,books.file_name,books.file_type,'`
                `'books.md5sum,books.lang,books.year,books.description '`
   `'FROM books,authors,sequences '`
   `'WHERE books.id_author=authors.id AND books.id_sequence=sequences.id;' | \
  sed -e 's/<S_e^Pa_r^At_oR>/¶/g' | sed ':a;N;$!ba;s/\n\([^¶]\)/\\newline\1/g' | \
  awk '{printf("%d%s\n", NR,$0)}')

# SERIES: '<id>¶<name>'
SERIES=$(sqlite3 "${SOURCE_DB}" 'SELECT value FROM sequences;' | sed '/^\s*$/d' | \
         sort -uf | awk '{printf("%d¶%s\n", NR,$0)}')

#INSERT authors
echo "${AUTHORS_FULL_NAMES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
     `'INSERT INTO authors (id,name,sort)\n'`
     `'VALUES (\1,\"\2\",\"\L\2\");/' | \
     feed_sql_and_log authors_insert_command.txt

#INSERT tags
echo "${TAGS}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
     `'INSERT INTO tags (id,name)\nVALUES (\1,\"\2\");/' | \
     feed_sql_and_log tags_insert_command.txt

#INSERT languages
echo "${LANGS}" | sed 's/\([^¶]*\)¶\(.*\)/'`
     `'INSERT INTO languages (id,lang_code)\n'`
     `'VALUES (\1,\"\2\");/' | \
     feed_sql_and_log languages_insert_command.txt

#INSERT series
echo "${SERIES}" | esc_q | sed 's/\([^¶]*\)¶\(.*\)/'`
     `'INSERT INTO series (id,name,sort)\n'`
     `'VALUES (\1,\"\2\",\"\2\");/' | \
     feed_sql_and_log series_insert_command.txt

#INSERT books, books_authors_link, books_series_link,
# books_tags_link, books_languages_link, comments
books_insert books_insert_command.txt
	#!/bin/bash

	SOURCE_DB="myrulib.db"

	TARGET_DB="metadata_my.db"

	#an empty Calibre database, with books and series
	# create/update triggers removed
	EMPTY_DB="metadata_empty.db"

	LIB_ROOT="./"

	function esc_q {
	# double quotes escape themselves in SQL
	sed 's/"/""/g'
	}

	function esc_sed {
	# sed special characters escape
	sed -e 's/\\/\\\\"/g' -e 's~[\.\*\^\$/]\\|\[\\|\]~\\&~g'
	}

	function my_cut {
	local i="$1"
	i=$((i - 1))
	sed -n "s/\([^¶]¶\)\{$i\}\([^¶]\).*/\2/p"
	}

	function get_date {
	local arg=""
	if [ -n "$1" ]; then
	arg="--date=$1"
	fi
	date -u $arg "+%Y-%m-%d %H:%M:%S.%N%z" \| sed 's/[0-9]\{2\}+0000$/+0000/'
	}

	function get_stamp {
	get_date $(stat "${LIB_ROOT}$1" -c%y)
	}

	# turn comma-separated list of authors into
	# newline separated one. Also remove some trash
	# from authors names
	function sanitize_authors {
	sed \
	-e 's/, \?\([[:upper:]]\\|d[ie][[:upper:]]\\|v[ao]n \\|d[ea] \\|de[ls] \\|te[nr]\?'`
	`' \\|tom \\|де \\|ван \\|ди \\|фон \)/\n\1/g' \| sed \
	-e 's/, \?[ae]t al$//' -e 's/, ред$//' \
	-e 's/\([[:upper:]]\)\.\?[, ]*$/\1./'
	}

	#filter get ids from authors names (case-insensetive)
	function auth_ids {
	local auths=$(echo "$1" \| esc_sed \| sed ':a;N;$!ba;s/\n/\\\|/g')
	(sed -n -e "/^[^¶]*¶\(${auths}\)\$/Ip" \| my_cut 1) <<< "${AUTHORS_FULL_NAMES}"
	}

	#same as auth_ids, but for series
	function series_ids {
	local series=$(echo "$1" \| esc_sed \| sed ':a;N;$!ba;s/\n/\\\|/g')
	(sed -n -e "/^[^¶]*¶\(${series}\)\$/Ip" \| my_cut 1) <<< "${SERIES}"
	}

	#same as auth_ids, but for tags
	function tag_ids {
	local tags=$(echo "$1" \| esc_sed \| sed ':a;N;$!ba;s/\n/\\\|/g')
	(sed -n -e "/^[^¶]*¶\(${tags}\)\$/Ip" \| my_cut 1) <<< "${TAGS}"
	}

	#get hierarchical tag from file path, e.g.,
	# "M_Mathematics/MC_Calculus/MCat_Advanced calculus/<file>.pdf" ->
	# "Mathematics.Calculus.Advanced calculus"
	function tag_from_filepath {
	sed -e 's\|^[^/]_\|\|' -e 's\|/[^/]_\|/\|g' -e 's\|/[^/]*$\|\|' -e 's\|/\|.\|g'
	}

	#same as auth_ids, but for langs
	function lang_ids {
	local langs=$(echo "$1" \| esc_sed \| sed ':a;N;$!ba;s/\n/\\\|/g')
	(sed -n -e "/^[^¶]*¶\(${langs}\)\$/Ip" \| my_cut 1) <<< "${LANGS}"
	}

	#feed stdin to sqlite, as well as log to "$1"
	function feed_sql_and_log {
	exec 3>>"$1"
	rm -f "$1"
	tee /dev/fd/3 \| sqlite3 "${TARGET_DB}"
	}

	#the most painful part: process book entries and collect/feed the respective
	# books, books_authors_link, books_series_link, books_tags_link,
	# books_languages_link, comments SQL INSERT commands
	function books_insert {
	# BOOKS:
	#<id>¶<title>¶<auths>¶<series>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<desc>

	local book
	local book_count=$(echo "${BOOKS}" \| wc -l)
	local n=$(echo -n "${book_count}" \| wc -c)
	(while IFS='' read -r book \|\| [ -n "$book" ]; do
	local book_id=$(echo "$book" \| my_cut 1)
	(awk "{printf \"[%07.4f]%% Inserting book %${n}d out of ${book_count}"`
	`"\n\", (\$0*100)/${book_count}, \$0}" >&2) <<< "${book_id}"
	local title=$(echo "$book" \| my_cut 2 \| esc_q)
	local authors=$(echo "$book" \| my_cut 3 \| sanitize_authors)
	local author_sort=$(echo "$authors" \| \
	sed ':a;N;$!ba;s/\([^\n]\)\n/\1 \& /g;s/./\L&/' \| esc_q)
	local series=$(echo "$book" \| my_cut 4)
	local file_path=$(echo "$book" \| my_cut 5)
	local file_type=$(echo "$book" \| my_cut 6)
	local md5sum=$(echo "$book" \| my_cut 7)
	local langs=$(echo "$book" \| my_cut 8 \| sed -e 's/,/\n/g')
	local year=$(echo "$book" \| my_cut 9)
	local description=$(echo "$book" \| my_cut 10 \| esc_q \| \
	sed 's/\\newline/\n/g')
	local timestamp=$(get_stamp "${file_path}")
	local ISBN=$(echo "${file_path}" \| \
	sed -n 's\|.(ISBN \([0-9]\))[^/]*$\|\1\|p')
	#book hierarchical classification, extracted from file path
	local tag=$(echo "${file_path}" \| tag_from_filepath)

	#books INSERT
	echo -e "INSERT INTO books (id,title,sort,timestamp,pubdate,"`
	`"author_sort,isbn,path,uuid,has_cover,"`
	`"last_modified)\n"`
	`"VALUES (${book_id},\"$title\",\"$title\",\"$timestamp\","`
	`"\"$year\",\"${author_sort}\",\"$ISBN\",\"${file_path}\""`
	`",\"$md5sum\",0,\"$timestamp\");"

	#books_authors_link INSERT
	auth_ids "$authors" \| sed -n -e 's/.\+/'`
	`"INSERT INTO books_authors_link (book,author)\n"`
	`"VALUES (${book_id},&);/p"

	#books_series_link INSERT
	series_ids "$series" \| sed -n -e 's/.\+/'`
	`"INSERT INTO books_series_link (book,series)\n"`
	`"VALUES (${book_id},&);/p"

	#books_tags_link INSERT
	tag_ids "$tag" \| sed -n -e 's/.\+/'`
	`"INSERT INTO books_tags_link (book,tag)\n"`
	`"VALUES (${book_id},&);/p"

	#books_languages_link INSERT
	lang_ids "$langs" \| sed -n -e 's/.\+/'`
	`"INSERT INTO books_languages_link (book,lang_code,item_order)\n"`
	`"VALUES (${book_id},&,0);/p"

	#comments INSERT
	if [ -n "${description}" ]; then
	echo -e "INSERT INTO comments (book,text)\n"`
	`"VALUES (${book_id},\"${description}\");"
	fi
	done <<< "${BOOKS}") \| feed_sql_and_log "$1"
	}

	cp -f "${EMPTY_DB}" "${TARGET_DB}"

	# AUTHORS_FULL_NAMES: '<id>¶<name>' lines
	AUTHORS_FULL_NAMES=$(sqlite3 "${SOURCE_DB}" 'SELECT full_name FROM authors;' \| \
	sanitize_authors \| sort -uf \| awk '{printf("%d¶%s\n", NR,$0)}')

	# TAGS: '<id>¶<tag>' lines
	TAGS=$(sqlite3 "${SOURCE_DB}" 'SELECT file_name FROM books;' \| tag_from_filepath \| \
	sort -uf \| awk '{printf("%d¶%s\n", NR,$0)}')

	# LANGS: '<id>¶<lang>' lines
	LANGS=$(sqlite3 "${SOURCE_DB}" 'SELECT DISTINCT lang FROM books;' \| \
	sed -e 's/,/\n/g' \| sort -uf \| awk '{printf("%d¶%s\n", NR,$0)}')

	# BOOKS:
	#'<id>¶<title>¶<authors>¶<sequence>¶<file_path>¶<file_type>¶<md5sum>¶<lang>¶<year>¶<description>'
	BOOKS=$(sqlite3 -separator '<S_e^Pa_r^At_oR>' "${SOURCE_DB}" \
	'SELECT printf("<S_e^Pa_r^At_oR>%s",books.title),authors.full_name,'`
	`'sequences.value,books.file_name,books.file_type,'`
	`'books.md5sum,books.lang,books.year,books.description '`
	`'FROM books,authors,sequences '`
	`'WHERE books.id_author=authors.id AND books.id_sequence=sequences.id;' \| \
	sed -e 's/<S_e^Pa_r^At_oR>/¶/g' \| sed ':a;N;$!ba;s/\n\([^¶]\)/\\newline\1/g' \| \
	awk '{printf("%d%s\n", NR,$0)}')

	# SERIES: '<id>¶<name>'
	SERIES=$(sqlite3 "${SOURCE_DB}" 'SELECT value FROM sequences;' \| sed '/^\s*$/d' \| \
	sort -uf \| awk '{printf("%d¶%s\n", NR,$0)}')

	#INSERT authors
	echo "${AUTHORS_FULL_NAMES}" \| esc_q \| sed 's/\([^¶]\)¶\(.\)/'`
	`'INSERT INTO authors (id,name,sort)\n'`
	`'VALUES (\1,\"\2\",\"\L\2\");/' \| \
	feed_sql_and_log authors_insert_command.txt

	#INSERT tags
	echo "${TAGS}" \| esc_q \| sed 's/\([^¶]\)¶\(.\)/'`
	`'INSERT INTO tags (id,name)\nVALUES (\1,\"\2\");/' \| \
	feed_sql_and_log tags_insert_command.txt

	#INSERT languages
	echo "${LANGS}" \| sed 's/\([^¶]\)¶\(.\)/'`
	`'INSERT INTO languages (id,lang_code)\n'`
	`'VALUES (\1,\"\2\");/' \| \
	feed_sql_and_log languages_insert_command.txt

	#INSERT series
	echo "${SERIES}" \| esc_q \| sed 's/\([^¶]\)¶\(.\)/'`
	`'INSERT INTO series (id,name,sort)\n'`
	`'VALUES (\1,\"\2\",\"\2\");/' \| \
	feed_sql_and_log series_insert_command.txt

	#INSERT books, books_authors_link, books_series_link,
	# books_tags_link, books_languages_link, comments
	books_insert books_insert_command.txt