drslump/jqlite

## jqlite
#!/usr/bin/env bash
#
# Copyright (C) 2018 Iván Velencoso Montes <ivan@velencoso.cat>.
# This file is licensed under the MIT license.
#
# Requirements:
#
#  - Bash 3.x or higher
#  - sqlite3
#  - gunzip / bzip2 (for compressed input)
#  - jq or Python (for JSON format)
#  - curl (for URLs)
#
# Usage:
#    jqlite [-v] [-s DBFILE] [-q SQL] [-f FORMAT] [-o OUTPUT] [--] <file>...
#
#    The SQL query can be piped over stdin or as the query argument.
#    When format is sql it can be suffixed with @<table> to specify which
#    table should the insert statements reference, otherwise it defaults
#    to <sql>.
#
# Arguments:
#    <file>  A CSV, TSV, JSON file, optionally compressed, or URL. It can be
#            suffixed with @<alias> to override the name of the generated table.
#            Otherwise the table will reflect the filename without the extension.
#            '-' can be used to read data from stdin which will be available on
#            a table named 'stdin'. In this mode any SQL can only be provided
#            via the --query option or a .sql file.
#
#            Additionally the aliasing can include a format @<alias>.<format>,
#            which is extremely useful when using process substitution to provide
#            the data.
#
#            When a .db or .sqlite file is passed it's attached and available for
#            queries under the prefix of its filename (or its override).
#            Also .sql files are detected and executed regardless of the query
#            option. Note that they are executed in the order in which they were
#            given, so if they depend on previous data files they should be
#            passed after them.
#
# Options:
#    -s, --save DBFILE     When set creates a sqlite database file.
#    -f, --format FORMAT   Outputs on CSV, TSV, TXT, JSON, SQL.
#    -o, --output OUTPUT   Where to store the output (defaults to stdout).
#    -q, --query SQL       What query to execute (- for stdin, default).
#    -v, --verbose         Be verbose.
#    -h, --help            Show this message.
#    -V, --version         Display the version and exit.
#
#
# Changelog:
#   0.2.6 (2024-12-18):
#     - Update author information
#
#   0.2.5 (2020-04-22):
#     - Use python3 if available (better unicode support)
#
#   0.2.4 (2018-08-04):
#     - Allow to force a format per input. Required for process substitution.
#     - Fix bug with url handling and added multiple tests for it.
#     - Fix bug with stdin input being eagerly consumed.
#     - Preliminary support for JSON via Ruby 1.9.3+
#
#   0.2.0 (2018-08-04):
#     - `-d`/`--database` renamed to `-s`/`--save`
#
#   0.1.1 (2018-08-03):
#     - Fix a deadlock when using large compressed files as input
#     - Python is now default over jq for handling JSON (better performance)
#
#   0.1.0 (2018-08-03):
#     - Fallback to Python for handling JSON
#

VERSION="0.2.6"

set -euo pipefail

# Handle the exit of the script from function calls
trap "exit 1" TERM
JQLITE_PID=$$
die() { errcho "ERROR:" "$@"; kill -s TERM $JQLITE_PID; exit 1; }

# Clean up on exit
TEMP_DIR=$(mktemp -d tmp-jqlite.XXXXX)
cleanup() { rv=$?; rm -rf "$TEMP_DIR"; exit $rv; }
trap "cleanup" exit


OUTPUT="/dev/stdout"
QUERY=""
DATABASE=""
FORMAT=""
VERBOSE=""
INPUTS=()
DQL=()

RESERVED=(ADD ALL ALTER AND AS AUTOINCREMENT BETWEEN CASE CHECK COLLATE COMMIT CONSTRAINT CREATE DEFAULT DEFERRABLE DELETE DISTINCT DROP ELSE ESCAPE EXCEPT EXISTS FOREIGN FROM GROUP HAVING IF IN INDEX INSERT INTERSECT INTO IS ISNULL JOIN LIMIT NOT NOTNULL NULL ON OR ORDER PRIMARY REFERENCES SELECT SET TABLE THEN TO TRANSACTION UNION UNIQUE UPDATE USING VALUES WHEN WHERE)

errcho() { >&2 echo "$@"; }
warn() { errcho "WARN:" "$@"; }
dql() { DQL+=("$1"); }
trace() { [ -n "$VERBOSE" ] && errcho "*" "$@" || true; }
trace_dql() { [ -n "$VERBOSE" ] && dql ".print '* $@'" || true; }

# Takes care of uncompressing or downloading files.
# Note that this function might spawn background commands to uncompress a file
# into a named fifo to avoid creating large temporary files. However this means
# that we can't call it in a subshell like in $(preprocess xxx), instead we
# have to use a bashism by reading its output into a variable with process
# substitution, otherwise the subshell will wait until the background task
# completes (which it'll never do).
preprocess() {
    fpath="$1"
    ext="${fpath##*.}"

    if [[ "$fpath" = *"://"* ]]; then
        trace "Downloading $fpath"
        tdir=$(mktemp -d "$TEMP_DIR/XXXXX")
        (
            cd "$tdir"
            curl --silent --fail -L --remote-header-name --remote-name "$fpath" \
            || rm -f "$tdir/*"
        )

        tname=$( ls "$tdir" | head -1 )
        [ -n "$tname" ] || die "Unable to download $fpath"
        read retval < <(preprocess "$tdir/$tname")
        echo "$retval"
    elif [ $ext == "gz" ]; then
        [ -f "$fpath" ] || die "File not found $fpath"
        trace "Uncompressing $fpath"
        fname=$(basename "$fpath")
        tpath="$TEMP_DIR/${fname%.*}"
        mkfifo "$tpath"

        gunzip --stdout "$fpath" >"$tpath" 2>/dev/null &
        read retval < <(preprocess "$tpath")
        echo "$retval"
    elif [ $ext == "bz2" ]; then
        [ -f "$fpath" ] || die "File not found $fpath"
        trace "Uncompressing $fpath"
        fname=$(basename "$fpath")
        tpath="$TEMP_DIR/${fname%.*}"
        mkfifo "$tpath"

        bzip2 -d --stdout "$fpath" >"$tpath" 2>/dev/null &
        read retval < <(preprocess "$tpath")
        echo "$retval"
    else
        [ -p "$fpath" ] || [ -f "$fpath" ] || die "File not found $fpath"
        echo "$fpath"
    fi
}

# Generates import statements for a file based on its format
import() {
    read fpath < <(preprocess "$1")
    table="$2"
    format="${3:-}"

    if [ ! -p "$fpath" ] && [ -f "$fpath" ]; then
        [[ -s $fpath ]] || die "File is empty: $fpath"
    fi

    # Auto-detect format based on file extension
    if [ -z "$format" ]; then
        case "${fpath##*.}" in
        db|sqlite) format="sqlite" ;;
        sql) format="sql" ;;
        json|jsonl) format="json" ;;
        tsv) format="tsv" ;;
        *) format="csv" ;;
        esac
        trace "Assuming format <$format> for $fpath"
    fi

    case $format in
    db|sqlite)
        dql "ATTACH '$fpath' AS $table;"
        ;;
    sql)
        dql ".read '$fpath'"
        ;;
    json*)
        fname=$(basename "$fpath")
        tpath="$TEMP_DIR/${fname%.*}.csv"
        mkfifo "$tpath"

        cat "$fpath" | $from_json > "$tpath" &

        import "$tpath" "$table" "csv"
        ;;
    csv)
        dql "DROP TABLE IF EXISTS [$table];"
        dql ".mode csv"
        dql ".import '$fpath' $table"
        ;;
    tsv)
        dql "DROP TABLE IF EXISTS [$table];"
        dql ".mode tabs"
        dql ".import '$fpath' $table"
        ;;

    *)
        die "Unsupported format '$format'"
        ;;
    esac
}

# Ouput formatters
format_csv() { cat; }
format_tsv() { cat; }
format_txt() { cat; }
format_sql() { cat; }
format_json() { $to_json; }


main() {
    while [[ $# -gt 0 ]]
    do
        case "$1" in
        -s|--save)
            DATABASE="$2"
            shift; shift ;;
        -f|--format)
            FORMAT=$( echo "$2" | tr '[:upper:]' '[:lower:]' )
            shift; shift ;;
        -o|--output)
            OUTPUT="$2"
            shift; shift ;;
        -q|--query)
            if [ "$2" == "-" ]; then QUERY="$(</dev/stdin)"; else QUERY="$2"; fi
            shift; shift ;;
        -v|--verbose)
            VERBOSE="1"
            shift ;;
        -V|--version)
            echo "jqlite v$VERSION -- sql queries for tabular data"
            exit 1 ;;
        -h|--help)
            echo "jqlite v$VERSION -- sql queries for tabular data"
            awk "NR >= 13 && NR <= 49" "$0" | cut -c 3-
            exit 0 ;;
        --test)
            # non-public option to run the unit tests
            shift; test
            ;;

        --)
            shift
            INPUTS+=("$@")
            break ;;
        -|-@*)
            # stdin requires a bit of dance since sqlite will receive the SQL
            # on its stdin, that means that data in the parent process stdin
            # needs to be redirected elsewhere. The approach is to create a named
            # pipe, spawn a background cat to feed it and then pass it to sqlite
            # for importing.
            tpath="$TEMP_DIR/stdin"
            mkfifo "$tpath"
            cat <&0 >"$tpath" &

            [ "$1" != "-" ] && tpath+="@${1#*@}"
            INPUTS+=("$tpath")
            QUERY=${QUERY:-";;"}   # Avoid default of reading query from stdin
            shift ;;
        -*)
            die "Unknown option $1" ;;
        *)
            INPUTS+=("$1")
            shift ;;
        esac
    done

    [[ ! -f "$DATABASE" && ${#INPUTS[@]} -eq 0 ]] && \
        die "No data provided so nothing to do."

    if [[ -z $QUERY && ! -t 0 ]]; then
        QUERY="$(</dev/stdin)"
    fi

    sqlite=$( command -v sqlite3 || true )
    if [ -z "$sqlite" ]; then
        die "Unable to find the sqlite3 command!"
    fi

    # Try to find something to handle the JSON format
    if command -v python3 >/dev/null; then
        trace "Enabling JSON support via python3"
        from_json=from_json_python3
        to_json=to_json_python3
    elif command -v python >/dev/null; then
        trace "Enabling JSON support via python"
        from_json=from_json_python
        to_json=to_json_python
    #TODO: Disabled until to_json_ruby is fixed
    # elif [[ $(check_ruby) == "ok" ]]; then
    #     trace "Enabling JSON support via ruby"
    #     from_json=from_json_ruby
    #     to_json=to_json_ruby
    elif command -v jq >/dev/null; then
        trace "Enabling JSON support via jq"
        from_json=from_json_jq
        to_json=to_json_jq
    else
        trace "Disabling JSON support (neither jq nor python available)"
        from_json=json_unsupported
        to_json=json_unsupported
    fi

    # Preface SQL commands
    dql ".bail on"
    dql ".output stderr"

    if [ -f "$DATABASE" ]; then
        # We need to make a copy otherwise we can't save the openned DB
        tmp_db="$TEMP_DIR/$(basename "$DATABASE"))"
        cp "$DATABASE" "$tmp_db"
        trace_dql "Opening database from $DATABASE"
        dql ".open '$tmp_db'"
    fi

    for fname in "${INPUTS[@]}"; do
        format=""

        # check if we have an alias
        IFS='@' read -r -a parts <<< "$fname"
        if [ "${#parts[@]}" -eq "1" ]; then
            table="${fname##*/}"
            ext="${fname##*.}"
            if [ $ext == "gz" ] || [ $ext == "bz2" ]; then
                table="${table%.*}"
            fi
            table="${table%.*}"
        else
            table="${parts[ ${#parts[@]}-1 ]}"
            unset 'parts[ ${#parts[@]}-1 ]'
            fname=$( IFS=@ ; echo "${parts[*]}" )

            # Now check if there is a format override
            IFS='.' read -r -a parts <<< "$table"
            if [ "${#parts[@]}" -gt "1" ]; then
                format="${parts[ ${#parts[@]}-1 ]}"
                unset 'parts[ ${#parts[@]}-1 ]'
                table=$( IFS=. ; echo "${parts[*]}" )
            fi
        fi

        [ -n "$table" ] || die "No table name defined for $fname"

        table=$( echo -n $table | tr '[:lower:]' '[:upper:]' )
        normalized=$( echo -n "$table" | tr -cs '[:alnum:]' _ )
        if [[ "$normalized" != "$table" ]]; then
            warn "Renaming <$table> to <$normalized> (normalization)"
            table="$normalized"
        fi

        if [[ " ${RESERVED[@]} " =~ " ${table} " ]]; then
            warn "Renaming <$table> to <_$table> (reserved word)"
            table="_$table"
        fi

        [ -z "$format" ] || trace "Forcing format $format for <$table>"

        trace_dql "Importing ${fname} at ${table}...'"
        import "$fname" "$table" "$format"
    done

    trace_dql "Dumping database schema:"
    [ -n "$VERBOSE" ] && dql ".schema"

    if [[ -n "$QUERY" && "$QUERY" != ";;" ]]; then
        # Try to infer the format from the output file
        if [ -z "$FORMAT" ]; then
            case "${OUTPUT##*.}" in
            csv|tsv|txt) FORMAT="${OUTPUT##*.}" ;;
            json*) FORMAT=json ;;
            sql*) FORMAT=sql ;;
            *) FORMAT=csv ;;
            esac
        fi

        case "$FORMAT" in
        csv) dql ".mode csv" ;;
        tsv) dql ".mode tabs" ;;
        txt) dql ".mode columns" ;;
        json) dql ".mode list"; dql ".separator \t \n"; dql ".nullvalue NULL" ;;
        sql*) dql ".mode insert ${FORMAT#*@}"; FORMAT=sql ;;
        *) die "Unsupported format $FORMAT" ;;
        esac
        trace_dql "Formatting results as ${FORMAT}"

        printf '%s;' "$QUERY" > "$TEMP_DIR/query.sql"
        trace_dql "Executing query..."
        dql ".output stdout"
        dql ".headers on"
        dql "BEGIN;"
        dql ".read '$TEMP_DIR/query.sql'"
        dql "COMMIT;"
        dql ".output stderr"
    fi

    if [ -n "$DATABASE" ]; then
        trace_dql "Saving database to $DATABASE"
        dql ".save '$DATABASE'"
    fi

    FORMAT="${FORMAT:-csv}"

    printf "%s\n" "${DQL[@]}" \
    | "$sqlite" -batch \
    | "format_$FORMAT" \
    > "$OUTPUT"

}

# JSON adapters

from_json_jq() {
    jq -r --slurp '
            (.[0] | keys_unsorted) as $keys
            |
                $keys,
                (
                    .[]
                    | [
                        .[ $keys[] ] as $value
                        | if ($value | [type] | inside(["array","object"])) then ($value|tojson) else $value end
                    ]
                )
            | @csv'
}

to_json_jq() {
    jq -cR --slurp '
            split("\n")
            | map( split("\t") )
            | .[0] as $hdrs
            | .[1:][]
            | select(length>0)
            | map( if length < 16 then tonumber? // . else . end )
            | .[] |= if . == "NULL" then null else . end
            | [$hdrs, .]
            | [ transpose[] | {key: .[0], value: .[1]} ]
            | from_entries'
}

from_json_python() {
    ${1:-python} -c "
import sys, csv, json
writer = None
for line in sys.stdin:
    data = dict(
        (k, json.dumps(v) if isinstance(v, (dict, list)) else v)
        for k,v in json.loads(line).items())

    if not writer:
        writer = csv.DictWriter(sys.stdout, list(data.keys()), extrasaction='ignore')
        writer.writeheader()

    writer.writerow(data)
"
}
from_json_python3() {
    from_json_python python3
}

to_json_python() {
    ${1:-python} -c "
import sys, csv, json

for row in csv.DictReader(sys.stdin, delimiter='\\t'):
    for hdr in row.keys():
        if row[hdr] == 'NULL':
            row[hdr] = None
        else:
            try:
                row[hdr] = float(row[hdr])
            except ValueError:
                pass

    print(json.dumps(row, sort_keys=True))
"
}
to_json_python3() {
    to_json_python python3
}

# We require at least Ruby 1.9.3
check_ruby() {
    if command -v ruby >/dev/null 2>&1; then
        local version=$(ruby --version | cut -d' ' -f2 | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }')
        [ $version -gt 10903 ] && echo "ok"
    fi
}

from_json_ruby() {
    ruby -e "
require 'csv'
require 'json'

CSV(\$stdout) do |csv|
    headers = nil
    while line = gets
        row = JSON.parse line
        unless headers
            headers = row.keys.sort
            csv << headers
        end
        csv << headers.map do |k| row[k] end
    end
end
"
}

to_json_ruby() {
#TODO: Breaks with quotes since sqlite doesn't quote fields in list mode
    ruby -e "
require 'csv'
require 'json'

CSV.new(\$stdin, :headers => true, :col_sep => '\t').map do |row|
    data = row.to_hash
    data.each do |k, v|
        if v == 'NULL'
            data[k] = nil if v == 'NULL'
        elsif (Float(v) != nil rescue false)
            data[k] = v.to_f
        end
    end

    puts JSON.generate(data)
end
"
}

json_unsupported() {
    die "JSON support requires for either jq or python to be available"
}


## Unit Tests #############

fail() {
    if ! [ -z "$1" ]; then
        errcho "[$0:$2] $1"
        [ -z "$3" ] || echo "$3"
    fi
    exit 99
}

test() {
    echo "Unit Tests:"

    failed=0
    for fn in $(declare -F | cut -d ' ' -f3); do
        [[ "$fn" =~ ^test_ ]] || continue
        echo -n "  - Test ${fn#*_} "
        {
            set +e

            cp "$0" "$TEMP_DIR/jqlite"
            cd "$TEMP_DIR"
            export PATH=".:$PATH"

            $( $fn >__stdout__ 2>__stderr__ )
            if  [ $? != 0 ]; then
                echo "✗"
                for output in __stdout__ __stderr__; do
                    if [ -s "$output" ]; then
                        echo "=====[ ${output//_} ]====="
                        cat "$output"
                    fi
                done
                echo "===================="
                (( failed++ ))
            else
                echo "✓"
            fi

            rm -rf "$TEMP_DIR"/*
            cd - > /dev/null
        }
    done

    exit $failed
}

test_opt_help() {
    output=$(jqlite --help)
    [[ "$output" = "$(jqlite -h)" ]] || fail "-h should be equal" $LINENO
    [[ "$output" = *"Usage:"* ]] || fail "expected Usage string" $LINENO
    [[ "$output" = *"Arguments:"* ]] || fail "expected Arguments section" $LINENO
    [[ "$output" = *"Options:"* ]] || fail "expected Options section" $LINENO
}

test_opt_version() {
    output=$(jqlite --version)
    [[ "$output" = "$(jqlite -V)" ]] || fail "-V should be equal" $LINENO
    [[ "$output" = *"v$VERSION"* ]] || fail "expected version string" $LINENO "$output"
}

test_json_parse() {
    echo '{"foo":"Foo1", "bar":10}' > "test.json"
    echo '{"foo":"Foo2", "bar":20}' >> "test.json"

    output=$( echo 'SELECT * FROM test' | jqlite test.json )
    [[ $(echo "$output" | wc -l) -eq 3 ]] || fail "expected 3 lines" $LINENO "$output"

    output=$( echo 'SELECT SUM(bar) FROM test' | jqlite test.json )
    [[ "$output" = $'SUM(bar)\r\n30\r' ]] || fail "sum a column" $LINENO "$output"
}

test_stdin() {
    output=$(
        echo '{"foo":"Foo1", "bar":10}' \
        | jqlite -q 'SELECT bar FROM stdin' -@stdin.json
        )
    [[ "$output" = $'bar\r\n10\r' ]] || fail "stdin" $LINENO "$output"
}

samples_urlbase='https://gist.githubusercontent.com/drslump/f018be6d5e00b9b12fdee11c21e79f96/raw'

test_url_csv() {
    url="$samples_urlbase/cities.csv"
    q='SELECT COUNT(*) FROM cities'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output"
}

test_url_csv_gz() {
    url="$samples_urlbase/cities.csv.gz"
    q='SELECT COUNT(*) FROM cities'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output"
}

test_url_tsv() {
    url="$samples_urlbase/hurricanes.tsv"
    q='SELECT COUNT(*) FROM hurricanes'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output"
}

test_url_tsv_bz2() {
    url="$samples_urlbase/hurricanes.tsv"
    q='SELECT COUNT(*) FROM hurricanes'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output"
}

test_url_jsonl() {
    url="$samples_urlbase/tweets.jsonl"
    q='SELECT COUNT(*) FROM tweets'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}

test_url_alias() {
    url="$samples_urlbase/tweets.jsonl@alias"
    q='SELECT COUNT(*) FROM alias'

    output=$( jqlite -q "$q" -f tsv $url )
    [[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}

test_url_alias_format() {
    q='SELECT COUNT(*) FROM alias'

    output=$( jqlite -q "$q" -f tsv <(curl --silent $samples_urlbase/tweets.jsonl)@alias.json )
    [[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}


# Run the script
main "$@"
	#!/usr/bin/env bash
	#
	# Copyright (C) 2018 Iván Velencoso Montes <ivan@velencoso.cat>.
	# This file is licensed under the MIT license.
	#
	# Requirements:
	#
	# - Bash 3.x or higher
	# - sqlite3
	# - gunzip / bzip2 (for compressed input)
	# - jq or Python (for JSON format)
	# - curl (for URLs)
	#
	# Usage:
	# jqlite [-v] [-s DBFILE] [-q SQL] [-f FORMAT] [-o OUTPUT] [--] <file>...
	#
	# The SQL query can be piped over stdin or as the query argument.
	# When format is sql it can be suffixed with @<table> to specify which
	# table should the insert statements reference, otherwise it defaults
	# to <sql>.
	#
	# Arguments:
	# <file> A CSV, TSV, JSON file, optionally compressed, or URL. It can be
	# suffixed with @<alias> to override the name of the generated table.
	# Otherwise the table will reflect the filename without the extension.
	# '-' can be used to read data from stdin which will be available on
	# a table named 'stdin'. In this mode any SQL can only be provided
	# via the --query option or a .sql file.
	#
	# Additionally the aliasing can include a format @<alias>.<format>,
	# which is extremely useful when using process substitution to provide
	# the data.
	#
	# When a .db or .sqlite file is passed it's attached and available for
	# queries under the prefix of its filename (or its override).
	# Also .sql files are detected and executed regardless of the query
	# option. Note that they are executed in the order in which they were
	# given, so if they depend on previous data files they should be
	# passed after them.
	#
	# Options:
	# -s, --save DBFILE When set creates a sqlite database file.
	# -f, --format FORMAT Outputs on CSV, TSV, TXT, JSON, SQL.
	# -o, --output OUTPUT Where to store the output (defaults to stdout).
	# -q, --query SQL What query to execute (- for stdin, default).
	# -v, --verbose Be verbose.
	# -h, --help Show this message.
	# -V, --version Display the version and exit.
	#
	#
	# Changelog:
	# 0.2.6 (2024-12-18):
	# - Update author information
	#
	# 0.2.5 (2020-04-22):
	# - Use python3 if available (better unicode support)
	#
	# 0.2.4 (2018-08-04):
	# - Allow to force a format per input. Required for process substitution.
	# - Fix bug with url handling and added multiple tests for it.
	# - Fix bug with stdin input being eagerly consumed.
	# - Preliminary support for JSON via Ruby 1.9.3+
	#
	# 0.2.0 (2018-08-04):
	# - `-d`/`--database` renamed to `-s`/`--save`
	#
	# 0.1.1 (2018-08-03):
	# - Fix a deadlock when using large compressed files as input
	# - Python is now default over jq for handling JSON (better performance)
	#
	# 0.1.0 (2018-08-03):
	# - Fallback to Python for handling JSON
	#

	VERSION="0.2.6"

	set -euo pipefail

	# Handle the exit of the script from function calls
	trap "exit 1" TERM
	JQLITE_PID=$$
	die() { errcho "ERROR:" "$@"; kill -s TERM $JQLITE_PID; exit 1; }

	# Clean up on exit
	TEMP_DIR=$(mktemp -d tmp-jqlite.XXXXX)
	cleanup() { rv=$?; rm -rf "$TEMP_DIR"; exit $rv; }
	trap "cleanup" exit



	OUTPUT="/dev/stdout"
	QUERY=""
	DATABASE=""
	FORMAT=""
	VERBOSE=""
	INPUTS=()
	DQL=()

	RESERVED=(ADD ALL ALTER AND AS AUTOINCREMENT BETWEEN CASE CHECK COLLATE COMMIT CONSTRAINT CREATE DEFAULT DEFERRABLE DELETE DISTINCT DROP ELSE ESCAPE EXCEPT EXISTS FOREIGN FROM GROUP HAVING IF IN INDEX INSERT INTERSECT INTO IS ISNULL JOIN LIMIT NOT NOTNULL NULL ON OR ORDER PRIMARY REFERENCES SELECT SET TABLE THEN TO TRANSACTION UNION UNIQUE UPDATE USING VALUES WHEN WHERE)

	errcho() { >&2 echo "$@"; }
	warn() { errcho "WARN:" "$@"; }
	dql() { DQL+=("$1"); }
	trace() { [ -n "$VERBOSE" ] && errcho "*" "$@" \|\| true; }
	trace_dql() { [ -n "$VERBOSE" ] && dql ".print '* $@'" \|\| true; }

	# Takes care of uncompressing or downloading files.
	# Note that this function might spawn background commands to uncompress a file
	# into a named fifo to avoid creating large temporary files. However this means
	# that we can't call it in a subshell like in $(preprocess xxx), instead we
	# have to use a bashism by reading its output into a variable with process
	# substitution, otherwise the subshell will wait until the background task
	# completes (which it'll never do).
	preprocess() {
	fpath="$1"
	ext="${fpath##*.}"

	if [[ "$fpath" = "://" ]]; then
	trace "Downloading $fpath"
	tdir=$(mktemp -d "$TEMP_DIR/XXXXX")
	(
	cd "$tdir"
	curl --silent --fail -L --remote-header-name --remote-name "$fpath" \
	\|\| rm -f "$tdir/*"
	)

	tname=$( ls "$tdir" \| head -1 )
	[ -n "$tname" ] \|\| die "Unable to download $fpath"
	read retval < <(preprocess "$tdir/$tname")
	echo "$retval"
	elif [ $ext == "gz" ]; then
	[ -f "$fpath" ] \|\| die "File not found $fpath"
	trace "Uncompressing $fpath"
	fname=$(basename "$fpath")
	tpath="$TEMP_DIR/${fname%.*}"
	mkfifo "$tpath"

	gunzip --stdout "$fpath" >"$tpath" 2>/dev/null &
	read retval < <(preprocess "$tpath")
	echo "$retval"
	elif [ $ext == "bz2" ]; then
	[ -f "$fpath" ] \|\| die "File not found $fpath"
	trace "Uncompressing $fpath"
	fname=$(basename "$fpath")
	tpath="$TEMP_DIR/${fname%.*}"
	mkfifo "$tpath"

	bzip2 -d --stdout "$fpath" >"$tpath" 2>/dev/null &
	read retval < <(preprocess "$tpath")
	echo "$retval"
	else
	[ -p "$fpath" ] \|\| [ -f "$fpath" ] \|\| die "File not found $fpath"
	echo "$fpath"
	fi
	}

	# Generates import statements for a file based on its format
	import() {
	read fpath < <(preprocess "$1")
	table="$2"
	format="${3:-}"

	if [ ! -p "$fpath" ] && [ -f "$fpath" ]; then
	[[ -s $fpath ]] \|\| die "File is empty: $fpath"
	fi

	# Auto-detect format based on file extension
	if [ -z "$format" ]; then
	case "${fpath##*.}" in
	db\|sqlite) format="sqlite" ;;
	sql) format="sql" ;;
	json\|jsonl) format="json" ;;
	tsv) format="tsv" ;;
	*) format="csv" ;;
	esac
	trace "Assuming format <$format> for $fpath"
	fi

	case $format in
	db\|sqlite)
	dql "ATTACH '$fpath' AS $table;"
	;;
	sql)
	dql ".read '$fpath'"
	;;
	json*)
	fname=$(basename "$fpath")
	tpath="$TEMP_DIR/${fname%.*}.csv"
	mkfifo "$tpath"

	cat "$fpath" \| $from_json > "$tpath" &

	import "$tpath" "$table" "csv"
	;;
	csv)
	dql "DROP TABLE IF EXISTS [$table];"
	dql ".mode csv"
	dql ".import '$fpath' $table"
	;;
	tsv)
	dql "DROP TABLE IF EXISTS [$table];"
	dql ".mode tabs"
	dql ".import '$fpath' $table"
	;;

	*)
	die "Unsupported format '$format'"
	;;
	esac
	}

	# Ouput formatters
	format_csv() { cat; }
	format_tsv() { cat; }
	format_txt() { cat; }
	format_sql() { cat; }
	format_json() { $to_json; }


	main() {
	while [[ $# -gt 0 ]]
	do
	case "$1" in
	-s\|--save)
	DATABASE="$2"
	shift; shift ;;
	-f\|--format)
	FORMAT=$( echo "$2" \| tr '[:upper:]' '[:lower:]' )
	shift; shift ;;
	-o\|--output)
	OUTPUT="$2"
	shift; shift ;;
	-q\|--query)
	if [ "$2" == "-" ]; then QUERY="$(</dev/stdin)"; else QUERY="$2"; fi
	shift; shift ;;
	-v\|--verbose)
	VERBOSE="1"
	shift ;;
	-V\|--version)
	echo "jqlite v$VERSION -- sql queries for tabular data"
	exit 1 ;;
	-h\|--help)
	echo "jqlite v$VERSION -- sql queries for tabular data"
	awk "NR >= 13 && NR <= 49" "$0" \| cut -c 3-
	exit 0 ;;
	--test)
	# non-public option to run the unit tests
	shift; test
	;;

	--)
	shift
	INPUTS+=("$@")
	break ;;
	-\|-@*)
	# stdin requires a bit of dance since sqlite will receive the SQL
	# on its stdin, that means that data in the parent process stdin
	# needs to be redirected elsewhere. The approach is to create a named
	# pipe, spawn a background cat to feed it and then pass it to sqlite
	# for importing.
	tpath="$TEMP_DIR/stdin"
	mkfifo "$tpath"
	cat <&0 >"$tpath" &

	[ "$1" != "-" ] && tpath+="@${1#*@}"
	INPUTS+=("$tpath")
	QUERY=${QUERY:-";;"} # Avoid default of reading query from stdin
	shift ;;
	-*)
	die "Unknown option $1" ;;
	*)
	INPUTS+=("$1")
	shift ;;
	esac
	done

	[[ ! -f "$DATABASE" && ${#INPUTS[@]} -eq 0 ]] && \
	die "No data provided so nothing to do."

	if [[ -z $QUERY && ! -t 0 ]]; then
	QUERY="$(</dev/stdin)"
	fi

	sqlite=$( command -v sqlite3 \|\| true )
	if [ -z "$sqlite" ]; then
	die "Unable to find the sqlite3 command!"
	fi

	# Try to find something to handle the JSON format
	if command -v python3 >/dev/null; then
	trace "Enabling JSON support via python3"
	from_json=from_json_python3
	to_json=to_json_python3
	elif command -v python >/dev/null; then
	trace "Enabling JSON support via python"
	from_json=from_json_python
	to_json=to_json_python
	#TODO: Disabled until to_json_ruby is fixed
	# elif [[ $(check_ruby) == "ok" ]]; then
	# trace "Enabling JSON support via ruby"
	# from_json=from_json_ruby
	# to_json=to_json_ruby
	elif command -v jq >/dev/null; then
	trace "Enabling JSON support via jq"
	from_json=from_json_jq
	to_json=to_json_jq
	else
	trace "Disabling JSON support (neither jq nor python available)"
	from_json=json_unsupported
	to_json=json_unsupported
	fi

	# Preface SQL commands
	dql ".bail on"
	dql ".output stderr"

	if [ -f "$DATABASE" ]; then
	# We need to make a copy otherwise we can't save the openned DB
	tmp_db="$TEMP_DIR/$(basename "$DATABASE"))"
	cp "$DATABASE" "$tmp_db"
	trace_dql "Opening database from $DATABASE"
	dql ".open '$tmp_db'"
	fi

	for fname in "${INPUTS[@]}"; do
	format=""

	# check if we have an alias
	IFS='@' read -r -a parts <<< "$fname"
	if [ "${#parts[@]}" -eq "1" ]; then
	table="${fname##*/}"
	ext="${fname##*.}"
	if [ $ext == "gz" ] \|\| [ $ext == "bz2" ]; then
	table="${table%.*}"
	fi
	table="${table%.*}"
	else
	table="${parts[ ${#parts[@]}-1 ]}"
	unset 'parts[ ${#parts[@]}-1 ]'
	fname=$( IFS=@ ; echo "${parts[*]}" )

	# Now check if there is a format override
	IFS='.' read -r -a parts <<< "$table"
	if [ "${#parts[@]}" -gt "1" ]; then
	format="${parts[ ${#parts[@]}-1 ]}"
	unset 'parts[ ${#parts[@]}-1 ]'
	table=$( IFS=. ; echo "${parts[*]}" )
	fi
	fi

	[ -n "$table" ] \|\| die "No table name defined for $fname"

	table=$( echo -n $table \| tr '[:lower:]' '[:upper:]' )
	normalized=$( echo -n "$table" \| tr -cs '[:alnum:]' _ )
	if [[ "$normalized" != "$table" ]]; then
	warn "Renaming <$table> to <$normalized> (normalization)"
	table="$normalized"
	fi

	if [[ " ${RESERVED[@]} " =~ " ${table} " ]]; then
	warn "Renaming <$table> to <_$table> (reserved word)"
	table="_$table"
	fi

	[ -z "$format" ] \|\| trace "Forcing format $format for <$table>"

	trace_dql "Importing ${fname} at ${table}...'"
	import "$fname" "$table" "$format"
	done

	trace_dql "Dumping database schema:"
	[ -n "$VERBOSE" ] && dql ".schema"

	if [[ -n "$QUERY" && "$QUERY" != ";;" ]]; then
	# Try to infer the format from the output file
	if [ -z "$FORMAT" ]; then
	case "${OUTPUT##*.}" in
	csv\|tsv\|txt) FORMAT="${OUTPUT##*.}" ;;
	json*) FORMAT=json ;;
	sql*) FORMAT=sql ;;
	*) FORMAT=csv ;;
	esac
	fi

	case "$FORMAT" in
	csv) dql ".mode csv" ;;
	tsv) dql ".mode tabs" ;;
	txt) dql ".mode columns" ;;
	json) dql ".mode list"; dql ".separator \t \n"; dql ".nullvalue NULL" ;;
	sql) dql ".mode insert ${FORMAT#@}"; FORMAT=sql ;;
	*) die "Unsupported format $FORMAT" ;;
	esac
	trace_dql "Formatting results as ${FORMAT}"

	printf '%s;' "$QUERY" > "$TEMP_DIR/query.sql"
	trace_dql "Executing query..."
	dql ".output stdout"
	dql ".headers on"
	dql "BEGIN;"
	dql ".read '$TEMP_DIR/query.sql'"
	dql "COMMIT;"
	dql ".output stderr"
	fi

	if [ -n "$DATABASE" ]; then
	trace_dql "Saving database to $DATABASE"
	dql ".save '$DATABASE'"
	fi

	FORMAT="${FORMAT:-csv}"

	printf "%s\n" "${DQL[@]}" \
	\| "$sqlite" -batch \
	\| "format_$FORMAT" \
	> "$OUTPUT"

	}

	# JSON adapters

	from_json_jq() {
	jq -r --slurp '
	(.[0] \| keys_unsorted) as $keys
	\|
	$keys,
	(
	.[]
	\| [
	.[ $keys[] ] as $value
	\| if ($value \| [type] \| inside(["array","object"])) then ($value\|tojson) else $value end
	]
	)
	\| @csv'
	}

	to_json_jq() {
	jq -cR --slurp '
	split("\n")
	\| map( split("\t") )
	\| .[0] as $hdrs
	\| .[1:][]
	\| select(length>0)
	\| map( if length < 16 then tonumber? // . else . end )
	\| .[] \|= if . == "NULL" then null else . end
	\| [$hdrs, .]
	\| [ transpose[] \| {key: .[0], value: .[1]} ]
	\| from_entries'
	}

	from_json_python() {
	${1:-python} -c "
	import sys, csv, json
	writer = None
	for line in sys.stdin:
	data = dict(
	(k, json.dumps(v) if isinstance(v, (dict, list)) else v)
	for k,v in json.loads(line).items())

	if not writer:
	writer = csv.DictWriter(sys.stdout, list(data.keys()), extrasaction='ignore')
	writer.writeheader()

	writer.writerow(data)
	"
	}
	from_json_python3() {
	from_json_python python3
	}

	to_json_python() {
	${1:-python} -c "
	import sys, csv, json

	for row in csv.DictReader(sys.stdin, delimiter='\\t'):
	for hdr in row.keys():
	if row[hdr] == 'NULL':
	row[hdr] = None
	else:
	try:
	row[hdr] = float(row[hdr])
	except ValueError:
	pass

	print(json.dumps(row, sort_keys=True))
	"
	}
	to_json_python3() {
	to_json_python python3
	}

	# We require at least Ruby 1.9.3
	check_ruby() {
	if command -v ruby >/dev/null 2>&1; then
	local version=$(ruby --version \| cut -d' ' -f2 \| awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }')
	[ $version -gt 10903 ] && echo "ok"
	fi
	}

	from_json_ruby() {
	ruby -e "
	require 'csv'
	require 'json'

	CSV(\$stdout) do \|csv\|
	headers = nil
	while line = gets
	row = JSON.parse line
	unless headers
	headers = row.keys.sort
	csv << headers
	end
	csv << headers.map do \|k\| row[k] end
	end
	end
	"
	}

	to_json_ruby() {
	#TODO: Breaks with quotes since sqlite doesn't quote fields in list mode
	ruby -e "
	require 'csv'
	require 'json'

	CSV.new(\$stdin, :headers => true, :col_sep => '\t').map do \|row\|
	data = row.to_hash
	data.each do \|k, v\|
	if v == 'NULL'
	data[k] = nil if v == 'NULL'
	elsif (Float(v) != nil rescue false)
	data[k] = v.to_f
	end
	end

	puts JSON.generate(data)
	end
	"
	}

	json_unsupported() {
	die "JSON support requires for either jq or python to be available"
	}


	## Unit Tests #############

	fail() {
	if ! [ -z "$1" ]; then
	errcho "[$0:$2] $1"
	[ -z "$3" ] \|\| echo "$3"
	fi
	exit 99
	}

	test() {
	echo "Unit Tests:"

	failed=0
	for fn in $(declare -F \| cut -d ' ' -f3); do
	[[ "$fn" =~ ^test_ ]] \|\| continue
	echo -n " - Test ${fn#*_} "
	{
	set +e

	cp "$0" "$TEMP_DIR/jqlite"
	cd "$TEMP_DIR"
	export PATH=".:$PATH"

	$( $fn >__stdout__ 2>__stderr__ )
	if [ $? != 0 ]; then
	echo "✗"
	for output in __stdout__ __stderr__; do
	if [ -s "$output" ]; then
	echo "=====[ ${output//_} ]====="
	cat "$output"
	fi
	done
	echo "===================="
	(( failed++ ))
	else
	echo "✓"
	fi

	rm -rf "$TEMP_DIR"/*
	cd - > /dev/null
	}
	done

	exit $failed
	}

	test_opt_help() {
	output=$(jqlite --help)
	[[ "$output" = "$(jqlite -h)" ]] \|\| fail "-h should be equal" $LINENO
	[[ "$output" = "Usage:" ]] \|\| fail "expected Usage string" $LINENO
	[[ "$output" = "Arguments:" ]] \|\| fail "expected Arguments section" $LINENO
	[[ "$output" = "Options:" ]] \|\| fail "expected Options section" $LINENO
	}

	test_opt_version() {
	output=$(jqlite --version)
	[[ "$output" = "$(jqlite -V)" ]] \|\| fail "-V should be equal" $LINENO
	[[ "$output" = "v$VERSION" ]] \|\| fail "expected version string" $LINENO "$output"
	}

	test_json_parse() {
	echo '{"foo":"Foo1", "bar":10}' > "test.json"
	echo '{"foo":"Foo2", "bar":20}' >> "test.json"

	output=$( echo 'SELECT * FROM test' \| jqlite test.json )
	[[ $(echo "$output" \| wc -l) -eq 3 ]] \|\| fail "expected 3 lines" $LINENO "$output"

	output=$( echo 'SELECT SUM(bar) FROM test' \| jqlite test.json )
	[[ "$output" = $'SUM(bar)\r\n30\r' ]] \|\| fail "sum a column" $LINENO "$output"
	}

	test_stdin() {
	output=$(
	echo '{"foo":"Foo1", "bar":10}' \
	\| jqlite -q 'SELECT bar FROM stdin' -@stdin.json
	)
	[[ "$output" = $'bar\r\n10\r' ]] \|\| fail "stdin" $LINENO "$output"
	}

	samples_urlbase='https://gist.githubusercontent.com/drslump/f018be6d5e00b9b12fdee11c21e79f96/raw'

	test_url_csv() {
	url="$samples_urlbase/cities.csv"
	q='SELECT COUNT(*) FROM cities'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n128' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_csv_gz() {
	url="$samples_urlbase/cities.csv.gz"
	q='SELECT COUNT(*) FROM cities'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n128' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_tsv() {
	url="$samples_urlbase/hurricanes.tsv"
	q='SELECT COUNT(*) FROM hurricanes'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n8' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_tsv_bz2() {
	url="$samples_urlbase/hurricanes.tsv"
	q='SELECT COUNT(*) FROM hurricanes'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n8' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_jsonl() {
	url="$samples_urlbase/tweets.jsonl"
	q='SELECT COUNT(*) FROM tweets'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n4' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_alias() {
	url="$samples_urlbase/tweets.jsonl@alias"
	q='SELECT COUNT(*) FROM alias'

	output=$( jqlite -q "$q" -f tsv $url )
	[[ "$output" = $'COUNT(*)\n4' ]] \|\| fail 'expected' $LINENO "$output"
	}

	test_url_alias_format() {
	q='SELECT COUNT(*) FROM alias'

	output=$( jqlite -q "$q" -f tsv <(curl --silent $samples_urlbase/tweets.jsonl)@alias.json )
	[[ "$output" = $'COUNT(*)\n4' ]] \|\| fail 'expected' $LINENO "$output"
	}


	# Run the script
	main "$@"