Skip to content

Instantly share code, notes, and snippets.

@drslump
Last active April 22, 2020 13:07
Show Gist options
  • Save drslump/cdacaa368a1d63bab717efd3414b4467 to your computer and use it in GitHub Desktop.
Save drslump/cdacaa368a1d63bab717efd3414b4467 to your computer and use it in GitHub Desktop.
jqlite: sql queries for tabular data
#!/usr/bin/env bash
#
# Copyright (C) 2018 Iván Montes <drslump@pollinimini.net>.
# This file is licensed under the MIT license.
#
# Requirements:
#
# - Bash 3.x or higher
# - sqlite3
# - gunzip / bzip2 (for compressed input)
# - jq or Python (for JSON format)
# - curl (for URLs)
#
# Usage:
# jqlite [-v] [-s DBFILE] [-q SQL] [-f FORMAT] [-o OUTPUT] [--] <file>...
#
# The SQL query can be piped over stdin or as the query argument.
# When format is sql it can be suffixed with @<table> to specify which
# table should the insert statements reference, otherwise it defaults
# to <sql>.
#
# Arguments:
# <file> A CSV, TSV, JSON file, optionally compressed, or URL. It can be
# suffixed with @<alias> to override the name of the generated table.
# Otherwise the table will reflect the filename without the extension.
# '-' can be used to read data from stdin which will be available on
# a table named 'stdin'. In this mode any SQL can only be provided
# via the --query option or a .sql file.
#
# Additionally the aliasing can include a format @<alias>.<format>,
# which is extremely useful when using process substitution to provide
# the data.
#
# When a .db or .sqlite file is passed it's attached and available for
# queries under the prefix of its filename (or its override).
# Also .sql files are detected and executed regardless of the query
# option. Note that they are executed in the order in which they were
# given, so if they depend on previous data files they should be
# passed after them.
#
# Options:
# -s, --save DBFILE When set creates a sqlite database file.
# -f, --format FORMAT Outputs on CSV, TSV, TXT, JSON, SQL.
# -o, --output OUTPUT Where to store the output (defaults to stdout).
# -q, --query SQL What query to execute (- for stdin, default).
# -v, --verbose Be verbose.
# -h, --help Show this message.
# -V, --version Display the version and exit.
#
#
# Changelog:
# 0.2.5 (2020-04-22):
# - Use python3 if available (better unicode support)
#
# 0.2.4 (2018-08-04):
# - Allow to force a format per input. Required for process substitution.
# - Fix bug with url handling and added multiple tests for it.
# - Fix bug with stdin input being eagerly consumed.
# - Preliminary support for JSON via Ruby 1.9.3+
#
# 0.2.0 (2018-08-04):
# - `-d`/`--database` renamed to `-s`/`--save`
#
# 0.1.1 (2018-08-03):
# - Fix a deadlock when using large compressed files as input
# - Python is now default over jq for handling JSON (better performance)
#
# 0.1.0 (2018-08-03):
# - Fallback to Python for handling JSON
#
VERSION="0.2.4"
set -euo pipefail
# Handle the exit of the script from function calls
trap "exit 1" TERM
JQLITE_PID=$$
die() { errcho "ERROR:" "$@"; kill -s TERM $JQLITE_PID; exit 1; }
# Clean up on exit
TEMP_DIR=$(mktemp -d tmp-jqlite.XXXXX)
cleanup() { rv=$?; rm -rf "$TEMP_DIR"; exit $rv; }
trap "cleanup" exit
OUTPUT="/dev/stdout"
QUERY=""
DATABASE=""
FORMAT=""
VERBOSE=""
INPUTS=()
DQL=()
RESERVED=(ADD ALL ALTER AND AS AUTOINCREMENT BETWEEN CASE CHECK COLLATE COMMIT CONSTRAINT CREATE DEFAULT DEFERRABLE DELETE DISTINCT DROP ELSE ESCAPE EXCEPT EXISTS FOREIGN FROM GROUP HAVING IF IN INDEX INSERT INTERSECT INTO IS ISNULL JOIN LIMIT NOT NOTNULL NULL ON OR ORDER PRIMARY REFERENCES SELECT SET TABLE THEN TO TRANSACTION UNION UNIQUE UPDATE USING VALUES WHEN WHERE)
errcho() { >&2 echo "$@"; }
warn() { errcho "WARN:" "$@"; }
dql() { DQL+=("$1"); }
trace() { [ -n "$VERBOSE" ] && errcho "*" "$@" || true; }
trace_dql() { [ -n "$VERBOSE" ] && dql ".print '* $@'" || true; }
# Takes care of uncompressing or downloading files.
# Note that this function might spawn background commands to uncompress a file
# into a named fifo to avoid creating large temporary files. However this means
# that we can't call it in a subshell like in $(preprocess xxx), instead we
# have to use a bashism by reading its output into a variable with process
# substitution, otherwise the subshell will wait until the background task
# completes (which it'll never do).
preprocess() {
fpath="$1"
ext="${fpath##*.}"
if [[ "$fpath" = *"://"* ]]; then
trace "Downloading $fpath"
tdir=$(mktemp -d "$TEMP_DIR/XXXXX")
(
cd "$tdir"
curl --silent --fail -L --remote-header-name --remote-name "$fpath" \
|| rm -f "$tdir/*"
)
tname=$( ls "$tdir" | head -1 )
[ -n "$tname" ] || die "Unable to download $fpath"
read retval < <(preprocess "$tdir/$tname")
echo "$retval"
elif [ $ext == "gz" ]; then
[ -f "$fpath" ] || die "File not found $fpath"
trace "Uncompressing $fpath"
fname=$(basename "$fpath")
tpath="$TEMP_DIR/${fname%.*}"
mkfifo "$tpath"
gunzip --stdout "$fpath" >"$tpath" 2>/dev/null &
read retval < <(preprocess "$tpath")
echo "$retval"
elif [ $ext == "bz2" ]; then
[ -f "$fpath" ] || die "File not found $fpath"
trace "Uncompressing $fpath"
fname=$(basename "$fpath")
tpath="$TEMP_DIR/${fname%.*}"
mkfifo "$tpath"
bzip2 -d --stdout "$fpath" >"$tpath" 2>/dev/null &
read retval < <(preprocess "$tpath")
echo "$retval"
else
[ -p "$fpath" ] || [ -f "$fpath" ] || die "File not found $fpath"
echo "$fpath"
fi
}
# Generates import statements for a file based on its format
import() {
read fpath < <(preprocess "$1")
table="$2"
format="${3:-}"
if [ ! -p "$fpath" ] && [ -f "$fpath" ]; then
[[ -s $fpath ]] || die "File is empty: $fpath"
fi
# Auto-detect format based on file extension
if [ -z "$format" ]; then
case "${fpath##*.}" in
db|sqlite) format="sqlite" ;;
sql) format="sql" ;;
json|jsonl) format="json" ;;
tsv) format="tsv" ;;
*) format="csv" ;;
esac
trace "Assuming format <$format> for $fpath"
fi
case $format in
db|sqlite)
dql "ATTACH '$fpath' AS $table;"
;;
sql)
dql ".read '$fpath'"
;;
json*)
fname=$(basename "$fpath")
tpath="$TEMP_DIR/${fname%.*}.csv"
mkfifo "$tpath"
cat "$fpath" | $from_json > "$tpath" &
import "$tpath" "$table" "csv"
;;
csv)
dql "DROP TABLE IF EXISTS [$table];"
dql ".mode csv"
dql ".import '$fpath' $table"
;;
tsv)
dql "DROP TABLE IF EXISTS [$table];"
dql ".mode tabs"
dql ".import '$fpath' $table"
;;
*)
die "Unsupported format '$format'"
;;
esac
}
# Ouput formatters
format_csv() { cat; }
format_tsv() { cat; }
format_txt() { cat; }
format_sql() { cat; }
format_json() { $to_json; }
main() {
while [[ $# -gt 0 ]]
do
case "$1" in
-s|--save)
DATABASE="$2"
shift; shift ;;
-f|--format)
FORMAT=$( echo "$2" | tr '[:upper:]' '[:lower:]' )
shift; shift ;;
-o|--output)
OUTPUT="$2"
shift; shift ;;
-q|--query)
if [ "$2" == "-" ]; then QUERY="$(</dev/stdin)"; else QUERY="$2"; fi
shift; shift ;;
-v|--verbose)
VERBOSE="1"
shift ;;
-V|--version)
echo "jqlite v$VERSION -- sql queries for tabular data"
exit 1 ;;
-h|--help)
echo "jqlite v$VERSION -- sql queries for tabular data"
awk "NR >= 13 && NR <= 49" "$0" | cut -c 3-
exit 0 ;;
--test)
# non-public option to run the unit tests
shift; test
;;
--)
shift
INPUTS+=("$@")
break ;;
-|-@*)
# stdin requires a bit of dance since sqlite will receive the SQL
# on its stdin, that means that data in the parent process stdin
# needs to be redirected elsewhere. The approach is to create a named
# pipe, spawn a background cat to feed it and then pass it to sqlite
# for importing.
tpath="$TEMP_DIR/stdin"
mkfifo "$tpath"
cat <&0 >"$tpath" &
[ "$1" != "-" ] && tpath+="@${1#*@}"
INPUTS+=("$tpath")
QUERY=${QUERY:-";;"} # Avoid default of reading query from stdin
shift ;;
-*)
die "Unknown option $1" ;;
*)
INPUTS+=("$1")
shift ;;
esac
done
[[ ! -f "$DATABASE" && ${#INPUTS[@]} -eq 0 ]] && \
die "No data provided so nothing to do."
if [[ -z $QUERY && ! -t 0 ]]; then
QUERY="$(</dev/stdin)"
fi
sqlite=$( command -v sqlite3 || true )
if [ -z "$sqlite" ]; then
die "Unable to find the sqlite3 command!"
fi
# Try to find something to handle the JSON format
if command -v python3 >/dev/null; then
trace "Enabling JSON support via python3"
from_json=from_json_python3
to_json=to_json_python3
elif command -v python >/dev/null; then
trace "Enabling JSON support via python"
from_json=from_json_python
to_json=to_json_python
#TODO: Disabled until to_json_ruby is fixed
# elif [[ $(check_ruby) == "ok" ]]; then
# trace "Enabling JSON support via ruby"
# from_json=from_json_ruby
# to_json=to_json_ruby
elif command -v jq >/dev/null; then
trace "Enabling JSON support via jq"
from_json=from_json_jq
to_json=to_json_jq
else
trace "Disabling JSON support (neither jq nor python available)"
from_json=json_unsupported
to_json=json_unsupported
fi
# Preface SQL commands
dql ".bail on"
dql ".output stderr"
if [ -f "$DATABASE" ]; then
# We need to make a copy otherwise we can't save the openned DB
tmp_db="$TEMP_DIR/$(basename "$DATABASE"))"
cp "$DATABASE" "$tmp_db"
trace_dql "Opening database from $DATABASE"
dql ".open '$tmp_db'"
fi
for fname in "${INPUTS[@]}"; do
format=""
# check if we have an alias
IFS='@' read -r -a parts <<< "$fname"
if [ "${#parts[@]}" -eq "1" ]; then
table="${fname##*/}"
ext="${fname##*.}"
if [ $ext == "gz" ] || [ $ext == "bz2" ]; then
table="${table%.*}"
fi
table="${table%.*}"
else
table="${parts[ ${#parts[@]}-1 ]}"
unset 'parts[ ${#parts[@]}-1 ]'
fname=$( IFS=@ ; echo "${parts[*]}" )
# Now check if there is a format override
IFS='.' read -r -a parts <<< "$table"
if [ "${#parts[@]}" -gt "1" ]; then
format="${parts[ ${#parts[@]}-1 ]}"
unset 'parts[ ${#parts[@]}-1 ]'
table=$( IFS=. ; echo "${parts[*]}" )
fi
fi
[ -n "$table" ] || die "No table name defined for $fname"
table=$( echo -n $table | tr '[:lower:]' '[:upper:]' )
normalized=$( echo -n "$table" | tr -cs '[:alnum:]' _ )
if [[ "$normalized" != "$table" ]]; then
warn "Renaming <$table> to <$normalized> (normalization)"
table="$normalized"
fi
if [[ " ${RESERVED[@]} " =~ " ${table} " ]]; then
warn "Renaming <$table> to <_$table> (reserved word)"
table="_$table"
fi
[ -z "$format" ] || trace "Forcing format $format for <$table>"
trace_dql "Importing ${fname} at ${table}...'"
import "$fname" "$table" "$format"
done
trace_dql "Dumping database schema:"
[ -n "$VERBOSE" ] && dql ".schema"
if [[ -n "$QUERY" && "$QUERY" != ";;" ]]; then
# Try to infer the format from the output file
if [ -z "$FORMAT" ]; then
case "${OUTPUT##*.}" in
csv|tsv|txt) FORMAT="${OUTPUT##*.}" ;;
json*) FORMAT=json ;;
sql*) FORMAT=sql ;;
*) FORMAT=csv ;;
esac
fi
case "$FORMAT" in
csv) dql ".mode csv" ;;
tsv) dql ".mode tabs" ;;
txt) dql ".mode columns" ;;
json) dql ".mode list"; dql ".separator \t \n"; dql ".nullvalue NULL" ;;
sql*) dql ".mode insert ${FORMAT#*@}"; FORMAT=sql ;;
*) die "Unsupported format $FORMAT" ;;
esac
trace_dql "Formatting results as ${FORMAT}"
printf '%s;' "$QUERY" > "$TEMP_DIR/query.sql"
trace_dql "Executing query..."
dql ".output stdout"
dql ".headers on"
dql "BEGIN;"
dql ".read '$TEMP_DIR/query.sql'"
dql "COMMIT;"
dql ".output stderr"
fi
if [ -n "$DATABASE" ]; then
trace_dql "Saving database to $DATABASE"
dql ".save '$DATABASE'"
fi
FORMAT="${FORMAT:-csv}"
printf "%s\n" "${DQL[@]}" \
| "$sqlite" -batch \
| "format_$FORMAT" \
> "$OUTPUT"
}
# JSON adapters
from_json_jq() {
jq -r --slurp '
(.[0] | keys_unsorted) as $keys
|
$keys,
(
.[]
| [
.[ $keys[] ] as $value
| if ($value | [type] | inside(["array","object"])) then ($value|tojson) else $value end
]
)
| @csv'
}
to_json_jq() {
jq -cR --slurp '
split("\n")
| map( split("\t") )
| .[0] as $hdrs
| .[1:][]
| select(length>0)
| map( if length < 16 then tonumber? // . else . end )
| .[] |= if . == "NULL" then null else . end
| [$hdrs, .]
| [ transpose[] | {key: .[0], value: .[1]} ]
| from_entries'
}
from_json_python() {
${1:-python} -c "
import sys, csv, json
writer = None
for line in sys.stdin:
data = dict(
(k, json.dumps(v) if isinstance(v, (dict, list)) else v)
for k,v in json.loads(line).items())
if not writer:
writer = csv.DictWriter(sys.stdout, list(data.keys()), extrasaction='ignore')
writer.writeheader()
writer.writerow(data)
"
}
from_json_python3() {
from_json_python python3
}
to_json_python() {
${1:-python} -c "
import sys, csv, json
for row in csv.DictReader(sys.stdin, delimiter='\\t'):
for hdr in row.keys():
if row[hdr] == 'NULL':
row[hdr] = None
else:
try:
row[hdr] = float(row[hdr])
except ValueError:
pass
print(json.dumps(row, sort_keys=True))
"
}
to_json_python3() {
to_json_python python3
}
# We require at least Ruby 1.9.3
check_ruby() {
if command -v ruby >/dev/null 2>&1; then
local version=$(ruby --version | cut -d' ' -f2 | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }')
[ $version -gt 10903 ] && echo "ok"
fi
}
from_json_ruby() {
ruby -e "
require 'csv'
require 'json'
CSV(\$stdout) do |csv|
headers = nil
while line = gets
row = JSON.parse line
unless headers
headers = row.keys.sort
csv << headers
end
csv << headers.map do |k| row[k] end
end
end
"
}
to_json_ruby() {
#TODO: Breaks with quotes since sqlite doesn't quote fields in list mode
ruby -e "
require 'csv'
require 'json'
CSV.new(\$stdin, :headers => true, :col_sep => '\t').map do |row|
data = row.to_hash
data.each do |k, v|
if v == 'NULL'
data[k] = nil if v == 'NULL'
elsif (Float(v) != nil rescue false)
data[k] = v.to_f
end
end
puts JSON.generate(data)
end
"
}
json_unsupported() {
die "JSON support requires for either jq or python to be available"
}
## Unit Tests #############
fail() {
if ! [ -z "$1" ]; then
errcho "[$0:$2] $1"
[ -z "$3" ] || echo "$3"
fi
exit 99
}
test() {
echo "Unit Tests:"
failed=0
for fn in $(declare -F | cut -d ' ' -f3); do
[[ "$fn" =~ ^test_ ]] || continue
echo -n " - Test ${fn#*_} "
{
set +e
cp "$0" "$TEMP_DIR/jqlite"
cd "$TEMP_DIR"
export PATH=".:$PATH"
$( $fn >__stdout__ 2>__stderr__ )
if [ $? != 0 ]; then
echo "✗"
for output in __stdout__ __stderr__; do
if [ -s "$output" ]; then
echo "=====[ ${output//_} ]====="
cat "$output"
fi
done
echo "===================="
(( failed++ ))
else
echo "✓"
fi
rm -rf "$TEMP_DIR"/*
cd - > /dev/null
}
done
exit $failed
}
test_opt_help() {
output=$(jqlite --help)
[[ "$output" = "$(jqlite -h)" ]] || fail "-h should be equal" $LINENO
[[ "$output" = *"Usage:"* ]] || fail "expected Usage string" $LINENO
[[ "$output" = *"Arguments:"* ]] || fail "expected Arguments section" $LINENO
[[ "$output" = *"Options:"* ]] || fail "expected Options section" $LINENO
}
test_opt_version() {
output=$(jqlite --version)
[[ "$output" = "$(jqlite -V)" ]] || fail "-V should be equal" $LINENO
[[ "$output" = *"v$VERSION"* ]] || fail "expected version string" $LINENO "$output"
}
test_json_parse() {
echo '{"foo":"Foo1", "bar":10}' > "test.json"
echo '{"foo":"Foo2", "bar":20}' >> "test.json"
output=$( echo 'SELECT * FROM test' | jqlite test.json )
[[ $(echo "$output" | wc -l) -eq 3 ]] || fail "expected 3 lines" $LINENO "$output"
output=$( echo 'SELECT SUM(bar) FROM test' | jqlite test.json )
[[ "$output" = $'SUM(bar)\r\n30\r' ]] || fail "sum a column" $LINENO "$output"
}
test_stdin() {
output=$(
echo '{"foo":"Foo1", "bar":10}' \
| jqlite -q 'SELECT bar FROM stdin' -@stdin.json
)
[[ "$output" = $'bar\r\n10\r' ]] || fail "stdin" $LINENO "$output"
}
samples_urlbase='https://gist.githubusercontent.com/drslump/f018be6d5e00b9b12fdee11c21e79f96/raw'
test_url_csv() {
url="$samples_urlbase/cities.csv"
q='SELECT COUNT(*) FROM cities'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output"
}
test_url_csv_gz() {
url="$samples_urlbase/cities.csv.gz"
q='SELECT COUNT(*) FROM cities'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output"
}
test_url_tsv() {
url="$samples_urlbase/hurricanes.tsv"
q='SELECT COUNT(*) FROM hurricanes'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output"
}
test_url_tsv_bz2() {
url="$samples_urlbase/hurricanes.tsv"
q='SELECT COUNT(*) FROM hurricanes'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output"
}
test_url_jsonl() {
url="$samples_urlbase/tweets.jsonl"
q='SELECT COUNT(*) FROM tweets'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}
test_url_alias() {
url="$samples_urlbase/tweets.jsonl@alias"
q='SELECT COUNT(*) FROM alias'
output=$( jqlite -q "$q" -f tsv $url )
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}
test_url_alias_format() {
q='SELECT COUNT(*) FROM alias'
output=$( jqlite -q "$q" -f tsv <(curl --silent $samples_urlbase/tweets.jsonl)@alias.json )
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output"
}
# Run the script
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment