Last active
April 22, 2020 13:07
-
-
Save drslump/cdacaa368a1d63bab717efd3414b4467 to your computer and use it in GitHub Desktop.
jqlite: sql queries for tabular data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Copyright (C) 2018 Iván Montes <drslump@pollinimini.net>. | |
# This file is licensed under the MIT license. | |
# | |
# Requirements: | |
# | |
# - Bash 3.x or higher | |
# - sqlite3 | |
# - gunzip / bzip2 (for compressed input) | |
# - jq or Python (for JSON format) | |
# - curl (for URLs) | |
# | |
# Usage: | |
# jqlite [-v] [-s DBFILE] [-q SQL] [-f FORMAT] [-o OUTPUT] [--] <file>... | |
# | |
# The SQL query can be piped over stdin or as the query argument. | |
# When format is sql it can be suffixed with @<table> to specify which | |
# table should the insert statements reference, otherwise it defaults | |
# to <sql>. | |
# | |
# Arguments: | |
# <file> A CSV, TSV, JSON file, optionally compressed, or URL. It can be | |
# suffixed with @<alias> to override the name of the generated table. | |
# Otherwise the table will reflect the filename without the extension. | |
# '-' can be used to read data from stdin which will be available on | |
# a table named 'stdin'. In this mode any SQL can only be provided | |
# via the --query option or a .sql file. | |
# | |
# Additionally the aliasing can include a format @<alias>.<format>, | |
# which is extremely useful when using process substitution to provide | |
# the data. | |
# | |
# When a .db or .sqlite file is passed it's attached and available for | |
# queries under the prefix of its filename (or its override). | |
# Also .sql files are detected and executed regardless of the query | |
# option. Note that they are executed in the order in which they were | |
# given, so if they depend on previous data files they should be | |
# passed after them. | |
# | |
# Options: | |
# -s, --save DBFILE When set creates a sqlite database file. | |
# -f, --format FORMAT Outputs on CSV, TSV, TXT, JSON, SQL. | |
# -o, --output OUTPUT Where to store the output (defaults to stdout). | |
# -q, --query SQL What query to execute (- for stdin, default). | |
# -v, --verbose Be verbose. | |
# -h, --help Show this message. | |
# -V, --version Display the version and exit. | |
# | |
# | |
# Changelog: | |
# 0.2.5 (2020-04-22): | |
# - Use python3 if available (better unicode support) | |
# | |
# 0.2.4 (2018-08-04): | |
# - Allow to force a format per input. Required for process substitution. | |
# - Fix bug with url handling and added multiple tests for it. | |
# - Fix bug with stdin input being eagerly consumed. | |
# - Preliminary support for JSON via Ruby 1.9.3+ | |
# | |
# 0.2.0 (2018-08-04): | |
# - `-d`/`--database` renamed to `-s`/`--save` | |
# | |
# 0.1.1 (2018-08-03): | |
# - Fix a deadlock when using large compressed files as input | |
# - Python is now default over jq for handling JSON (better performance) | |
# | |
# 0.1.0 (2018-08-03): | |
# - Fallback to Python for handling JSON | |
# | |
VERSION="0.2.4" | |
set -euo pipefail | |
# Handle the exit of the script from function calls | |
trap "exit 1" TERM | |
JQLITE_PID=$$ | |
die() { errcho "ERROR:" "$@"; kill -s TERM $JQLITE_PID; exit 1; } | |
# Clean up on exit | |
TEMP_DIR=$(mktemp -d tmp-jqlite.XXXXX) | |
cleanup() { rv=$?; rm -rf "$TEMP_DIR"; exit $rv; } | |
trap "cleanup" exit | |
OUTPUT="/dev/stdout" | |
QUERY="" | |
DATABASE="" | |
FORMAT="" | |
VERBOSE="" | |
INPUTS=() | |
DQL=() | |
RESERVED=(ADD ALL ALTER AND AS AUTOINCREMENT BETWEEN CASE CHECK COLLATE COMMIT CONSTRAINT CREATE DEFAULT DEFERRABLE DELETE DISTINCT DROP ELSE ESCAPE EXCEPT EXISTS FOREIGN FROM GROUP HAVING IF IN INDEX INSERT INTERSECT INTO IS ISNULL JOIN LIMIT NOT NOTNULL NULL ON OR ORDER PRIMARY REFERENCES SELECT SET TABLE THEN TO TRANSACTION UNION UNIQUE UPDATE USING VALUES WHEN WHERE) | |
errcho() { >&2 echo "$@"; } | |
warn() { errcho "WARN:" "$@"; } | |
dql() { DQL+=("$1"); } | |
trace() { [ -n "$VERBOSE" ] && errcho "*" "$@" || true; } | |
trace_dql() { [ -n "$VERBOSE" ] && dql ".print '* $@'" || true; } | |
# Takes care of uncompressing or downloading files. | |
# Note that this function might spawn background commands to uncompress a file | |
# into a named fifo to avoid creating large temporary files. However this means | |
# that we can't call it in a subshell like in $(preprocess xxx), instead we | |
# have to use a bashism by reading its output into a variable with process | |
# substitution, otherwise the subshell will wait until the background task | |
# completes (which it'll never do). | |
preprocess() { | |
fpath="$1" | |
ext="${fpath##*.}" | |
if [[ "$fpath" = *"://"* ]]; then | |
trace "Downloading $fpath" | |
tdir=$(mktemp -d "$TEMP_DIR/XXXXX") | |
( | |
cd "$tdir" | |
curl --silent --fail -L --remote-header-name --remote-name "$fpath" \ | |
|| rm -f "$tdir/*" | |
) | |
tname=$( ls "$tdir" | head -1 ) | |
[ -n "$tname" ] || die "Unable to download $fpath" | |
read retval < <(preprocess "$tdir/$tname") | |
echo "$retval" | |
elif [ $ext == "gz" ]; then | |
[ -f "$fpath" ] || die "File not found $fpath" | |
trace "Uncompressing $fpath" | |
fname=$(basename "$fpath") | |
tpath="$TEMP_DIR/${fname%.*}" | |
mkfifo "$tpath" | |
gunzip --stdout "$fpath" >"$tpath" 2>/dev/null & | |
read retval < <(preprocess "$tpath") | |
echo "$retval" | |
elif [ $ext == "bz2" ]; then | |
[ -f "$fpath" ] || die "File not found $fpath" | |
trace "Uncompressing $fpath" | |
fname=$(basename "$fpath") | |
tpath="$TEMP_DIR/${fname%.*}" | |
mkfifo "$tpath" | |
bzip2 -d --stdout "$fpath" >"$tpath" 2>/dev/null & | |
read retval < <(preprocess "$tpath") | |
echo "$retval" | |
else | |
[ -p "$fpath" ] || [ -f "$fpath" ] || die "File not found $fpath" | |
echo "$fpath" | |
fi | |
} | |
# Generates import statements for a file based on its format | |
import() { | |
read fpath < <(preprocess "$1") | |
table="$2" | |
format="${3:-}" | |
if [ ! -p "$fpath" ] && [ -f "$fpath" ]; then | |
[[ -s $fpath ]] || die "File is empty: $fpath" | |
fi | |
# Auto-detect format based on file extension | |
if [ -z "$format" ]; then | |
case "${fpath##*.}" in | |
db|sqlite) format="sqlite" ;; | |
sql) format="sql" ;; | |
json|jsonl) format="json" ;; | |
tsv) format="tsv" ;; | |
*) format="csv" ;; | |
esac | |
trace "Assuming format <$format> for $fpath" | |
fi | |
case $format in | |
db|sqlite) | |
dql "ATTACH '$fpath' AS $table;" | |
;; | |
sql) | |
dql ".read '$fpath'" | |
;; | |
json*) | |
fname=$(basename "$fpath") | |
tpath="$TEMP_DIR/${fname%.*}.csv" | |
mkfifo "$tpath" | |
cat "$fpath" | $from_json > "$tpath" & | |
import "$tpath" "$table" "csv" | |
;; | |
csv) | |
dql "DROP TABLE IF EXISTS [$table];" | |
dql ".mode csv" | |
dql ".import '$fpath' $table" | |
;; | |
tsv) | |
dql "DROP TABLE IF EXISTS [$table];" | |
dql ".mode tabs" | |
dql ".import '$fpath' $table" | |
;; | |
*) | |
die "Unsupported format '$format'" | |
;; | |
esac | |
} | |
# Ouput formatters | |
format_csv() { cat; } | |
format_tsv() { cat; } | |
format_txt() { cat; } | |
format_sql() { cat; } | |
format_json() { $to_json; } | |
main() { | |
while [[ $# -gt 0 ]] | |
do | |
case "$1" in | |
-s|--save) | |
DATABASE="$2" | |
shift; shift ;; | |
-f|--format) | |
FORMAT=$( echo "$2" | tr '[:upper:]' '[:lower:]' ) | |
shift; shift ;; | |
-o|--output) | |
OUTPUT="$2" | |
shift; shift ;; | |
-q|--query) | |
if [ "$2" == "-" ]; then QUERY="$(</dev/stdin)"; else QUERY="$2"; fi | |
shift; shift ;; | |
-v|--verbose) | |
VERBOSE="1" | |
shift ;; | |
-V|--version) | |
echo "jqlite v$VERSION -- sql queries for tabular data" | |
exit 1 ;; | |
-h|--help) | |
echo "jqlite v$VERSION -- sql queries for tabular data" | |
awk "NR >= 13 && NR <= 49" "$0" | cut -c 3- | |
exit 0 ;; | |
--test) | |
# non-public option to run the unit tests | |
shift; test | |
;; | |
--) | |
shift | |
INPUTS+=("$@") | |
break ;; | |
-|-@*) | |
# stdin requires a bit of dance since sqlite will receive the SQL | |
# on its stdin, that means that data in the parent process stdin | |
# needs to be redirected elsewhere. The approach is to create a named | |
# pipe, spawn a background cat to feed it and then pass it to sqlite | |
# for importing. | |
tpath="$TEMP_DIR/stdin" | |
mkfifo "$tpath" | |
cat <&0 >"$tpath" & | |
[ "$1" != "-" ] && tpath+="@${1#*@}" | |
INPUTS+=("$tpath") | |
QUERY=${QUERY:-";;"} # Avoid default of reading query from stdin | |
shift ;; | |
-*) | |
die "Unknown option $1" ;; | |
*) | |
INPUTS+=("$1") | |
shift ;; | |
esac | |
done | |
[[ ! -f "$DATABASE" && ${#INPUTS[@]} -eq 0 ]] && \ | |
die "No data provided so nothing to do." | |
if [[ -z $QUERY && ! -t 0 ]]; then | |
QUERY="$(</dev/stdin)" | |
fi | |
sqlite=$( command -v sqlite3 || true ) | |
if [ -z "$sqlite" ]; then | |
die "Unable to find the sqlite3 command!" | |
fi | |
# Try to find something to handle the JSON format | |
if command -v python3 >/dev/null; then | |
trace "Enabling JSON support via python3" | |
from_json=from_json_python3 | |
to_json=to_json_python3 | |
elif command -v python >/dev/null; then | |
trace "Enabling JSON support via python" | |
from_json=from_json_python | |
to_json=to_json_python | |
#TODO: Disabled until to_json_ruby is fixed | |
# elif [[ $(check_ruby) == "ok" ]]; then | |
# trace "Enabling JSON support via ruby" | |
# from_json=from_json_ruby | |
# to_json=to_json_ruby | |
elif command -v jq >/dev/null; then | |
trace "Enabling JSON support via jq" | |
from_json=from_json_jq | |
to_json=to_json_jq | |
else | |
trace "Disabling JSON support (neither jq nor python available)" | |
from_json=json_unsupported | |
to_json=json_unsupported | |
fi | |
# Preface SQL commands | |
dql ".bail on" | |
dql ".output stderr" | |
if [ -f "$DATABASE" ]; then | |
# We need to make a copy otherwise we can't save the openned DB | |
tmp_db="$TEMP_DIR/$(basename "$DATABASE"))" | |
cp "$DATABASE" "$tmp_db" | |
trace_dql "Opening database from $DATABASE" | |
dql ".open '$tmp_db'" | |
fi | |
for fname in "${INPUTS[@]}"; do | |
format="" | |
# check if we have an alias | |
IFS='@' read -r -a parts <<< "$fname" | |
if [ "${#parts[@]}" -eq "1" ]; then | |
table="${fname##*/}" | |
ext="${fname##*.}" | |
if [ $ext == "gz" ] || [ $ext == "bz2" ]; then | |
table="${table%.*}" | |
fi | |
table="${table%.*}" | |
else | |
table="${parts[ ${#parts[@]}-1 ]}" | |
unset 'parts[ ${#parts[@]}-1 ]' | |
fname=$( IFS=@ ; echo "${parts[*]}" ) | |
# Now check if there is a format override | |
IFS='.' read -r -a parts <<< "$table" | |
if [ "${#parts[@]}" -gt "1" ]; then | |
format="${parts[ ${#parts[@]}-1 ]}" | |
unset 'parts[ ${#parts[@]}-1 ]' | |
table=$( IFS=. ; echo "${parts[*]}" ) | |
fi | |
fi | |
[ -n "$table" ] || die "No table name defined for $fname" | |
table=$( echo -n $table | tr '[:lower:]' '[:upper:]' ) | |
normalized=$( echo -n "$table" | tr -cs '[:alnum:]' _ ) | |
if [[ "$normalized" != "$table" ]]; then | |
warn "Renaming <$table> to <$normalized> (normalization)" | |
table="$normalized" | |
fi | |
if [[ " ${RESERVED[@]} " =~ " ${table} " ]]; then | |
warn "Renaming <$table> to <_$table> (reserved word)" | |
table="_$table" | |
fi | |
[ -z "$format" ] || trace "Forcing format $format for <$table>" | |
trace_dql "Importing ${fname} at ${table}...'" | |
import "$fname" "$table" "$format" | |
done | |
trace_dql "Dumping database schema:" | |
[ -n "$VERBOSE" ] && dql ".schema" | |
if [[ -n "$QUERY" && "$QUERY" != ";;" ]]; then | |
# Try to infer the format from the output file | |
if [ -z "$FORMAT" ]; then | |
case "${OUTPUT##*.}" in | |
csv|tsv|txt) FORMAT="${OUTPUT##*.}" ;; | |
json*) FORMAT=json ;; | |
sql*) FORMAT=sql ;; | |
*) FORMAT=csv ;; | |
esac | |
fi | |
case "$FORMAT" in | |
csv) dql ".mode csv" ;; | |
tsv) dql ".mode tabs" ;; | |
txt) dql ".mode columns" ;; | |
json) dql ".mode list"; dql ".separator \t \n"; dql ".nullvalue NULL" ;; | |
sql*) dql ".mode insert ${FORMAT#*@}"; FORMAT=sql ;; | |
*) die "Unsupported format $FORMAT" ;; | |
esac | |
trace_dql "Formatting results as ${FORMAT}" | |
printf '%s;' "$QUERY" > "$TEMP_DIR/query.sql" | |
trace_dql "Executing query..." | |
dql ".output stdout" | |
dql ".headers on" | |
dql "BEGIN;" | |
dql ".read '$TEMP_DIR/query.sql'" | |
dql "COMMIT;" | |
dql ".output stderr" | |
fi | |
if [ -n "$DATABASE" ]; then | |
trace_dql "Saving database to $DATABASE" | |
dql ".save '$DATABASE'" | |
fi | |
FORMAT="${FORMAT:-csv}" | |
printf "%s\n" "${DQL[@]}" \ | |
| "$sqlite" -batch \ | |
| "format_$FORMAT" \ | |
> "$OUTPUT" | |
} | |
# JSON adapters | |
from_json_jq() { | |
jq -r --slurp ' | |
(.[0] | keys_unsorted) as $keys | |
| | |
$keys, | |
( | |
.[] | |
| [ | |
.[ $keys[] ] as $value | |
| if ($value | [type] | inside(["array","object"])) then ($value|tojson) else $value end | |
] | |
) | |
| @csv' | |
} | |
to_json_jq() { | |
jq -cR --slurp ' | |
split("\n") | |
| map( split("\t") ) | |
| .[0] as $hdrs | |
| .[1:][] | |
| select(length>0) | |
| map( if length < 16 then tonumber? // . else . end ) | |
| .[] |= if . == "NULL" then null else . end | |
| [$hdrs, .] | |
| [ transpose[] | {key: .[0], value: .[1]} ] | |
| from_entries' | |
} | |
from_json_python() { | |
${1:-python} -c " | |
import sys, csv, json | |
writer = None | |
for line in sys.stdin: | |
data = dict( | |
(k, json.dumps(v) if isinstance(v, (dict, list)) else v) | |
for k,v in json.loads(line).items()) | |
if not writer: | |
writer = csv.DictWriter(sys.stdout, list(data.keys()), extrasaction='ignore') | |
writer.writeheader() | |
writer.writerow(data) | |
" | |
} | |
from_json_python3() { | |
from_json_python python3 | |
} | |
to_json_python() { | |
${1:-python} -c " | |
import sys, csv, json | |
for row in csv.DictReader(sys.stdin, delimiter='\\t'): | |
for hdr in row.keys(): | |
if row[hdr] == 'NULL': | |
row[hdr] = None | |
else: | |
try: | |
row[hdr] = float(row[hdr]) | |
except ValueError: | |
pass | |
print(json.dumps(row, sort_keys=True)) | |
" | |
} | |
to_json_python3() { | |
to_json_python python3 | |
} | |
# We require at least Ruby 1.9.3 | |
check_ruby() { | |
if command -v ruby >/dev/null 2>&1; then | |
local version=$(ruby --version | cut -d' ' -f2 | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') | |
[ $version -gt 10903 ] && echo "ok" | |
fi | |
} | |
from_json_ruby() { | |
ruby -e " | |
require 'csv' | |
require 'json' | |
CSV(\$stdout) do |csv| | |
headers = nil | |
while line = gets | |
row = JSON.parse line | |
unless headers | |
headers = row.keys.sort | |
csv << headers | |
end | |
csv << headers.map do |k| row[k] end | |
end | |
end | |
" | |
} | |
to_json_ruby() { | |
#TODO: Breaks with quotes since sqlite doesn't quote fields in list mode | |
ruby -e " | |
require 'csv' | |
require 'json' | |
CSV.new(\$stdin, :headers => true, :col_sep => '\t').map do |row| | |
data = row.to_hash | |
data.each do |k, v| | |
if v == 'NULL' | |
data[k] = nil if v == 'NULL' | |
elsif (Float(v) != nil rescue false) | |
data[k] = v.to_f | |
end | |
end | |
puts JSON.generate(data) | |
end | |
" | |
} | |
json_unsupported() { | |
die "JSON support requires for either jq or python to be available" | |
} | |
## Unit Tests ############# | |
fail() { | |
if ! [ -z "$1" ]; then | |
errcho "[$0:$2] $1" | |
[ -z "$3" ] || echo "$3" | |
fi | |
exit 99 | |
} | |
test() { | |
echo "Unit Tests:" | |
failed=0 | |
for fn in $(declare -F | cut -d ' ' -f3); do | |
[[ "$fn" =~ ^test_ ]] || continue | |
echo -n " - Test ${fn#*_} " | |
{ | |
set +e | |
cp "$0" "$TEMP_DIR/jqlite" | |
cd "$TEMP_DIR" | |
export PATH=".:$PATH" | |
$( $fn >__stdout__ 2>__stderr__ ) | |
if [ $? != 0 ]; then | |
echo "✗" | |
for output in __stdout__ __stderr__; do | |
if [ -s "$output" ]; then | |
echo "=====[ ${output//_} ]=====" | |
cat "$output" | |
fi | |
done | |
echo "====================" | |
(( failed++ )) | |
else | |
echo "✓" | |
fi | |
rm -rf "$TEMP_DIR"/* | |
cd - > /dev/null | |
} | |
done | |
exit $failed | |
} | |
test_opt_help() { | |
output=$(jqlite --help) | |
[[ "$output" = "$(jqlite -h)" ]] || fail "-h should be equal" $LINENO | |
[[ "$output" = *"Usage:"* ]] || fail "expected Usage string" $LINENO | |
[[ "$output" = *"Arguments:"* ]] || fail "expected Arguments section" $LINENO | |
[[ "$output" = *"Options:"* ]] || fail "expected Options section" $LINENO | |
} | |
test_opt_version() { | |
output=$(jqlite --version) | |
[[ "$output" = "$(jqlite -V)" ]] || fail "-V should be equal" $LINENO | |
[[ "$output" = *"v$VERSION"* ]] || fail "expected version string" $LINENO "$output" | |
} | |
test_json_parse() { | |
echo '{"foo":"Foo1", "bar":10}' > "test.json" | |
echo '{"foo":"Foo2", "bar":20}' >> "test.json" | |
output=$( echo 'SELECT * FROM test' | jqlite test.json ) | |
[[ $(echo "$output" | wc -l) -eq 3 ]] || fail "expected 3 lines" $LINENO "$output" | |
output=$( echo 'SELECT SUM(bar) FROM test' | jqlite test.json ) | |
[[ "$output" = $'SUM(bar)\r\n30\r' ]] || fail "sum a column" $LINENO "$output" | |
} | |
test_stdin() { | |
output=$( | |
echo '{"foo":"Foo1", "bar":10}' \ | |
| jqlite -q 'SELECT bar FROM stdin' -@stdin.json | |
) | |
[[ "$output" = $'bar\r\n10\r' ]] || fail "stdin" $LINENO "$output" | |
} | |
samples_urlbase='https://gist.githubusercontent.com/drslump/f018be6d5e00b9b12fdee11c21e79f96/raw' | |
test_url_csv() { | |
url="$samples_urlbase/cities.csv" | |
q='SELECT COUNT(*) FROM cities' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_csv_gz() { | |
url="$samples_urlbase/cities.csv.gz" | |
q='SELECT COUNT(*) FROM cities' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n128' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_tsv() { | |
url="$samples_urlbase/hurricanes.tsv" | |
q='SELECT COUNT(*) FROM hurricanes' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_tsv_bz2() { | |
url="$samples_urlbase/hurricanes.tsv" | |
q='SELECT COUNT(*) FROM hurricanes' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n8' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_jsonl() { | |
url="$samples_urlbase/tweets.jsonl" | |
q='SELECT COUNT(*) FROM tweets' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_alias() { | |
url="$samples_urlbase/tweets.jsonl@alias" | |
q='SELECT COUNT(*) FROM alias' | |
output=$( jqlite -q "$q" -f tsv $url ) | |
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output" | |
} | |
test_url_alias_format() { | |
q='SELECT COUNT(*) FROM alias' | |
output=$( jqlite -q "$q" -f tsv <(curl --silent $samples_urlbase/tweets.jsonl)@alias.json ) | |
[[ "$output" = $'COUNT(*)\n4' ]] || fail 'expected' $LINENO "$output" | |
} | |
# Run the script | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment