Last active
April 18, 2022 19:24
-
-
Save ernstki/91b427d6714cdd4dd6560e5b4fb961f4 to your computer and use it in GitHub Desktop.
Fetch SNP coordinates from the UCSC MySQL server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Script to query UCSC MySQL server for SNP coordinates (but could easily be | |
# repurposed to query any arbitrary database/table) | |
# | |
# Author: Kevin Ernst | |
# Date: 2 March 2019; updated 30 August 2021 | |
# Source: https://gist.github.com/ernstki/91b427d6714cdd4dd6560e5b4fb961f4 | |
# License: MIT | |
# | |
# shellcheck disable=SC1117 | |
# TODO: number of simultaneous requests (GNU parallel tasks) | |
#MAX_SIMULT_REQS=5 | |
# set TRACE=1 in the environment to enable 'set -x' for the body of the script | |
# useful for debugging; ref. https://tinyurl.com/lbwcd54 (Bash manual) | |
TRACE=${TRACE:-} | |
ME=$( basename "${BASH_SOURCE[0]}" ) | |
# UCSC MySQL database credentials; | |
# refer to https://genome.ucsc.edu/goldenPath/help/mysql.html | |
DB_HOST=${GETSNPCOORDS_DB_HOST:-genome-mysql.soe.ucsc.edu} | |
DB_USER=${GETSNPCOORDS_DB_USER:-genome} | |
# intentionally blank; the password for the 'genome' user is empty | |
DB_PASSWORD=${GETSNPCOORDS_DB_PASSWORD:-} | |
DEFAULT_DB_SCHEMA=${GETSNPCOORDS_DB_SCHEMA:-hg19} | |
DEFAULT_DB_TABLE=${GETSNPCOORDS_DB_TABLE:-snp142} | |
QUERY_COLUMNS=${GETSNPCOORDS_QUERY_COLUMNS:-chrom,chromStart,chromEnd,name} | |
QUERY_MATCH_FIELD=${GETSNPCOORDS_QUERY_MATCH_FIELD:-name} | |
QUERY_ORDER_BY=${GETSNPCOORDS_QUERY_ORDER_BY:-chrom,chromStart ASC} | |
ISSUES_URL='https://gist.github.com/ernstki/91b427d6714cdd4dd6560e5b4fb961f4' | |
# only *if* stdout is a terminal, use bold/underline font in help/messages | |
if [[ -t 1 ]]; then | |
BOLD=$(tput bold) | |
RESET=$(tput sgr0) | |
UL=$(tput sgr 0 1) | |
else | |
BOLD=; RESET=; UL= | |
fi | |
USAGE="\ | |
${UL}usage${RESET}: | |
$ME [-h|--help] | |
$ME [options] SNPID [SNPID...] | |
$ME [options] < SNPLIST" | |
HELP=" | |
$BOLD$ME$RESET - query SNP IDs from the UCSC MySQL database server | |
$USAGE | |
${UL}options${RESET}: | |
-d, --database DB use database DB instead of default (see below) | |
-t, --snp-table TABLE use table TABLE instead of default (see below) | |
-r, --raw don't escape special characters in the output | |
-c, --column-names print a header row | |
-s, --silent silence printing database and table and warnings | |
about SNP IDs that don't start with 'rs' (but | |
they'll be queried from the database regardless) | |
-e, --exit[-on-invalid] terminate for any ID that doesn't start with 'rs' | |
(default is to print a warning to stderr, but | |
attempt to query them from the database anyway) | |
-v, --verbose print the 'mysql' command line to be executed | |
${UL}where${RESET}: | |
DB is the organism database to use (default: $DEFAULT_DB_SCHEMA) | |
TABLE is the SNP database table name (default: $DEFAULT_DB_TABLE) | |
SNPID is an 'rsXXXXXX'-format SNP ID; can provide multiple | |
Any other options are passed through to the MySQL client; see 'man mysql'. | |
Please report any issues you find at $ISSUES_URL. | |
" | |
# next two are Bash arrays; ref. https://tinyurl.com/llcklu2 (Bash manual) | |
snps=() | |
extra_mysql_opts=() | |
table=$DEFAULT_DB_TABLE | |
database=$DEFAULT_DB_SCHEMA | |
show_column_names= | |
verbose= | |
silent= | |
exit_invalid= | |
bail() { | |
echo -ne "$USAGE\n\n" >&2 | |
echo -e " ${BOLD}OH NOES!${RESET}\n $1. Try '--help'.\n" >&2 | |
exit 1 | |
} | |
warn() { | |
echo "${BOLD}WARNING${RESET}: $1." >&2 | |
} | |
# source: https://stackoverflow.com/a/17841619/785213; see the section | |
# "Parameter Expansion" in the Bash manual, under '${parameter/pattern/string}' | |
join_() { | |
local d=$1; shift | |
echo -n "$1"; shift | |
echo "${@/#/$d}" | |
} | |
# if '$1' looks like a valid rsID, let it pass; warn or exit otherwise | |
check_valid_rsid() { | |
# pattern matching reference: https://tinyurl.com/mgfgos9 ("Conditional | |
# Constructions" section in Bash manual; scroll down to '[[…]]') | |
[[ $1 =~ ^rs[1-9][0-9]+$ ]] && return | |
# '((…))' is how you do tests on math expressions in the shell; the | |
# values of shell variables can be retrieved without including the '$', | |
# and any empty/zero value returns "false", while non-zero returns "true" | |
(( exit_invalid )) && bail "Invalid SNP ID '$1'" | |
(( !silent )) && warn "Invalid SNP ID '$1'" | |
} | |
# print out every statement as it's executed from here on (for debugging) | |
(( TRACE )) && set -x | |
# process command line options | |
# - $# is the total number of input arguments | |
# - '((…))' is how you do tests on math expressions in a Bash loop | |
# construct; any non-zero result is considered "true," so the 'while' loop | |
# continues until there are no more arguments to process | |
# - $1, $2, $3... contain the actual arguments in the order given | |
# - 'shift' (near the end of the 'while' loop) discards '$1' and shifts all | |
# the other ones down by one (so $2 becomes $1 and so on) | |
# - pattern matching within 'case' statements roughly follows shell pattern | |
# matching, so it understands '*', '?', and character classes ('[]') | |
# (ref. https://tinyurl.com/k5nokvt - Bash manual) | |
# - this is the cleanest and most idiomatic way of processing command line | |
# options in Bash that I have found so far; see also 'man getopt' | |
while (( $# )); do | |
case $1 in | |
-d|--db*|--database) | |
shift | |
database=$1 | |
;; | |
-t|--snp-table|--table) | |
shift | |
[[ $1 =~ ^snp[1-9][0-9]+$ ]] || bail "Invalid SNP table '$1'" | |
table=$1 | |
;; | |
-c|--column-names) | |
# the '-c' option to 'mysql' actually means "preserve comments in | |
# statements sent to the server" so no big deal if we mask it here | |
show_column_names=1 | |
;; | |
-v|--verbose) | |
verbose=1 | |
;; | |
-s|--silent) | |
silent=1 | |
;; | |
-e|--exit*) | |
exit_invalid=1 | |
;; | |
-h|--h*) | |
echo "$HELP" | |
exit | |
;; | |
-*) | |
# pass anything else as an argument to 'mysql' | |
warn "Unrecognized option '$1', will pass to 'mysql'" | |
# this is Bash syntax for appending an item to an existing array | |
extra_mysql_opts+=("$1") | |
;; | |
*) | |
check_valid_rsid "$1" | |
snps+=("$1") | |
;; | |
esac | |
shift | |
done | |
# shellcheck disable=SC2191 | |
mysql_opts=( | |
--no-auto-rehash # short version: -A | |
--batch # -B; use tab as delimiter & escape special chars | |
--host="$DB_HOST" # -h | |
--user="$DB_USER" # -u | |
--password="$DB_PASSWORD" # -pPASSWORD (but -p will not work /w null password) | |
--database="$database" # -D; the database name | |
) | |
if (( !show_column_names )); then | |
mysql_opts+=( | |
--skip-column-names # -N; do not print a header row | |
) | |
fi | |
# add any extra options not recognized by this script: | |
mysql_opts+=("${extra_mysql_opts[@]}") | |
# if the list of SNPs so far is empty; that is, if the array, expanded as a | |
# single string separated by whitespace ('${array[*]}'), has zero length... | |
if [[ -z ${snps[*]} ]]; then | |
# ...and stdin (file descriptor 0 in Unix) is a terminal, bail out | |
test -t 0 && bail "Missing SNP list" | |
# ...and stdin is NOT a terminal, read from stdin | |
while read -r snp; do | |
check_valid_rsid "$snp" | |
# append to existing 'snps' array | |
snps+=("$snp") | |
done | |
fi | |
# TOOD: run queries in parallel with GNU Parallel | |
#if (( ${#snps[@]} > MAX_SIMULT_REQS )); then | |
# start putting together the conditions of the 'WHERE' clause | |
wheres=() | |
for snp in "${snps[@]}"; do | |
wheres+=("$QUERY_MATCH_FIELD=\"$snp\"") | |
done | |
# join all the individual 'WHERE' conditions with ' OR ' | |
where_clause="WHERE $(join_ " OR " "${wheres[@]}")" | |
# shellcheck disable=SC2206 | |
query=( | |
SELECT $QUERY_COLUMNS | |
FROM $table | |
$where_clause | |
ORDER BY $QUERY_ORDER_BY\; | |
) | |
if (( !silent )); then | |
echo "# fetching coordinates from '$database' table '$table'..." >&2 | |
fi | |
# -e|--execute option runs SQL query given as string argument | |
if (( verbose )); then | |
( set -x; mysql "${mysql_opts[@]}" --execute "${query[*]}" ) | |
else | |
mysql "${mysql_opts[@]}" --execute "${query[*]}" | |
fi | |
# getsnpcoords |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requires the command-line
mysql
client. If you typemysql
in the shell and something happens, skip to Installation below.Otherwise, the package supplying this program may be found in Linux repositories under names like
mysql-community-client
ormysql-client-core
ormariadb-client-core
. In most cases simply runningsudo yourpackagemanager install mysql # or sudo yourpackagemanager install mariadb
…will get it for you, with the understanding that they'll probably pull in the server components as well, which you probably don't need.
Installation
Assuming a reasonably default macOS or Linux box:
It's quite rare for it not to be these days, but if
$HOME/bin
is not in your search path, you can update that in your~/.bash_profile
or~/.profile
(whichever one you have):Log (completely) out and back in in order for this change to take effect.