Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@knbknb
Last active July 21, 2022 08:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save knbknb/a8c9fa96c13259e1122edac4ca5ca840 to your computer and use it in GitHub Desktop.
Save knbknb/a8c9fa96c13259e1122edac4ca5ca840 to your computer and use it in GitHub Desktop.
BigQuery snippets (for command line tool bq)
/*
--## a longer and more complex query:
--## 6 tags on stackoverflow.com:
--## monthly posts by tagcount per month, since 2008
*/
with cte as (
select
t.tag_name,
LAST_DAY(date(creation_date)) as month
from
`bigquery-public-data.stackoverflow.posts_questions` p
join `bigquery-public-data.stackoverflow.tags` t on t.tag_name = p.Tags
where
Tag_Name in (
/* 'r',
'python',
'javascript',
'java',
'php',*/
'svg', 'xslt', 'xml', 'xpath', 'xsl', 'xsd', 'xml-schema',
'json', 'json-schema', 'json-ld', 'json-ld-schema'
)
)
select
*,
count(*) as tag_cnt
from
cte
group by
tag_name,
month
order by
Month asc,
tag_name asc
#!/usr/bin/env bash
# bigquery-bq-commands--collection.sh
# knb 2022-07
# Tagcount by month on stackoverflow.com
# call: ./bigquery-bq-commands--collection.sh -t r,c,c++
# edit commands as needed.
function query {
local tags=$(perl -E "say join q(,), map {qq('\$_')} split(',', qq($1))")
cd /var/tmp;
echo tags=$tags
# url_gh="https://gist.githubusercontent.com/knbknb"
# # Attention: gist fragment cah change with every save!
# gist="a8c9fa96c13259e1122edac4ca5ca840/raw/3ab980dcca392cf958e0ea28e359c49b01e1ee67/"
# sql="bigquery--stackoverflow-tagcount-per-month.sql"
# tagcount_by_month=$(curl -sL "${url_gh}${gist}${sql}")
# url_sql="${url_gh}${gist}${sql}"
# 'clojure', 'scala', 'haskell', 'erlang', 'common-lisp'
# tagcount_by_month=$(curl -sL "${url_sql}")
tagcount_by_month=$(cat <<EOD
with cte as (
select
t.tag_name,
LAST_DAY(date(creation_date)) as month
from
bigquery-public-data.stackoverflow.posts_questions p
join bigquery-public-data.stackoverflow.tags t on t.tag_name = p.Tags
where
Tag_Name in (
$tags
)
)
select
tag_name,
month,
count(*) as tag_cnt
from
cte
group by
tag_name,
month
order by
Month asc,
tag_name asc
EOD
)
echo "${tagcount_by_month}"
# various important customizations are here: $HOME/.bigqueryrc
# execute query
datafile_csv=bigquery_resultdata.csv
# if [ -f "${datafile_csv}" ]; then
# perl -E "say 'x' x 50;"
# echo "${datafile_csv} already exists, skipping query"
# perl -E "say 'x' x 50;"
# else
# echo "$tagcount_by_month" | bq query > ${datafile_csv}
# fi
query="bigquery--stackoverflow-tagcount-per-month"
echo "$tagcount_by_month" | bq query > ${datafile_csv}
outfile_svg="${query}.svg"
if [ -f "${datafile_csv}" ]; then
if [ ! -s "${datafile_csv}" ]; then
echo rm "${datafile_csv}"
rm "${datafile_csv}"
else
r_code=$(cat <<EOR
theme_set(theme_light());
image <-df %>%
mutate(month=as.Date(month)) %>%
mutate_if(is.character, as.factor) %>%
ggplot() +
geom_line(aes(x=month,y=tag_cnt,group=tag_name,color=tag_name)) +
scale_x_date(date_breaks = '3 months' , date_labels = '%b-%y') +
theme(axis.text.x = element_text(angle=90)) +
labs(title='Tags by month on stackoverflow.com',
subtitle=sprintf('2008 - %s', lubridate::year(lubridate::now())));
ggsave(filename='$outfile_svg', plot=image, width=10, height=8)
EOR
)
# generate the plot and save to disk, as SVG
< ${datafile_csv} Rio -r -d, -g -e "$r_code"
fi
fi
if [ -f "${outfile_svg}" ]; then
if [ "${outfile_svg}" -ef ".svg" ]; then
echo "'${outfile_svg}' -ef '.svg'"
else
nohup google-chrome ${outfile_svg} &
fi
else
echo "no svg file created, '${outfile_svg}' is bad or empty"
fi
}
OPTIONS=`getopt -o t: --long tags: -n 'bigquery-bq-commands--collection.sh' -- $@`
eval set -- "$OPTIONS"
while true; do
case $1 in
-t|--tags)
case $2 in
([a-zA-Z,]*) query $2; shift 2;;
*) >&2 echo Please provide a string of SO tags in the form of "r,python,perl"; shift 2;;
esac;;
--) shift; break;;
*) >&2 echo "[Error] The given command line argument is invalid, please try again"; exit 1;;
esac
done
exit 0;
# see Question on SO
# https://stackoverflow.com/questions/43195143/is-there-a-way-to-export-a-bigquery-tables-schema-as-json
#
bq show --format prettyjson bigquery-public-data:samples.wikipedia
#
bq show --format prettyjson bigquery-public-data:samples.natality
# only column defs, with jq
bq show --format=prettyjson bigquery-public-data:samples.wikipedia | jq '.schema.fields'
# same thing, no need for jq
bq show --schema --format=prettyjson bigquery-public-data:samples.natality
# 2-column output, just name and description, e.g. for data dictionaries
bq show --schema --format=prettyjson bigquery-public-data:samples.natality | jq ".[] | {(.name?): .description}" | sort -u
# similar, but just 1 string per line
bq show --schema --format=prettyjson bigquery-public-data:samples.natality | jq '.. | (.name?) + " : " + (.description?) ' | sort
# with cleanup in perl
bq show --schema --format=prettyjson bigquery-public-data:samples.natality > /tmp/natality.json
< /tmp/natality.json jq '[sort_by(.name) | ..| .name? + ": " + .description? ]' \
| perl -pE "s/\\\n/, /g; s/'/\\\\'/g; " | perl -pE 's/\\"//g'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment