knbknb/bigquery--stackoverflow-tagcount-per-month.sql

## bigquery--stackoverflow-tagcount-per-month.sql
/*
--## a longer and more complex query:
--##  6 tags on stackoverflow.com:
--## monthly posts by tagcount per month, since 2008
*/
with cte as (
    select
        t.tag_name,
        LAST_DAY(date(creation_date)) as month
    from
        `bigquery-public-data.stackoverflow.posts_questions` p
        join `bigquery-public-data.stackoverflow.tags` t on t.tag_name = p.Tags
    where
        Tag_Name in (
            /* 'r',
            'python',
            'javascript',
            'java',
            'php',*/
            'svg', 'xslt', 'xml', 'xpath', 'xsl',  'xsd', 'xml-schema',
            'json', 'json-schema', 'json-ld', 'json-ld-schema'
        )
)
select
    *,
    count(*) as tag_cnt
from
    cte
group by
    tag_name,
    month
order by
    Month asc,
    tag_name asc

## bigquery-bq-commands--collection.sh
#!/usr/bin/env bash
# bigquery-bq-commands--collection.sh
# knb 2022-07
# Tagcount by month on stackoverflow.com
# call: ./bigquery-bq-commands--collection.sh -t r,c,c++

# edit commands as needed.

function query {
 	local tags=$(perl -E "say join q(,), map {qq('\$_')} split(',', qq($1))")
    cd /var/tmp;
    echo tags=$tags
    # url_gh="https://gist.githubusercontent.com/knbknb"
    # # Attention: gist fragment cah change with every save!
    # gist="a8c9fa96c13259e1122edac4ca5ca840/raw/3ab980dcca392cf958e0ea28e359c49b01e1ee67/"
    # sql="bigquery--stackoverflow-tagcount-per-month.sql"
    # tagcount_by_month=$(curl -sL "${url_gh}${gist}${sql}")
    # url_sql="${url_gh}${gist}${sql}"
    #            'clojure', 'scala', 'haskell', 'erlang', 'common-lisp'
    # tagcount_by_month=$(curl -sL "${url_sql}")
    tagcount_by_month=$(cat <<EOD
    with cte as (
        select
            t.tag_name,
            LAST_DAY(date(creation_date)) as month
        from
            bigquery-public-data.stackoverflow.posts_questions p
            join bigquery-public-data.stackoverflow.tags t on t.tag_name = p.Tags
        where
            Tag_Name in (
                $tags
            )
    )
    select
        tag_name,
        month,
        count(*) as tag_cnt
    from
        cte
    group by
        tag_name,
        month
    order by
        Month asc,
        tag_name asc
EOD
    )

    echo "${tagcount_by_month}"
    # various important customizations are here: $HOME/.bigqueryrc
    # execute query
    datafile_csv=bigquery_resultdata.csv
    # if [ -f "${datafile_csv}" ]; then
    #     perl -E "say 'x' x 50;"
    #     echo "${datafile_csv} already exists, skipping query"
    #     perl -E "say 'x' x 50;"
    # else
    #     echo "$tagcount_by_month" | bq query  > ${datafile_csv}
    # fi
    query="bigquery--stackoverflow-tagcount-per-month"
    echo "$tagcount_by_month" | bq query  > ${datafile_csv}
    outfile_svg="${query}.svg"
    if [ -f "${datafile_csv}"  ]; then
        if [ ! -s "${datafile_csv}" ]; then
            echo rm "${datafile_csv}"
            rm "${datafile_csv}"
        else

            r_code=$(cat <<EOR
    theme_set(theme_light());
    image <-df %>%
    mutate(month=as.Date(month)) %>%
    mutate_if(is.character, as.factor)  %>%
    ggplot() +
    geom_line(aes(x=month,y=tag_cnt,group=tag_name,color=tag_name)) +
    scale_x_date(date_breaks = '3 months' , date_labels = '%b-%y') +
    theme(axis.text.x = element_text(angle=90)) +
    labs(title='Tags by month on stackoverflow.com',
        subtitle=sprintf('2008 - %s', lubridate::year(lubridate::now())));
    ggsave(filename='$outfile_svg', plot=image, width=10, height=8)
EOR
    )

            # generate the plot and save to disk, as SVG
            < ${datafile_csv}  Rio -r -d, -g -e  "$r_code"
        fi
    fi
    if [  -f "${outfile_svg}"  ]; then
    if [ "${outfile_svg}" -ef ".svg"  ]; then
        echo "'${outfile_svg}' -ef '.svg'"
    else
        nohup google-chrome ${outfile_svg} &
    fi
    else
        echo "no svg file created, '${outfile_svg}' is bad or empty"
    fi

}

OPTIONS=`getopt -o t: --long tags: -n 'bigquery-bq-commands--collection.sh' -- $@`
eval set -- "$OPTIONS"
while true; do
	case $1 in
		-t|--tags)
			case $2 in
				([a-zA-Z,]*) query $2; shift 2;;
				*) >&2 echo Please provide a string of SO tags in the form of "r,python,perl"; shift 2;;
			esac;;
		--) shift; break;;
		*) >&2 echo "[Error] The given command line argument is invalid, please try again"; exit 1;;
	esac
done

exit 0;


## bigquery-snippets-cmdline.--get-tabledefn-as-json.sh
# see Question on SO
# https://stackoverflow.com/questions/43195143/is-there-a-way-to-export-a-bigquery-tables-schema-as-json
#
bq show --format prettyjson  bigquery-public-data:samples.wikipedia
#
bq show --format prettyjson  bigquery-public-data:samples.natality

# only column defs, with jq
bq show --format=prettyjson bigquery-public-data:samples.wikipedia | jq '.schema.fields'

# same thing, no need for jq
bq show --schema --format=prettyjson bigquery-public-data:samples.natality

# 2-column output, just name and description, e.g. for data dictionaries
bq show --schema --format=prettyjson bigquery-public-data:samples.natality | jq ".[] | {(.name?): .description}"   | sort -u

# similar, but just 1 string per line
bq show --schema --format=prettyjson bigquery-public-data:samples.natality | jq '.. |  (.name?) + " : " + (.description?) ' | sort

# with cleanup in perl
bq show --schema --format=prettyjson bigquery-public-data:samples.natality > /tmp/natality.json
< /tmp/natality.json   jq '[sort_by(.name) |  ..| .name? + ": " + .description? ]' \
| perl -pE "s/\\\n/, /g; s/'/\\\\'/g; " | perl -pE 's/\\"//g'
	/*
	--## a longer and more complex query:
	--## 6 tags on stackoverflow.com:
	--## monthly posts by tagcount per month, since 2008
	*/
	with cte as (
	select
	t.tag_name,
	LAST_DAY(date(creation_date)) as month
	from
	`bigquery-public-data.stackoverflow.posts_questions` p
	join `bigquery-public-data.stackoverflow.tags` t on t.tag_name = p.Tags
	where
	Tag_Name in (
	/* 'r',
	'python',
	'javascript',
	'java',
	'php',*/
	'svg', 'xslt', 'xml', 'xpath', 'xsl', 'xsd', 'xml-schema',
	'json', 'json-schema', 'json-ld', 'json-ld-schema'
	)
	)
	select
	*,
	count(*) as tag_cnt
	from
	cte
	group by
	tag_name,
	month
	order by
	Month asc,
	tag_name asc
	#!/usr/bin/env bash
	# bigquery-bq-commands--collection.sh
	# knb 2022-07
	# Tagcount by month on stackoverflow.com
	# call: ./bigquery-bq-commands--collection.sh -t r,c,c++

	# edit commands as needed.

	function query {
	local tags=$(perl -E "say join q(,), map {qq('\$_')} split(',', qq($1))")
	cd /var/tmp;
	echo tags=$tags
	# url_gh="https://gist.githubusercontent.com/knbknb"
	# # Attention: gist fragment cah change with every save!
	# gist="a8c9fa96c13259e1122edac4ca5ca840/raw/3ab980dcca392cf958e0ea28e359c49b01e1ee67/"
	# sql="bigquery--stackoverflow-tagcount-per-month.sql"
	# tagcount_by_month=$(curl -sL "${url_gh}${gist}${sql}")
	# url_sql="${url_gh}${gist}${sql}"
	# 'clojure', 'scala', 'haskell', 'erlang', 'common-lisp'
	# tagcount_by_month=$(curl -sL "${url_sql}")
	tagcount_by_month=$(cat <<EOD
	with cte as (
	select
	t.tag_name,
	LAST_DAY(date(creation_date)) as month
	from
	bigquery-public-data.stackoverflow.posts_questions p
	join bigquery-public-data.stackoverflow.tags t on t.tag_name = p.Tags
	where
	Tag_Name in (
	$tags
	)
	)
	select
	tag_name,
	month,
	count(*) as tag_cnt
	from
	cte
	group by
	tag_name,
	month
	order by
	Month asc,
	tag_name asc
	EOD
	)

	echo "${tagcount_by_month}"
	# various important customizations are here: $HOME/.bigqueryrc
	# execute query
	datafile_csv=bigquery_resultdata.csv
	# if [ -f "${datafile_csv}" ]; then
	# perl -E "say 'x' x 50;"
	# echo "${datafile_csv} already exists, skipping query"
	# perl -E "say 'x' x 50;"
	# else
	# echo "$tagcount_by_month" \| bq query > ${datafile_csv}
	# fi
	query="bigquery--stackoverflow-tagcount-per-month"
	echo "$tagcount_by_month" \| bq query > ${datafile_csv}
	outfile_svg="${query}.svg"
	if [ -f "${datafile_csv}" ]; then
	if [ ! -s "${datafile_csv}" ]; then
	echo rm "${datafile_csv}"
	rm "${datafile_csv}"
	else

	r_code=$(cat <<EOR
	theme_set(theme_light());
	image <-df %>%
	mutate(month=as.Date(month)) %>%
	mutate_if(is.character, as.factor) %>%
	ggplot() +
	geom_line(aes(x=month,y=tag_cnt,group=tag_name,color=tag_name)) +
	scale_x_date(date_breaks = '3 months' , date_labels = '%b-%y') +
	theme(axis.text.x = element_text(angle=90)) +
	labs(title='Tags by month on stackoverflow.com',
	subtitle=sprintf('2008 - %s', lubridate::year(lubridate::now())));
	ggsave(filename='$outfile_svg', plot=image, width=10, height=8)
	EOR
	)

	# generate the plot and save to disk, as SVG
	< ${datafile_csv} Rio -r -d, -g -e "$r_code"
	fi
	fi
	if [ -f "${outfile_svg}" ]; then
	if [ "${outfile_svg}" -ef ".svg" ]; then
	echo "'${outfile_svg}' -ef '.svg'"
	else
	nohup google-chrome ${outfile_svg} &
	fi
	else
	echo "no svg file created, '${outfile_svg}' is bad or empty"
	fi

	}

	OPTIONS=`getopt -o t: --long tags: -n 'bigquery-bq-commands--collection.sh' -- $@`
	eval set -- "$OPTIONS"
	while true; do
	case $1 in
	-t\|--tags)
	case $2 in
	([a-zA-Z,]*) query $2; shift 2;;
	*) >&2 echo Please provide a string of SO tags in the form of "r,python,perl"; shift 2;;
	esac;;
	--) shift; break;;
	*) >&2 echo "[Error] The given command line argument is invalid, please try again"; exit 1;;
	esac
	done

	exit 0;
	# see Question on SO
	# https://stackoverflow.com/questions/43195143/is-there-a-way-to-export-a-bigquery-tables-schema-as-json
	#
	bq show --format prettyjson bigquery-public-data:samples.wikipedia
	#
	bq show --format prettyjson bigquery-public-data:samples.natality

	# only column defs, with jq
	bq show --format=prettyjson bigquery-public-data:samples.wikipedia \| jq '.schema.fields'

	# same thing, no need for jq
	bq show --schema --format=prettyjson bigquery-public-data:samples.natality

	# 2-column output, just name and description, e.g. for data dictionaries
	bq show --schema --format=prettyjson bigquery-public-data:samples.natality \| jq ".[] \| {(.name?): .description}" \| sort -u

	# similar, but just 1 string per line
	bq show --schema --format=prettyjson bigquery-public-data:samples.natality \| jq '.. \| (.name?) + " : " + (.description?) ' \| sort

	# with cleanup in perl
	bq show --schema --format=prettyjson bigquery-public-data:samples.natality > /tmp/natality.json
	< /tmp/natality.json jq '[sort_by(.name) \| ..\| .name? + ": " + .description? ]' \
	\| perl -pE "s/\\\n/, /g; s/'/\\\\'/g; " \| perl -pE 's/\\"//g'