rtravitz/wiki.sh

## wiki.sh
#!/usr/bin/env bash

# Usage
# wiki.sh <page to search> [subsection]
#
# Examples
# ./wiki.sh walrus - shows a sentence and sections of a page about Walruses.
# ./wiki.sh walrus anatomy - shows a sentence from that section and its subsections.
# ./wiki.sh walrus 3.1 - shows the subsection for "Tusks and dentition." It could also
#                        be access like ./wiki.sh walrus 'tusks and dentition'.

bold=$(tput bold)
blue=$(tput setaf 4)
underline=$(tput smul)
italic=$(tput sitm)
reset=$(tput sgr0)

# The search API returns a list of matching articles. This script only uses it to get the official title
# and an ID for the page used in other API queries.
# More on the search API: https://www.mediawiki.org/wiki/API:Search
remote_search() {
  escaped_spaces=${1// /%20}
  curl -s "https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=$escaped_spaces&srlimit=1"
}

# The parse API returns a list of sections on the page. The fields this script cares about are the 'line,' which is
# the human readable name for the section, and the 'number' which can look like '5' or '5.2.1'
# More on the parsing API: https://www.mediawiki.org/wiki/API:Parsing_wikitext#
remote_sections() {
  curl -s "https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid=$1&prop=sections"
}

# The extract extension can return the content of a page in mostly plain text. The only piece of markup is that section
# headers are preceded by a number of equal signs equivalent to their level in the hierachy.
# More on the extract extension: https://www.mediawiki.org/wiki/Extension:TextExtracts#API
remote_content() {
  curl -s "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&pageids=$1"
}

get_extracted_text_for_page() {
  remote_content "$1" |
  jq -r --arg page_id "$1" '.query.pages[$page_id].extract'
}

get_top_level_sections() {
  subsections=$(jq -r '.parse.sections
    | map(select(.level == "2"))
    | map("\(.number): \(.line)")
    | .[]' <<< "$1")

  if [[ -n "$subsections" ]]; then
    print_subsections_header
    echo "$subsections"
  fi
}

get_subsection_name() {
  num_with_name=$(jq --arg section_id "$2" -r '.parse.sections
    | map(select(.number == $section_id))
    | "\(.[0].number): \(.[0].line)"' <<< "$1")

  echo -e "$italic$num_with_name$reset"
}

get_section_name_and_number() {
  jq --arg section_name "$2" -r '.parse.sections
    | map(select(
        ((.line | ascii_downcase) == ($section_name | ascii_downcase))
        or (.number == $section_name)
      ))
    | { number: .[0].number, name: .[0].line }' <<< "$1"
}

get_subsections() {
  subsections=$(jq --arg section_num "$2" -r '.parse.sections
    | map(select(
      (.number | split(".") | length) > 1
      and (.number | split(".") | .[0]) == $section_num
      ))
    | map("\(.number): \(.line)")
    | .[]' <<< "$1")

  if [[ -n "$subsections" ]]; then
    print_subsections_header
    echo "$subsections"
  fi
}

strindex() {
  prefix="${1%%"$2"*}"
  [[ "$prefix" = "$1" ]] && echo -1 || echo "${#prefix}"
}

first_sentence() {
  len=$(strindex "$1" ". ")

  if [[ $len -gt -1 ]]; then
    len=$((len + 1))
    echo "${1:0:len}"
  fi
}

get_section_content() {
  section_content=$(echo "$1" | grep -i -A 2 "== $2" | tail -n +2)
  first_sentence "$section_content"
}

print_title() {
  printf "\n$bold$blue%s$reset\n\n" "$1"
}

print_subsections_header() {
  echo -e "\n$underline""Subsections$reset\n"
}

main() {
  if [[ $# -eq 0 ]]; then
    echo 'please provide a search term'
    exit 1
  fi

  search_result=$(remote_search "$1")

  total_hits=$(jq -r '.query.searchinfo.totalhits' <<< "$search_result")
  if [[ $total_hits -eq 0 ]]; then
    echo "Your search '$1' did not match any wikipedia pages."
    exit 1
  fi

  page_id=$(jq -r '.query.search[0].pageid' <<< "$search_result")
  page_title=$(jq -r '.query.search[0].title' <<< "$search_result")

  sections=$(remote_sections "$page_id")
  content=$(get_extracted_text_for_page "$page_id")

  if [[ $# -gt 1 ]]; then
    name_and_number=$(get_section_name_and_number "$sections" "$2")
    name=$(jq -r '.name' <<< "$name_and_number")
    number=$(jq -r '.number' <<< "$name_and_number")
    if [[ $number == 'null' ]]; then
      echo "The section '$2' does not exist on the page '$1'."
      exit 1
    fi

    print_title "$page_title"
    get_subsection_name "$sections" "$number"
    get_section_content "$content" "$name"
    get_subsections "$sections" "$number"
  else
    print_title "$page_title"
    first_sentence "$content"
    get_top_level_sections "$sections"
  fi
}

main "$@"
	#!/usr/bin/env bash

	# Usage
	# wiki.sh <page to search> [subsection]
	#
	# Examples
	# ./wiki.sh walrus - shows a sentence and sections of a page about Walruses.
	# ./wiki.sh walrus anatomy - shows a sentence from that section and its subsections.
	# ./wiki.sh walrus 3.1 - shows the subsection for "Tusks and dentition." It could also
	# be access like ./wiki.sh walrus 'tusks and dentition'.

	bold=$(tput bold)
	blue=$(tput setaf 4)
	underline=$(tput smul)
	italic=$(tput sitm)
	reset=$(tput sgr0)

	# The search API returns a list of matching articles. This script only uses it to get the official title
	# and an ID for the page used in other API queries.
	# More on the search API: https://www.mediawiki.org/wiki/API:Search
	remote_search() {
	escaped_spaces=${1// /%20}
	curl -s "https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=$escaped_spaces&srlimit=1"
	}

	# The parse API returns a list of sections on the page. The fields this script cares about are the 'line,' which is
	# the human readable name for the section, and the 'number' which can look like '5' or '5.2.1'
	# More on the parsing API: https://www.mediawiki.org/wiki/API:Parsing_wikitext#
	remote_sections() {
	curl -s "https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid=$1&prop=sections"
	}

	# The extract extension can return the content of a page in mostly plain text. The only piece of markup is that section
	# headers are preceded by a number of equal signs equivalent to their level in the hierachy.
	# More on the extract extension: https://www.mediawiki.org/wiki/Extension:TextExtracts#API
	remote_content() {
	curl -s "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&pageids=$1"
	}

	get_extracted_text_for_page() {
	remote_content "$1" \|
	jq -r --arg page_id "$1" '.query.pages[$page_id].extract'
	}

	get_top_level_sections() {
	subsections=$(jq -r '.parse.sections
	\| map(select(.level == "2"))
	\| map("\(.number): \(.line)")
	\| .[]' <<< "$1")

	if [[ -n "$subsections" ]]; then
	print_subsections_header
	echo "$subsections"
	fi
	}

	get_subsection_name() {
	num_with_name=$(jq --arg section_id "$2" -r '.parse.sections
	\| map(select(.number == $section_id))
	\| "\(.[0].number): \(.[0].line)"' <<< "$1")

	echo -e "$italic$num_with_name$reset"
	}

	get_section_name_and_number() {
	jq --arg section_name "$2" -r '.parse.sections
	\| map(select(
	((.line \| ascii_downcase) == ($section_name \| ascii_downcase))
	or (.number == $section_name)
	))
	\| { number: .[0].number, name: .[0].line }' <<< "$1"
	}

	get_subsections() {
	subsections=$(jq --arg section_num "$2" -r '.parse.sections
	\| map(select(
	(.number \| split(".") \| length) > 1
	and (.number \| split(".") \| .[0]) == $section_num
	))
	\| map("\(.number): \(.line)")
	\| .[]' <<< "$1")

	if [[ -n "$subsections" ]]; then
	print_subsections_header
	echo "$subsections"
	fi
	}

	strindex() {
	prefix="${1%%"$2"*}"
	[[ "$prefix" = "$1" ]] && echo -1 \|\| echo "${#prefix}"
	}

	first_sentence() {
	len=$(strindex "$1" ". ")

	if [[ $len -gt -1 ]]; then
	len=$((len + 1))
	echo "${1:0:len}"
	fi
	}

	get_section_content() {
	section_content=$(echo "$1" \| grep -i -A 2 "== $2" \| tail -n +2)
	first_sentence "$section_content"
	}

	print_title() {
	printf "\n$bold$blue%s$reset\n\n" "$1"
	}

	print_subsections_header() {
	echo -e "\n$underline""Subsections$reset\n"
	}

	main() {
	if [[ $# -eq 0 ]]; then
	echo 'please provide a search term'
	exit 1
	fi

	search_result=$(remote_search "$1")

	total_hits=$(jq -r '.query.searchinfo.totalhits' <<< "$search_result")
	if [[ $total_hits -eq 0 ]]; then
	echo "Your search '$1' did not match any wikipedia pages."
	exit 1
	fi

	page_id=$(jq -r '.query.search[0].pageid' <<< "$search_result")
	page_title=$(jq -r '.query.search[0].title' <<< "$search_result")

	sections=$(remote_sections "$page_id")
	content=$(get_extracted_text_for_page "$page_id")

	if [[ $# -gt 1 ]]; then
	name_and_number=$(get_section_name_and_number "$sections" "$2")
	name=$(jq -r '.name' <<< "$name_and_number")
	number=$(jq -r '.number' <<< "$name_and_number")
	if [[ $number == 'null' ]]; then
	echo "The section '$2' does not exist on the page '$1'."
	exit 1
	fi

	print_title "$page_title"
	get_subsection_name "$sections" "$number"
	get_section_content "$content" "$name"
	get_subsections "$sections" "$number"
	else
	print_title "$page_title"
	first_sentence "$content"
	get_top_level_sections "$sections"
	fi
	}

	main "$@"