Skip to content

Instantly share code, notes, and snippets.

@rtravitz
Created July 17, 2023 01:55
Show Gist options
  • Save rtravitz/f6ed62bf9cb7d185ddf04f2aaf61a77a to your computer and use it in GitHub Desktop.
Save rtravitz/f6ed62bf9cb7d185ddf04f2aaf61a77a to your computer and use it in GitHub Desktop.
Query wikipedia pages
#!/usr/bin/env bash
# Usage
# wiki.sh <page to search> [subsection]
#
# Examples
# ./wiki.sh walrus - shows a sentence and sections of a page about Walruses.
# ./wiki.sh walrus anatomy - shows a sentence from that section and its subsections.
# ./wiki.sh walrus 3.1 - shows the subsection for "Tusks and dentition." It could also
# be access like ./wiki.sh walrus 'tusks and dentition'.
bold=$(tput bold)
blue=$(tput setaf 4)
underline=$(tput smul)
italic=$(tput sitm)
reset=$(tput sgr0)
# The search API returns a list of matching articles. This script only uses it to get the official title
# and an ID for the page used in other API queries.
# More on the search API: https://www.mediawiki.org/wiki/API:Search
remote_search() {
escaped_spaces=${1// /%20}
curl -s "https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=$escaped_spaces&srlimit=1"
}
# The parse API returns a list of sections on the page. The fields this script cares about are the 'line,' which is
# the human readable name for the section, and the 'number' which can look like '5' or '5.2.1'
# More on the parsing API: https://www.mediawiki.org/wiki/API:Parsing_wikitext#
remote_sections() {
curl -s "https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid=$1&prop=sections"
}
# The extract extension can return the content of a page in mostly plain text. The only piece of markup is that section
# headers are preceded by a number of equal signs equivalent to their level in the hierachy.
# More on the extract extension: https://www.mediawiki.org/wiki/Extension:TextExtracts#API
remote_content() {
curl -s "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&pageids=$1"
}
get_extracted_text_for_page() {
remote_content "$1" |
jq -r --arg page_id "$1" '.query.pages[$page_id].extract'
}
get_top_level_sections() {
subsections=$(jq -r '.parse.sections
| map(select(.level == "2"))
| map("\(.number): \(.line)")
| .[]' <<< "$1")
if [[ -n "$subsections" ]]; then
print_subsections_header
echo "$subsections"
fi
}
get_subsection_name() {
num_with_name=$(jq --arg section_id "$2" -r '.parse.sections
| map(select(.number == $section_id))
| "\(.[0].number): \(.[0].line)"' <<< "$1")
echo -e "$italic$num_with_name$reset"
}
get_section_name_and_number() {
jq --arg section_name "$2" -r '.parse.sections
| map(select(
((.line | ascii_downcase) == ($section_name | ascii_downcase))
or (.number == $section_name)
))
| { number: .[0].number, name: .[0].line }' <<< "$1"
}
get_subsections() {
subsections=$(jq --arg section_num "$2" -r '.parse.sections
| map(select(
(.number | split(".") | length) > 1
and (.number | split(".") | .[0]) == $section_num
))
| map("\(.number): \(.line)")
| .[]' <<< "$1")
if [[ -n "$subsections" ]]; then
print_subsections_header
echo "$subsections"
fi
}
strindex() {
prefix="${1%%"$2"*}"
[[ "$prefix" = "$1" ]] && echo -1 || echo "${#prefix}"
}
first_sentence() {
len=$(strindex "$1" ". ")
if [[ $len -gt -1 ]]; then
len=$((len + 1))
echo "${1:0:len}"
fi
}
get_section_content() {
section_content=$(echo "$1" | grep -i -A 2 "== $2" | tail -n +2)
first_sentence "$section_content"
}
print_title() {
printf "\n$bold$blue%s$reset\n\n" "$1"
}
print_subsections_header() {
echo -e "\n$underline""Subsections$reset\n"
}
main() {
if [[ $# -eq 0 ]]; then
echo 'please provide a search term'
exit 1
fi
search_result=$(remote_search "$1")
total_hits=$(jq -r '.query.searchinfo.totalhits' <<< "$search_result")
if [[ $total_hits -eq 0 ]]; then
echo "Your search '$1' did not match any wikipedia pages."
exit 1
fi
page_id=$(jq -r '.query.search[0].pageid' <<< "$search_result")
page_title=$(jq -r '.query.search[0].title' <<< "$search_result")
sections=$(remote_sections "$page_id")
content=$(get_extracted_text_for_page "$page_id")
if [[ $# -gt 1 ]]; then
name_and_number=$(get_section_name_and_number "$sections" "$2")
name=$(jq -r '.name' <<< "$name_and_number")
number=$(jq -r '.number' <<< "$name_and_number")
if [[ $number == 'null' ]]; then
echo "The section '$2' does not exist on the page '$1'."
exit 1
fi
print_title "$page_title"
get_subsection_name "$sections" "$number"
get_section_content "$content" "$name"
get_subsections "$sections" "$number"
else
print_title "$page_title"
first_sentence "$content"
get_top_level_sections "$sections"
fi
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment