Created
July 17, 2023 01:55
-
-
Save rtravitz/f6ed62bf9cb7d185ddf04f2aaf61a77a to your computer and use it in GitHub Desktop.
Query wikipedia pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Usage | |
# wiki.sh <page to search> [subsection] | |
# | |
# Examples | |
# ./wiki.sh walrus - shows a sentence and sections of a page about Walruses. | |
# ./wiki.sh walrus anatomy - shows a sentence from that section and its subsections. | |
# ./wiki.sh walrus 3.1 - shows the subsection for "Tusks and dentition." It could also | |
# be access like ./wiki.sh walrus 'tusks and dentition'. | |
bold=$(tput bold) | |
blue=$(tput setaf 4) | |
underline=$(tput smul) | |
italic=$(tput sitm) | |
reset=$(tput sgr0) | |
# The search API returns a list of matching articles. This script only uses it to get the official title | |
# and an ID for the page used in other API queries. | |
# More on the search API: https://www.mediawiki.org/wiki/API:Search | |
remote_search() { | |
escaped_spaces=${1// /%20} | |
curl -s "https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=$escaped_spaces&srlimit=1" | |
} | |
# The parse API returns a list of sections on the page. The fields this script cares about are the 'line,' which is | |
# the human readable name for the section, and the 'number' which can look like '5' or '5.2.1' | |
# More on the parsing API: https://www.mediawiki.org/wiki/API:Parsing_wikitext# | |
remote_sections() { | |
curl -s "https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid=$1&prop=sections" | |
} | |
# The extract extension can return the content of a page in mostly plain text. The only piece of markup is that section | |
# headers are preceded by a number of equal signs equivalent to their level in the hierachy. | |
# More on the extract extension: https://www.mediawiki.org/wiki/Extension:TextExtracts#API | |
remote_content() { | |
curl -s "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&pageids=$1" | |
} | |
get_extracted_text_for_page() { | |
remote_content "$1" | | |
jq -r --arg page_id "$1" '.query.pages[$page_id].extract' | |
} | |
get_top_level_sections() { | |
subsections=$(jq -r '.parse.sections | |
| map(select(.level == "2")) | |
| map("\(.number): \(.line)") | |
| .[]' <<< "$1") | |
if [[ -n "$subsections" ]]; then | |
print_subsections_header | |
echo "$subsections" | |
fi | |
} | |
get_subsection_name() { | |
num_with_name=$(jq --arg section_id "$2" -r '.parse.sections | |
| map(select(.number == $section_id)) | |
| "\(.[0].number): \(.[0].line)"' <<< "$1") | |
echo -e "$italic$num_with_name$reset" | |
} | |
get_section_name_and_number() { | |
jq --arg section_name "$2" -r '.parse.sections | |
| map(select( | |
((.line | ascii_downcase) == ($section_name | ascii_downcase)) | |
or (.number == $section_name) | |
)) | |
| { number: .[0].number, name: .[0].line }' <<< "$1" | |
} | |
get_subsections() { | |
subsections=$(jq --arg section_num "$2" -r '.parse.sections | |
| map(select( | |
(.number | split(".") | length) > 1 | |
and (.number | split(".") | .[0]) == $section_num | |
)) | |
| map("\(.number): \(.line)") | |
| .[]' <<< "$1") | |
if [[ -n "$subsections" ]]; then | |
print_subsections_header | |
echo "$subsections" | |
fi | |
} | |
strindex() { | |
prefix="${1%%"$2"*}" | |
[[ "$prefix" = "$1" ]] && echo -1 || echo "${#prefix}" | |
} | |
first_sentence() { | |
len=$(strindex "$1" ". ") | |
if [[ $len -gt -1 ]]; then | |
len=$((len + 1)) | |
echo "${1:0:len}" | |
fi | |
} | |
get_section_content() { | |
section_content=$(echo "$1" | grep -i -A 2 "== $2" | tail -n +2) | |
first_sentence "$section_content" | |
} | |
print_title() { | |
printf "\n$bold$blue%s$reset\n\n" "$1" | |
} | |
print_subsections_header() { | |
echo -e "\n$underline""Subsections$reset\n" | |
} | |
main() { | |
if [[ $# -eq 0 ]]; then | |
echo 'please provide a search term' | |
exit 1 | |
fi | |
search_result=$(remote_search "$1") | |
total_hits=$(jq -r '.query.searchinfo.totalhits' <<< "$search_result") | |
if [[ $total_hits -eq 0 ]]; then | |
echo "Your search '$1' did not match any wikipedia pages." | |
exit 1 | |
fi | |
page_id=$(jq -r '.query.search[0].pageid' <<< "$search_result") | |
page_title=$(jq -r '.query.search[0].title' <<< "$search_result") | |
sections=$(remote_sections "$page_id") | |
content=$(get_extracted_text_for_page "$page_id") | |
if [[ $# -gt 1 ]]; then | |
name_and_number=$(get_section_name_and_number "$sections" "$2") | |
name=$(jq -r '.name' <<< "$name_and_number") | |
number=$(jq -r '.number' <<< "$name_and_number") | |
if [[ $number == 'null' ]]; then | |
echo "The section '$2' does not exist on the page '$1'." | |
exit 1 | |
fi | |
print_title "$page_title" | |
get_subsection_name "$sections" "$number" | |
get_section_content "$content" "$name" | |
get_subsections "$sections" "$number" | |
else | |
print_title "$page_title" | |
first_sentence "$content" | |
get_top_level_sections "$sections" | |
fi | |
} | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment