Skip to content

Instantly share code, notes, and snippets.

@jindrichmynarz
Last active July 7, 2019 18:42
Show Gist options
  • Save jindrichmynarz/a91196f3cfa0b60b05ecceff1b0c95b5 to your computer and use it in GitHub Desktop.
Save jindrichmynarz/a91196f3cfa0b60b05ecceff1b0c95b5 to your computer and use it in GitHub Desktop.
Displays a graph of the most visited entities in Wikipedia, in Bash and Python versions
#!/usr/bin/env python3
from argparse import ArgumentParser, ArgumentTypeError
from datetime import date, datetime, timedelta
import json
import re
from string import Template
import urllib.parse
import urllib.request
import webbrowser
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
ACCEPT_JSON = {"Accept": "application/json"}
def positive_int(s):
i = int(s)
if i <= 0:
raise ArgumentTypeError("{} must be a positive number!".format(s))
else:
return i
def wiki_lang_exists(lang):
"""Validate if a Wikipedia for the given language code exists."""
query = Template("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
ASK {
"$LANG" ^wdt:P424/wdt:P31 wd:Q10876391 .
}
""").substitute({"LANG": lang})
data = urllib.parse.urlencode({"query": query}).encode("utf-8")
request = urllib.request.Request(SPARQL_ENDPOINT, data, ACCEPT_JSON)
with urllib.request.urlopen(request) as response:
return json.loads(response.read())["boolean"]
def is_wiki_lang(s):
if re.match("^[a-z]{2,}$", s) and wiki_lang_exists(s):
return s
else:
raise ArgumentTypeError("{} is not a valid language code!".format(s))
def is_year_month(s):
min_date = datetime(2015, 10, 1)
if re.match("^[0-9]{4}/[0-9]{2}$", s) and datetime.strptime(s, "%Y/%m") >= min_date:
return s
else:
raise ArgumentTypeError("""
Year and month {} must be formatted as YYYY/MM, no sooner than 2015/10!
""".strip().format(s))
def top_pages(wiki_lang, year_month, number):
"""Get the most visited Wikipedia pages for the specified month."""
def to_wikipedia_url(page):
return "https://{}.wikipedia.org/wiki/{}".format(
wiki_lang,
urllib.parse.quote(page, safe = "()")
)
endpoint = "https://wikimedia.org/api/rest_v1/metrics/pageviews/top"
url = "{}/{}.wikipedia/all-access/{}/all-days".format(
endpoint,
wiki_lang,
year_month)
with urllib.request.urlopen(url) as response:
pages = json.loads(response.read())["items"][0]["articles"][:number]
return {to_wikipedia_url(page["article"]): page["views"] for page in pages}
def query_entities(pages):
"""Match Wikipedia pages to Wikidata entities."""
template = Template("""
PREFIX : <http://schema.org/>
SELECT ?page ?wikidata
WHERE {
VALUES ?page {
$PAGES
}
?page :about ?wikidata .
}
""")
query = template.substitute({"PAGES": "\n".join("<{}>".format(p) for p in pages.keys())})
data = urllib.parse.urlencode({"query": query}).encode("utf-8")
request = urllib.request.Request(SPARQL_ENDPOINT, data, ACCEPT_JSON)
with urllib.request.urlopen(request) as response:
entities = json.loads(response.read())["results"]["bindings"]
return [(entity["wikidata"]["value"], pages[entity["page"]["value"]])
for entity in entities
if entity["page"]["value"] in pages]
def wikidata_graph(wiki_lang, entities):
"""Generate a query for relations between the entities."""
template = Template("""
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>
#defaultView:Graph
SELECT ?a ?aLabel ?aViews ?b ?bLabel ?bViews
WHERE {
VALUES (?a ?aViews) {
$ENTITIES
}
VALUES (?b ?bViews) {
$ENTITIES
}
?a ?p ?b .
FILTER (!sameTerm(?a, ?b))
SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],$WIKILANG" .
}
}
""")
query = template.substitute({
"ENTITIES": "\n".join("(<{}> {})".format(entity, rank)
for (entity, rank) in entities),
"WIKILANG": wiki_lang
})
return "https://query.wikidata.org/embed.html#{}".format(urllib.parse.quote(query))
def wikitrends(wiki_lang, year_month, number):
pages = top_pages(wiki_lang = wiki_lang, year_month = year_month, number = number)
url = wikidata_graph(wiki_lang, query_entities(pages))
webbrowser.open_new_tab(url)
if __name__ == "__main__":
parser = ArgumentParser(description = "Displays a graph of the most visited entities in Wikipedia")
parser.add_argument("-l", "--wiki-lang",
type = is_wiki_lang,
default = "cs",
help = "Language version of Wikipedia (e.g., en, cs, sk).")
parser.add_argument("-m", "--year-month",
type = is_year_month,
default = (date.today() - timedelta(days = 30)).strftime("%Y/%m"),
help = "Month of interest, formatted as YYYY/MM.")
parser.add_argument("-n", "--number",
type = positive_int,
default = 50,
help = "Number of entities.")
args = vars(parser.parse_args())
wikitrends(**args)
#!/usr/bin/env bash
#
# Displays a graph of the most visited entities in Wikipedia
#
## Usage
# ./wikitrends.sh -h
set -e
shopt -s extglob
die () {
echo >&2 "$@"
exit 1
}
PAGEVIEWS_ENDPOINT=https://wikimedia.org/api/rest_v1/metrics/pageviews/top
SPARQL_ENDPOINT=https://query.wikidata.org/sparql
command -v jq >/dev/null 2>&1 ||
die 'Please install jq (https://stedolan.github.io/jq/)!'
usage () {
echo "Displays a graph of the most visited entities in Wikipedia.
Usage: $(basename "$0") [-h] -l language -m year_month -n number
[language] = Language version of Wikipedia (e.g., en, cs, sk). Default = cs
[year_month] = Month of interest, formatted as YYYY/MM. Default = previous month
[number] = Number of entities. Default = 50
-h = Print this help message"
}
# Argument default values
WIKILANG=cs
YEAR_MONTH=$(date -v-1m +%Y/%m)
NUMBER=50
while getopts :l:m:n:h OPT
do
case $OPT in
l) WIKILANG="$OPTARG" ;;
m) YEAR_MONTH="$OPTARG" ;;
n) NUMBER="$OPTARG" ;;
h) usage; exit ;;
esac
done
# Get the most visited Wikipedia pages for the specified month
PAGES=$(curl \
--silent \
${PAGEVIEWS_ENDPOINT}/${WIKILANG}.wikipedia/all-access/${YEAR_MONTH}/all-days |
jq \
--raw-output \
'.items[].articles[] | "(\"\(.article | @uri)\" \(.views))"' |
head -n ${NUMBER})
# Match Wikipedia pages to Wikidata entities
QUERY_ENTITIES=`mktemp`
cat <<EOF > ${QUERY_ENTITIES}
PREFIX : <http://schema.org/>
SELECT ?wikidata ?views
WHERE {
VALUES (?page ?views) {
$PAGES
}
BIND (iri(concat("https://${WIKILANG}.wikipedia.org/wiki/", ?page)) AS ?wikipedia)
?wikipedia :about ?wikidata .
}
EOF
ENTITIES=$(curl \
--silent \
-H "Accept: text/tab-separated-values" \
--data-urlencode query@${QUERY_ENTITIES} \
${SPARQL_ENDPOINT} |
tail -n+2 |
sed -e 's/^/(/g' -e 's/$/)/g')
# Generate a query for relations between the entities
QUERY_GRAPH=`mktemp`
cat <<EOF > ${QUERY_GRAPH}
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>
#defaultView:Graph
SELECT ?a ?aLabel ?aViews ?b ?bLabel ?bViews
WHERE {
VALUES (?a ?aViews) {
$ENTITIES
}
VALUES (?b ?bViews) {
$ENTITIES
}
?a ?p ?b .
FILTER (!sameTerm(?a, ?b))
SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],${WIKILANG}" .
}
}
EOF
ENCODED_QUERY=$(jq \
--raw-input \
--raw-output \
--slurp \
"@uri" < ${QUERY_GRAPH})
# Open the visualization in the browser
open https://query.wikidata.org/embed.html#$ENCODED_QUERY
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment