Last active
July 7, 2019 18:42
-
-
Save jindrichmynarz/a91196f3cfa0b60b05ecceff1b0c95b5 to your computer and use it in GitHub Desktop.
Displays a graph of the most visited entities in Wikipedia, in Bash and Python versions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from argparse import ArgumentParser, ArgumentTypeError | |
from datetime import date, datetime, timedelta | |
import json | |
import re | |
from string import Template | |
import urllib.parse | |
import urllib.request | |
import webbrowser | |
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" | |
ACCEPT_JSON = {"Accept": "application/json"} | |
def positive_int(s): | |
i = int(s) | |
if i <= 0: | |
raise ArgumentTypeError("{} must be a positive number!".format(s)) | |
else: | |
return i | |
def wiki_lang_exists(lang): | |
"""Validate if a Wikipedia for the given language code exists.""" | |
query = Template(""" | |
PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
ASK { | |
"$LANG" ^wdt:P424/wdt:P31 wd:Q10876391 . | |
} | |
""").substitute({"LANG": lang}) | |
data = urllib.parse.urlencode({"query": query}).encode("utf-8") | |
request = urllib.request.Request(SPARQL_ENDPOINT, data, ACCEPT_JSON) | |
with urllib.request.urlopen(request) as response: | |
return json.loads(response.read())["boolean"] | |
def is_wiki_lang(s): | |
if re.match("^[a-z]{2,}$", s) and wiki_lang_exists(s): | |
return s | |
else: | |
raise ArgumentTypeError("{} is not a valid language code!".format(s)) | |
def is_year_month(s): | |
min_date = datetime(2015, 10, 1) | |
if re.match("^[0-9]{4}/[0-9]{2}$", s) and datetime.strptime(s, "%Y/%m") >= min_date: | |
return s | |
else: | |
raise ArgumentTypeError(""" | |
Year and month {} must be formatted as YYYY/MM, no sooner than 2015/10! | |
""".strip().format(s)) | |
def top_pages(wiki_lang, year_month, number): | |
"""Get the most visited Wikipedia pages for the specified month.""" | |
def to_wikipedia_url(page): | |
return "https://{}.wikipedia.org/wiki/{}".format( | |
wiki_lang, | |
urllib.parse.quote(page, safe = "()") | |
) | |
endpoint = "https://wikimedia.org/api/rest_v1/metrics/pageviews/top" | |
url = "{}/{}.wikipedia/all-access/{}/all-days".format( | |
endpoint, | |
wiki_lang, | |
year_month) | |
with urllib.request.urlopen(url) as response: | |
pages = json.loads(response.read())["items"][0]["articles"][:number] | |
return {to_wikipedia_url(page["article"]): page["views"] for page in pages} | |
def query_entities(pages): | |
"""Match Wikipedia pages to Wikidata entities.""" | |
template = Template(""" | |
PREFIX : <http://schema.org/> | |
SELECT ?page ?wikidata | |
WHERE { | |
VALUES ?page { | |
$PAGES | |
} | |
?page :about ?wikidata . | |
} | |
""") | |
query = template.substitute({"PAGES": "\n".join("<{}>".format(p) for p in pages.keys())}) | |
data = urllib.parse.urlencode({"query": query}).encode("utf-8") | |
request = urllib.request.Request(SPARQL_ENDPOINT, data, ACCEPT_JSON) | |
with urllib.request.urlopen(request) as response: | |
entities = json.loads(response.read())["results"]["bindings"] | |
return [(entity["wikidata"]["value"], pages[entity["page"]["value"]]) | |
for entity in entities | |
if entity["page"]["value"] in pages] | |
def wikidata_graph(wiki_lang, entities): | |
"""Generate a query for relations between the entities.""" | |
template = Template(""" | |
PREFIX bd: <http://www.bigdata.com/rdf#> | |
PREFIX wikibase: <http://wikiba.se/ontology#> | |
#defaultView:Graph | |
SELECT ?a ?aLabel ?aViews ?b ?bLabel ?bViews | |
WHERE { | |
VALUES (?a ?aViews) { | |
$ENTITIES | |
} | |
VALUES (?b ?bViews) { | |
$ENTITIES | |
} | |
?a ?p ?b . | |
FILTER (!sameTerm(?a, ?b)) | |
SERVICE wikibase:label { | |
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],$WIKILANG" . | |
} | |
} | |
""") | |
query = template.substitute({ | |
"ENTITIES": "\n".join("(<{}> {})".format(entity, rank) | |
for (entity, rank) in entities), | |
"WIKILANG": wiki_lang | |
}) | |
return "https://query.wikidata.org/embed.html#{}".format(urllib.parse.quote(query)) | |
def wikitrends(wiki_lang, year_month, number): | |
pages = top_pages(wiki_lang = wiki_lang, year_month = year_month, number = number) | |
url = wikidata_graph(wiki_lang, query_entities(pages)) | |
webbrowser.open_new_tab(url) | |
if __name__ == "__main__": | |
parser = ArgumentParser(description = "Displays a graph of the most visited entities in Wikipedia") | |
parser.add_argument("-l", "--wiki-lang", | |
type = is_wiki_lang, | |
default = "cs", | |
help = "Language version of Wikipedia (e.g., en, cs, sk).") | |
parser.add_argument("-m", "--year-month", | |
type = is_year_month, | |
default = (date.today() - timedelta(days = 30)).strftime("%Y/%m"), | |
help = "Month of interest, formatted as YYYY/MM.") | |
parser.add_argument("-n", "--number", | |
type = positive_int, | |
default = 50, | |
help = "Number of entities.") | |
args = vars(parser.parse_args()) | |
wikitrends(**args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Displays a graph of the most visited entities in Wikipedia | |
# | |
## Usage | |
# ./wikitrends.sh -h | |
set -e | |
shopt -s extglob | |
die () { | |
echo >&2 "$@" | |
exit 1 | |
} | |
PAGEVIEWS_ENDPOINT=https://wikimedia.org/api/rest_v1/metrics/pageviews/top | |
SPARQL_ENDPOINT=https://query.wikidata.org/sparql | |
command -v jq >/dev/null 2>&1 || | |
die 'Please install jq (https://stedolan.github.io/jq/)!' | |
usage () { | |
echo "Displays a graph of the most visited entities in Wikipedia. | |
Usage: $(basename "$0") [-h] -l language -m year_month -n number | |
[language] = Language version of Wikipedia (e.g., en, cs, sk). Default = cs | |
[year_month] = Month of interest, formatted as YYYY/MM. Default = previous month | |
[number] = Number of entities. Default = 50 | |
-h = Print this help message" | |
} | |
# Argument default values | |
WIKILANG=cs | |
YEAR_MONTH=$(date -v-1m +%Y/%m) | |
NUMBER=50 | |
while getopts :l:m:n:h OPT | |
do | |
case $OPT in | |
l) WIKILANG="$OPTARG" ;; | |
m) YEAR_MONTH="$OPTARG" ;; | |
n) NUMBER="$OPTARG" ;; | |
h) usage; exit ;; | |
esac | |
done | |
# Get the most visited Wikipedia pages for the specified month | |
PAGES=$(curl \ | |
--silent \ | |
${PAGEVIEWS_ENDPOINT}/${WIKILANG}.wikipedia/all-access/${YEAR_MONTH}/all-days | | |
jq \ | |
--raw-output \ | |
'.items[].articles[] | "(\"\(.article | @uri)\" \(.views))"' | | |
head -n ${NUMBER}) | |
# Match Wikipedia pages to Wikidata entities | |
QUERY_ENTITIES=`mktemp` | |
cat <<EOF > ${QUERY_ENTITIES} | |
PREFIX : <http://schema.org/> | |
SELECT ?wikidata ?views | |
WHERE { | |
VALUES (?page ?views) { | |
$PAGES | |
} | |
BIND (iri(concat("https://${WIKILANG}.wikipedia.org/wiki/", ?page)) AS ?wikipedia) | |
?wikipedia :about ?wikidata . | |
} | |
EOF | |
ENTITIES=$(curl \ | |
--silent \ | |
-H "Accept: text/tab-separated-values" \ | |
--data-urlencode query@${QUERY_ENTITIES} \ | |
${SPARQL_ENDPOINT} | | |
tail -n+2 | | |
sed -e 's/^/(/g' -e 's/$/)/g') | |
# Generate a query for relations between the entities | |
QUERY_GRAPH=`mktemp` | |
cat <<EOF > ${QUERY_GRAPH} | |
PREFIX bd: <http://www.bigdata.com/rdf#> | |
PREFIX wikibase: <http://wikiba.se/ontology#> | |
#defaultView:Graph | |
SELECT ?a ?aLabel ?aViews ?b ?bLabel ?bViews | |
WHERE { | |
VALUES (?a ?aViews) { | |
$ENTITIES | |
} | |
VALUES (?b ?bViews) { | |
$ENTITIES | |
} | |
?a ?p ?b . | |
FILTER (!sameTerm(?a, ?b)) | |
SERVICE wikibase:label { | |
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],${WIKILANG}" . | |
} | |
} | |
EOF | |
ENCODED_QUERY=$(jq \ | |
--raw-input \ | |
--raw-output \ | |
--slurp \ | |
"@uri" < ${QUERY_GRAPH}) | |
# Open the visualization in the browser | |
open https://query.wikidata.org/embed.html#$ENCODED_QUERY |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment