scionoftech/GetWiki.py

## GetWiki.py
# https://en.wikipedia.org/wiki/Python_(programming_language)
# https://www.mediawiki.org/wiki/API:Main_page

# 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop

# Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/

# You want to hit a URL like

# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext

# Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts):

# action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters
# prop=extracts makes us use the TextExtracts extension
# exintro limits the response to content before the first section heading
# explaintext makes the extract in the response be plain text instead of HTML
# Then parse the JSON response and extract the extract:

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }).json()
page = next(iter(response['query']['pages'].values()))
print(page['extract'])


# 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph
# MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph.

# For example:

import requests
from lxml import html
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'parse',
        'page': 'Python_(programming_language)',
        'format': 'json',
    }).json()
raw_html = response['parse']['text']['*']
document = html.document_fromstring(raw_html)
first_p = document.xpath('//p')[0]
intro_text = first_p.text_content()
print(intro_text)

# 3. Parse wikitext yourself
# You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve.
import requests
import mwparserfromhell
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Python_(programming_language)',
        'prop': 'revisions',
        'rvprop': 'content'
    }).json()
page = next(iter(response['query']['pages'].values()))
wikicode = page['revisions'][0]['*']
parsed_wikicode = mwparserfromhell.parse(wikicode)
print(parsed_wikicode.strip_code())
	# https://en.wikipedia.org/wiki/Python_(programming_language)
	# https://www.mediawiki.org/wiki/API:Main_page

	# 1. Get a plain text representation of either the entire page or the page "extract" straight from the API with the extracts prop

	# Note that this approach only works on MediaWiki sites with the TextExtracts extension. This notably includes Wikipedia, but not some smaller Mediawiki sites like, say, http://www.wikia.com/

	# You want to hit a URL like

	# https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Bla_Bla_Bla&prop=extracts&exintro&explaintext

	# Breaking that down, we've got the following parameters in there (documented at https://www.mediawiki.org/wiki/Extension:TextExtracts#query+extracts):

	# action=query, format=json, and title=Bla_Bla_Bla are all standard MediaWiki API parameters
	# prop=extracts makes us use the TextExtracts extension
	# exintro limits the response to content before the first section heading
	# explaintext makes the extract in the response be plain text instead of HTML
	# Then parse the JSON response and extract the extract:

	import requests
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'query',
	'format': 'json',
	'titles': 'Python_(programming_language)',
	'prop': 'extracts',
	'exintro': True,
	'explaintext': True,
	}).json()
	page = next(iter(response['query']['pages'].values()))
	print(page['extract'])


	# 2. Get the full HTML of the page using the parse endpoint, parse it, and extract the first paragraph
	# MediaWiki has a parse endpoint that you can hit with a URL like https://en.wikipedia.org/w/api.php?action=parse&page=Bla_Bla_Bla to get the HTML of a page. You can then parse it with an HTML parser like lxml (install it first with pip install lxml) to extract the first paragraph.

	# For example:

	import requests
	from lxml import html
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'parse',
	'page': 'Python_(programming_language)',
	'format': 'json',
	}).json()
	raw_html = response['parse']['text']['*']
	document = html.document_fromstring(raw_html)
	first_p = document.xpath('//p')[0]
	intro_text = first_p.text_content()
	print(intro_text)

	# 3. Parse wikitext yourself
	# You can use the query API to get the page's wikitext, parse it using mwparserfromhell (install it first using pip install mwparserfromhell), then reduce it down to human-readable text using strip_code. strip_code doesn't work perfectly at the time of writing (as shown clearly in the example below) but will hopefully improve.
	import requests
	import mwparserfromhell
	response = requests.get(
	'https://en.wikipedia.org/w/api.php',
	params={
	'action': 'query',
	'format': 'json',
	'titles': 'Python_(programming_language)',
	'prop': 'revisions',
	'rvprop': 'content'
	}).json()
	page = next(iter(response['query']['pages'].values()))
	wikicode = page['revisions'][0]['*']
	parsed_wikicode = mwparserfromhell.parse(wikicode)
	print(parsed_wikicode.strip_code())