todays-mitsui/020.py

## 020.py
# -*- coding: utf-8 -*-

"""
20. JSONデータの読み込み
Wikipedia記事のJSONファイルを読み込み，「イギリス」に関する記事本文を表示せよ．問題21-29では，ここで抽出した記事本文に対して実行せよ．
"""

import codecs
import json


for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
    article = json.loads(row)
    if u"イギリス" == article["title"]:
        print(article["text"])

## 021.py
# -*- coding: utf-8 -*-

"""
21. カテゴリ名を含む行を抽出
記事中でカテゴリ名を宣言している行を抽出せよ．
"""

import codecs
import json
import re


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]


text = extract_text(u"イギリス")
for line in text.split("\n"):
    if re.search(r"Category:", line):
        print(line)
# =>
# [[Category:イギリス|*]]
# [[Category:英連邦王国|*]]
# [[Category:G8加盟国]]
# [[Category:欧州連合加盟国]]
# [[Category:海洋国家]]
# [[Category:君主国]]
# [[Category:島国|くれいとふりてん]]
# [[Category:1801年に設立された州・地域]]

## 022.py
# -*- coding: utf-8 -*-

"""
22. カテゴリ名の抽出
記事のカテゴリ名を（行単位ではなく名前で）抽出せよ．
"""

import codecs
import json
import re


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]


text = extract_text(u"イギリス")
for line in text.split("\n"):
    m = re.search(r"Category:(?P<category>.+?)(\||])", line)
    if m:
        print(m.group("category"))
# =>
# イギリス
# 英連邦王国
# G8加盟国
# 欧州連合加盟国
# 海洋国家
# 君主国
# 島国
# 1801年に設立された州・地域

## 023.py
# -*- coding: utf-8 -*-

"""
23. セクション構造
記事中に含まれるセクション名とそのレベル（例えば"== セクション名 =="なら1）を表示せよ．
"""

import codecs
import json
import re


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]


text = extract_text(u"イギリス")
for line in text.split("\n"):
    m = re.search(r"^(?P<level>=+)(?P<header>.+)\1$", line)
    if m:
        header = m.group("header")
        level = m.group("level").count("=") - 1

        print("{0}: {1}".format(level, header))
# =>
# 1: 国名
# 1: 歴史
# 1: 地理
# 2: 気候
# ...
# 1: 外部リンク

## 024.py
# -*- coding: utf-8 -*-

"""
24. ファイル参照の抽出
記事から参照されているメディアファイルをすべて抜き出せ．
"""

import codecs
import json
import re


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]


text = extract_text(u"イギリス")
for line in text.split("\n"):
    m = re.search("ファイル:(?P<filename>[^|]+)\|", line)
    if m:
        print(m.group("filename"))
# =>
# Royal Coat of Arms of the United Kingdom.svg
# CHANDOS3.jpg
# The Fabs.JPG
# PalaceOfWestminsterAtNight.jpg
# ...
# Wembley Stadium, illuminated.jpg

## 025.py
# -*- coding: utf-8 -*-

"""
25. テンプレートの抽出
記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し，辞書オブジェクトとして格納せよ．
"""

import codecs
import json
import re
from pprint import pprint


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]

def extract_base_info(text):
    m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
    if not m:
        return {}

    info_body = m.group("info_body")

    info_dict = {}

    for item in info_body.split("\n|"):
        key, word = re.split(r"\s*=\s*", item, maxsplit=1)
        info_dict[key] = word

    return info_dict

text = extract_text(u"イギリス")
base_info = extract_base_info(text)

pprint(base_info, indent=4)
# =>
# {
#     '公式国名': '{{lang|en|United Kingdom of Great Britain and Northern Ireland}}<ref>英語以外での...コットランド語）</ref>',
#     '国旗画像': 'Flag of the United Kingdom.svg',
#     '日本語国名': 'グレー トブリテン及び北アイルランド連合王国',
#     '国章リンク': '（[[イギリスの国章|国章]]）',
#     ...
#     '首都': '[[ロンドン]]'
# }

## 026.py
# -*- coding: utf-8 -*-

"""
26. 強調マークアップの除去
25の処理時に，テンプレートの値からMediaWikiの強調マークアップ（弱い強調，強調，強い強調のすべて）を除去してテキストに変換せよ（参考: マークアップ早見表）．
"""

import codecs
import json
import re
from pprint import pprint


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]

def extract_base_info(text):
    m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
    if not m:
        return {}

    info_body = m.group("info_body")

    info_dict = {}

    for item in info_body.split("\n|"):
        key, word = re.split(r"\s*=\s*", item, maxsplit=1)
        info_dict[key] = word

    return info_dict

def remove_emphasis(text):
    """強調マークアップを除去"""
    return re.sub(r"'{2,}", "", text)

text = extract_text(u"イギリス")
base_info = extract_base_info(text)


sanitized_base_info = {}
for k, v in base_info.items():
    v = remove_emphasis(v)
    sanitized_base_info[k] = v


pprint(sanitized_base_info, indent=4)

## 027.py
# -*- coding: utf-8 -*-

"""
27. 内部リンクの除去
26の処理に加えて，テンプレートの値からMediaWikiの内部リンクマークアップを除去し，テキストに変換せよ（参考: マークアップ早見表）．
"""

import codecs
import json
import re
from pprint import pprint


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]

def extract_base_info(text):
    m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
    if not m:
        return {}

    info_body = m.group("info_body")

    info_dict = {}

    for item in info_body.split("\n|"):
        key, word = re.split(r"\s+=\s+", item, maxsplit=1)
        info_dict[key = word

    return info_dict

def remove_emphasis(text):
    """強調マークアップを除去"""
    return re.sub(r"'{2,}", "", text)

def remove_internal_links(text):
    """内部リンクのマークアップを除去"""
    return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text)


text = extract_text(u"イギリス")
base_info = extract_base_info(text)

sanitized_base_info = {}
for k, v in base_info.items():
    v = remove_emphasis(v)
    v = remove_internal_links(v)
    sanitized_base_info[k] = v


pprint(sanitized_base_info, indent=4)

## 028.py
# -*- coding: utf-8 -*-

"""
28. MediaWikiマークアップの除去
27の処理に加えて，テンプレートの値からMediaWikiマークアップを可能な限り除去し，国の基本情報を整形せよ．
"""

import codecs
import json
import re
from pprint import pprint


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]

def extract_base_info(text):
    m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
    if not m:
        return {}

    info_body = m.group("info_body")

    info_dict = {}

    for item in info_body.split("\n|"):
        key, word = re.split(r"\s+=\s+", item, maxsplit=1)

        word = remove_section_header(word)
        word = remove_emphasis(word)
        word = remove_category_links(word)
        word = remove_internal_links(word)
        word = remove_external_links(word)
        word = remove_template(word)
        word = remove_unordered_list(word)
        word = remove_define_list(word)
        word = remove_redirect(word)
        word = remove_comment(word)

        info_dict[key] = word

    return info_dict

def remove_section_header(text):
    """見出しのマークアップを除去"""
    return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text)

def remove_emphasis(text):
    """強調マークアップを除去"""
    return re.sub(r"'{2,}", "", text)

def remove_category_links(text):
    """カテゴリリンクのマークアップを除去"""
    return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("|")[0], text)

def remove_internal_links(text):
    """内部リンクのマークアップを除去"""
    return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text)

def remove_external_links(text):
    """外部リンクのマークアップを除去"""
    return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text)

def remove_template(text):
    """スタブのマークアップを除去"""
    return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("|")[-1], text)

def remove_unordered_list(text):
    """箇条書きのマークアップを除去"""
    return re.sub(r"^\*+\s*", "", text, flags=re.MULTILINE)

def remove_ordered_list(text):
    """番号付箇条書きのマークアップを除去"""
    return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)

def remove_define_list(text):
    """定義の箇条書きのマークアップを除去"""
    return re.sub(r"^(:|;)\s*", "", text, flags=re.MULTILINE)

def remove_redirect(text):
    """リダイレクトのマークアップを除去"""
    return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text)

def remove_comment(text):
    """コメントアウトのマークアップを除去"""
    return re.sub(r"<!--.*?-->", "", text)


text = extract_text(u"イギリス")
base_info = extract_base_info(text)

pprint(base_info, indent=4)

## 029.py
# -*- coding: utf-8 -*-

"""
29. 国旗画像のURLを取得する
テンプレートの内容を利用し，国旗画像のURLを取得せよ．（ヒント: MediaWiki APIのimageinfoを呼び出して，ファイル参照をURLに変換すればよい）
"""

import codecs
import json
import re
from pprint import pprint


def extract_text(title):
    for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
        article = json.loads(row)
        if title == article["title"]:
            return article["text"]

def extract_base_info(text):
    m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
    if not m:
        return {}

    info_body = m.group("info_body")

    info_dict = {}

    for item in info_body.split("\n|"):
        [key, word] = re.split(r"\s+=\s+", item, maxsplit=1)

        word = remove_section_header(word)
        word = remove_emphasis(word)
        word = remove_category_links(word)
        word = remove_internal_links(word)
        word = remove_external_links(word)
        word = remove_template(word)
        word = remove_unordered_list(word)
        word = remove_define_list(word)
        word = remove_redirect(word)
        word = remove_comment(word)

        info_dict[key] = word

    return info_dict

def remove_section_header(text):
    """見出しのマークアップを除去"""
    return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text)

def remove_emphasis(text):
    """強調マークアップを除去"""
    return re.sub(r"'{2,}", "", text)

def remove_category_links(text):
    """カテゴリリンクのマークアップを除去"""
    return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("|")[0], text)

def remove_internal_links(text):
    """内部リンクのマークアップを除去"""
    return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text)

def remove_external_links(text):
    """外部リンクのマークアップを除去"""
    return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text)

def remove_template(text):
    """スタブのマークアップを除去"""
    return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("|")[-1], text)

def remove_unordered_list(text):
    """箇条書きのマークアップを除去"""
    return re.sub(r"^\*+\s*", "", text, flags=re.MULTILINE)

def remove_ordered_list(text):
    """番号付箇条書きのマークアップを除去"""
    return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)

def remove_define_list(text):
    """定義の箇条書きのマークアップを除去"""
    return re.sub(r"^(:|;)\s*", "", text, flags=re.MULTILINE)

def remove_redirect(text):
    """リダイレクトのマークアップを除去"""
    return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text)

def remove_comment(text):
    """コメントアウトのマークアップを除去"""
    return re.sub(r"<!--.*?-->", "", text)


text = extract_text(u"イギリス")
base_info = extract_base_info(text)


from urllib.parse import urlencode
from urllib import request

flag_image_name = base_info["国旗画像"]
query = urlencode({
    "action": "query",
    "titles": "File:{0}".format(flag_image_name),
    "prop": "imageinfo",
    "iiprop": "url",
    "format": "json",
})
url = "https://commons.wikimedia.org/w/api.php?{0}".format(query)

with request.urlopen(url) as response:
    body = response.read()
    data = json.loads(body.decode("utf-8"))

    pprint(data, indent=4)
    # =>
    # {
    #     'continue': {'continue': '||', 'iistart': '2007-09-03T09:51:34Z'},
    #     'query': {
    #         'pages': {
    #             '347935': {
    #                 'imageinfo': [{
    #                     'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=347935',
    #                     'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Flag_of_the_United_Kingdom.svg',
    #                     'url': 'https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg'
    #                 }],
    #                 'imagerepository': 'local',
    #                 'ns': 6,
    #                 'pageid': 347935,
    #                 'title': 'File:Flag of the United ''Kingdom.svg'
    #             }
    #         }
    #     }
    # }

    flag_image_url = list(data["query"]["pages"].values())[0]["imageinfo"][0]["url"]

    print(flag_image_url)
    # => https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg
	# -- coding: utf-8 --

	"""
	20. JSONデータの読み込み
	Wikipedia記事のJSONファイルを読み込み，「イギリス」に関する記事本文を表示せよ．問題21-29では，ここで抽出した記事本文に対して実行せよ．
	"""

	import codecs
	import json


	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if u"イギリス" == article["title"]:
	print(article["text"])
	# -- coding: utf-8 --

	"""
	21. カテゴリ名を含む行を抽出
	記事中でカテゴリ名を宣言している行を抽出せよ．
	"""

	import codecs
	import json
	import re


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]


	text = extract_text(u"イギリス")
	for line in text.split("\n"):
	if re.search(r"Category:", line):
	print(line)
	# =>
	# [[Category:イギリス\|*]]
	# [[Category:英連邦王国\|*]]
	# [[Category:G8加盟国]]
	# [[Category:欧州連合加盟国]]
	# [[Category:海洋国家]]
	# [[Category:君主国]]
	# [[Category:島国\|くれいとふりてん]]
	# [[Category:1801年に設立された州・地域]]
	# -- coding: utf-8 --

	"""
	22. カテゴリ名の抽出
	記事のカテゴリ名を（行単位ではなく名前で）抽出せよ．
	"""

	import codecs
	import json
	import re


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]


	text = extract_text(u"イギリス")
	for line in text.split("\n"):
	m = re.search(r"Category:(?P<category>.+?)(\\|\|])", line)
	if m:
	print(m.group("category"))
	# =>
	# イギリス
	# 英連邦王国
	# G8加盟国
	# 欧州連合加盟国
	# 海洋国家
	# 君主国
	# 島国
	# 1801年に設立された州・地域
	# -- coding: utf-8 --

	"""
	23. セクション構造
	記事中に含まれるセクション名とそのレベル（例えば"== セクション名 =="なら1）を表示せよ．
	"""

	import codecs
	import json
	import re


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]


	text = extract_text(u"イギリス")
	for line in text.split("\n"):
	m = re.search(r"^(?P<level>=+)(?P<header>.+)\1$", line)
	if m:
	header = m.group("header")
	level = m.group("level").count("=") - 1

	print("{0}: {1}".format(level, header))
	# =>
	# 1: 国名
	# 1: 歴史
	# 1: 地理
	# 2: 気候
	# ...
	# 1: 外部リンク
	# -- coding: utf-8 --

	"""
	24. ファイル参照の抽出
	記事から参照されているメディアファイルをすべて抜き出せ．
	"""

	import codecs
	import json
	import re


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]


	text = extract_text(u"イギリス")
	for line in text.split("\n"):
	m = re.search("ファイル:(?P<filename>[^\|]+)\\|", line)
	if m:
	print(m.group("filename"))
	# =>
	# Royal Coat of Arms of the United Kingdom.svg
	# CHANDOS3.jpg
	# The Fabs.JPG
	# PalaceOfWestminsterAtNight.jpg
	# ...
	# Wembley Stadium, illuminated.jpg
	# -- coding: utf-8 --

	"""
	25. テンプレートの抽出
	記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し，辞書オブジェクトとして格納せよ．
	"""

	import codecs
	import json
	import re
	from pprint import pprint


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]

	def extract_base_info(text):
	m = re.search("{{基礎情報[^\|]+\\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
	if not m:
	return {}

	info_body = m.group("info_body")

	info_dict = {}

	for item in info_body.split("\n\|"):
	key, word = re.split(r"\s=\s", item, maxsplit=1)
	info_dict[key] = word

	return info_dict

	text = extract_text(u"イギリス")
	base_info = extract_base_info(text)

	pprint(base_info, indent=4)
	# =>
	# {
	# '公式国名': '{{lang\|en\|United Kingdom of Great Britain and Northern Ireland}}<ref>英語以外での...コットランド語）</ref>',
	# '国旗画像': 'Flag of the United Kingdom.svg',
	# '日本語国名': 'グレートブリテン及び北アイルランド連合王国',
	# '国章リンク': '（[[イギリスの国章\|国章]]）',
	# ...
	# '首都': '[[ロンドン]]'
	# }
	# -- coding: utf-8 --

	"""
	26. 強調マークアップの除去
	25の処理時に，テンプレートの値からMediaWikiの強調マークアップ（弱い強調，強調，強い強調のすべて）を除去してテキストに変換せよ（参考: マークアップ早見表）．
	"""

	import codecs
	import json
	import re
	from pprint import pprint


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]

	def extract_base_info(text):
	m = re.search("{{基礎情報[^\|]+\\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
	if not m:
	return {}

	info_body = m.group("info_body")

	info_dict = {}

	for item in info_body.split("\n\|"):
	key, word = re.split(r"\s=\s", item, maxsplit=1)
	info_dict[key] = word

	return info_dict

	def remove_emphasis(text):
	"""強調マークアップを除去"""
	return re.sub(r"'{2,}", "", text)

	text = extract_text(u"イギリス")
	base_info = extract_base_info(text)


	sanitized_base_info = {}
	for k, v in base_info.items():
	v = remove_emphasis(v)
	sanitized_base_info[k] = v


	pprint(sanitized_base_info, indent=4)
	# -- coding: utf-8 --

	"""
	27. 内部リンクの除去
	26の処理に加えて，テンプレートの値からMediaWikiの内部リンクマークアップを除去し，テキストに変換せよ（参考: マークアップ早見表）．
	"""

	import codecs
	import json
	import re
	from pprint import pprint


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]

	def extract_base_info(text):
	m = re.search("{{基礎情報[^\|]+\\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
	if not m:
	return {}

	info_body = m.group("info_body")

	info_dict = {}

	for item in info_body.split("\n\|"):
	key, word = re.split(r"\s+=\s+", item, maxsplit=1)
	info_dict[key = word

	return info_dict

	def remove_emphasis(text):
	"""強調マークアップを除去"""
	return re.sub(r"'{2,}", "", text)

	def remove_internal_links(text):
	"""内部リンクのマークアップを除去"""
	return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("\|")[-1], text)


	text = extract_text(u"イギリス")
	base_info = extract_base_info(text)

	sanitized_base_info = {}
	for k, v in base_info.items():
	v = remove_emphasis(v)
	v = remove_internal_links(v)
	sanitized_base_info[k] = v


	pprint(sanitized_base_info, indent=4)
	# -- coding: utf-8 --

	"""
	28. MediaWikiマークアップの除去
	27の処理に加えて，テンプレートの値からMediaWikiマークアップを可能な限り除去し，国の基本情報を整形せよ．
	"""

	import codecs
	import json
	import re
	from pprint import pprint


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]

	def extract_base_info(text):
	m = re.search("{{基礎情報[^\|]+\\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
	if not m:
	return {}

	info_body = m.group("info_body")

	info_dict = {}

	for item in info_body.split("\n\|"):
	key, word = re.split(r"\s+=\s+", item, maxsplit=1)

	word = remove_section_header(word)
	word = remove_emphasis(word)
	word = remove_category_links(word)
	word = remove_internal_links(word)
	word = remove_external_links(word)
	word = remove_template(word)
	word = remove_unordered_list(word)
	word = remove_define_list(word)
	word = remove_redirect(word)
	word = remove_comment(word)

	info_dict[key] = word

	return info_dict

	def remove_section_header(text):
	"""見出しのマークアップを除去"""
	return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text)

	def remove_emphasis(text):
	"""強調マークアップを除去"""
	return re.sub(r"'{2,}", "", text)

	def remove_category_links(text):
	"""カテゴリリンクのマークアップを除去"""
	return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("\|")[0], text)

	def remove_internal_links(text):
	"""内部リンクのマークアップを除去"""
	return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("\|")[-1], text)

	def remove_external_links(text):
	"""外部リンクのマークアップを除去"""
	return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text)

	def remove_template(text):
	"""スタブのマークアップを除去"""
	return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("\|")[-1], text)

	def remove_unordered_list(text):
	"""箇条書きのマークアップを除去"""
	return re.sub(r"^\+\s", "", text, flags=re.MULTILINE)

	def remove_ordered_list(text):
	"""番号付箇条書きのマークアップを除去"""
	return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)

	def remove_define_list(text):
	"""定義の箇条書きのマークアップを除去"""
	return re.sub(r"^(:\|;)\s*", "", text, flags=re.MULTILINE)

	def remove_redirect(text):
	"""リダイレクトのマークアップを除去"""
	return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text)

	def remove_comment(text):
	"""コメントアウトのマークアップを除去"""
	return re.sub(r"<!--.*?-->", "", text)


	text = extract_text(u"イギリス")
	base_info = extract_base_info(text)

	pprint(base_info, indent=4)
	# -- coding: utf-8 --

	"""
	29. 国旗画像のURLを取得する
	テンプレートの内容を利用し，国旗画像のURLを取得せよ．（ヒント: MediaWiki APIのimageinfoを呼び出して，ファイル参照をURLに変換すればよい）
	"""

	import codecs
	import json
	import re
	from pprint import pprint


	def extract_text(title):
	for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"):
	article = json.loads(row)
	if title == article["title"]:
	return article["text"]

	def extract_base_info(text):
	m = re.search("{{基礎情報[^\|]+\\|(?P<info_body>.+?)\n}}", text, re.DOTALL)
	if not m:
	return {}

	info_body = m.group("info_body")

	info_dict = {}

	for item in info_body.split("\n\|"):
	[key, word] = re.split(r"\s+=\s+", item, maxsplit=1)

	word = remove_section_header(word)
	word = remove_emphasis(word)
	word = remove_category_links(word)
	word = remove_internal_links(word)
	word = remove_external_links(word)
	word = remove_template(word)
	word = remove_unordered_list(word)
	word = remove_define_list(word)
	word = remove_redirect(word)
	word = remove_comment(word)

	info_dict[key] = word

	return info_dict

	def remove_section_header(text):
	"""見出しのマークアップを除去"""
	return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text)

	def remove_emphasis(text):
	"""強調マークアップを除去"""
	return re.sub(r"'{2,}", "", text)

	def remove_category_links(text):
	"""カテゴリリンクのマークアップを除去"""
	return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("\|")[0], text)

	def remove_internal_links(text):
	"""内部リンクのマークアップを除去"""
	return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("\|")[-1], text)

	def remove_external_links(text):
	"""外部リンクのマークアップを除去"""
	return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text)

	def remove_template(text):
	"""スタブのマークアップを除去"""
	return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("\|")[-1], text)

	def remove_unordered_list(text):
	"""箇条書きのマークアップを除去"""
	return re.sub(r"^\+\s", "", text, flags=re.MULTILINE)

	def remove_ordered_list(text):
	"""番号付箇条書きのマークアップを除去"""
	return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)

	def remove_define_list(text):
	"""定義の箇条書きのマークアップを除去"""
	return re.sub(r"^(:\|;)\s*", "", text, flags=re.MULTILINE)

	def remove_redirect(text):
	"""リダイレクトのマークアップを除去"""
	return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text)

	def remove_comment(text):
	"""コメントアウトのマークアップを除去"""
	return re.sub(r"<!--.*?-->", "", text)


	text = extract_text(u"イギリス")
	base_info = extract_base_info(text)


	from urllib.parse import urlencode
	from urllib import request

	flag_image_name = base_info["国旗画像"]
	query = urlencode({
	"action": "query",
	"titles": "File:{0}".format(flag_image_name),
	"prop": "imageinfo",
	"iiprop": "url",
	"format": "json",
	})
	url = "https://commons.wikimedia.org/w/api.php?{0}".format(query)

	with request.urlopen(url) as response:
	body = response.read()
	data = json.loads(body.decode("utf-8"))

	pprint(data, indent=4)
	# =>
	# {
	# 'continue': {'continue': '\|\|', 'iistart': '2007-09-03T09:51:34Z'},
	# 'query': {
	# 'pages': {
	# '347935': {
	# 'imageinfo': [{
	# 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=347935',
	# 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Flag_of_the_United_Kingdom.svg',
	# 'url': 'https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg'
	# }],
	# 'imagerepository': 'local',
	# 'ns': 6,
	# 'pageid': 347935,
	# 'title': 'File:Flag of the United ''Kingdom.svg'
	# }
	# }
	# }
	# }

	flag_image_url = list(data["query"]["pages"].values())[0]["imageinfo"][0]["url"]

	print(flag_image_url)
	# => https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg