nutszebra/100_questions_NLP_027

## 100_questions_NLP_027
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
1行に1記事の情報がJSON形式で格納される
各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
ファイル全体はgzipで圧縮される
以下の処理を行うプログラムを作成せよ．
27. 内部リンクの除去
26の処理に加えて，テンプレートの値からMediaWikiの内部リンクマークアップを除去し，テキストに変換せよ（参考: マークアップ早見表）．
"""

import subprocess
import requests
import os
import re
import json

def download(url, dir, params={}):
  dl = requests.get(url, params=params)
  with file(dir, "wb") as f:
    f.write(dl.content)

def gunzip(path):
  cmd = "gunzip " + path
  subprocess.call(cmd, shell=True)

#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
if not os.path.exists("wikipedia.json"):
  gunzip("./wikipedia.json.gz")
content = {}
i=0
with open('wikipedia.json', 'r') as f:
  for line in f:
    content[i]=json.loads(line)
    i+=1

keyword = [u"イギリス", u"英国", u"British"]

result = {}
for article in content:
  for word in keyword:
    if word in content[article]["text"] or word in content[article]["text"]:
      result[article] = content[article]
      break
"""
************************************************************************************
The code until here is the code that was written for question 20, and I am resusing
Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
************************************************************************************
"""

answer = {}
for key in result:
  answer[key] = re.sub(r'\[\[(.*?)\]\]', r'\1' ,result[key]['text'])
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
	link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	1行に1記事の情報がJSON形式で格納される
	各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
	ファイル全体はgzipで圧縮される
	以下の処理を行うプログラムを作成せよ．
	27. 内部リンクの除去
	26の処理に加えて，テンプレートの値からMediaWikiの内部リンクマークアップを除去し，テキストに変換せよ（参考: マークアップ早見表）．
	"""

	import subprocess
	import requests
	import os
	import re
	import json

	def download(url, dir, params={}):
	dl = requests.get(url, params=params)
	with file(dir, "wb") as f:
	f.write(dl.content)

	def gunzip(path):
	cmd = "gunzip " + path
	subprocess.call(cmd, shell=True)

	#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
	if not os.path.exists("wikipedia.json"):
	gunzip("./wikipedia.json.gz")
	content = {}
	i=0
	with open('wikipedia.json', 'r') as f:
	for line in f:
	content[i]=json.loads(line)
	i+=1

	keyword = [u"イギリス", u"英国", u"British"]

	result = {}
	for article in content:
	for word in keyword:
	if word in content[article]["text"] or word in content[article]["text"]:
	result[article] = content[article]
	break
	"""
	************************************************************************************
	The code until here is the code that was written for question 20, and I am resusing
	Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
	************************************************************************************
	"""

	answer = {}
	for key in result:
	answer[key] = re.sub(r'\[\[(.*?)\]\]', r'\1' ,result[key]['text'])