cuevasclemente/parse_wiki_files.py

## parse_wiki_files.py
import argparse
from os import path


def parse_wiki_file(output_location, filename):
    with open(filename) as f:
        txt = f.read()
    docs = txt.split("</doc>")
    split_by_line = [doc.split("\n") for doc in docs]
    with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{
            "title": doc[2], "body": "\n".join(doc[3:])}
            for doc in split_by_line[1:] if len(doc) > 2]
    for title_and_body in with_title:
        with open(path.join(
                output_location, title_and_body["title"]), "w") as f:
            f.write(title_and_body["body"])

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Write the articles in a wikipedia "
                    "parsed file where all the text is in a "
                    "file named after the article title")
    parser.add_argument("files",
                        help="The files to run the parsing procedure on",
                        nargs="+")
    parser.add_argument("--output_location",
                        help="The directory to output articles to",
                        default="./")
    args = parser.parse_args()
    for f in args.files:
        parse_wiki_file(args.output_location, f)
	import argparse
	from os import path


	def parse_wiki_file(output_location, filename):
	with open(filename) as f:
	txt = f.read()
	docs = txt.split("</doc>")
	split_by_line = [doc.split("\n") for doc in docs]
	with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{
	"title": doc[2], "body": "\n".join(doc[3:])}
	for doc in split_by_line[1:] if len(doc) > 2]
	for title_and_body in with_title:
	with open(path.join(
	output_location, title_and_body["title"]), "w") as f:
	f.write(title_and_body["body"])

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description="Write the articles in a wikipedia "
	"parsed file where all the text is in a "
	"file named after the article title")
	parser.add_argument("files",
	help="The files to run the parsing procedure on",
	nargs="+")
	parser.add_argument("--output_location",
	help="The directory to output articles to",
	default="./")
	args = parser.parse_args()
	for f in args.files:
	parse_wiki_file(args.output_location, f)