Skip to content

Instantly share code, notes, and snippets.

@bstriner
Created March 8, 2017 03:44
Show Gist options
  • Save bstriner/7062dbefd54bd66955a4aa67f8f0cdc4 to your computer and use it in GitHub Desktop.
Save bstriner/7062dbefd54bd66955a4aa67f8f0cdc4 to your computer and use it in GitHub Desktop.
Read output from wikiextractor
import glob
import os
import json
class WikiDoc(object):
def __init__(self, url, text, id, title):
self.url = url
self.text = text
self.id = id
self.title = title
class WikiModel(object):
def __init__(self, data_dir):
self.data_dir = data_dir
def files(self):
return glob.glob(os.path.join(self.data_dir, "**", "wiki_*"))
def file_docs(self, path):
with open(path) as f:
for line in f:
if line:
doc = json.loads(line)
yield WikiDoc(doc["url"], doc["text"], doc["id"], doc["title"])
def docs(self):
for path in self.files():
for doc in self.file_docs(path):
yield doc
@bstriner
Copy link
Author

bstriner commented Mar 8, 2017

Use Wikiextractor in JSON mode which will create several directories with files with one JSON object on each line.

WikiExtractor

This class will iterate through the extracted documents.

path = "Y:\\wikipedia\\json"
model = WikiModel(path)
for doc in model.docs():
  print doc.title
  print doc.text

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment