Skip to content

Instantly share code, notes, and snippets.

@luizpvas
Created August 22, 2017 22:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luizpvas/5e861eab508ee3baf972b09e10fd896e to your computer and use it in GitHub Desktop.
Save luizpvas/5e861eab508ee3baf972b09e10fd896e to your computer and use it in GitHub Desktop.
Consolidate the output from WikiExtractor.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ===========================================
#
# WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
# The files are compiled into one .txt file, where each line is a pre-processed
# sentence. The following transformations and filters are applied in the text:
#
# * Commas, dots, quotes and parathensis are removed. Question and exclamation
# marks are kept, but with spaces between then and words. For example:
# "how are you?" becomes "how are you ?"
#
# * Text is converted to lower case. Not sure if this is a good thing, though.
# Let's A/B test in the future!
import os
import bz2
extract_dir = "/home/luiz/Documents/extracted"
bz2_dirs = os.listdir(extract_dir)
consolidated_file = "./ptbrwiki_consolidated.txt"
consolidated_doc = ""
total_docs = 0
def consolidate_bz2_dir(dir):
print("Consolidating directory: [{}]".format(dir))
files = os.listdir(dir)
print("There are {} files in the directory".format(len(files)))
for file in files:
print("Consolidating file {}".format(file))
bz_file = bz2.BZ2File(os.path.join(dir, file))
lines = bz_file.readlines()
doc = ""
for line in lines:
if line.find('<doc') == 0:
doc = ""
elif line.find('</doc>') != -1:
consolidate_document(doc)
else:
doc += line
write_to_consolidated_file()
print "---> {} consolidated documents so far".format(total_docs)
def consolidate_document(doc):
global consolidated_doc
global total_docs
total_docs += 1
for sentence in doc.split('\n'):
# The first step is to convert the content to lower case
sentence = sentence.decode('utf-8').lower()
# Then we're going to remove unwanted characters
sentence = sentence.replace(",", "")
sentence = sentence.replace(".", "")
sentence = sentence.replace("(", "")
sentence = sentence.replace(")", "")
sentence = sentence.replace(":", "")
sentence = sentence.replace(";", "")
sentence = sentence.replace(" - ", "")
# Then we put a space between punctuation to separate it from the words.
# We don't want "there?" and "there" to be two separated entities.
sentence = sentence.replace("?", " ? ")
sentence = sentence.replace("!", " ! ")
# Then we split by word and make sure the sentence has at least 10 words
words = sentence.split()
if len(words) < 10:
continue
consolidated_doc += sentence + "\n"
# Writes to the consolidated file the contents of the 'consolidated_doc'
# variable
def write_to_consolidated_file():
global consolidated_doc
with open(consolidated_file, "a") as f:
f.write(consolidated_doc.encode('utf-8'))
consolidated_doc = ""
for bz2_dir in bz2_dirs:
consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment