luizpvas/WikiConsolidator.py

## WikiConsolidator.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ===========================================
#
# WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
# The files are compiled into one .txt file, where each line is a pre-processed
# sentence. The following transformations and filters are applied in the text:
#
# * Commas, dots, quotes and parathensis are removed. Question and exclamation
#   marks are kept, but with spaces between then and words. For example:
#   "how are you?" becomes "how are you ?"
#
# * Text is converted to lower case. Not sure if this is a good thing, though.
#   Let's A/B test in the future!

import os
import bz2

extract_dir = "/home/luiz/Documents/extracted"
bz2_dirs = os.listdir(extract_dir)
consolidated_file = "./ptbrwiki_consolidated.txt"
consolidated_doc = ""
total_docs = 0

def consolidate_bz2_dir(dir):
  print("Consolidating directory: [{}]".format(dir))
  files = os.listdir(dir)
  print("There are {} files in the directory".format(len(files)))
  for file in files:
    print("Consolidating file {}".format(file))
    bz_file = bz2.BZ2File(os.path.join(dir, file))
    lines = bz_file.readlines()
    doc = ""
    for line in lines:
      if line.find('<doc') == 0:
        doc = ""
      elif line.find('</doc>') != -1:
        consolidate_document(doc)
      else:
        doc += line
    write_to_consolidated_file()
    print "---> {} consolidated documents so far".format(total_docs)

def consolidate_document(doc):
  global consolidated_doc
  global total_docs

  total_docs += 1
  for sentence in doc.split('\n'):
    # The first step is to convert the content to lower case
    sentence = sentence.decode('utf-8').lower()

    # Then we're going to remove unwanted characters
    sentence = sentence.replace(",", "")
    sentence = sentence.replace(".", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace(":", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(" - ", "")

    # Then we put a space between punctuation to separate it from the words.
    # We don't want "there?" and "there" to be two separated entities.
    sentence = sentence.replace("?", " ? ")
    sentence = sentence.replace("!", " ! ")

    # Then we split by word and make sure the sentence has at least 10 words
    words = sentence.split()
    if len(words) < 10:
      continue

    consolidated_doc += sentence + "\n"

# Writes to the consolidated file the contents of the 'consolidated_doc'
# variable
def write_to_consolidated_file():
  global consolidated_doc
  with open(consolidated_file, "a") as f:
    f.write(consolidated_doc.encode('utf-8'))
  consolidated_doc = ""

for bz2_dir in bz2_dirs:
  consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# ===========================================
	#
	# WikiConsolidator reads a directory with .bz2 files generated by WikiExtractor.
	# The files are compiled into one .txt file, where each line is a pre-processed
	# sentence. The following transformations and filters are applied in the text:
	#
	# * Commas, dots, quotes and parathensis are removed. Question and exclamation
	# marks are kept, but with spaces between then and words. For example:
	# "how are you?" becomes "how are you ?"
	#
	# * Text is converted to lower case. Not sure if this is a good thing, though.
	# Let's A/B test in the future!

	import os
	import bz2

	extract_dir = "/home/luiz/Documents/extracted"
	bz2_dirs = os.listdir(extract_dir)
	consolidated_file = "./ptbrwiki_consolidated.txt"
	consolidated_doc = ""
	total_docs = 0

	def consolidate_bz2_dir(dir):
	print("Consolidating directory: [{}]".format(dir))
	files = os.listdir(dir)
	print("There are {} files in the directory".format(len(files)))
	for file in files:
	print("Consolidating file {}".format(file))
	bz_file = bz2.BZ2File(os.path.join(dir, file))
	lines = bz_file.readlines()
	doc = ""
	for line in lines:
	if line.find('<doc') == 0:
	doc = ""
	elif line.find('</doc>') != -1:
	consolidate_document(doc)
	else:
	doc += line
	write_to_consolidated_file()
	print "---> {} consolidated documents so far".format(total_docs)

	def consolidate_document(doc):
	global consolidated_doc
	global total_docs

	total_docs += 1
	for sentence in doc.split('\n'):
	# The first step is to convert the content to lower case
	sentence = sentence.decode('utf-8').lower()

	# Then we're going to remove unwanted characters
	sentence = sentence.replace(",", "")
	sentence = sentence.replace(".", "")
	sentence = sentence.replace("(", "")
	sentence = sentence.replace(")", "")
	sentence = sentence.replace(":", "")
	sentence = sentence.replace(";", "")
	sentence = sentence.replace(" - ", "")

	# Then we put a space between punctuation to separate it from the words.
	# We don't want "there?" and "there" to be two separated entities.
	sentence = sentence.replace("?", " ? ")
	sentence = sentence.replace("!", " ! ")

	# Then we split by word and make sure the sentence has at least 10 words
	words = sentence.split()
	if len(words) < 10:
	continue

	consolidated_doc += sentence + "\n"

	# Writes to the consolidated file the contents of the 'consolidated_doc'
	# variable
	def write_to_consolidated_file():
	global consolidated_doc
	with open(consolidated_file, "a") as f:
	f.write(consolidated_doc.encode('utf-8'))
	consolidated_doc = ""

	for bz2_dir in bz2_dirs:
	consolidate_bz2_dir(os.path.join(extract_dir, bz2_dir))