phamtm/preprocess.py

## preprocess.py
# -*- coding: utf-8 -*-
import codecs
import cPickle as pickle
import os
import re
import string


PUNCTUATIONS = string.punctuation + u'“' + u'”' + u'…'
PATTERN_D_REPLACE = re.compile(r'Đ', flags=re.UNICODE)
PATTERN_NUMBERS = re.compile(r'\d+', flags=re.UNICODE)
PATTERN_PUNCTUATIONS = re.compile('[%s]' % (re.escape(PUNCTUATIONS)),
                                  flags=re.UNICODE)
PATTERN_SPLIT_SENTENCES = re.compile(r'[\.!?]', flags=re.UNICODE)


"""
Preprocessing pipeline for text
- Lowercase
- Split paragraph -> sentences
- Strip punctuation from sentence
- Split sentence -> tokens
- Remove invalid tokens
"""


def strip_numbers(text):
    """Remove numbers from text.
    Might not be necessary ?
    """
    return PATTERN_NUMBERS.sub('', text)


def split_into_sentences(text):
    """Split paragraph into sentences"""
    return PATTERN_SPLIT_SENTENCES.split(text)


def strip_punctuations(text):
    """Remove all the punctuation from a string."""
    return PATTERN_PUNCTUATIONS.sub(' ', text)


def produce_tokenized_sentences(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = strip_numbers(text)
    # Convert 'Đ' into 'đ'
    text = PATTERN_D_REPLACE.sub(u'đ', text)
    sentences = split_into_sentences(text)
    print 'num_sentences: %d' % len(sentences)
    sentence_vectors = []
    for s in sentences:
        vector = strip_punctuations(s).split()
        if len(vector) > 0:
            sentence_vectors.append(vector)
    # stats(sentences)
    return sentence_vectors


def produce_tokenized_sentences_from_file(filepath):
    with codecs.open(filepath, 'r', encoding='utf8') as f:
        text = f.read()
        return produce_tokenized_sentences(text)


def stats(word2vec_sentences):
    """Print the statistics of a corpus in word2vec format."""
    pass
    print 'num_sentences: %12d' % len(word2vec_sentences)
    print 'num_words:   : %12d' % sum([len(s) for s in word2vec_sentences])


if __name__ == '__main__':
    CORPUS_PATH = os.path.join('corpus', 'isach', 'orig')
    WORKDIR_PATH = os.path.join('corpus', 'isach', 'workdir')
    direntries = os.listdir(CORPUS_PATH)
    txt_filenames = [f for f in direntries if f.endswith('.txt') and os.path.isfile(os.path.join(CORPUS_PATH, f))]

    for txt_filename in txt_filenames:
        filepath = os.path.join(CORPUS_PATH, txt_filename)
        sentences = produce_tokenized_sentences_from_file(filepath)
        outputpath = os.path.join(WORKDIR_PATH, txt_filename[:-4] + '.pkl')
        with open(outputpath, 'w') as pkl_file:
            pickle.dump(sentences, pkl_file)
	# -- coding: utf-8 --
	import codecs
	import cPickle as pickle
	import os
	import re
	import string


	PUNCTUATIONS = string.punctuation + u'“' + u'”' + u'…'
	PATTERN_D_REPLACE = re.compile(r'Đ', flags=re.UNICODE)
	PATTERN_NUMBERS = re.compile(r'\d+', flags=re.UNICODE)
	PATTERN_PUNCTUATIONS = re.compile('[%s]' % (re.escape(PUNCTUATIONS)),
	flags=re.UNICODE)
	PATTERN_SPLIT_SENTENCES = re.compile(r'[\.!?]', flags=re.UNICODE)


	"""
	Preprocessing pipeline for text
	- Lowercase
	- Split paragraph -> sentences
	- Strip punctuation from sentence
	- Split sentence -> tokens
	- Remove invalid tokens
	"""


	def strip_numbers(text):
	"""Remove numbers from text.
	Might not be necessary ?
	"""
	return PATTERN_NUMBERS.sub('', text)


	def split_into_sentences(text):
	"""Split paragraph into sentences"""
	return PATTERN_SPLIT_SENTENCES.split(text)


	def strip_punctuations(text):
	"""Remove all the punctuation from a string."""
	return PATTERN_PUNCTUATIONS.sub(' ', text)


	def produce_tokenized_sentences(text):
	# Convert text to lowercase
	text = text.lower()
	# Remove numbers
	text = strip_numbers(text)
	# Convert 'Đ' into 'đ'
	text = PATTERN_D_REPLACE.sub(u'đ', text)
	sentences = split_into_sentences(text)
	print 'num_sentences: %d' % len(sentences)
	sentence_vectors = []
	for s in sentences:
	vector = strip_punctuations(s).split()
	if len(vector) > 0:
	sentence_vectors.append(vector)
	# stats(sentences)
	return sentence_vectors


	def produce_tokenized_sentences_from_file(filepath):
	with codecs.open(filepath, 'r', encoding='utf8') as f:
	text = f.read()
	return produce_tokenized_sentences(text)


	def stats(word2vec_sentences):
	"""Print the statistics of a corpus in word2vec format."""
	pass
	print 'num_sentences: %12d' % len(word2vec_sentences)
	print 'num_words: : %12d' % sum([len(s) for s in word2vec_sentences])


	if __name__ == '__main__':
	CORPUS_PATH = os.path.join('corpus', 'isach', 'orig')
	WORKDIR_PATH = os.path.join('corpus', 'isach', 'workdir')
	direntries = os.listdir(CORPUS_PATH)
	txt_filenames = [f for f in direntries if f.endswith('.txt') and os.path.isfile(os.path.join(CORPUS_PATH, f))]

	for txt_filename in txt_filenames:
	filepath = os.path.join(CORPUS_PATH, txt_filename)
	sentences = produce_tokenized_sentences_from_file(filepath)
	outputpath = os.path.join(WORKDIR_PATH, txt_filename[:-4] + '.pkl')
	with open(outputpath, 'w') as pkl_file:
	pickle.dump(sentences, pkl_file)