Created
October 19, 2015 01:43
-
-
Save phamtm/d9b2db6f9834795c1608 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import codecs | |
import cPickle as pickle | |
import os | |
import re | |
import string | |
PUNCTUATIONS = string.punctuation + u'“' + u'”' + u'…' | |
PATTERN_D_REPLACE = re.compile(r'Đ', flags=re.UNICODE) | |
PATTERN_NUMBERS = re.compile(r'\d+', flags=re.UNICODE) | |
PATTERN_PUNCTUATIONS = re.compile('[%s]' % (re.escape(PUNCTUATIONS)), | |
flags=re.UNICODE) | |
PATTERN_SPLIT_SENTENCES = re.compile(r'[\.!?]', flags=re.UNICODE) | |
""" | |
Preprocessing pipeline for text | |
- Lowercase | |
- Split paragraph -> sentences | |
- Strip punctuation from sentence | |
- Split sentence -> tokens | |
- Remove invalid tokens | |
""" | |
def strip_numbers(text): | |
"""Remove numbers from text. | |
Might not be necessary ? | |
""" | |
return PATTERN_NUMBERS.sub('', text) | |
def split_into_sentences(text): | |
"""Split paragraph into sentences""" | |
return PATTERN_SPLIT_SENTENCES.split(text) | |
def strip_punctuations(text): | |
"""Remove all the punctuation from a string.""" | |
return PATTERN_PUNCTUATIONS.sub(' ', text) | |
def produce_tokenized_sentences(text): | |
# Convert text to lowercase | |
text = text.lower() | |
# Remove numbers | |
text = strip_numbers(text) | |
# Convert 'Đ' into 'đ' | |
text = PATTERN_D_REPLACE.sub(u'đ', text) | |
sentences = split_into_sentences(text) | |
print 'num_sentences: %d' % len(sentences) | |
sentence_vectors = [] | |
for s in sentences: | |
vector = strip_punctuations(s).split() | |
if len(vector) > 0: | |
sentence_vectors.append(vector) | |
# stats(sentences) | |
return sentence_vectors | |
def produce_tokenized_sentences_from_file(filepath): | |
with codecs.open(filepath, 'r', encoding='utf8') as f: | |
text = f.read() | |
return produce_tokenized_sentences(text) | |
def stats(word2vec_sentences): | |
"""Print the statistics of a corpus in word2vec format.""" | |
pass | |
print 'num_sentences: %12d' % len(word2vec_sentences) | |
print 'num_words: : %12d' % sum([len(s) for s in word2vec_sentences]) | |
if __name__ == '__main__': | |
CORPUS_PATH = os.path.join('corpus', 'isach', 'orig') | |
WORKDIR_PATH = os.path.join('corpus', 'isach', 'workdir') | |
direntries = os.listdir(CORPUS_PATH) | |
txt_filenames = [f for f in direntries if f.endswith('.txt') and os.path.isfile(os.path.join(CORPUS_PATH, f))] | |
for txt_filename in txt_filenames: | |
filepath = os.path.join(CORPUS_PATH, txt_filename) | |
sentences = produce_tokenized_sentences_from_file(filepath) | |
outputpath = os.path.join(WORKDIR_PATH, txt_filename[:-4] + '.pkl') | |
with open(outputpath, 'w') as pkl_file: | |
pickle.dump(sentences, pkl_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment