Skip to content

Instantly share code, notes, and snippets.

@phamtm
Created October 19, 2015 01:43
Show Gist options
  • Save phamtm/d9b2db6f9834795c1608 to your computer and use it in GitHub Desktop.
Save phamtm/d9b2db6f9834795c1608 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import codecs
import cPickle as pickle
import os
import re
import string
PUNCTUATIONS = string.punctuation + u'“' + u'”' + u'…'
PATTERN_D_REPLACE = re.compile(r'Đ', flags=re.UNICODE)
PATTERN_NUMBERS = re.compile(r'\d+', flags=re.UNICODE)
PATTERN_PUNCTUATIONS = re.compile('[%s]' % (re.escape(PUNCTUATIONS)),
flags=re.UNICODE)
PATTERN_SPLIT_SENTENCES = re.compile(r'[\.!?]', flags=re.UNICODE)
"""
Preprocessing pipeline for text
- Lowercase
- Split paragraph -> sentences
- Strip punctuation from sentence
- Split sentence -> tokens
- Remove invalid tokens
"""
def strip_numbers(text):
"""Remove numbers from text.
Might not be necessary ?
"""
return PATTERN_NUMBERS.sub('', text)
def split_into_sentences(text):
"""Split paragraph into sentences"""
return PATTERN_SPLIT_SENTENCES.split(text)
def strip_punctuations(text):
"""Remove all the punctuation from a string."""
return PATTERN_PUNCTUATIONS.sub(' ', text)
def produce_tokenized_sentences(text):
# Convert text to lowercase
text = text.lower()
# Remove numbers
text = strip_numbers(text)
# Convert 'Đ' into 'đ'
text = PATTERN_D_REPLACE.sub(u'đ', text)
sentences = split_into_sentences(text)
print 'num_sentences: %d' % len(sentences)
sentence_vectors = []
for s in sentences:
vector = strip_punctuations(s).split()
if len(vector) > 0:
sentence_vectors.append(vector)
# stats(sentences)
return sentence_vectors
def produce_tokenized_sentences_from_file(filepath):
with codecs.open(filepath, 'r', encoding='utf8') as f:
text = f.read()
return produce_tokenized_sentences(text)
def stats(word2vec_sentences):
"""Print the statistics of a corpus in word2vec format."""
pass
print 'num_sentences: %12d' % len(word2vec_sentences)
print 'num_words: : %12d' % sum([len(s) for s in word2vec_sentences])
if __name__ == '__main__':
CORPUS_PATH = os.path.join('corpus', 'isach', 'orig')
WORKDIR_PATH = os.path.join('corpus', 'isach', 'workdir')
direntries = os.listdir(CORPUS_PATH)
txt_filenames = [f for f in direntries if f.endswith('.txt') and os.path.isfile(os.path.join(CORPUS_PATH, f))]
for txt_filename in txt_filenames:
filepath = os.path.join(CORPUS_PATH, txt_filename)
sentences = produce_tokenized_sentences_from_file(filepath)
outputpath = os.path.join(WORKDIR_PATH, txt_filename[:-4] + '.pkl')
with open(outputpath, 'w') as pkl_file:
pickle.dump(sentences, pkl_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment