Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Preprocess movie review polarity dataset v2.0
"""Preprocess movie review polarity dataset v2.0.
import os
import re
import glob
import random
from syntok.tokenizer import Tokenizer
def process(path, pattern, out):
files = glob.glob(path)
tok = Tokenizer(replace_not_contraction=False)
for filename in files:
if pattern.match(os.path.basename(filename)):
with open(filename, 'r') as inp:
text =
label = os.path.basename(os.path.dirname(filename))
out.write('__label__%s\t%s\n' % (
' '.join(a.value for a in tok.tokenize(text.lower()))))
with open('moviestest.txt', 'w') as out:
process('txt_sentoken/*/*', re.compile('^cv0'), out)
with open('moviestrain.txt', 'w') as out:
process('txt_sentoken/*/*', re.compile('^cv[^0]'), out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment