Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Created February 10, 2019 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/53947164edf73c6d70eaec26471d53ab to your computer and use it in GitHub Desktop.
Save andreasvc/53947164edf73c6d70eaec26471d53ab to your computer and use it in GitHub Desktop.
Preprocess movie review polarity dataset v2.0
"""Preprocess movie review polarity dataset v2.0.
http://www.cs.cornell.edu/people/pabo/movie-review-data/
"""
import os
import re
import glob
import random
from syntok.tokenizer import Tokenizer
def process(path, pattern, out):
files = glob.glob(path)
random.shuffle(files)
tok = Tokenizer(replace_not_contraction=False)
for filename in files:
if pattern.match(os.path.basename(filename)):
with open(filename, 'r') as inp:
text = inp.read()
label = os.path.basename(os.path.dirname(filename))
out.write('__label__%s\t%s\n' % (
label,
' '.join(a.value for a in tok.tokenize(text.lower()))))
with open('moviestest.txt', 'w') as out:
process('txt_sentoken/*/*', re.compile('^cv0'), out)
with open('moviestrain.txt', 'w') as out:
process('txt_sentoken/*/*', re.compile('^cv[^0]'), out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment