Skip to content

Instantly share code, notes, and snippets.

@henry16lin
Last active February 13, 2020 15:20
Show Gist options
  • Save henry16lin/ec2cb600899b39488852337ce608e397 to your computer and use it in GitHub Desktop.
Save henry16lin/ec2cb600899b39488852337ce608e397 to your computer and use it in GitHub Desktop.
imdb preprocess
import os
import re
from itertools import chain
TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
# Removing html tags
sentence = TAG_RE.sub('', sen)
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
return sentence
def readIMDB(path, seg):
classes = ['pos', 'neg']
data = []
for label in classes:
files = os.listdir(os.path.join(path, seg, label))
for file in files:
with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
review = rf.read().replace('\n', '')
if label == 'pos':
data.append([preprocess_text(review), 1])
elif label == 'neg':
data.append([preprocess_text(review), 0])
return data
def tokenizer(text):
return [tok.lower() for tok in text.split(' ')] #簡單使用空格來斷詞
train_data = readIMDB('aclImdb','train')
test_data = readIMDB('aclImdb', 'test')
train_tokenized = []
test_tokenized = []
for review, score in train_data:
train_tokenized.append(tokenizer(review))
for review, score in test_data:
test_tokenized.append(tokenizer(review))
vocab = set(chain(*train_tokenized)) #把tokenized 所有字給串起來
vocab_size = len(vocab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment