Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created August 21, 2020 05:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amankharwal/b32b33fe5485273d13207aec197c284c to your computer and use it in GitHub Desktop.
Save amankharwal/b32b33fe5485273d13207aec197c284c to your computer and use it in GitHub Desktop.
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np
import re
import io
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}
emoticons_additional = {
'(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>',
':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>',
':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':":
'<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>',
}
text_processor = TextPreProcessor(
# terms that will be normalized
normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
'time', 'url', 'date', 'number'],
# terms that will be annotated
annotate={"hashtag", "allcaps", "elongated", "repeated",
'emphasis', 'censored'},
fix_html=True, # fix HTML tokens
# corpus from which the word statistics are going to be used
# for word segmentation
segmenter="twitter",
# corpus from which the word statistics are going to be used
# for spell correction
corrector="twitter",
unpack_hashtags=True, # perform word segmentation on hashtags
unpack_contractions=True, # Unpack contractions (can't -> can not)
spell_correct_elong=True, # spell correction for elongated words
# select a tokenizer. You can use SocialTokenizer, or pass your own
# the tokenizer, should take as input a string and return a list of tokens
tokenizer=SocialTokenizer(lowercase=True).tokenize,
# list of dictionaries, for replacing tokens extracted from the text,
# with other expressions. You can pass more than one dictionaries.
dicts=[emoticons, emoticons_additional]
)
def tokenize(text):
text = " ".join(text_processor.pre_process_doc(text))
return text
def preprocessData(dataFilePath, mode):
conversations = []
labels = []
with io.open(dataFilePath, encoding="utf8") as finput:
finput.readline()
for line in finput:
line = line.strip().split('\t')
for i in range(1, 4):
line[i] = tokenize(line[i])
if mode == "train":
labels.append(emotion2label[line[4]])
conv = line[1:4]
conversations.append(conv)
if mode == "train":
return np.array(conversations), np.array(labels)
else:
return np.array(conversations)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment