Skip to content

Instantly share code, notes, and snippets.

@ceshine
Last active January 21, 2018 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceshine/efc8a8102e163b44801e4f1beb0e6771 to your computer and use it in GitHub Desktop.
Save ceshine/efc8a8102e163b44801e4f1beb0e6771 to your computer and use it in GitHub Desktop.
tokenization for toxic comment dataset
""" Tested with Python 3.6 """
import re
import pandas as pd
import spacy
import joblib
from tqdm import tqdm
nlp = spacy.load('en')
def tokenize(texts):
lists_of_tokens = []
for text in tqdm(texts, mininterval=2):
text = re.sub(
r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(text))
text = re.sub(r"[ ]+", " ", text)
text = re.sub(r"\!+", "!", text)
text = re.sub(r"\,+", ",", text)
text = re.sub(r"\?+", "?", text)
if (len(text) > 20000):
print(text)
print(len(text))
text = text[:20000]
lists_of_tokens.append(nlp.tokenizer(text))
return lists_of_tokens
def main():
train = pd.read_csv('data/train.csv')
train_tokenized = tokenize(train["comment_text"].tolist())
joblib.dump(train_tokenized, "cache/train_tokenized.pkl")
del train_tokenized, train
test = pd.read_csv('data/test.csv')
test_tokenized = tokenize(test["comment_text"].tolist())
joblib.dump(test_tokenized, "cache/test_tokenized.pkl")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment