Skip to content

Instantly share code, notes, and snippets.

@VXU1230
Last active April 18, 2019 20:04
Show Gist options
  • Save VXU1230/eed1ec30ebcc1963af30f7eff4f474e2 to your computer and use it in GitHub Desktop.
Save VXU1230/eed1ec30ebcc1963af30f7eff4f474e2 to your computer and use it in GitHub Desktop.
def clean_sentence(dic):
sent = dic["text"].decode("utf-8").lower().translate(EXCLUDE_TRANS)
words = [LEMMA.lemmatize(word, POS_DIC[pos_tag([word])[0][1]]) for word in sent.split()
if word.isalpha()]
return words
NUM_WORKERS = 20
def clean_batch(batch_rows):
with Pool(processes=NUM_WORKERS) as pool:
batch = pool.map(clean_sentence, batch_rows, chunksize=200)
return batch
train_text = clean_batch(np_train_data)
test_text = clean_batch(np_test_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment