Created
October 25, 2019 10:08
-
-
Save oneryalcin/2d5651bb4716cd70d7c55362117093e5 to your computer and use it in GitHub Desktop.
Tokenizing channel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
nltk.download(['punkt', 'stopwords', 'wordnet']) | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
def tokenize(text): | |
""" | |
Tokenize a given text | |
Args: | |
text (str): Text to tokenize | |
Returns: | |
list: List of text tokens | |
""" | |
# remove punctiation | |
text = re.sub(r'[^a-zA-Z0-9]', " ", text) | |
# tokenize into words | |
tokens = word_tokenize(text) | |
# lemmatize to get the root of the word | |
lemmatizer = WordNetLemmatizer() | |
clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens | |
if tok not in stopwords.words('english')] | |
return clean_tokens | |
Example: | |
>>> tokenize('ep+17+horrid+henry+and+the+dinner+guests') | |
>>> ['ep', 'horrid', 'henry', 'dinner', 'guest'] | |
>>> tokenize('brit+cops%3a+war+on+crime') | |
>>> ['brit', 'cop', 'war', 'crime'] | |
>>>df.head()['content_name'].apply(lambda x: tokenize(x) if x else x) | |
0 [coronation, street] | |
1 [ep, mirror] | |
2 None | |
3 [go] | |
4 [forrest, gump] | |
Name: content_name, dtype: object |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment