Created
May 7, 2018 02:24
-
-
Save limitpointinf0/07dec19467eac9c2168120c44819d080 to your computer and use it in GitHub Desktop.
Text Cleaning and EDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""The following script contains two functions. One for creating a wordcloud from a string. The second is for cleaning text found | |
in a dataframe column.""" | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import os | |
from wordcloud import WordCloud, STOPWORDS | |
import string | |
import matplotlib.pyplot as plt | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
def plot_wordcloud(text, title=None, max = 1000, size=(10,5), title_size=16): | |
"""plots wordcloud""" | |
wordcloud = WordCloud(max_words=max).generate(text) | |
plt.figure(figsize=size) | |
plt.title(title, size=title_size) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
def clean_ColText(df, col, stem=True): | |
"""Takes dataframe and column name and returns a dataframe with cleaned strings in the form of a list of word tokens. | |
Stemming is an option.""" | |
table = str.maketrans('', '', string.punctuation) | |
df[col] = df[col].map(lambda x: x.translate(table)) #remove punctuation | |
df[col] = df[col].map(lambda x: x.lower()) #lowercase | |
df[col] = df[col].apply(word_tokenize) #tokenize | |
stop_words = set(stopwords.words('english')) | |
df[col] = df[col].map(lambda x: [y for y in x if not y in stop_words]) #remove stop words | |
df[col] = df[col].map(lambda x: [y for y in x if y not in ["’","’","”","“","‘","—"]]) #remove smart quotes and other non alphanums | |
if stem: | |
porter = PorterStemmer() | |
df[col] = df[col].map(lambda x: [porter.stem(y) for y in x]) | |
return df | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment