Skip to content

Instantly share code, notes, and snippets.

Created October 23, 2018 15:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AdityaSoni19031997/4f4fbb12d59987fe739e0cc7f49efdab to your computer and use it in GitHub Desktop.
Save AdityaSoni19031997/4f4fbb12d59987fe739e0cc7f49efdab to your computer and use it in GitHub Desktop.
#from my repo
def preprocess_word(word):
# Remove punctuation
word = word.strip('\'"?!,.():;')
# Convert more than 2 letter repetitions to 2 letter
# funnnnny --> funny
word = re.sub(r'(.)\1+', r'\1\1', word)
# Remove - & '
word = re.sub(r'(-|\')', '', word)
return word
def is_valid_word(word):
# Check if word begins with an alphabet
return ('^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)
def handle_emojis(tweet):
# Smile -- :), : ), :-), (:, ( :, (-:, :')
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
# Love -- <3, :*
tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
# Wink -- ;-), ;), ;-D, ;D, (;, (-;
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
# Sad -- :-(, : (, :(, ):, )-:
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
# Cry -- :,(, :'(, :"(
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
return tweet
def preprocess_tweet(tweet):
processed_tweet = []
# Convert to lower case
tweet = tweet.lower()
# Replaces URLs with the word URL
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
# Replace @handle with the word USER_MENTION
tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
# Replaces #hashtag with hashtag
tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
# Remove RT (retweet)
tweet = re.sub(r'\brt\b', '', tweet)
# Replace 2+ dots with space
tweet = re.sub(r'\.{2,}', ' ', tweet)
# Strip space, " and ' from tweet
tweet = tweet.strip(' "\'')
# Replace emojis with either EMO_POS or EMO_NEG
tweet = handle_emojis(tweet)
# Replace multiple spaces with a single space
tweet = re.sub(r'\s+', ' ', tweet)
words = tweet.split()
for word in words:
word = preprocess_word(word)
if is_valid_word(word):
if use_stemmer:
word = str(porter_stemmer.stem(word))
return ' '.join(processed_tweet)
def preprocessing_text(s):
import re
s = re.sub(r"[^A-Za-z0-9^,\*+-=]", " ",s)
s = re.sub(r"(\d+)(k)", r"\g<1>000", s) #expand 'k' to '000' eg. 50k to 50000
s = re.sub(r"\;"," ",s)
s = re.sub(r"\:"," ",s)
s = re.sub(r"\,"," ",s)
s = re.sub(r"\."," ",s)
s = re.sub(r"\<"," ",s)
s = re.sub(r"\^"," ",s)
s = re.sub(r"(\d+)(/)", "\g<1> divide ", s) #change number/number to number divide number (eg. 2/3 to 2 divide 3)
s = re.sub(r"\/"," ",s) #replace the rest of / with white space
s = re.sub(r"\+", " plus ", s)
s = re.sub(r"\-", " minus ", s)
s = re.sub(r"\*", " multiply ", s)
s = re.sub(r"\=", "equal", s)
s = re.sub(r"What's", "What is ", s)
s = re.sub(r"what's", "what is ", s)
s = re.sub(r"Who's", "Who is ", s)
s = re.sub(r"who's", "who is ", s)
s = re.sub(r"\'s", " ", s)
s = re.sub(r"\'ve", " have ", s)
s = re.sub(r"can't", "cannot ", s)
s = re.sub(r"n't", " not ", s)
s = re.sub(r"\'re", " are ", s)
s = re.sub(r"\'d", " would ", s)
s = re.sub(r"\'ll", " will ", s)
s = re.sub(r"'m", " am ", s)
s = re.sub(r"or not", " ", s)
s = re.sub(r"What should I do to", "How can I", s)
s = re.sub(r"How do I", "How can I", s)
s = re.sub(r"How can you make", "What can make", s)
s = re.sub(r"How do we", "How do I", s)
s = re.sub(r"How do you", "How do I", s)
s = re.sub(r"Is it possible", "Can we", s)
s = re.sub(r"Why is", "Why", s)
s = re.sub(r"Which are", "What are", s)
s = re.sub(r"What are the reasons", "Why", s)
s = re.sub(r"What are some tips", "tips", s)
s = re.sub(r"What is the best way", "best way", s)
s = re.sub(r"e-mail", "email", s)
s = re.sub(r"e - mail", "email", s)
s = re.sub(r"US", "America", s)
s = re.sub(r"USA", "America", s)
s = re.sub(r"us", "America", s)
s = re.sub(r"usa", "America", s)
s = re.sub(r"Chinese", "China", s)
s = re.sub(r"india", "India", s)
s = re.sub(r"\s{2,}", " ", s) #remove extra white space
s = s.strip()
return s
def remove_stopwords(string):
word_list = [word.lower() for word in string.split()]
from nltk.corpus import stopwords
stopwords_list = list(stopwords.words("english"))
for word in word_list:
if word in stopwords_list:
return ' '.join(word_list)
def get_char_length_ratio(row):
return len(row['tweet'])/max(1,len(row['tweet_without_stopwords']))
def get_synonyms(word):
from nltk.corpus import wordnet as wn
synonyms = []
if wn.synsets(word):
for syn in wn.synsets(word):
for l in syn.lemmas():
return list(set(synonyms))
def get_row_syn_set(row):
import nltk
syn_set = [nltk.word_tokenize(row)]
for token in nltk.word_tokenize(row):
if get_synonyms(token):
return set([y for x in syn_set for y in x])
def get_Levenshtein(string1,string2):
import editdistance
return editdistance.eval(string1,string2)
def num_pos(sent):
num_pos = 0
word_list = [word.lower() for word in nltk.word_tokenize(sent)]
for index, word in enumerate(word_list):
if word in positive_words:
if word_list[index-1] not in ['not','no']:
num_pos += 1
return num_pos
def num_neg(sent):
num_neg = 0
word_list = [word.lower() for word in nltk.word_tokenize(sent)]
for index, word in enumerate(word_list):
if word in negative_words:
if word_list[index-1] not in ['not','no']:
num_neg += 1
return num_neg
p_url = ''
n_url = ''
import requests,nltk
positive_words = requests.get(p_url).content.decode('latin-1')
positive_words = nltk.word_tokenize(positive_words)
negative_words = requests.get(n_url).content.decode('latin-1')
negative_words = nltk.word_tokenize(negative_words)
positive_words = positive_words[413:]
negative_words = negative_words[418:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment