Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import re
import nltk
import time
import pandas as pd
import numpy as np
import pickle
from nltk import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pattern.en import singularize
stop = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
#Import Data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
#attributes=pd.read_csv('attributes.csv')
description=pd.read_csv('product_descriptions.csv')
def clean_text(sentence):
sentence = unicode(sentence,'utf-8', errors='replace')
cleaned=' '.join([stemmer.stem(w) for w in sentence.split()])
cleaned=' '.join([w for w in cleaned.split() if w not in stop])
cleaned=re.sub('\W',' ',cleaned)
cleaned=' '.join([singularize(w) for w in sentence.split() ])
cleaned=' '.join([w for w in cleaned.split() if w != 'x' and w!= 'in'])
return cleaned
#Bullets_cleaned=Bullets['Bullets'].apply(lambda x: cleaning_text(x))
Description_cleaned=description['product_description'].apply(lambda x: clean_text(x))
pickle.dump(Description_cleaned, open( "Description_cleaned.p", "wb" ))
#Description_cleaned=pickle.load(open( "Description_cleaned.p", "rb" ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.