Skip to content

Instantly share code, notes, and snippets.

@Jian-Qiao
Last active August 23, 2017 16:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jian-Qiao/329357e98155ade7222150da1691da64 to your computer and use it in GitHub Desktop.
Save Jian-Qiao/329357e98155ade7222150da1691da64 to your computer and use it in GitHub Desktop.
import re
import nltk
import time
import pandas as pd
import numpy as np
import pickle
from nltk import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pattern.en import singularize
stop = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
#Import Data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
#attributes=pd.read_csv('attributes.csv')
description=pd.read_csv('product_descriptions.csv')
def clean_text(sentence):
sentence = unicode(sentence,'utf-8', errors='replace')
cleaned=' '.join([stemmer.stem(w) for w in sentence.split()])
cleaned=' '.join([w for w in cleaned.split() if w not in stop])
cleaned=re.sub('\W',' ',cleaned)
cleaned=' '.join([singularize(w) for w in sentence.split() ])
cleaned=' '.join([w for w in cleaned.split() if w != 'x' and w!= 'in'])
return cleaned
#Bullets_cleaned=Bullets['Bullets'].apply(lambda x: cleaning_text(x))
Description_cleaned=description['product_description'].apply(lambda x: clean_text(x))
pickle.dump(Description_cleaned, open( "Description_cleaned.p", "wb" ))
#Description_cleaned=pickle.load(open( "Description_cleaned.p", "rb" ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment