Last active
August 29, 2015 13:57
-
-
Save Azure-rong/9732931 to your computer and use it in GitHub Desktop.
Feature extraction:Review's entropy and perplexity by building ngram language model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python2.7 | |
#coding=utf-8 | |
""" | |
Compute review's entropy and perplexity. | |
This module aim to bulid review ngram language model then compute review entropy and perplexity as features | |
""" | |
import itertools | |
import nltk | |
from nltk.model.ngram import NgramModel | |
# Function of reading excel file contain review data and filtering stopwords, then return a list of review words | |
def word_by_word_review(filepath, sheetnum, colnum): | |
# Read product review data from excel file and segment every review | |
review_data = [] | |
for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]: | |
review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew | |
# Read txt file contain stopwords | |
stopfile = open('D:/code/stopword.txt', 'r') | |
stopdata1 = stopfile.readlines() | |
stopdata2 = ''.join(stopdata1) | |
stopwords = stopdata2.decode('utf8').split('\n') | |
stopfile.close() | |
# Filter stopwords from reviews | |
seg_fil_result = [] | |
for review in review_data: | |
fil = [word for word in review if word not in stopwords and word != ' '] | |
seg_fil_result.append(fil) | |
fil = [] | |
# Return review set as onedimentional list | |
review = list(itertools.chain(*seg_fil_result)) | |
return review | |
# Function of computing review entropy and perplexity of a review ngram language model | |
def entropy_perplexity(model, dataset): | |
ent_per = [] | |
for r in dataset: | |
ent = model.entropy(r) | |
per = model.perplexity(r) | |
ent_per.append((ent, per)) | |
return ent_per | |
# Store features | |
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath): | |
# Building an ngram language model of a certain product category review | |
lm = NgramModel(1, review_data, estimator=None) # Need initiallized | |
# Read full review dataset | |
data = tp.seg_fil_excel(filepath, sheetnum, colnum) | |
ep = entropy_perplexity(lm, data) | |
p = open(storepath,'w') | |
for j in ep: | |
p.write(str(j[0]) + '\t' + str(j[1]) + '\n') | |
p.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment