Skip to content

Instantly share code, notes, and snippets.

@Azure-rong
Last active August 29, 2015 13:57
Show Gist options
  • Save Azure-rong/9732821 to your computer and use it in GitHub Desktop.
Save Azure-rong/9732821 to your computer and use it in GitHub Desktop.
Feature extraction:Review word, sentence and review length features
#! /usr/bin/env python2.7
#coding=utf-8
"""
Counting review's word number, sentence number and review length
This module aim to extract review's word number and sentence number and review length features.
"""
import textprocessing as tp
# Function counting review word number, sentence number and review length
def word_sent_count(dataset):
word_sent_count = []
for review in dataset:
sents = tp.cut_sentence(review)
words = tp.segmentation(review,'list')
sent_num = len(sents)
word_num = len(words)
sent_word = float(word_num)/float(sent_num) # review length = word number/sentence number
word_sent_count.append([word_num, sent_num, sent_word])
return word_sent_count
# Store features
def store_word_sent_num_features(filepath, sheetnum, colnum, data, storepath):
data = tp.seg_fil_excel(filepath, sheetnum, colnum)
word_sent_num = word_sent_count(data) # Need initiallized
f = open(storepath,'w')
for i in word_sent_num:
f.write(str(i[0])+' '+str(i[1])+' '+str(i[2])+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment