Skip to content

Instantly share code, notes, and snippets.

@Azure-rong
Last active August 29, 2015 13:57
Show Gist options
  • Save Azure-rong/9732853 to your computer and use it in GitHub Desktop.
Save Azure-rong/9732853 to your computer and use it in GitHub Desktop.
Feature extraction:Review product name, brand and attribute appearing times feature
#! /usr/bin/env python2.7
#coding=utf-8
"""
Counting the product name, product brand and product attribute appear times in the review.
This module aim to extract product name, brand and attribute features.
"""
import textprocessing as tp
# Read txt files include product name, product brand and product attributes
name = tp.get_txt_data("D:/code/product_name.txt", "lines")
brand = tp.get_txt_data("D:/code/product_brand.txt", "lines")
attribute = tp.get_txt_data("D:/code/product_attribute", "lines")
# Function counting feature appearing times
def name_brand_attribute(dataset):
num = []
n, b, a = 0, 0, 0
for review in dataset:
for word in review:
if word in name:
n += 1
elif word in brand:
b += 1
elif word in attribute:
a += 1
num.append((n, b, a))
n, b, a = 0, 0, 0
return num
# Store features
def store_name_brand_attribute_features(filepath, sheetnum, colnum, data, storepath):
data = tp.seg_fil_excel(filepath, sheetnum, colnum)
n_b_a = name_brand_attribute(data) # Need initiallized
f = open(storepath, 'w')
for i in n_b_a:
f.write(str(i[0])+' '+str(i[1])+' '+str(i[2])+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment