ronaldgreeff/Machine Learning - text feature extraction

## Machine Learning - text feature extraction
# -*- coding: utf-8 -*-
import os, sys, re

lib_path = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'lib'))
if lib_path not in sys.path:
    sys.path[0:0] = [lib_path]

# Main
import json
import random
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

#from sklearn.feature_extraction import DictVectorizer
#from sklearn import svm, preprocessing, cluster#, cross_validation
#from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support
#from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
#from textblob import Word, TextBlob

# DBSCAN

#####################################################
################# Temporary Helpers #################

DATA_LOOKUP = {
    0 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
    1 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
    2 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
    3 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
    4 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
    5 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
    6 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
    7 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
    8 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
}

#####################################################
##################### Constants #####################

STOP = stopwords.words('english')

#####################################################
###################### Helpers ######################

def load_data(file):
    with open(file) as f:
        data = json.load(f)
    return data

def longest_substring(string_list):
    string_list = [s.strip().rstrip() for s in string_list if not s == None]
    long_comm_substr = ""

    for i in range(len(string_list)):
        match = SequenceMatcher(None, string_list[0], string_list[i]).find_longest_match(0, len(string_list[0]), 0, len(string_list[i]))

        if (match.size!=0):
            long_comm_substr = (string_list[0][match.a: match.a + match.size])
        else:
            long_comm_substr = string_list[0]

    return long_comm_substr

def trim_ends(string):
    return re.sub('\s\|.*', "", string)

def char_count(string):
    """(word chars, number chars, non-word/non-digit chars) - excluding whitespaces"""
    d = {'a': 0, 'd': 0, 'w': 0, 's': 0}
    for char in string:
        char = char.strip("\n")
        if char.isalpha():
            d['a'] += 1
        if char.isdigit():
            d['d'] += 1
        if char.isspace():
            d['w'] += 1
        if not char.isalpha() and not char.isdigit() and not char.isspace():
            d['s'] += 1
    return [d['a'], d['d'], d['w'], d['s']]

def split_stopwords(split_string):
    stop_words = []
    non_stopws = []

    split_string = [s.lower() for s in split_string] # Lower first for stop words to work properly
    [stop_words.append(s) if s in STOP else non_stop.append(s) for s in split_string]

    return (non_stopws, stop_words)

def dense_matrix(string):
    return [(c, ord(v)) for c, v in enumerate(string)]

def string(dense_matrix):
    return ''.join([chr(c[1]) for c in dense_matrix])

#####################################################

class Page_Object:
    def __init__(self, i):
        self.title = trim_ends(longest_substring(i['titles']))

        self.page_width = float(i['body']['bound']['width'])
        self.page_height = float(i['body']['bound']['height'])
        self.norm_len_texts = 1/float(len(i['texts']))
        self.norm_def_font_size = 1/float(int(i['body']['computed']['font-size'][:2]))

        #print "\n", self.title

        #[Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size) for c, to in enumerate(i['texts'])]
        for c, to in enumerate(i['texts']):
            if c < 1:
                Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size)

class Text_Object:
    def __init__(self, index, text_object, pw, ph, pndfs):
        self.index = index
        self.norm_coords = (text_object['bound']['top']/ph, text_object['bound']['left']/pw)
        self.perc_area = ((text_object['bound']['height'] * text_object['bound']['width'])/(pw * ph))*100
        self.features = self.extract_features(text_object['text'])

    def extract_features(self, text_list):

        tl = tuple()

        for _, string in enumerate(text_list):
            string = string.strip()
            split_string = string.split()

            chars_breakdown = char_count(string)
            number_of_words = len(split_string)
            average_word_length = (sum(len(s) for s in split_string)/len(split_string))

            words, stopwords = split_stopwords(split_string)
            len_words, len_stopwords = len(split_string), len(stopwords)

            word_dm = [dense_matrix(word) for word in words]

def main():

    data = [load_data(file) for file in DATA]
    for c, page in enumerate(data):
        page_object = Page_Object(page)


if __name__ == '__main__':

    DATA_ROOT = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'data/_extracts/json'))
    DATA = [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][:3] + [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][7:15]

    main()
	# -- coding: utf-8 --
	import os, sys, re

	lib_path = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'lib'))
	if lib_path not in sys.path:
	sys.path[0:0] = [lib_path]

	# Main
	import json
	import random
	import numpy as np
	import pandas as pd
	from difflib import SequenceMatcher

	#from sklearn.feature_extraction import DictVectorizer
	#from sklearn import svm, preprocessing, cluster#, cross_validation
	#from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support
	#from sklearn.preprocessing import StandardScaler
	from nltk.corpus import stopwords
	#from textblob import Word, TextBlob

	# DBSCAN

	#####################################################
	################# Temporary Helpers #################

	DATA_LOOKUP = {
	0 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
	1 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
	2 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
	3 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
	4 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
	5 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
	6 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
	7 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
	8 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
	}

	#####################################################
	##################### Constants #####################

	STOP = stopwords.words('english')

	#####################################################
	###################### Helpers ######################

	def load_data(file):
	with open(file) as f:
	data = json.load(f)
	return data

	def longest_substring(string_list):
	string_list = [s.strip().rstrip() for s in string_list if not s == None]
	long_comm_substr = ""

	for i in range(len(string_list)):
	match = SequenceMatcher(None, string_list[0], string_list[i]).find_longest_match(0, len(string_list[0]), 0, len(string_list[i]))

	if (match.size!=0):
	long_comm_substr = (string_list[0][match.a: match.a + match.size])
	else:
	long_comm_substr = string_list[0]

	return long_comm_substr

	def trim_ends(string):
	return re.sub('\s\\|.*', "", string)

	def char_count(string):
	"""(word chars, number chars, non-word/non-digit chars) - excluding whitespaces"""
	d = {'a': 0, 'd': 0, 'w': 0, 's': 0}
	for char in string:
	char = char.strip("\n")
	if char.isalpha():
	d['a'] += 1
	if char.isdigit():
	d['d'] += 1
	if char.isspace():
	d['w'] += 1
	if not char.isalpha() and not char.isdigit() and not char.isspace():
	d['s'] += 1
	return [d['a'], d['d'], d['w'], d['s']]

	def split_stopwords(split_string):
	stop_words = []
	non_stopws = []

	split_string = [s.lower() for s in split_string] # Lower first for stop words to work properly
	[stop_words.append(s) if s in STOP else non_stop.append(s) for s in split_string]

	return (non_stopws, stop_words)

	def dense_matrix(string):
	return [(c, ord(v)) for c, v in enumerate(string)]

	def string(dense_matrix):
	return ''.join([chr(c[1]) for c in dense_matrix])

	#####################################################

	class Page_Object:
	def __init__(self, i):
	self.title = trim_ends(longest_substring(i['titles']))

	self.page_width = float(i['body']['bound']['width'])
	self.page_height = float(i['body']['bound']['height'])
	self.norm_len_texts = 1/float(len(i['texts']))
	self.norm_def_font_size = 1/float(int(i['body']['computed']['font-size'][:2]))

	#print "\n", self.title

	#[Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size) for c, to in enumerate(i['texts'])]
	for c, to in enumerate(i['texts']):
	if c < 1:
	Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size)

	class Text_Object:
	def __init__(self, index, text_object, pw, ph, pndfs):
	self.index = index
	self.norm_coords = (text_object['bound']['top']/ph, text_object['bound']['left']/pw)
	self.perc_area = ((text_object['bound']['height'] * text_object['bound']['width'])/(pw * ph))*100
	self.features = self.extract_features(text_object['text'])

	def extract_features(self, text_list):

	tl = tuple()

	for _, string in enumerate(text_list):
	string = string.strip()
	split_string = string.split()

	chars_breakdown = char_count(string)
	number_of_words = len(split_string)
	average_word_length = (sum(len(s) for s in split_string)/len(split_string))

	words, stopwords = split_stopwords(split_string)
	len_words, len_stopwords = len(split_string), len(stopwords)

	word_dm = [dense_matrix(word) for word in words]

	def main():

	data = [load_data(file) for file in DATA]
	for c, page in enumerate(data):
	page_object = Page_Object(page)



	if __name__ == '__main__':

	DATA_ROOT = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'data/_extracts/json'))
	DATA = [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][:3] + [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][7:15]

	main()