Skip to content

Instantly share code, notes, and snippets.

@ronaldgreeff
Last active January 5, 2019 01:34
Show Gist options
  • Save ronaldgreeff/07879a2d93b3db4119efca660e0e1c2c to your computer and use it in GitHub Desktop.
Save ronaldgreeff/07879a2d93b3db4119efca660e0e1c2c to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os, sys, re
lib_path = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'lib'))
if lib_path not in sys.path:
sys.path[0:0] = [lib_path]
# Main
import json
import random
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
#from sklearn.feature_extraction import DictVectorizer
#from sklearn import svm, preprocessing, cluster#, cross_validation
#from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support
#from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
#from textblob import Word, TextBlob
# DBSCAN
#####################################################
################# Temporary Helpers #################
DATA_LOOKUP = {
0 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
1 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
2 : {'breadcrumbs': 18, 'title': 19, 'price': 20},
3 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
4 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
5 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
6 : {'breadcrumbs': 20, 'title': 21, 'price': 22},
7 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
8 : {'breadcrumbs': 11, 'title': 13, 'price': 17},
}
#####################################################
##################### Constants #####################
STOP = stopwords.words('english')
#####################################################
###################### Helpers ######################
def load_data(file):
with open(file) as f:
data = json.load(f)
return data
def longest_substring(string_list):
string_list = [s.strip().rstrip() for s in string_list if not s == None]
long_comm_substr = ""
for i in range(len(string_list)):
match = SequenceMatcher(None, string_list[0], string_list[i]).find_longest_match(0, len(string_list[0]), 0, len(string_list[i]))
if (match.size!=0):
long_comm_substr = (string_list[0][match.a: match.a + match.size])
else:
long_comm_substr = string_list[0]
return long_comm_substr
def trim_ends(string):
return re.sub('\s\|.*', "", string)
def char_count(string):
"""(word chars, number chars, non-word/non-digit chars) - excluding whitespaces"""
d = {'a': 0, 'd': 0, 'w': 0, 's': 0}
for char in string:
char = char.strip("\n")
if char.isalpha():
d['a'] += 1
if char.isdigit():
d['d'] += 1
if char.isspace():
d['w'] += 1
if not char.isalpha() and not char.isdigit() and not char.isspace():
d['s'] += 1
return [d['a'], d['d'], d['w'], d['s']]
def split_stopwords(split_string):
stop_words = []
non_stopws = []
split_string = [s.lower() for s in split_string] # Lower first for stop words to work properly
[stop_words.append(s) if s in STOP else non_stop.append(s) for s in split_string]
return (non_stopws, stop_words)
def dense_matrix(string):
return [(c, ord(v)) for c, v in enumerate(string)]
def string(dense_matrix):
return ''.join([chr(c[1]) for c in dense_matrix])
#####################################################
class Page_Object:
def __init__(self, i):
self.title = trim_ends(longest_substring(i['titles']))
self.page_width = float(i['body']['bound']['width'])
self.page_height = float(i['body']['bound']['height'])
self.norm_len_texts = 1/float(len(i['texts']))
self.norm_def_font_size = 1/float(int(i['body']['computed']['font-size'][:2]))
#print "\n", self.title
#[Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size) for c, to in enumerate(i['texts'])]
for c, to in enumerate(i['texts']):
if c < 1:
Text_Object((c*self.norm_len_texts), to, self.page_width, self.page_height, self.norm_def_font_size)
class Text_Object:
def __init__(self, index, text_object, pw, ph, pndfs):
self.index = index
self.norm_coords = (text_object['bound']['top']/ph, text_object['bound']['left']/pw)
self.perc_area = ((text_object['bound']['height'] * text_object['bound']['width'])/(pw * ph))*100
self.features = self.extract_features(text_object['text'])
def extract_features(self, text_list):
tl = tuple()
for _, string in enumerate(text_list):
string = string.strip()
split_string = string.split()
chars_breakdown = char_count(string)
number_of_words = len(split_string)
average_word_length = (sum(len(s) for s in split_string)/len(split_string))
words, stopwords = split_stopwords(split_string)
len_words, len_stopwords = len(split_string), len(stopwords)
word_dm = [dense_matrix(word) for word in words]
def main():
data = [load_data(file) for file in DATA]
for c, page in enumerate(data):
page_object = Page_Object(page)
if __name__ == '__main__':
DATA_ROOT = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', 'data/_extracts/json'))
DATA = [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][:3] + [os.path.join(DATA_ROOT, file) for file in os.listdir(DATA_ROOT)][7:15]
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment