alexander-wei/association_model.py Secret

## association_model.py
"""
association_model() class-> module for word clusterings
"""

import re
import pickle as pk
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
from time import time

# association_model class
class association_model():
    """
    association_model()
    construct and train a word association model
    usage:
    AM = association_model()
    AM.build_run()
    AM.predict()
    AM.cluster()
    """
    def __init__(self):
        self.TransformedWords = None

    # getk, helper fct
    def getk(self,k):
        """ getk(k), helper function to produce a k-index unit vector in R1 """
        return (np.arange(500) == k).astype(np.float64)

    # getk, generate statistical correlations
    def create_ATA(self, from_file=True):
        """ create_ATA, generate the cooccurrence matrix """
        # do some minor cleaning, ie. nan's
        _X = pd.read_csv("./workspace/archive/HN_posts_year_to_Sep_26_2016.csv")
        _X.title = _X.title.apply(lambda s: s.lower())
        _X = _X.dropna()
        all_times = list(_X['created_at'])
        # **Sample and sort the categories**

        # label by times
        Times = \
            [sum(np.array([60,1]) * np.array(
                [_.split('/') for _ in x.split(' ')][1][0].split(':'))\
                 .astype(np.float64))
             for x in all_times]
        Times=np.array(Times)
        binme = KBinsDiscretizer(10,encode="ordinal",strategy="uniform")
        binme.fit(Times.reshape(-1,1))
        Times_ = binme.transform(Times.reshape(-1,1))
        _X['timebin'] = Times_
        _X['log_comments'] = np.log1p(_X['num_comments'])

        # search by 1000 entries at a time
        t,s = 0,1000; i=0
        bigun = {''}
        listem = []
        returns = False
        while not returns:
            t+= 1000; s+=1000
            if s > _X.shape[0]:
                s = _X.shape[0]
                returns = True

            Q = _X.title.loc[_X.index[t:s]].str.lower().str.split()
            V = {''}
            l = []
            # filter alphanumeric
            for q in Q:
                for b in q:
                    x = re.sub("[^a-zA-Z]",' ',b)
                    l.extend(x.split())
            # extend a list, then concat into a set to get only unique entries
            listem.extend(l)
            for q in l:
                V.update({q})
            bigun.update(V)
        bigun.remove('')

        # get the counts
        wordcounts = {}.fromkeys(bigun,0)
        for l in listem:
            wordcounts[l]+=1
        wordcounts = \
            {k: v for k, v in sorted(wordcounts.items(),
                                     key=lambda item: item[1],reverse=True)}
        # top 500 words
        I100 = list(wordcounts.keys())[:500]
        self.Itup = [(i,wordcounts[i]) for i in I100]

        Imap = {''}
        i = 0
        for v in I100:
            Imap.update({v: i})
            i+=1
        Imap.remove('')
        # sklearn ---> counts per word
        self.countme = CountVectorizer(vocabulary=Imap)
        self.word_legend = self.countme.get_feature_names_out()
        self.ATA = 0

        global TransformedWords
        TransformedWords = self.countme.fit_transform(_X.title)
        self.TransformedWords=TransformedWords.toarray().copy()
        print (1232)
        if not from_file:
            """ enable this segment for generation of ATA matrix """
            # TransformedWords = TransformedWords.tocoo()
            # TransformedWords = self.countme.fit_transform(_X.title)
            # TransformedWords_0 = \
            #     tf.sparse.SparseTensor(indices=np.vstack(
            #         [TransformedWords.row,TransformedWords.col]).T\
            #             .reshape(-1,2),
            #             values=tf.constant(TransformedWords.data),
            #             dense_shape=[279256,500])
            # TransformedWords_0T = tf.sparse.transpose(TransformedWords_0)
            # ATA = tf.sparse.sparse_dense_matmul(
            #     TransformedWords_0T, tf.sparse.to_dense(TransformedWords_0))
            # ATA = np.log1p(ATA)
            # ATA = ATA.astype(np.float32)
            """ pk.dump(ATA, open('500x500Association.pk', 'wb'))"""
            assert True
            with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
                self.ATA = pk.load(file)
        else:
            with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
                self.ATA = pk.load(file)
        # generate sparse representation
        self.coo = coo_matrix(self.ATA).astype(np.float32)

    def define_model(self):
        """
        define_model()
        define the neural network
        """
        # Dense layer Neural Network encoder
        D = 10**-10
        R = 10**-5
        inputs = tf.keras.layers.Input(shape=(1001,))
        x = inputs[:,:-1]
        x1 = tf.keras.layers.Reshape((500,))(x[:,:500])
        x2 = tf.keras.layers.Reshape((500,))(x[:,500:])

        D1 = tf.keras.layers.Dense(
            250, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        x1 = D1(x1)
        x2 = D1(x2)

        D1d = tf.keras.layers.Dropout(D)
        x1 = D1d(x1)
        x2 = D1d(x2)

        D2 = tf.keras.layers.Dense(
            120, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        D2d = tf.keras.layers.Dropout(D)
        x1 = D2(x1); x1 = D2d(x1)
        x2 = D2(x2); x2 = D2d(x2)

        D3 =  tf.keras.layers.Dense(
            60, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        D3d = tf.keras.layers.Dropout(D)
        x1 = D3(x1); x1 = D3d(x1)
        x2 = D3(x2); x2 = D3d(x2)

        D4 = tf.keras.layers.Dense(
            2, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        x1 = D4(x1)
        x2 = D4(x2)

        R1 = tf.keras.layers.Reshape((2,1))
        x1 = R1(x1); x2 = R1(x2)
        y = tf.keras.layers.Concatenate(axis=-1)([x1, x2])

        Model = tf.keras.models.Model(inputs = inputs, outputs= y)
        return Model

    def build_run(self, verbose=False):
        """
        build_run( verbose bool)
        build and run the neural network to produce embeddings (and graphics)
        """
        def embed_loss_plain_association(x,y):
            return tf.keras.backend.sum(
                tf.keras.backend.pow((x[0]-x[1]) * y,2))

        self.create_ATA()
        self.Model = self.define_model()
        self.Model.build(input_shape=(1001,))
        if verbose:
            self.Model.summary()

        self.Model.compile(loss=embed_loss_plain_association,
                           optimizer=tf.keras.optimizers.Adam(
                            tf.keras.optimizers.schedules.ExponentialDecay(
                                1.,50,.5,staircase=False)))

        # generate training data
        Z = list(zip(self.coo.row, self.coo.col, self.coo.data))
        train_x = np.array(
            [np.hstack([self.getk(z[0]), self.getk(z[1]), z[2]])\
            for z in Z])

        self.Model.fit(train_x.reshape(-1,1001,1), self.coo.data.reshape(-1,1),
                       epochs=30, batch_size=4096, verbose=verbose)

    def predict(self):
        """
        predict()
        generate predictions
        """
        retrieve_x = np.array(
            [ np.hstack([self.getk(z), np.zeros(501)])    for z in range(500)]
        )
        self.YY = self.Model.predict(retrieve_x.reshape(-1,1001,1))
        return self.YY

    def cluster(self,show=True):
        """
        cluster( bool show)
        produce clusterings
        """
        # visualize_test predictions
        KM = KMeans(n_clusters=3)
        YY = self.YY
        Itup = self.Itup
        labels = KM.fit_predict(YY[:,:,0])

        score = KM.score(YY[:,:,0])
        STAMP  = str(time())
        print(STAMP)
        if show:
            _ = np.log(np.array(Itup[:])[:,1].astype(np.float32))
            _ = _/np.max(_); _ = _**1.5
            plt.figure(figsize=(16,10))
            plt.scatter(YY[:,0,0], YY[:,1,0],c=labels,alpha=_**1.3)
            plt.colorbar(ticks=np.arange(4))
            plt.show()
        labeled500Words = pd.DataFrame(np.squeeze([self.word_legend,labels]).T,
                                       columns=['word', 'label'])

        labeled500Words = labeled500Words.groupby('label').apply(np.array)

        appendix_of_words =\
            [labeled500Words[i][:,0] for i in range(len(labeled500Words))]

        with open("appendix/appendix." + STAMP + ".pk",'wb') as file:
            pk.dump(appendix_of_words,file)

        log_word_count_in_corpus = \
            np.log(np.array(Itup[:])[:,1].astype(np.float32))

        DF = pd.DataFrame(np.vstack([YY[:,0,0],YY[:,1,0], self.word_legend,
            labels, log_word_count_in_corpus]).T,
            columns=['x','y','word', 'cluster', 'log_count'])

        with open('appendix/predicted_DF.' + STAMP + '.pk', 'wb') as file:
            pk.dump(DF, file)

        if show:
            print(appendix_of_words)

        counts = [len(k) for k in appendix_of_words]
        print(counts)

        return counts, score, appendix_of_words

## example_clusters.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              example_clusters.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## new_clusters.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              new_clusters.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## scoring_clusters.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              scoring_clusters.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	"""
	association_model() class-> module for word clusterings
	"""

	import re
	import pickle as pk
	import numpy as np
	import pandas as pd
	from matplotlib import pyplot as plt
	import tensorflow as tf
	from scipy.sparse import coo_matrix
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.preprocessing import KBinsDiscretizer
	from sklearn.cluster import KMeans
	from time import time

	# association_model class
	class association_model():
	"""
	association_model()
	construct and train a word association model
	usage:
	AM = association_model()
	AM.build_run()
	AM.predict()
	AM.cluster()
	"""
	def __init__(self):
	self.TransformedWords = None

	# getk, helper fct
	def getk(self,k):
	""" getk(k), helper function to produce a k-index unit vector in R1 """
	return (np.arange(500) == k).astype(np.float64)

	# getk, generate statistical correlations
	def create_ATA(self, from_file=True):
	""" create_ATA, generate the cooccurrence matrix """
	# do some minor cleaning, ie. nan's
	_X = pd.read_csv("./workspace/archive/HN_posts_year_to_Sep_26_2016.csv")
	_X.title = _X.title.apply(lambda s: s.lower())
	_X = _X.dropna()
	all_times = list(_X['created_at'])
	# Sample and sort the categories

	# label by times
	Times = \
	[sum(np.array([60,1]) * np.array(
	[_.split('/') for _ in x.split(' ')][1][0].split(':'))\
	.astype(np.float64))
	for x in all_times]
	Times=np.array(Times)
	binme = KBinsDiscretizer(10,encode="ordinal",strategy="uniform")
	binme.fit(Times.reshape(-1,1))
	Times_ = binme.transform(Times.reshape(-1,1))
	_X['timebin'] = Times_
	_X['log_comments'] = np.log1p(_X['num_comments'])

	# search by 1000 entries at a time
	t,s = 0,1000; i=0
	bigun = {''}
	listem = []
	returns = False
	while not returns:
	t+= 1000; s+=1000
	if s > _X.shape[0]:
	s = _X.shape[0]
	returns = True

	Q = _X.title.loc[_X.index[t:s]].str.lower().str.split()
	V = {''}
	l = []
	# filter alphanumeric
	for q in Q:
	for b in q:
	x = re.sub("[^a-zA-Z]",' ',b)
	l.extend(x.split())
	# extend a list, then concat into a set to get only unique entries
	listem.extend(l)
	for q in l:
	V.update({q})
	bigun.update(V)
	bigun.remove('')

	# get the counts
	wordcounts = {}.fromkeys(bigun,0)
	for l in listem:
	wordcounts[l]+=1
	wordcounts = \
	{k: v for k, v in sorted(wordcounts.items(),
	key=lambda item: item[1],reverse=True)}
	# top 500 words
	I100 = list(wordcounts.keys())[:500]
	self.Itup = [(i,wordcounts[i]) for i in I100]

	Imap = {''}
	i = 0
	for v in I100:
	Imap.update({v: i})
	i+=1
	Imap.remove('')
	# sklearn ---> counts per word
	self.countme = CountVectorizer(vocabulary=Imap)
	self.word_legend = self.countme.get_feature_names_out()
	self.ATA = 0

	global TransformedWords
	TransformedWords = self.countme.fit_transform(_X.title)
	self.TransformedWords=TransformedWords.toarray().copy()
	print (1232)
	if not from_file:
	""" enable this segment for generation of ATA matrix """
	# TransformedWords = TransformedWords.tocoo()
	# TransformedWords = self.countme.fit_transform(_X.title)
	# TransformedWords_0 = \
	# tf.sparse.SparseTensor(indices=np.vstack(
	# [TransformedWords.row,TransformedWords.col]).T\
	# .reshape(-1,2),
	# values=tf.constant(TransformedWords.data),
	# dense_shape=[279256,500])
	# TransformedWords_0T = tf.sparse.transpose(TransformedWords_0)
	# ATA = tf.sparse.sparse_dense_matmul(
	# TransformedWords_0T, tf.sparse.to_dense(TransformedWords_0))
	# ATA = np.log1p(ATA)
	# ATA = ATA.astype(np.float32)
	""" pk.dump(ATA, open('500x500Association.pk', 'wb'))"""
	assert True
	with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
	self.ATA = pk.load(file)
	else:
	with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
	self.ATA = pk.load(file)
	# generate sparse representation
	self.coo = coo_matrix(self.ATA).astype(np.float32)

	def define_model(self):
	"""
	define_model()
	define the neural network
	"""
	# Dense layer Neural Network encoder
	D = 10**-10
	R = 10**-5
	inputs = tf.keras.layers.Input(shape=(1001,))
	x = inputs[:,:-1]
	x1 = tf.keras.layers.Reshape((500,))(x[:,:500])
	x2 = tf.keras.layers.Reshape((500,))(x[:,500:])

	D1 = tf.keras.layers.Dense(
	250, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	x1 = D1(x1)
	x2 = D1(x2)

	D1d = tf.keras.layers.Dropout(D)
	x1 = D1d(x1)
	x2 = D1d(x2)

	D2 = tf.keras.layers.Dense(
	120, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	D2d = tf.keras.layers.Dropout(D)
	x1 = D2(x1); x1 = D2d(x1)
	x2 = D2(x2); x2 = D2d(x2)

	D3 = tf.keras.layers.Dense(
	60, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	D3d = tf.keras.layers.Dropout(D)
	x1 = D3(x1); x1 = D3d(x1)
	x2 = D3(x2); x2 = D3d(x2)

	D4 = tf.keras.layers.Dense(
	2, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	x1 = D4(x1)
	x2 = D4(x2)

	R1 = tf.keras.layers.Reshape((2,1))
	x1 = R1(x1); x2 = R1(x2)
	y = tf.keras.layers.Concatenate(axis=-1)([x1, x2])

	Model = tf.keras.models.Model(inputs = inputs, outputs= y)
	return Model

	def build_run(self, verbose=False):
	"""
	build_run( verbose bool)
	build and run the neural network to produce embeddings (and graphics)
	"""
	def embed_loss_plain_association(x,y):
	return tf.keras.backend.sum(
	tf.keras.backend.pow((x[0]-x[1]) * y,2))

	self.create_ATA()
	self.Model = self.define_model()
	self.Model.build(input_shape=(1001,))
	if verbose:
	self.Model.summary()

	self.Model.compile(loss=embed_loss_plain_association,
	optimizer=tf.keras.optimizers.Adam(
	tf.keras.optimizers.schedules.ExponentialDecay(
	1.,50,.5,staircase=False)))

	# generate training data
	Z = list(zip(self.coo.row, self.coo.col, self.coo.data))
	train_x = np.array(
	[np.hstack([self.getk(z[0]), self.getk(z[1]), z[2]])\
	for z in Z])

	self.Model.fit(train_x.reshape(-1,1001,1), self.coo.data.reshape(-1,1),
	epochs=30, batch_size=4096, verbose=verbose)

	def predict(self):
	"""
	predict()
	generate predictions
	"""
	retrieve_x = np.array(
	[ np.hstack([self.getk(z), np.zeros(501)]) for z in range(500)]
	)
	self.YY = self.Model.predict(retrieve_x.reshape(-1,1001,1))
	return self.YY

	def cluster(self,show=True):
	"""
	cluster( bool show)
	produce clusterings
	"""
	# visualize_test predictions
	KM = KMeans(n_clusters=3)
	YY = self.YY
	Itup = self.Itup
	labels = KM.fit_predict(YY[:,:,0])

	score = KM.score(YY[:,:,0])
	STAMP = str(time())
	print(STAMP)
	if show:
	_ = np.log(np.array(Itup[:])[:,1].astype(np.float32))
	_ = _/np.max(_); _ = _**1.5
	plt.figure(figsize=(16,10))
	plt.scatter(YY[:,0,0], YY[:,1,0],c=labels,alpha=_**1.3)
	plt.colorbar(ticks=np.arange(4))
	plt.show()
	labeled500Words = pd.DataFrame(np.squeeze([self.word_legend,labels]).T,
	columns=['word', 'label'])

	labeled500Words = labeled500Words.groupby('label').apply(np.array)

	appendix_of_words =\
	[labeled500Words[i][:,0] for i in range(len(labeled500Words))]

	with open("appendix/appendix." + STAMP + ".pk",'wb') as file:
	pk.dump(appendix_of_words,file)

	log_word_count_in_corpus = \
	np.log(np.array(Itup[:])[:,1].astype(np.float32))

	DF = pd.DataFrame(np.vstack([YY[:,0,0],YY[:,1,0], self.word_legend,
	labels, log_word_count_in_corpus]).T,
	columns=['x','y','word', 'cluster', 'log_count'])

	with open('appendix/predicted_DF.' + STAMP + '.pk', 'wb') as file:
	pk.dump(DF, file)

	if show:
	print(appendix_of_words)

	counts = [len(k) for k in appendix_of_words]
	print(counts)

	return counts, score, appendix_of_words