alexander-wei/association_model.py Secret

## association_model.py
"""
association_model() class-> module for word clusterings
"""

import re
import pickle as pk
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
from time import time

# association_model class
class association_model():
    """
    association_model()
    construct and train a word association model
    usage:
    AM = association_model()
    AM.build_run()
    AM.predict()
    AM.cluster()
    """
    def __init__(self):
        self.TransformedWords = None

    # getk, helper fct
    def getk(self,k):
        """ getk(k), helper function to produce a k-index unit vector in R1 """
        return (np.arange(500) == k).astype(np.float64)

    # getk, generate statistical correlations
    def create_ATA(self, from_file=True):
        """ create_ATA, generate the cooccurrence matrix """
        # do some minor cleaning, ie. nan's
        _X = pd.read_csv("./workspace/archive/HN_posts_year_to_Sep_26_2016.csv")
        _X.title = _X.title.apply(lambda s: s.lower())
        _X = _X.dropna()
        all_times = list(_X['created_at'])
        # **Sample and sort the categories**

        # label by times
        Times = \
            [sum(np.array([60,1]) * np.array(
                [_.split('/') for _ in x.split(' ')][1][0].split(':'))\
                 .astype(np.float64))
             for x in all_times]
        Times=np.array(Times)
        binme = KBinsDiscretizer(10,encode="ordinal",strategy="uniform")
        binme.fit(Times.reshape(-1,1))
        Times_ = binme.transform(Times.reshape(-1,1))
        _X['timebin'] = Times_
        _X['log_comments'] = np.log1p(_X['num_comments'])

        # search by 1000 entries at a time
        t,s = 0,1000; i=0
        bigun = {''}
        listem = []
        returns = False
        while not returns:
            t+= 1000; s+=1000
            if s > _X.shape[0]:
                s = _X.shape[0]
                returns = True

            Q = _X.title.loc[_X.index[t:s]].str.lower().str.split()
            V = {''}
            l = []
            # filter alphanumeric
            for q in Q:
                for b in q:
                    x = re.sub("[^a-zA-Z]",' ',b)
                    l.extend(x.split())
            # extend a list, then concat into a set to get only unique entries
            listem.extend(l)
            for q in l:
                V.update({q})
            bigun.update(V)
        bigun.remove('')

        # get the counts
        wordcounts = {}.fromkeys(bigun,0)
        for l in listem:
            wordcounts[l]+=1
        wordcounts = \
            {k: v for k, v in sorted(wordcounts.items(),
                                     key=lambda item: item[1],reverse=True)}
        # top 500 words
        I100 = list(wordcounts.keys())[:500]
        self.Itup = [(i,wordcounts[i]) for i in I100]

        Imap = {''}
        i = 0
        for v in I100:
            Imap.update({v: i})
            i+=1
        Imap.remove('')
        # sklearn ---> counts per word
        self.countme = CountVectorizer(vocabulary=Imap)
        self.word_legend = self.countme.get_feature_names_out()
        self.ATA = 0

        global TransformedWords
        TransformedWords = self.countme.fit_transform(_X.title)
        self.TransformedWords=TransformedWords.toarray().copy()
        print (1232)
        if not from_file:
            """ enable this segment for generation of ATA matrix """
            # TransformedWords = TransformedWords.tocoo()
            # TransformedWords = self.countme.fit_transform(_X.title)
            # TransformedWords_0 = \
            #     tf.sparse.SparseTensor(indices=np.vstack(
            #         [TransformedWords.row,TransformedWords.col]).T\
            #             .reshape(-1,2),
            #             values=tf.constant(TransformedWords.data),
            #             dense_shape=[279256,500])
            # TransformedWords_0T = tf.sparse.transpose(TransformedWords_0)
            # ATA = tf.sparse.sparse_dense_matmul(
            #     TransformedWords_0T, tf.sparse.to_dense(TransformedWords_0))
            # ATA = np.log1p(ATA)
            # ATA = ATA.astype(np.float32)
            """ pk.dump(ATA, open('500x500Association.pk', 'wb'))"""
            assert True
            with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
                self.ATA = pk.load(file)
        else:
            with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
                self.ATA = pk.load(file)
        # generate sparse representation
        self.coo = coo_matrix(self.ATA).astype(np.float32)

    def define_model(self):
        """
        define_model()
        define the neural network
        """
        # Dense layer Neural Network encoder
        D = 10**-10
        R = 10**-5
        inputs = tf.keras.layers.Input(shape=(1001,))
        x = inputs[:,:-1]
        x1 = tf.keras.layers.Reshape((500,))(x[:,:500])
        x2 = tf.keras.layers.Reshape((500,))(x[:,500:])

        D1 = tf.keras.layers.Dense(
            250, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        x1 = D1(x1)
        x2 = D1(x2)

        D1d = tf.keras.layers.Dropout(D)
        x1 = D1d(x1)
        x2 = D1d(x2)

        D2 = tf.keras.layers.Dense(
            120, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        D2d = tf.keras.layers.Dropout(D)
        x1 = D2(x1); x1 = D2d(x1)
        x2 = D2(x2); x2 = D2d(x2)

        D3 =  tf.keras.layers.Dense(
            60, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        D3d = tf.keras.layers.Dropout(D)
        x1 = D3(x1); x1 = D3d(x1)
        x2 = D3(x2); x2 = D3d(x2)

        D4 = tf.keras.layers.Dense(
            2, kernel_regularizer=tf.keras.regularizers.l2(R),
            activity_regularizer=tf.keras.regularizers.l2(R))
        x1 = D4(x1)
        x2 = D4(x2)

        R1 = tf.keras.layers.Reshape((2,1))
        x1 = R1(x1); x2 = R1(x2)
        y = tf.keras.layers.Concatenate(axis=-1)([x1, x2])

        Model = tf.keras.models.Model(inputs = inputs, outputs= y)
        return Model

    def build_run(self, verbose=False):
        """
        build_run( verbose bool)
        build and run the neural network to produce embeddings (and graphics)
        """
        def embed_loss_plain_association(x,y):
            return tf.keras.backend.sum(
                tf.keras.backend.pow((x[0]-x[1]) * y,2))

        self.create_ATA()
        self.Model = self.define_model()
        self.Model.build(input_shape=(1001,))
        if verbose:
            self.Model.summary()

        self.Model.compile(loss=embed_loss_plain_association,
                           optimizer=tf.keras.optimizers.Adam(
                            tf.keras.optimizers.schedules.ExponentialDecay(
                                1.,50,.5,staircase=False)))

        # generate training data
        Z = list(zip(self.coo.row, self.coo.col, self.coo.data))
        train_x = np.array(
            [np.hstack([self.getk(z[0]), self.getk(z[1]), z[2]])\
            for z in Z])

        self.Model.fit(train_x.reshape(-1,1001,1), self.coo.data.reshape(-1,1),
                       epochs=30, batch_size=4096, verbose=verbose)

    def predict(self):
        """
        predict()
        generate predictions
        """
        retrieve_x = np.array(
            [ np.hstack([self.getk(z), np.zeros(501)])    for z in range(500)]
        )
        self.YY = self.Model.predict(retrieve_x.reshape(-1,1001,1))
        return self.YY

    def cluster(self,show=True):
        """
        cluster( bool show)
        produce clusterings
        """
        # visualize_test predictions
        KM = KMeans(n_clusters=3)
        YY = self.YY
        Itup = self.Itup
        labels = KM.fit_predict(YY[:,:,0])

        score = KM.score(YY[:,:,0])
        STAMP  = str(time())
        print(STAMP)
        if show:
            _ = np.log(np.array(Itup[:])[:,1].astype(np.float32))
            _ = _/np.max(_); _ = _**1.5
            plt.figure(figsize=(16,10))
            plt.scatter(YY[:,0,0], YY[:,1,0],c=labels,alpha=_**1.3)
            plt.colorbar(ticks=np.arange(4))
            plt.show()
        labeled500Words = pd.DataFrame(np.squeeze([self.word_legend,labels]).T,
                                       columns=['word', 'label'])

        labeled500Words = labeled500Words.groupby('label').apply(np.array)

        appendix_of_words =\
            [labeled500Words[i][:,0] for i in range(len(labeled500Words))]

        with open("appendix/appendix." + STAMP + ".pk",'wb') as file:
            pk.dump(appendix_of_words,file)

        log_word_count_in_corpus = \
            np.log(np.array(Itup[:])[:,1].astype(np.float32))

        DF = pd.DataFrame(np.vstack([YY[:,0,0],YY[:,1,0], self.word_legend,
            labels, log_word_count_in_corpus]).T,
            columns=['x','y','word', 'cluster', 'log_count'])

        with open('appendix/predicted_DF.' + STAMP + '.pk', 'wb') as file:
            pk.dump(DF, file)

        if show:
            print(appendix_of_words)

        counts = [len(k) for k in appendix_of_words]
        print(counts)

        return counts, score, appendix_of_words

## example_clusters.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "4c136840-ce71-4df0-a70d-eebcc58bd866",
   "metadata": {},
   "outputs": [],
   "source": [
    "YY = pk.load(open('appendix/appendix.1661655443.8803668.pk', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "dbfa5865-24c5-4e91-a392-995bf44657cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|     | 0           | 1            | 2            |\n",
      "|----:|:------------|:-------------|:-------------|\n",
      "|   0 | a           | ad           | access       |\n",
      "|   1 | about       | america      | ads          |\n",
      "|   2 | again       | and          | after        |\n",
      "|   3 | ai          | applications | against      |\n",
      "|   4 | analysis    | aws          | age          |\n",
      "|   5 | android     | become       | algorithm    |\n",
      "|   6 | any         | beta         | all          |\n",
      "|   7 | api         | bill         | amazon       |\n",
      "|   8 | app         | black        | american     |\n",
      "|   9 | apple       | bot          | an           |\n",
      "|  10 | apps        | built        | analytics    |\n",
      "|  11 | are         | business     | angular      |\n",
      "|  12 | b           | but          | apache       |\n",
      "|  13 | based       | can          | application  |\n",
      "|  14 | be          | cars         | art          |\n",
      "|  15 | been        | chinese      | artificial   |\n",
      "|  16 | behind      | cloud        | as           |\n",
      "|  17 | between     | code         | at           |\n",
      "|  18 | blockchain  | computer     | attack       |\n",
      "|  19 | build       | creating     | available    |\n",
      "|  20 | by          | day          | back         |\n",
      "|  21 | c           | did          | bad          |\n",
      "|  22 | car         | does         | before       |\n",
      "|  23 | case        | driving      | being        |\n",
      "|  24 | china       | email        | best         |\n",
      "|  25 | client      | engineering  | better       |\n",
      "|  26 | coding      | ever         | big          |\n",
      "|  27 | company     | every        | bitcoin      |\n",
      "|  28 | content     | everything   | book         |\n",
      "|  29 | control     | fbi          | books        |\n",
      "|  30 | cost        | first        | brain        |\n",
      "|  31 | could       | framework    | browser      |\n",
      "|  32 | court       | generation   | building     |\n",
      "|  33 | create      | get          | ceo          |\n",
      "|  34 | css         | getting      | change       |\n",
      "|  35 | d           | go           | chrome       |\n",
      "|  36 | data        | good         | city         |\n",
      "|  37 | deep        | great        | com          |\n",
      "|  38 | design      | growth       | coming       |\n",
      "|  39 | developer   | hackers      | command      |\n",
      "|  40 | developers  | his          | community    |\n",
      "|  41 | development | home         | companies    |\n",
      "|  42 | digital     | how          | computing    |\n",
      "|  43 | distributed | html         | core         |\n",
      "|  44 | do          | human        | database     |\n",
      "|  45 | docker      | image        | dead         |\n",
      "|  46 | don         | internet     | deal         |\n",
      "|  47 | dont        | interview    | death        |\n",
      "|  48 | e           | into         | devices      |\n",
      "|  49 | earth       | ios          | down         |\n",
      "|  50 | easy        | is           | drone        |\n",
      "|  51 | encryption  | it           | economy      |\n",
      "|  52 | end         | language     | energy       |\n",
      "|  53 | f           | learn        | engine       |\n",
      "|  54 | facebook    | less         | experience   |\n",
      "|  55 | files       | lessons      | fast         |\n",
      "|  56 | for         | life         | faster       |\n",
      "|  57 | founder     | like         | file         |\n",
      "|  58 | free        | line         | find         |\n",
      "|  59 | from        | linux        | found        |\n",
      "|  60 | guide       | live         | full         |\n",
      "|  61 | hack        | look         | future       |\n",
      "|  62 | hacker      | love         | game         |\n",
      "|  63 | have        | mac          | games        |\n",
      "|  64 | here        | machine      | gets         |\n",
      "|  65 | history     | making       | git          |\n",
      "|  66 | i           | management   | github       |\n",
      "|  67 | if          | map          | global       |\n",
      "|  68 | in          | may          | going        |\n",
      "|  69 | industry    | memory       | google       |\n",
      "|  70 | introducing | microsoft    | got          |\n",
      "|  71 | io          | most         | government   |\n",
      "|  72 | java        | native       | hacking      |\n",
      "|  73 | javascript  | need         | hard         |\n",
      "|  74 | jobs        | network      | has          |\n",
      "|  75 | just        | neural       | health       |\n",
      "|  76 | k           | off          | help         |\n",
      "|  77 | last        | on           | high         |\n",
      "|  78 | launch      | or           | hn           |\n",
      "|  79 | launches    | other        | http         |\n",
      "|  80 | learned     | our          | images       |\n",
      "|  81 | let         | pdf          | india        |\n",
      "|  82 | lets        | plan         | inside       |\n",
      "|  83 | library     | power        | intel        |\n",
      "|  84 | m           | product      | intelligence |\n",
      "|  85 | made        | program      | interactive  |\n",
      "|  86 | man         | project      | introduction |\n",
      "|  87 | manager     | projects     | iot          |\n",
      "|  88 | marketing   | re           | iphone       |\n",
      "|  89 | money       | react        | its          |\n",
      "|  90 | net         | read         | job          |\n",
      "|  91 | next        | real         | js           |\n",
      "|  92 | no          | release      | keep         |\n",
      "|  93 | node        | robots       | key          |\n",
      "|  94 | of          | san          | know         |\n",
      "|  95 | office      | say          | law          |\n",
      "|  96 | open        | scale        | learning     |\n",
      "|  97 | os          | science      | light        |\n",
      "|  98 | out         | scientists   | list         |\n",
      "|  99 | over        | secure       | long         |\n",
      "| 100 | part        | see          | make         |\n",
      "| 101 | pay         | services     | makes        |\n",
      "| 102 | performance | sharing      | many         |\n",
      "| 103 | phone       | should       | market       |\n",
      "| 104 | pi          | shows        | me           |\n",
      "| 105 | play        | side         | media        |\n",
      "| 106 | private     | site         | meet         |\n",
      "| 107 | problem     | slack        | mobile       |\n",
      "| 108 | programming | so           | model        |\n",
      "| 109 | quantum     | software     | modern       |\n",
      "| 110 | r           | source       | more         |\n",
      "| 111 | released    | space        | much         |\n",
      "| 112 | remote      | star         | music        |\n",
      "| 113 | report      | start        | my           |\n",
      "| 114 | right       | still        | nasa         |\n",
      "| 115 | run         | storage      | networks     |\n",
      "| 116 | running     | system       | never        |\n",
      "| 117 | rust        | systems      | new          |\n",
      "| 118 | s           | take         | news         |\n",
      "| 119 | series      | tech         | not          |\n",
      "| 120 | server      | test         | now          |\n",
      "| 121 | set         | testing      | old          |\n",
      "| 122 | silicon     | they         | one          |\n",
      "| 123 | simple      | think        | online       |\n",
      "| 124 | small       | through      | only         |\n",
      "| 125 | social      | tips         | own          |\n",
      "| 126 | solar       | to           | page         |\n",
      "| 127 | some        | tool         | people       |\n",
      "| 128 | stack       | twitter      | php          |\n",
      "| 129 | startup     | ui           | platform     |\n",
      "| 130 | state       | up           | police       |\n",
      "| 131 | store       | us           | post         |\n",
      "| 132 | support     | users        | privacy      |\n",
      "| 133 | swift       | valley       | public       |\n",
      "| 134 | t           | video        | python       |\n",
      "| 135 | tesla       | vs           | rails        |\n",
      "| 136 | than        | was          | raises       |\n",
      "| 137 | them        | way          | reality      |\n",
      "| 138 | there       | who          | really       |\n",
      "| 139 | this        | women        | research     |\n",
      "| 140 | three       | working      | review       |\n",
      "| 141 | too         | write        | rise         |\n",
      "| 142 | two         | wrong        | robot        |\n",
      "| 143 | u           |              | ruby         |\n",
      "| 144 | update      |              | rules        |\n",
      "| 145 | used        |              | save         |\n",
      "| 146 | v           |              | says         |\n",
      "| 147 | version     |              | school       |\n",
      "| 148 | war         |              | search       |\n",
      "| 149 | web         |              | secret       |\n",
      "| 150 | website     |              | security     |\n",
      "| 151 | week        |              | self         |\n",
      "| 152 | were        |              | service      |\n",
      "| 153 | what        |              | show         |\n",
      "| 154 | where       |              | smart        |\n",
      "| 155 | will        |              | speed        |\n",
      "| 156 | with        |              | startups     |\n",
      "| 157 | world       |              | stop         |\n",
      "| 158 | would       |              | story        |\n",
      "| 159 | writing     |              | study        |\n",
      "| 160 | x           |              | team         |\n",
      "| 161 | years       |              | technology   |\n",
      "| 162 | your        |              | text         |\n",
      "| 163 |             |              | that         |\n",
      "| 164 |             |              | the          |\n",
      "| 165 |             |              | their        |\n",
      "| 166 |             |              | things       |\n",
      "| 167 |             |              | time         |\n",
      "| 168 |             |              | today        |\n",
      "| 169 |             |              | tools        |\n",
      "| 170 |             |              | top          |\n",
      "| 171 |             |              | tv           |\n",
      "| 172 |             |              | uber         |\n",
      "| 173 |             |              | uk           |\n",
      "| 174 |             |              | under        |\n",
      "| 175 |             |              | use          |\n",
      "| 176 |             |              | user         |\n",
      "| 177 |             |              | using        |\n",
      "| 178 |             |              | via          |\n",
      "| 179 |             |              | virtual      |\n",
      "| 180 |             |              | visual       |\n",
      "| 181 |             |              | vr           |\n",
      "| 182 |             |              | want         |\n",
      "| 183 |             |              | wants        |\n",
      "| 184 |             |              | watch        |\n",
      "| 185 |             |              | ways         |\n",
      "| 186 |             |              | we           |\n",
      "| 187 |             |              | when         |\n",
      "| 188 |             |              | why          |\n",
      "| 189 |             |              | windows      |\n",
      "| 190 |             |              | without      |\n",
      "| 191 |             |              | work         |\n",
      "| 192 |             |              | year         |\n",
      "| 193 |             |              | you          |\n"
     ]
    }
   ],
   "source": [
    "print(pd.DataFrame(YY).T.to_markdown())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

## new_clusters.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              new_clusters.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## scoring_clusters.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              scoring_clusters.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	"""
	association_model() class-> module for word clusterings
	"""

	import re
	import pickle as pk
	import numpy as np
	import pandas as pd
	from matplotlib import pyplot as plt
	import tensorflow as tf
	from scipy.sparse import coo_matrix
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.preprocessing import KBinsDiscretizer
	from sklearn.cluster import KMeans
	from time import time

	# association_model class
	class association_model():
	"""
	association_model()
	construct and train a word association model
	usage:
	AM = association_model()
	AM.build_run()
	AM.predict()
	AM.cluster()
	"""
	def __init__(self):
	self.TransformedWords = None

	# getk, helper fct
	def getk(self,k):
	""" getk(k), helper function to produce a k-index unit vector in R1 """
	return (np.arange(500) == k).astype(np.float64)

	# getk, generate statistical correlations
	def create_ATA(self, from_file=True):
	""" create_ATA, generate the cooccurrence matrix """
	# do some minor cleaning, ie. nan's
	_X = pd.read_csv("./workspace/archive/HN_posts_year_to_Sep_26_2016.csv")
	_X.title = _X.title.apply(lambda s: s.lower())
	_X = _X.dropna()
	all_times = list(_X['created_at'])
	# Sample and sort the categories

	# label by times
	Times = \
	[sum(np.array([60,1]) * np.array(
	[_.split('/') for _ in x.split(' ')][1][0].split(':'))\
	.astype(np.float64))
	for x in all_times]
	Times=np.array(Times)
	binme = KBinsDiscretizer(10,encode="ordinal",strategy="uniform")
	binme.fit(Times.reshape(-1,1))
	Times_ = binme.transform(Times.reshape(-1,1))
	_X['timebin'] = Times_
	_X['log_comments'] = np.log1p(_X['num_comments'])

	# search by 1000 entries at a time
	t,s = 0,1000; i=0
	bigun = {''}
	listem = []
	returns = False
	while not returns:
	t+= 1000; s+=1000
	if s > _X.shape[0]:
	s = _X.shape[0]
	returns = True

	Q = _X.title.loc[_X.index[t:s]].str.lower().str.split()
	V = {''}
	l = []
	# filter alphanumeric
	for q in Q:
	for b in q:
	x = re.sub("[^a-zA-Z]",' ',b)
	l.extend(x.split())
	# extend a list, then concat into a set to get only unique entries
	listem.extend(l)
	for q in l:
	V.update({q})
	bigun.update(V)
	bigun.remove('')

	# get the counts
	wordcounts = {}.fromkeys(bigun,0)
	for l in listem:
	wordcounts[l]+=1
	wordcounts = \
	{k: v for k, v in sorted(wordcounts.items(),
	key=lambda item: item[1],reverse=True)}
	# top 500 words
	I100 = list(wordcounts.keys())[:500]
	self.Itup = [(i,wordcounts[i]) for i in I100]

	Imap = {''}
	i = 0
	for v in I100:
	Imap.update({v: i})
	i+=1
	Imap.remove('')
	# sklearn ---> counts per word
	self.countme = CountVectorizer(vocabulary=Imap)
	self.word_legend = self.countme.get_feature_names_out()
	self.ATA = 0

	global TransformedWords
	TransformedWords = self.countme.fit_transform(_X.title)
	self.TransformedWords=TransformedWords.toarray().copy()
	print (1232)
	if not from_file:
	""" enable this segment for generation of ATA matrix """
	# TransformedWords = TransformedWords.tocoo()
	# TransformedWords = self.countme.fit_transform(_X.title)
	# TransformedWords_0 = \
	# tf.sparse.SparseTensor(indices=np.vstack(
	# [TransformedWords.row,TransformedWords.col]).T\
	# .reshape(-1,2),
	# values=tf.constant(TransformedWords.data),
	# dense_shape=[279256,500])
	# TransformedWords_0T = tf.sparse.transpose(TransformedWords_0)
	# ATA = tf.sparse.sparse_dense_matmul(
	# TransformedWords_0T, tf.sparse.to_dense(TransformedWords_0))
	# ATA = np.log1p(ATA)
	# ATA = ATA.astype(np.float32)
	""" pk.dump(ATA, open('500x500Association.pk', 'wb'))"""
	assert True
	with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
	self.ATA = pk.load(file)
	else:
	with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
	self.ATA = pk.load(file)
	# generate sparse representation
	self.coo = coo_matrix(self.ATA).astype(np.float32)

	def define_model(self):
	"""
	define_model()
	define the neural network
	"""
	# Dense layer Neural Network encoder
	D = 10**-10
	R = 10**-5
	inputs = tf.keras.layers.Input(shape=(1001,))
	x = inputs[:,:-1]
	x1 = tf.keras.layers.Reshape((500,))(x[:,:500])
	x2 = tf.keras.layers.Reshape((500,))(x[:,500:])

	D1 = tf.keras.layers.Dense(
	250, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	x1 = D1(x1)
	x2 = D1(x2)

	D1d = tf.keras.layers.Dropout(D)
	x1 = D1d(x1)
	x2 = D1d(x2)

	D2 = tf.keras.layers.Dense(
	120, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	D2d = tf.keras.layers.Dropout(D)
	x1 = D2(x1); x1 = D2d(x1)
	x2 = D2(x2); x2 = D2d(x2)

	D3 = tf.keras.layers.Dense(
	60, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	D3d = tf.keras.layers.Dropout(D)
	x1 = D3(x1); x1 = D3d(x1)
	x2 = D3(x2); x2 = D3d(x2)

	D4 = tf.keras.layers.Dense(
	2, kernel_regularizer=tf.keras.regularizers.l2(R),
	activity_regularizer=tf.keras.regularizers.l2(R))
	x1 = D4(x1)
	x2 = D4(x2)

	R1 = tf.keras.layers.Reshape((2,1))
	x1 = R1(x1); x2 = R1(x2)
	y = tf.keras.layers.Concatenate(axis=-1)([x1, x2])

	Model = tf.keras.models.Model(inputs = inputs, outputs= y)
	return Model

	def build_run(self, verbose=False):
	"""
	build_run( verbose bool)
	build and run the neural network to produce embeddings (and graphics)
	"""
	def embed_loss_plain_association(x,y):
	return tf.keras.backend.sum(
	tf.keras.backend.pow((x[0]-x[1]) * y,2))

	self.create_ATA()
	self.Model = self.define_model()
	self.Model.build(input_shape=(1001,))
	if verbose:
	self.Model.summary()

	self.Model.compile(loss=embed_loss_plain_association,
	optimizer=tf.keras.optimizers.Adam(
	tf.keras.optimizers.schedules.ExponentialDecay(
	1.,50,.5,staircase=False)))

	# generate training data
	Z = list(zip(self.coo.row, self.coo.col, self.coo.data))
	train_x = np.array(
	[np.hstack([self.getk(z[0]), self.getk(z[1]), z[2]])\
	for z in Z])

	self.Model.fit(train_x.reshape(-1,1001,1), self.coo.data.reshape(-1,1),
	epochs=30, batch_size=4096, verbose=verbose)

	def predict(self):
	"""
	predict()
	generate predictions
	"""
	retrieve_x = np.array(
	[ np.hstack([self.getk(z), np.zeros(501)]) for z in range(500)]
	)
	self.YY = self.Model.predict(retrieve_x.reshape(-1,1001,1))
	return self.YY

	def cluster(self,show=True):
	"""
	cluster( bool show)
	produce clusterings
	"""
	# visualize_test predictions
	KM = KMeans(n_clusters=3)
	YY = self.YY
	Itup = self.Itup
	labels = KM.fit_predict(YY[:,:,0])

	score = KM.score(YY[:,:,0])
	STAMP = str(time())
	print(STAMP)
	if show:
	_ = np.log(np.array(Itup[:])[:,1].astype(np.float32))
	_ = _/np.max(_); _ = _**1.5
	plt.figure(figsize=(16,10))
	plt.scatter(YY[:,0,0], YY[:,1,0],c=labels,alpha=_**1.3)
	plt.colorbar(ticks=np.arange(4))
	plt.show()
	labeled500Words = pd.DataFrame(np.squeeze([self.word_legend,labels]).T,
	columns=['word', 'label'])

	labeled500Words = labeled500Words.groupby('label').apply(np.array)

	appendix_of_words =\
	[labeled500Words[i][:,0] for i in range(len(labeled500Words))]

	with open("appendix/appendix." + STAMP + ".pk",'wb') as file:
	pk.dump(appendix_of_words,file)

	log_word_count_in_corpus = \
	np.log(np.array(Itup[:])[:,1].astype(np.float32))

	DF = pd.DataFrame(np.vstack([YY[:,0,0],YY[:,1,0], self.word_legend,
	labels, log_word_count_in_corpus]).T,
	columns=['x','y','word', 'cluster', 'log_count'])

	with open('appendix/predicted_DF.' + STAMP + '.pk', 'wb') as file:
	pk.dump(DF, file)

	if show:
	print(appendix_of_words)

	counts = [len(k) for k in appendix_of_words]
	print(counts)

	return counts, score, appendix_of_words
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 95,
	"id": "4c136840-ce71-4df0-a70d-eebcc58bd866",
	"metadata": {},
	"outputs": [],
	"source": [
	"YY = pk.load(open('appendix/appendix.1661655443.8803668.pk', 'rb'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 105,
	"id": "dbfa5865-24c5-4e91-a392-995bf44657cc",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\| \| 0 \| 1 \| 2 \|\n",
	"\|----:\|:------------\|:-------------\|:-------------\|\n",
	"\| 0 \| a \| ad \| access \|\n",
	"\| 1 \| about \| america \| ads \|\n",
	"\| 2 \| again \| and \| after \|\n",
	"\| 3 \| ai \| applications \| against \|\n",
	"\| 4 \| analysis \| aws \| age \|\n",
	"\| 5 \| android \| become \| algorithm \|\n",
	"\| 6 \| any \| beta \| all \|\n",
	"\| 7 \| api \| bill \| amazon \|\n",
	"\| 8 \| app \| black \| american \|\n",
	"\| 9 \| apple \| bot \| an \|\n",
	"\| 10 \| apps \| built \| analytics \|\n",
	"\| 11 \| are \| business \| angular \|\n",
	"\| 12 \| b \| but \| apache \|\n",
	"\| 13 \| based \| can \| application \|\n",
	"\| 14 \| be \| cars \| art \|\n",
	"\| 15 \| been \| chinese \| artificial \|\n",
	"\| 16 \| behind \| cloud \| as \|\n",
	"\| 17 \| between \| code \| at \|\n",
	"\| 18 \| blockchain \| computer \| attack \|\n",
	"\| 19 \| build \| creating \| available \|\n",
	"\| 20 \| by \| day \| back \|\n",
	"\| 21 \| c \| did \| bad \|\n",
	"\| 22 \| car \| does \| before \|\n",
	"\| 23 \| case \| driving \| being \|\n",
	"\| 24 \| china \| email \| best \|\n",
	"\| 25 \| client \| engineering \| better \|\n",
	"\| 26 \| coding \| ever \| big \|\n",
	"\| 27 \| company \| every \| bitcoin \|\n",
	"\| 28 \| content \| everything \| book \|\n",
	"\| 29 \| control \| fbi \| books \|\n",
	"\| 30 \| cost \| first \| brain \|\n",
	"\| 31 \| could \| framework \| browser \|\n",
	"\| 32 \| court \| generation \| building \|\n",
	"\| 33 \| create \| get \| ceo \|\n",
	"\| 34 \| css \| getting \| change \|\n",
	"\| 35 \| d \| go \| chrome \|\n",
	"\| 36 \| data \| good \| city \|\n",
	"\| 37 \| deep \| great \| com \|\n",
	"\| 38 \| design \| growth \| coming \|\n",
	"\| 39 \| developer \| hackers \| command \|\n",
	"\| 40 \| developers \| his \| community \|\n",
	"\| 41 \| development \| home \| companies \|\n",
	"\| 42 \| digital \| how \| computing \|\n",
	"\| 43 \| distributed \| html \| core \|\n",
	"\| 44 \| do \| human \| database \|\n",
	"\| 45 \| docker \| image \| dead \|\n",
	"\| 46 \| don \| internet \| deal \|\n",
	"\| 47 \| dont \| interview \| death \|\n",
	"\| 48 \| e \| into \| devices \|\n",
	"\| 49 \| earth \| ios \| down \|\n",
	"\| 50 \| easy \| is \| drone \|\n",
	"\| 51 \| encryption \| it \| economy \|\n",
	"\| 52 \| end \| language \| energy \|\n",
	"\| 53 \| f \| learn \| engine \|\n",
	"\| 54 \| facebook \| less \| experience \|\n",
	"\| 55 \| files \| lessons \| fast \|\n",
	"\| 56 \| for \| life \| faster \|\n",
	"\| 57 \| founder \| like \| file \|\n",
	"\| 58 \| free \| line \| find \|\n",
	"\| 59 \| from \| linux \| found \|\n",
	"\| 60 \| guide \| live \| full \|\n",
	"\| 61 \| hack \| look \| future \|\n",
	"\| 62 \| hacker \| love \| game \|\n",
	"\| 63 \| have \| mac \| games \|\n",
	"\| 64 \| here \| machine \| gets \|\n",
	"\| 65 \| history \| making \| git \|\n",
	"\| 66 \| i \| management \| github \|\n",
	"\| 67 \| if \| map \| global \|\n",
	"\| 68 \| in \| may \| going \|\n",
	"\| 69 \| industry \| memory \| google \|\n",
	"\| 70 \| introducing \| microsoft \| got \|\n",
	"\| 71 \| io \| most \| government \|\n",
	"\| 72 \| java \| native \| hacking \|\n",
	"\| 73 \| javascript \| need \| hard \|\n",
	"\| 74 \| jobs \| network \| has \|\n",
	"\| 75 \| just \| neural \| health \|\n",
	"\| 76 \| k \| off \| help \|\n",
	"\| 77 \| last \| on \| high \|\n",
	"\| 78 \| launch \| or \| hn \|\n",
	"\| 79 \| launches \| other \| http \|\n",
	"\| 80 \| learned \| our \| images \|\n",
	"\| 81 \| let \| pdf \| india \|\n",
	"\| 82 \| lets \| plan \| inside \|\n",
	"\| 83 \| library \| power \| intel \|\n",
	"\| 84 \| m \| product \| intelligence \|\n",
	"\| 85 \| made \| program \| interactive \|\n",
	"\| 86 \| man \| project \| introduction \|\n",
	"\| 87 \| manager \| projects \| iot \|\n",
	"\| 88 \| marketing \| re \| iphone \|\n",
	"\| 89 \| money \| react \| its \|\n",
	"\| 90 \| net \| read \| job \|\n",
	"\| 91 \| next \| real \| js \|\n",
	"\| 92 \| no \| release \| keep \|\n",
	"\| 93 \| node \| robots \| key \|\n",
	"\| 94 \| of \| san \| know \|\n",
	"\| 95 \| office \| say \| law \|\n",
	"\| 96 \| open \| scale \| learning \|\n",
	"\| 97 \| os \| science \| light \|\n",
	"\| 98 \| out \| scientists \| list \|\n",
	"\| 99 \| over \| secure \| long \|\n",
	"\| 100 \| part \| see \| make \|\n",
	"\| 101 \| pay \| services \| makes \|\n",
	"\| 102 \| performance \| sharing \| many \|\n",
	"\| 103 \| phone \| should \| market \|\n",
	"\| 104 \| pi \| shows \| me \|\n",
	"\| 105 \| play \| side \| media \|\n",
	"\| 106 \| private \| site \| meet \|\n",
	"\| 107 \| problem \| slack \| mobile \|\n",
	"\| 108 \| programming \| so \| model \|\n",
	"\| 109 \| quantum \| software \| modern \|\n",
	"\| 110 \| r \| source \| more \|\n",
	"\| 111 \| released \| space \| much \|\n",
	"\| 112 \| remote \| star \| music \|\n",
	"\| 113 \| report \| start \| my \|\n",
	"\| 114 \| right \| still \| nasa \|\n",
	"\| 115 \| run \| storage \| networks \|\n",
	"\| 116 \| running \| system \| never \|\n",
	"\| 117 \| rust \| systems \| new \|\n",
	"\| 118 \| s \| take \| news \|\n",
	"\| 119 \| series \| tech \| not \|\n",
	"\| 120 \| server \| test \| now \|\n",
	"\| 121 \| set \| testing \| old \|\n",
	"\| 122 \| silicon \| they \| one \|\n",
	"\| 123 \| simple \| think \| online \|\n",
	"\| 124 \| small \| through \| only \|\n",
	"\| 125 \| social \| tips \| own \|\n",
	"\| 126 \| solar \| to \| page \|\n",
	"\| 127 \| some \| tool \| people \|\n",
	"\| 128 \| stack \| twitter \| php \|\n",
	"\| 129 \| startup \| ui \| platform \|\n",
	"\| 130 \| state \| up \| police \|\n",
	"\| 131 \| store \| us \| post \|\n",
	"\| 132 \| support \| users \| privacy \|\n",
	"\| 133 \| swift \| valley \| public \|\n",
	"\| 134 \| t \| video \| python \|\n",
	"\| 135 \| tesla \| vs \| rails \|\n",
	"\| 136 \| than \| was \| raises \|\n",
	"\| 137 \| them \| way \| reality \|\n",
	"\| 138 \| there \| who \| really \|\n",
	"\| 139 \| this \| women \| research \|\n",
	"\| 140 \| three \| working \| review \|\n",
	"\| 141 \| too \| write \| rise \|\n",
	"\| 142 \| two \| wrong \| robot \|\n",
	"\| 143 \| u \| \| ruby \|\n",
	"\| 144 \| update \| \| rules \|\n",
	"\| 145 \| used \| \| save \|\n",
	"\| 146 \| v \| \| says \|\n",
	"\| 147 \| version \| \| school \|\n",
	"\| 148 \| war \| \| search \|\n",
	"\| 149 \| web \| \| secret \|\n",
	"\| 150 \| website \| \| security \|\n",
	"\| 151 \| week \| \| self \|\n",
	"\| 152 \| were \| \| service \|\n",
	"\| 153 \| what \| \| show \|\n",
	"\| 154 \| where \| \| smart \|\n",
	"\| 155 \| will \| \| speed \|\n",
	"\| 156 \| with \| \| startups \|\n",
	"\| 157 \| world \| \| stop \|\n",
	"\| 158 \| would \| \| story \|\n",
	"\| 159 \| writing \| \| study \|\n",
	"\| 160 \| x \| \| team \|\n",
	"\| 161 \| years \| \| technology \|\n",
	"\| 162 \| your \| \| text \|\n",
	"\| 163 \| \| \| that \|\n",
	"\| 164 \| \| \| the \|\n",
	"\| 165 \| \| \| their \|\n",
	"\| 166 \| \| \| things \|\n",
	"\| 167 \| \| \| time \|\n",
	"\| 168 \| \| \| today \|\n",
	"\| 169 \| \| \| tools \|\n",
	"\| 170 \| \| \| top \|\n",
	"\| 171 \| \| \| tv \|\n",
	"\| 172 \| \| \| uber \|\n",
	"\| 173 \| \| \| uk \|\n",
	"\| 174 \| \| \| under \|\n",
	"\| 175 \| \| \| use \|\n",
	"\| 176 \| \| \| user \|\n",
	"\| 177 \| \| \| using \|\n",
	"\| 178 \| \| \| via \|\n",
	"\| 179 \| \| \| virtual \|\n",
	"\| 180 \| \| \| visual \|\n",
	"\| 181 \| \| \| vr \|\n",
	"\| 182 \| \| \| want \|\n",
	"\| 183 \| \| \| wants \|\n",
	"\| 184 \| \| \| watch \|\n",
	"\| 185 \| \| \| ways \|\n",
	"\| 186 \| \| \| we \|\n",
	"\| 187 \| \| \| when \|\n",
	"\| 188 \| \| \| why \|\n",
	"\| 189 \| \| \| windows \|\n",
	"\| 190 \| \| \| without \|\n",
	"\| 191 \| \| \| work \|\n",
	"\| 192 \| \| \| year \|\n",
	"\| 193 \| \| \| you \|\n"
	]
	}
	],
	"source": [
	"print(pd.DataFrame(YY).T.to_markdown())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}