Skip to content

Instantly share code, notes, and snippets.

@alexander-wei
Last active October 26, 2022 08:20
Show Gist options
  • Save alexander-wei/550dfddae216eecb8cbd3647b8f594a0 to your computer and use it in GitHub Desktop.
Save alexander-wei/550dfddae216eecb8cbd3647b8f594a0 to your computer and use it in GitHub Desktop.
hacker news titles, word embedding
"""
association_model() class-> module for word clusterings
"""
import re
import pickle as pk
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
from time import time
# association_model class
class association_model():
"""
association_model()
construct and train a word association model
usage:
AM = association_model()
AM.build_run()
AM.predict()
AM.cluster()
"""
def __init__(self):
self.TransformedWords = None
# getk, helper fct
def getk(self,k):
""" getk(k), helper function to produce a k-index unit vector in R1 """
return (np.arange(500) == k).astype(np.float64)
# getk, generate statistical correlations
def create_ATA(self, from_file=True):
""" create_ATA, generate the cooccurrence matrix """
# do some minor cleaning, ie. nan's
_X = pd.read_csv("./workspace/archive/HN_posts_year_to_Sep_26_2016.csv")
_X.title = _X.title.apply(lambda s: s.lower())
_X = _X.dropna()
all_times = list(_X['created_at'])
# **Sample and sort the categories**
# label by times
Times = \
[sum(np.array([60,1]) * np.array(
[_.split('/') for _ in x.split(' ')][1][0].split(':'))\
.astype(np.float64))
for x in all_times]
Times=np.array(Times)
binme = KBinsDiscretizer(10,encode="ordinal",strategy="uniform")
binme.fit(Times.reshape(-1,1))
Times_ = binme.transform(Times.reshape(-1,1))
_X['timebin'] = Times_
_X['log_comments'] = np.log1p(_X['num_comments'])
# search by 1000 entries at a time
t,s = 0,1000; i=0
bigun = {''}
listem = []
returns = False
while not returns:
t+= 1000; s+=1000
if s > _X.shape[0]:
s = _X.shape[0]
returns = True
Q = _X.title.loc[_X.index[t:s]].str.lower().str.split()
V = {''}
l = []
# filter alphanumeric
for q in Q:
for b in q:
x = re.sub("[^a-zA-Z]",' ',b)
l.extend(x.split())
# extend a list, then concat into a set to get only unique entries
listem.extend(l)
for q in l:
V.update({q})
bigun.update(V)
bigun.remove('')
# get the counts
wordcounts = {}.fromkeys(bigun,0)
for l in listem:
wordcounts[l]+=1
wordcounts = \
{k: v for k, v in sorted(wordcounts.items(),
key=lambda item: item[1],reverse=True)}
# top 500 words
I100 = list(wordcounts.keys())[:500]
self.Itup = [(i,wordcounts[i]) for i in I100]
Imap = {''}
i = 0
for v in I100:
Imap.update({v: i})
i+=1
Imap.remove('')
# sklearn ---> counts per word
self.countme = CountVectorizer(vocabulary=Imap)
self.word_legend = self.countme.get_feature_names_out()
self.ATA = 0
global TransformedWords
TransformedWords = self.countme.fit_transform(_X.title)
self.TransformedWords=TransformedWords.toarray().copy()
print (1232)
if not from_file:
""" enable this segment for generation of ATA matrix """
# TransformedWords = TransformedWords.tocoo()
# TransformedWords = self.countme.fit_transform(_X.title)
# TransformedWords_0 = \
# tf.sparse.SparseTensor(indices=np.vstack(
# [TransformedWords.row,TransformedWords.col]).T\
# .reshape(-1,2),
# values=tf.constant(TransformedWords.data),
# dense_shape=[279256,500])
# TransformedWords_0T = tf.sparse.transpose(TransformedWords_0)
# ATA = tf.sparse.sparse_dense_matmul(
# TransformedWords_0T, tf.sparse.to_dense(TransformedWords_0))
# ATA = np.log1p(ATA)
# ATA = ATA.astype(np.float32)
""" pk.dump(ATA, open('500x500Association.pk', 'wb'))"""
assert True
with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
self.ATA = pk.load(file)
else:
with open("./workspace/clustering/500x500Association.pk", 'rb') as file:
self.ATA = pk.load(file)
# generate sparse representation
self.coo = coo_matrix(self.ATA).astype(np.float32)
def define_model(self):
"""
define_model()
define the neural network
"""
# Dense layer Neural Network encoder
D = 10**-10
R = 10**-5
inputs = tf.keras.layers.Input(shape=(1001,))
x = inputs[:,:-1]
x1 = tf.keras.layers.Reshape((500,))(x[:,:500])
x2 = tf.keras.layers.Reshape((500,))(x[:,500:])
D1 = tf.keras.layers.Dense(
250, kernel_regularizer=tf.keras.regularizers.l2(R),
activity_regularizer=tf.keras.regularizers.l2(R))
x1 = D1(x1)
x2 = D1(x2)
D1d = tf.keras.layers.Dropout(D)
x1 = D1d(x1)
x2 = D1d(x2)
D2 = tf.keras.layers.Dense(
120, kernel_regularizer=tf.keras.regularizers.l2(R),
activity_regularizer=tf.keras.regularizers.l2(R))
D2d = tf.keras.layers.Dropout(D)
x1 = D2(x1); x1 = D2d(x1)
x2 = D2(x2); x2 = D2d(x2)
D3 = tf.keras.layers.Dense(
60, kernel_regularizer=tf.keras.regularizers.l2(R),
activity_regularizer=tf.keras.regularizers.l2(R))
D3d = tf.keras.layers.Dropout(D)
x1 = D3(x1); x1 = D3d(x1)
x2 = D3(x2); x2 = D3d(x2)
D4 = tf.keras.layers.Dense(
2, kernel_regularizer=tf.keras.regularizers.l2(R),
activity_regularizer=tf.keras.regularizers.l2(R))
x1 = D4(x1)
x2 = D4(x2)
R1 = tf.keras.layers.Reshape((2,1))
x1 = R1(x1); x2 = R1(x2)
y = tf.keras.layers.Concatenate(axis=-1)([x1, x2])
Model = tf.keras.models.Model(inputs = inputs, outputs= y)
return Model
def build_run(self, verbose=False):
"""
build_run( verbose bool)
build and run the neural network to produce embeddings (and graphics)
"""
def embed_loss_plain_association(x,y):
return tf.keras.backend.sum(
tf.keras.backend.pow((x[0]-x[1]) * y,2))
self.create_ATA()
self.Model = self.define_model()
self.Model.build(input_shape=(1001,))
if verbose:
self.Model.summary()
self.Model.compile(loss=embed_loss_plain_association,
optimizer=tf.keras.optimizers.Adam(
tf.keras.optimizers.schedules.ExponentialDecay(
1.,50,.5,staircase=False)))
# generate training data
Z = list(zip(self.coo.row, self.coo.col, self.coo.data))
train_x = np.array(
[np.hstack([self.getk(z[0]), self.getk(z[1]), z[2]])\
for z in Z])
self.Model.fit(train_x.reshape(-1,1001,1), self.coo.data.reshape(-1,1),
epochs=30, batch_size=4096, verbose=verbose)
def predict(self):
"""
predict()
generate predictions
"""
retrieve_x = np.array(
[ np.hstack([self.getk(z), np.zeros(501)]) for z in range(500)]
)
self.YY = self.Model.predict(retrieve_x.reshape(-1,1001,1))
return self.YY
def cluster(self,show=True):
"""
cluster( bool show)
produce clusterings
"""
# visualize_test predictions
KM = KMeans(n_clusters=3)
YY = self.YY
Itup = self.Itup
labels = KM.fit_predict(YY[:,:,0])
score = KM.score(YY[:,:,0])
STAMP = str(time())
print(STAMP)
if show:
_ = np.log(np.array(Itup[:])[:,1].astype(np.float32))
_ = _/np.max(_); _ = _**1.5
plt.figure(figsize=(16,10))
plt.scatter(YY[:,0,0], YY[:,1,0],c=labels,alpha=_**1.3)
plt.colorbar(ticks=np.arange(4))
plt.show()
labeled500Words = pd.DataFrame(np.squeeze([self.word_legend,labels]).T,
columns=['word', 'label'])
labeled500Words = labeled500Words.groupby('label').apply(np.array)
appendix_of_words =\
[labeled500Words[i][:,0] for i in range(len(labeled500Words))]
with open("appendix/appendix." + STAMP + ".pk",'wb') as file:
pk.dump(appendix_of_words,file)
log_word_count_in_corpus = \
np.log(np.array(Itup[:])[:,1].astype(np.float32))
DF = pd.DataFrame(np.vstack([YY[:,0,0],YY[:,1,0], self.word_legend,
labels, log_word_count_in_corpus]).T,
columns=['x','y','word', 'cluster', 'log_count'])
with open('appendix/predicted_DF.' + STAMP + '.pk', 'wb') as file:
pk.dump(DF, file)
if show:
print(appendix_of_words)
counts = [len(k) for k in appendix_of_words]
print(counts)
return counts, score, appendix_of_words
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 95,
"id": "4c136840-ce71-4df0-a70d-eebcc58bd866",
"metadata": {},
"outputs": [],
"source": [
"YY = pk.load(open('appendix/appendix.1661655443.8803668.pk', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "dbfa5865-24c5-4e91-a392-995bf44657cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| | 0 | 1 | 2 |\n",
"|----:|:------------|:-------------|:-------------|\n",
"| 0 | a | ad | access |\n",
"| 1 | about | america | ads |\n",
"| 2 | again | and | after |\n",
"| 3 | ai | applications | against |\n",
"| 4 | analysis | aws | age |\n",
"| 5 | android | become | algorithm |\n",
"| 6 | any | beta | all |\n",
"| 7 | api | bill | amazon |\n",
"| 8 | app | black | american |\n",
"| 9 | apple | bot | an |\n",
"| 10 | apps | built | analytics |\n",
"| 11 | are | business | angular |\n",
"| 12 | b | but | apache |\n",
"| 13 | based | can | application |\n",
"| 14 | be | cars | art |\n",
"| 15 | been | chinese | artificial |\n",
"| 16 | behind | cloud | as |\n",
"| 17 | between | code | at |\n",
"| 18 | blockchain | computer | attack |\n",
"| 19 | build | creating | available |\n",
"| 20 | by | day | back |\n",
"| 21 | c | did | bad |\n",
"| 22 | car | does | before |\n",
"| 23 | case | driving | being |\n",
"| 24 | china | email | best |\n",
"| 25 | client | engineering | better |\n",
"| 26 | coding | ever | big |\n",
"| 27 | company | every | bitcoin |\n",
"| 28 | content | everything | book |\n",
"| 29 | control | fbi | books |\n",
"| 30 | cost | first | brain |\n",
"| 31 | could | framework | browser |\n",
"| 32 | court | generation | building |\n",
"| 33 | create | get | ceo |\n",
"| 34 | css | getting | change |\n",
"| 35 | d | go | chrome |\n",
"| 36 | data | good | city |\n",
"| 37 | deep | great | com |\n",
"| 38 | design | growth | coming |\n",
"| 39 | developer | hackers | command |\n",
"| 40 | developers | his | community |\n",
"| 41 | development | home | companies |\n",
"| 42 | digital | how | computing |\n",
"| 43 | distributed | html | core |\n",
"| 44 | do | human | database |\n",
"| 45 | docker | image | dead |\n",
"| 46 | don | internet | deal |\n",
"| 47 | dont | interview | death |\n",
"| 48 | e | into | devices |\n",
"| 49 | earth | ios | down |\n",
"| 50 | easy | is | drone |\n",
"| 51 | encryption | it | economy |\n",
"| 52 | end | language | energy |\n",
"| 53 | f | learn | engine |\n",
"| 54 | facebook | less | experience |\n",
"| 55 | files | lessons | fast |\n",
"| 56 | for | life | faster |\n",
"| 57 | founder | like | file |\n",
"| 58 | free | line | find |\n",
"| 59 | from | linux | found |\n",
"| 60 | guide | live | full |\n",
"| 61 | hack | look | future |\n",
"| 62 | hacker | love | game |\n",
"| 63 | have | mac | games |\n",
"| 64 | here | machine | gets |\n",
"| 65 | history | making | git |\n",
"| 66 | i | management | github |\n",
"| 67 | if | map | global |\n",
"| 68 | in | may | going |\n",
"| 69 | industry | memory | google |\n",
"| 70 | introducing | microsoft | got |\n",
"| 71 | io | most | government |\n",
"| 72 | java | native | hacking |\n",
"| 73 | javascript | need | hard |\n",
"| 74 | jobs | network | has |\n",
"| 75 | just | neural | health |\n",
"| 76 | k | off | help |\n",
"| 77 | last | on | high |\n",
"| 78 | launch | or | hn |\n",
"| 79 | launches | other | http |\n",
"| 80 | learned | our | images |\n",
"| 81 | let | pdf | india |\n",
"| 82 | lets | plan | inside |\n",
"| 83 | library | power | intel |\n",
"| 84 | m | product | intelligence |\n",
"| 85 | made | program | interactive |\n",
"| 86 | man | project | introduction |\n",
"| 87 | manager | projects | iot |\n",
"| 88 | marketing | re | iphone |\n",
"| 89 | money | react | its |\n",
"| 90 | net | read | job |\n",
"| 91 | next | real | js |\n",
"| 92 | no | release | keep |\n",
"| 93 | node | robots | key |\n",
"| 94 | of | san | know |\n",
"| 95 | office | say | law |\n",
"| 96 | open | scale | learning |\n",
"| 97 | os | science | light |\n",
"| 98 | out | scientists | list |\n",
"| 99 | over | secure | long |\n",
"| 100 | part | see | make |\n",
"| 101 | pay | services | makes |\n",
"| 102 | performance | sharing | many |\n",
"| 103 | phone | should | market |\n",
"| 104 | pi | shows | me |\n",
"| 105 | play | side | media |\n",
"| 106 | private | site | meet |\n",
"| 107 | problem | slack | mobile |\n",
"| 108 | programming | so | model |\n",
"| 109 | quantum | software | modern |\n",
"| 110 | r | source | more |\n",
"| 111 | released | space | much |\n",
"| 112 | remote | star | music |\n",
"| 113 | report | start | my |\n",
"| 114 | right | still | nasa |\n",
"| 115 | run | storage | networks |\n",
"| 116 | running | system | never |\n",
"| 117 | rust | systems | new |\n",
"| 118 | s | take | news |\n",
"| 119 | series | tech | not |\n",
"| 120 | server | test | now |\n",
"| 121 | set | testing | old |\n",
"| 122 | silicon | they | one |\n",
"| 123 | simple | think | online |\n",
"| 124 | small | through | only |\n",
"| 125 | social | tips | own |\n",
"| 126 | solar | to | page |\n",
"| 127 | some | tool | people |\n",
"| 128 | stack | twitter | php |\n",
"| 129 | startup | ui | platform |\n",
"| 130 | state | up | police |\n",
"| 131 | store | us | post |\n",
"| 132 | support | users | privacy |\n",
"| 133 | swift | valley | public |\n",
"| 134 | t | video | python |\n",
"| 135 | tesla | vs | rails |\n",
"| 136 | than | was | raises |\n",
"| 137 | them | way | reality |\n",
"| 138 | there | who | really |\n",
"| 139 | this | women | research |\n",
"| 140 | three | working | review |\n",
"| 141 | too | write | rise |\n",
"| 142 | two | wrong | robot |\n",
"| 143 | u | | ruby |\n",
"| 144 | update | | rules |\n",
"| 145 | used | | save |\n",
"| 146 | v | | says |\n",
"| 147 | version | | school |\n",
"| 148 | war | | search |\n",
"| 149 | web | | secret |\n",
"| 150 | website | | security |\n",
"| 151 | week | | self |\n",
"| 152 | were | | service |\n",
"| 153 | what | | show |\n",
"| 154 | where | | smart |\n",
"| 155 | will | | speed |\n",
"| 156 | with | | startups |\n",
"| 157 | world | | stop |\n",
"| 158 | would | | story |\n",
"| 159 | writing | | study |\n",
"| 160 | x | | team |\n",
"| 161 | years | | technology |\n",
"| 162 | your | | text |\n",
"| 163 | | | that |\n",
"| 164 | | | the |\n",
"| 165 | | | their |\n",
"| 166 | | | things |\n",
"| 167 | | | time |\n",
"| 168 | | | today |\n",
"| 169 | | | tools |\n",
"| 170 | | | top |\n",
"| 171 | | | tv |\n",
"| 172 | | | uber |\n",
"| 173 | | | uk |\n",
"| 174 | | | under |\n",
"| 175 | | | use |\n",
"| 176 | | | user |\n",
"| 177 | | | using |\n",
"| 178 | | | via |\n",
"| 179 | | | virtual |\n",
"| 180 | | | visual |\n",
"| 181 | | | vr |\n",
"| 182 | | | want |\n",
"| 183 | | | wants |\n",
"| 184 | | | watch |\n",
"| 185 | | | ways |\n",
"| 186 | | | we |\n",
"| 187 | | | when |\n",
"| 188 | | | why |\n",
"| 189 | | | windows |\n",
"| 190 | | | without |\n",
"| 191 | | | work |\n",
"| 192 | | | year |\n",
"| 193 | | | you |\n"
]
}
],
"source": [
"print(pd.DataFrame(YY).T.to_markdown())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment