Skip to content

Instantly share code, notes, and snippets.

@samclane
Created October 5, 2018 00:23
Show Gist options
  • Save samclane/4b8cbaf952308915017866360aa5e922 to your computer and use it in GitHub Desktop.
Save samclane/4b8cbaf952308915017866360aa5e922 to your computer and use it in GitHub Desktop.
Test ML processing for social data acquired from Discord. Just a snapshot to backup; will probably make a new repo for this.
import pandas
import time
import random
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import MultiLabelBinarizer
df = pandas.DataFrame({"member": [], "present": []})
df.index.name = "timestamp"
user_ids = ["user{}".format(n) for n in range(10)]
starttime = int(time.time())
# spoof data
for _ in range(500):
member = str(random.choice(user_ids))
others = [str(uid) for uid in user_ids]
others.remove(member)
present = random.sample(others, random.randint(1, len(others)))
# bias data to make user0 and user1 friends
if member == str(user_ids[0]) and str(user_ids[1]) not in present:
if random.random() < .99:
present.append(str(user_ids[1]))
elif member == str(user_ids[1]) and str(user_ids[0]) not in present:
if random.random() < .99:
present.append(str(user_ids[0]))
# user5 and user9 friends
elif member == str(user_ids[5]) and str(user_ids[9]) not in present:
if random.random() < .99:
present.append(str(user_ids[9]))
elif member == str(user_ids[9]) and str(user_ids[5]) not in present:
if random.random() < .99:
present.append(str(user_ids[5]))
df = df.append(pandas.Series({"member": str(member), "present": present}, name=starttime))
starttime += random.randint(1, 1000)
# Just to validate formatting
# df.to_csv(r"C:\Users\SawyerPC\Pictures\Saved Pictures\Data.csv")
# Use MLB to create binary vectors for each "present" list
enc = MultiLabelBinarizer()
print(pandas.DataFrame(enc.fit_transform(df["present"]), columns=enc.classes_, index=df.index))
"""
# Try different classifiers
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]
for clf in classifiers:
print(type(clf).__name__+"\n--------")
# Given that only user0 member is in the room, who is most likely to join? (user1)
print(clf.predict([[1,0,0,0,0,0,0,0,0,0]]))
prob_map = [(enc.classes_[n], clf.predict_proba([[1,0,0,0,0,0,0,0,0,0]])[0][n]) for n in range(len(enc.classes_))]
print(sorted(prob_map, key=lambda i: i[1], reverse=True))
print("")
"""
# GaussianNB is probably best
clf = GaussianNB()
clf.fit(enc.fit_transform(df["present"]), list(df["member"]))
social_graph = nx.DiGraph()
social_graph.add_nodes_from(user_ids)
for u in user_ids:
others = list(user_ids)
others.remove(u)
for o in others:
vec = enc.transform([[o]])
prob_map = {enc.classes_[n]: clf.predict_proba(vec)[0][n] for n in range(len(enc.classes_))}
social_graph.add_edge(u, o, weight=float(prob_map[u]))
plt.subplot(121)
# pos = nx.spring_layout(social_graph)
nx.draw(social_graph, with_labels=True, arrows=False, font_weight='bold')
# nx.draw_networkx_edge_labels(social_graph, pos)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment