Skip to content

Instantly share code, notes, and snippets.

@Jong-Sig
Created June 16, 2024 20:55
Show Gist options
  • Select an option

  • Save Jong-Sig/f5546cf195715429d36db5800a167484 to your computer and use it in GitHub Desktop.

Select an option

Save Jong-Sig/f5546cf195715429d36db5800a167484 to your computer and use it in GitHub Desktop.
Calculate Degree Between Nodes
import pandas as pd
import polars as pl
import numpy as np
import os
import re
import csv
import json
from tqdm import tqdm
from ast import literal_eval
from CreateSQL import *
from sklearn import preprocessing
"""
------------------------------------------------------------------------------
** Process documentation
------------------------------------------------------------------------------
1) Create a dictionary of {'Poster' : ['Tag']}
For PostID,
{'Poster A' : ['Tag A', 'Tag 'B'],
'Poster B': ['Tag A', 'Tag C']}
2) Create a dictionary of {'Poster' : ['Commentor']}
For PostID,
{'Poster A' : ['Commentor A', 'Commentor B'],
'Poster B' : ['Commentor A', 'Commentor C']}
3) Create a graph
- Option: undirected, weighted graph
- Time window: 3 months (t-2 ~ t), 4 months (t-3 ~ t), 5months (t-4 ~ t)
4) Create measures (e.g., use modified Dijkstra's Algorithm)
1. # of direct ties between two nodes
2. Shortest path between two nodes & # of different shortest paths (2 if A-B-D & A-C-D)
references:
https://www.geeksforgeeks.org/number-of-shortest-paths-in-an-undirected-weighted-graph/
https://notebook.community/stevenbergner/stevenbergner.github.io/Teaching/cmpt767/graph-tool-tutorial-todo
------------------------------------------------------------------------------
------------------------------------------------------------------------------
**Important note
------------------------------------------------------------------------------
Dict format:
{Post Year-Month : {PosterID : [List of tags + commenters]}}
# of edges between two nodes:
(A, B) : 2
(B, A) : 5
A - B : 2 + 5
B - A : 2 + 5
------------------------------------------------------------------------------
"""
########################################
# Preprocess: Create node list and count
########################################
# Get the list of total nodes & the number of total nodes per Month-Year
def get_nodes(dictionary):
dict_nodes_list = {} # dictionary to store the list of nodes
dict_nodes_N = {} # dictionary to store the number of nodes
# For each Month-Year
for date in tqdm(dictionary.keys()):
# Create an empty set
total_nodes = set()
# Get the set of unique nodes each Month-Year
for key, val in dictionary[date].items():
# First, add a key, which is a string
total_nodes.add(key)
# Second, join a value which is a list
total_nodes.update(set(val))
# Store the result to dictionary
dict_nodes_list[date] = list(total_nodes)
# After appending all the nodes, get the number of nodes
total_nodes = len(total_nodes)
# Create a dictionary which store total N per Month-Year
dict_nodes_N[date] = total_nodes
return dict_nodes_list, dict_nodes_N
###################################
# Preprocess: Create string labeler
###################################
# Need to convert the string ID of node into integer ID to speed up the processing
def node_labeler(dict_original, dict_nodes_list):
dict_nodes_id = {} #dictionary to store the IDs of nodes
# For each Month-Year
for date in tqdm(dict_original.keys()):
# Create a label encoder
le = preprocessing.LabelEncoder()
# Fit the label encoder using the list of nodes in a given Month-Year
lem = le.fit(dict_nodes_list[date])
# Get the string-integer ID mapping and save it as a dictionary in a given Month-Year
ids = dict(zip(lem.classes_, lem.transform(lem.classes_))) # check if id contains empty string ''
dict_nodes_id[date] = ids
# Encode node list dictionary (dict_nodes_list)
dict_nodes_list[date] = list(lem.transform(dict_nodes_list[date]))
# Encode edges dictionary (dict_all)
## Create an empty dictionary
d = {}
# https://stackoverflow.com/questions/39475187/how-to-speed-labelencoder-up-recoding-a-categorical-variable-into-integers
# https://stackoverflow.com/questions/45321999/how-can-i-optimize-label-encoding-for-large-data-sets-sci-kit-learn
## Append processed key and values to the dictionary
### Numpy array takes 0 sec whereas lem.transform takes more than 1 min
### Use array approach
# for key, val in tqdm(dict_all[date].items()):
# d[lem.transform([key])[0]] = list(lem.transform(val))
for key, val in dict_original[date].items():
d[ids[key]] = [ids[ele] for ele in val]
## Update the original one and delete the temp dictionary
dict_original[date] = d
return dict_original, dict_nodes_list, dict_nodes_id
########################
# Analysis: Create Graph
########################
# Node class
class Node:
# Stores the node
def __init__(self, node, weight):
# Stores the weight of the edge
self.node = node
self.weight = weight
# Custom comparator
def __lt__(self, other):
return self.weight < other.weight
# Function to insert a node in adjacency list
def addEdge(adj, node1, node2, weight):
# Change directed path to undirected path
# E.g., if (A, B) has 2 direct ties and (B, A) has 3, both (A, B) and (B, A) have 5 direct ties in total
# For node1, check if node2 already exists
for index, x in enumerate(x.node for x in adj[node1]):
if node2 == x:
get_node2_index = index
# If node 2 exsits, update weight += 1
# If node2 not exists (NameError), create a new set of (node2, weight)
try:
adj[node1][get_node2_index].weight = adj[node1][get_node2_index].weight + weight
except NameError:
adj[node1].append(Node(node2, weight))
# Do the same for node2
for index, x in enumerate(x.node for x in adj[node2]):
if node1 == x:
get_node1_index = index
try:
adj[node2][get_node1_index].weight = adj[node2][get_node1_index].weight + weight
except NameError:
adj[node2].append(Node(node1, weight))
####################################
# Analysis: Create Degree calculator
####################################
# # Find the number of direct paths between two nodes
# def degree(adj, src, n):
# # Store IDs of all nodes
# ids = [i for i in range(n)]
# # Initializing degree of every vertex as zero
# edges = [0] * (n)
# # Setting degree of source vertext to 1
# edges[src] = 1
# for i in range(len(adj[src])): # i will iterate through nodes matched to a given node
# # get the alter node
# to = adj[src][i].node
# # assign weight to edge from a focal node to the alter node
# edges[to] = adj[src][i].weight
# return ids, edges
# Find the number of direct paths between two nodes
def degree(adj, src):
# Create dictionary to store the result
result = []
for i in range(len(adj[src])): # i will iterate through nodes matched to a given node
# get the alter node
to = adj[src][i].node
weight = adj[src][i].weight
result.append([src, to, weight])
return result
############################
# Analysis: Calculate degree
############################
def get_degree(dict_original, dict_nodes_N, dict_nodes_id):
# Create a dictionary to store all results of all Year-Month pairs
dict_degree = {}
# Get the results per Month-Year
for date in tqdm(dict_original.keys()):
# Create a list to store all results of each Year-Month pair
degree_results = []
# Create an empty adjacent matrix per Month-Year
adj = [[] for i in range(dict_nodes_N[date])]
# Append edges to the matrix
for key, val in dict_original[date].items():
# As value contains the list of nodes that connect to a focal node, iterate through it
for node in val:
addEdge(adj, key, node, 1) # weight is 1 per each pair. But the code will + 1 every time each pair appears
# iterate through source nodes
for src in dict_nodes_id[date].values():
# calculate the number of direct paths to each node
result = degree(adj, src)
# Append the list [src, des, degree] to the final list
degree_results += result
# convert list to dataframe
df = pl.DataFrame(degree_results, schema = ['From', 'To', 'Degree'])
# create a nested dictionary to save the dataframes
dict_degree[date] = df
return dict_degree
#######################
# Create string decoder
#######################
# Need to convert the string ID of node into integer ID to speed up the processing
def node_decoder(dict_degree, dict_nodes_id):
# Decode dataframes in the dictionary
# Create a dictionary to store all results of all Year-Month pairs
dict_degree_decode = {}
# Get the Month-Year pair
for date in tqdm(dict_degree.keys()):
# Prepare the dictionary of ids to decode in each Month-Year
ids = dict_nodes_id[date]
ids_rev = {v: k for k, v in ids.items()}
# Convert dataframe to dictionary to speed up decoding
for key, val in dict_degree[date].to_dict().items():
# For source node, decode using decoder
if key == 'From':
From = [ids_rev[ele] for ele in val]
# For destination node, decode using decoder
elif key == 'To':
To = [ids_rev[ele] for ele in val]
# For degree value, don't need to decode
elif key == 'Degree':
Degree = val
# create a nested list to convert list to dataframe again
result = [From, To, Degree]
df = pl.DataFrame(result,
schema = {'From' : pl.Utf8,
'To' : pl.Utf8,
'Degree' : pl.Int64})
# Some nodes are poorly encoded, filter such nodes
df = df.filter(~pl.any(pl.col(pl.Utf8).is_null()))
dict_degree_decode[date] = df
return dict_degree_decode
#######################
# Create auto-processer
#######################
def main(dict_original):
# Create node list and its count by Year-Month
tqdm.write('Creating nodes')
dict_nodes_list, dict_nodes_N = get_nodes(dict_original)
# Convert String ID to Integer ID to speed up the process
tqdm.write('Encoding nodes')
dict_original, dict_nodes_list, dict_nodes_id = node_labeler(dict_original, dict_nodes_list)
# Calculate degree between nodes
tqdm.write('Calculating degree between nodes')
dict_degree = get_degree(dict_original, dict_nodes_N, dict_nodes_id)
# Decode Integer ID to string ID to save file
tqdm.write('Decoding nodes')
dict_degree = node_decoder(dict_degree, dict_nodes_id)
return dict_degree
if __name__ == '__main__':
################
# Data cleaning
################
# Connect to MySQL - DB: Instagram
connection = create_engine(db = 'organic')
# Get all the table names in the DB
tables = """
SHOW tables;
"""
table_names = pd.read_sql(tables, connection)
# # Set date limit to speed up the process and reduce memory usage
# date = '2018-01-01'
#############
# Post Sample
#############
for i, table in enumerate(('organicpost1_update', 'organicpost2_update')):
query = f"""
SELECT PostID, PosterID, PostYear, PostMonth, PostMention
FROM {table};
"""
if i == 0:
df_org = pd.read_sql(query,
connection)
if i == 1:
df_org = pd.concat([df_org,
pd.read_sql(query,
connection)])
################
# Comment Sample
################
for i, table in enumerate(('organicpost1_comments', 'organicpost2_comments')):
# Filter posts: Keep posts where CommenterID exists -- remove posts w.o. comments
query1 = f"""
SELECT FileID, PostYear, PostMonth, CommenterID
FROM {table}_2018_04
WHERE CommenterID IS NOT NULL;
"""
query2 = f"""
SELECT FileID, PostYear, PostMonth, CommenterID
FROM {table}_rest
WHERE CommenterID IS NOT NULL;
"""
if i == 0:
# create dataframe with comments from 2018-4 from organicpost1 table
df_cmts = pd.read_sql(query1,
connection)
# then, append rest of the comments from organicpost1 table
df_cmts = pd.concat([df_cmts,
pd.read_sql(query2,
connection)])
if i == 1:
# append comments from 2018-4 from organicpost2 table
df_cmts = pd.concat([df_cmts,
pd.read_sql(query1,
connection)])
# then, append rest of the comments from organicpost2 table
df_cmts = pd.concat([df_cmts,
pd.read_sql(query2,
connection)])
connection.dispose()
# -- Assume commenters have social capital only when at least one of their comments are positive -- #
## Commenters that had non-positive comments are less likely to have social capital
## Confine to comments that have at least one positive comments as having social capital
## Specify the path and file that contains the sentiment data of comments
path = r'F:\Projects\Brokerage\Insta_Analysis_Ver2\data\dataframe\comments_emotion\influencer_organic_posts'
file = 'organicpost_comments_emotion.parquet.gzip'
df_cmts_sc = pd.read_parquet(path + '\\' + file)
## Keep commenter's ID with at least one positive comments
df_cmts_sc = df_cmts_sc.loc[df_cmts_sc['EmotionPositive'] == 1, 'CommenterID'].drop_duplicates()
df_cmts_sc = df_cmts_sc.to_list()
## Keep if commenter's ID appears in the list of commenters with positive comments
df_cmts = df_cmts.loc[df_cmts['CommenterID'].isin(df_cmts_sc)].reset_index(drop = True)
###################
# Influencer Sample
###################
# Get influencer ID and their first organic post posting date
path = r'F:\Projects\Brokerage\Insta_Analysis_Ver2\data\dataframe\influencers'
df_inf = pd.read_csv(f'{path}\influencer_first_post_date_all.csv')
#####################################################
# Post data: create a list of nodes per node by edges
#####################################################
# Change the format of PosterID and lowercase it
df_org['PosterID'] = df_org['PosterID'].apply(lambda x: '@' + x.lower())
# Keep necessary columns only
df_post = df_org.loc[:, ['PosterID', 'PostYear', 'PostMonth', 'PostMention']].copy()
# Convert string to a list
df_post['PostMention'] = df_post['PostMention'].apply(lambda x: literal_eval(str(x)))
# Remove duplicated IDs
df_post['PostMention'] = df_post['PostMention'].apply(lambda x: list(set([ele.lower() for ele in x])))
# Edges between the same node does not make sense. Remove it, if exists
df_post['PostMention'] = df_post.\
apply(lambda x: [ele for ele in x['PostMention'] if ele != x['PosterID']],\
axis = 1)
# Remove posts without mentions
df_post = df_post.loc[df_post["PostMention"].str.len() != 0, :].copy().reset_index(drop = True)
# Create a nested dictionary by Year-Month
## Dictionary to store the result
dict_post = {}
## For each Year-Month pair
for year in df_post['PostYear'].unique():
for month in df_post['PostMonth'].unique():
## Get observations per each pair
df_sub = df_post.loc[(df_post['PostYear'] == year)
& (df_post['PostMonth'] == month), :].copy()
## Create dictionary if observations exist
if df_sub.empty:
pass
else:
dict_sub = df_sub.groupby('PosterID')['PostMention'].apply(sum).to_dict()
## Remove empty list from values
dict_sub = {key:val for key, val in dict_sub.items() if val != ([] or [''])}
dict_post[f'{year}-{month}'] = dict_sub
del df_sub, dict_sub
########################################################
# Comment data: create a list of nodes per node by edges
########################################################
# Change the format of FileID, and create PostID
df_cmts['FileID'] = df_cmts['FileID'].apply(lambda x: x.split('-')[1])
# Keep necessary columns only
df_cmts = pd.merge(df_cmts.loc[:, ['FileID', 'PostYear', 'PostMonth', 'CommenterID']],
df_org.loc[:, ['PostID', 'PosterID']],
left_on = 'FileID',
right_on = 'PostID',
how = 'left')
## --- Added Filter here --- ##
# Filter PosterID : Poster that appeared in the influencer dataframe
## Change the format of PosterID and lowercase it
df_inf['PosterID'] = df_inf['PosterID'].apply(lambda x: '@' + x.lower())
## Keep samples that are in Influencer list
df_cmts = df_cmts.loc[df_cmts['PosterID'].isin(df_inf['PosterID']),
:].reset_index(drop = True)
# change the format of CommenterID and lowercase it
df_cmts['CommenterID'] = df_cmts['CommenterID'].apply(lambda x: '@' + x.lower())
# Create a list of Commenters per node
df_cmts = df_cmts.\
groupby(by = ['PosterID', 'PostYear', 'PostMonth'])\
['CommenterID'].apply(list).reset_index(name='CommenterID')
# Edges between the same node does not make sense. Remove it, if exists
df_cmts['CommenterID'] = df_cmts.\
apply(lambda x: [ele for ele in x['CommenterID'] if ele != x['PosterID']], axis = 1)
# Create a nested dictionary by Year-Month
## Dictionary to store the result
dict_comment = {}
## For each Year-Month pair
for year in df_cmts['PostYear'].unique():
for month in df_cmts['PostMonth'].unique():
## Get observations per each pair
df_sub = df_cmts.loc[(df_cmts['PostYear'] == year)
& (df_cmts['PostMonth'] == month), :].copy()
## Create dictionary if observations exist
if df_sub.empty:
pass
else:
dict_sub = dict(zip(df_sub['PosterID'], df_sub['CommenterID']))
## Remove empty list from values
dict_sub = {key : val for key, val in dict_sub.items() if val != ([] or [''])}
dict_comment[f'{year}-{month}'] = dict_sub
del df_org, df_cmts, df_sub, dict_sub
#######################################
# Merge two dicts of node relationships
#######################################
# Dictionary to store the result
dict_all = {}
# For each Year-Month pair
for year in df_post['PostYear'].unique():
for month in df_post['PostMonth'].unique():
# first create a key to iterate
date = f'{year}-{month}'
# some Year-Month pairs not exist in posts: check if exsits
if date in dict_post:
dict1 = dict_post[date]
# Even if posts exsit in a given Year-Month pair, comments may not exist
# pass merging dictionaries in that case
if date in dict_comment:
dict2 = dict_comment[date]
dict_all[date] = {key : dict1.get(key,[]) + dict2.get(key,[]) \
for key in set(list(dict1.keys()) + list(dict2.keys()))}
# When only mentions exist, append values of dict_post
else:
dict_all[date] = dict1
## Remove empty list from values
dict_all[date] = {key : val for key, val in dict_all[date].items() if val != ([] or [''])}
else:
pass
del dict1, dict2
#############
# Run Process
#############
dict_degree = main(dict_all)
################
# Export to SQL
################
connection = create_engine(db = 'influencer_social_capital')
for date in tqdm(dict_degree.keys()):
df = dict_degree[date].to_pandas()
# replace date format to avoid error in MySQL
# e.g., 2018-1 > 2018_1
date = date.replace('-', '_')
df.to_sql(con = connection,
name = f'social_capital_{date}',
if_exists = 'replace', #append the result of each chunk to existing one
chunksize = 50000,
index = False)
connection.dispose()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment