Created
June 16, 2024 20:55
-
-
Save Jong-Sig/f5546cf195715429d36db5800a167484 to your computer and use it in GitHub Desktop.
Calculate Degree Between Nodes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import polars as pl | |
| import numpy as np | |
| import os | |
| import re | |
| import csv | |
| import json | |
| from tqdm import tqdm | |
| from ast import literal_eval | |
| from CreateSQL import * | |
| from sklearn import preprocessing | |
| """ | |
| ------------------------------------------------------------------------------ | |
| ** Process documentation | |
| ------------------------------------------------------------------------------ | |
| 1) Create a dictionary of {'Poster' : ['Tag']} | |
| For PostID, | |
| {'Poster A' : ['Tag A', 'Tag 'B'], | |
| 'Poster B': ['Tag A', 'Tag C']} | |
| 2) Create a dictionary of {'Poster' : ['Commentor']} | |
| For PostID, | |
| {'Poster A' : ['Commentor A', 'Commentor B'], | |
| 'Poster B' : ['Commentor A', 'Commentor C']} | |
| 3) Create a graph | |
| - Option: undirected, weighted graph | |
| - Time window: 3 months (t-2 ~ t), 4 months (t-3 ~ t), 5months (t-4 ~ t) | |
| 4) Create measures (e.g., use modified Dijkstra's Algorithm) | |
| 1. # of direct ties between two nodes | |
| 2. Shortest path between two nodes & # of different shortest paths (2 if A-B-D & A-C-D) | |
| references: | |
| https://www.geeksforgeeks.org/number-of-shortest-paths-in-an-undirected-weighted-graph/ | |
| https://notebook.community/stevenbergner/stevenbergner.github.io/Teaching/cmpt767/graph-tool-tutorial-todo | |
| ------------------------------------------------------------------------------ | |
| ------------------------------------------------------------------------------ | |
| **Important note | |
| ------------------------------------------------------------------------------ | |
| Dict format: | |
| {Post Year-Month : {PosterID : [List of tags + commenters]}} | |
| # of edges between two nodes: | |
| (A, B) : 2 | |
| (B, A) : 5 | |
| A - B : 2 + 5 | |
| B - A : 2 + 5 | |
| ------------------------------------------------------------------------------ | |
| """ | |
| ######################################## | |
| # Preprocess: Create node list and count | |
| ######################################## | |
| # Get the list of total nodes & the number of total nodes per Month-Year | |
| def get_nodes(dictionary): | |
| dict_nodes_list = {} # dictionary to store the list of nodes | |
| dict_nodes_N = {} # dictionary to store the number of nodes | |
| # For each Month-Year | |
| for date in tqdm(dictionary.keys()): | |
| # Create an empty set | |
| total_nodes = set() | |
| # Get the set of unique nodes each Month-Year | |
| for key, val in dictionary[date].items(): | |
| # First, add a key, which is a string | |
| total_nodes.add(key) | |
| # Second, join a value which is a list | |
| total_nodes.update(set(val)) | |
| # Store the result to dictionary | |
| dict_nodes_list[date] = list(total_nodes) | |
| # After appending all the nodes, get the number of nodes | |
| total_nodes = len(total_nodes) | |
| # Create a dictionary which store total N per Month-Year | |
| dict_nodes_N[date] = total_nodes | |
| return dict_nodes_list, dict_nodes_N | |
| ################################### | |
| # Preprocess: Create string labeler | |
| ################################### | |
| # Need to convert the string ID of node into integer ID to speed up the processing | |
| def node_labeler(dict_original, dict_nodes_list): | |
| dict_nodes_id = {} #dictionary to store the IDs of nodes | |
| # For each Month-Year | |
| for date in tqdm(dict_original.keys()): | |
| # Create a label encoder | |
| le = preprocessing.LabelEncoder() | |
| # Fit the label encoder using the list of nodes in a given Month-Year | |
| lem = le.fit(dict_nodes_list[date]) | |
| # Get the string-integer ID mapping and save it as a dictionary in a given Month-Year | |
| ids = dict(zip(lem.classes_, lem.transform(lem.classes_))) # check if id contains empty string '' | |
| dict_nodes_id[date] = ids | |
| # Encode node list dictionary (dict_nodes_list) | |
| dict_nodes_list[date] = list(lem.transform(dict_nodes_list[date])) | |
| # Encode edges dictionary (dict_all) | |
| ## Create an empty dictionary | |
| d = {} | |
| # https://stackoverflow.com/questions/39475187/how-to-speed-labelencoder-up-recoding-a-categorical-variable-into-integers | |
| # https://stackoverflow.com/questions/45321999/how-can-i-optimize-label-encoding-for-large-data-sets-sci-kit-learn | |
| ## Append processed key and values to the dictionary | |
| ### Numpy array takes 0 sec whereas lem.transform takes more than 1 min | |
| ### Use array approach | |
| # for key, val in tqdm(dict_all[date].items()): | |
| # d[lem.transform([key])[0]] = list(lem.transform(val)) | |
| for key, val in dict_original[date].items(): | |
| d[ids[key]] = [ids[ele] for ele in val] | |
| ## Update the original one and delete the temp dictionary | |
| dict_original[date] = d | |
| return dict_original, dict_nodes_list, dict_nodes_id | |
| ######################## | |
| # Analysis: Create Graph | |
| ######################## | |
| # Node class | |
| class Node: | |
| # Stores the node | |
| def __init__(self, node, weight): | |
| # Stores the weight of the edge | |
| self.node = node | |
| self.weight = weight | |
| # Custom comparator | |
| def __lt__(self, other): | |
| return self.weight < other.weight | |
| # Function to insert a node in adjacency list | |
| def addEdge(adj, node1, node2, weight): | |
| # Change directed path to undirected path | |
| # E.g., if (A, B) has 2 direct ties and (B, A) has 3, both (A, B) and (B, A) have 5 direct ties in total | |
| # For node1, check if node2 already exists | |
| for index, x in enumerate(x.node for x in adj[node1]): | |
| if node2 == x: | |
| get_node2_index = index | |
| # If node 2 exsits, update weight += 1 | |
| # If node2 not exists (NameError), create a new set of (node2, weight) | |
| try: | |
| adj[node1][get_node2_index].weight = adj[node1][get_node2_index].weight + weight | |
| except NameError: | |
| adj[node1].append(Node(node2, weight)) | |
| # Do the same for node2 | |
| for index, x in enumerate(x.node for x in adj[node2]): | |
| if node1 == x: | |
| get_node1_index = index | |
| try: | |
| adj[node2][get_node1_index].weight = adj[node2][get_node1_index].weight + weight | |
| except NameError: | |
| adj[node2].append(Node(node1, weight)) | |
| #################################### | |
| # Analysis: Create Degree calculator | |
| #################################### | |
| # # Find the number of direct paths between two nodes | |
| # def degree(adj, src, n): | |
| # # Store IDs of all nodes | |
| # ids = [i for i in range(n)] | |
| # # Initializing degree of every vertex as zero | |
| # edges = [0] * (n) | |
| # # Setting degree of source vertext to 1 | |
| # edges[src] = 1 | |
| # for i in range(len(adj[src])): # i will iterate through nodes matched to a given node | |
| # # get the alter node | |
| # to = adj[src][i].node | |
| # # assign weight to edge from a focal node to the alter node | |
| # edges[to] = adj[src][i].weight | |
| # return ids, edges | |
| # Find the number of direct paths between two nodes | |
| def degree(adj, src): | |
| # Create dictionary to store the result | |
| result = [] | |
| for i in range(len(adj[src])): # i will iterate through nodes matched to a given node | |
| # get the alter node | |
| to = adj[src][i].node | |
| weight = adj[src][i].weight | |
| result.append([src, to, weight]) | |
| return result | |
| ############################ | |
| # Analysis: Calculate degree | |
| ############################ | |
| def get_degree(dict_original, dict_nodes_N, dict_nodes_id): | |
| # Create a dictionary to store all results of all Year-Month pairs | |
| dict_degree = {} | |
| # Get the results per Month-Year | |
| for date in tqdm(dict_original.keys()): | |
| # Create a list to store all results of each Year-Month pair | |
| degree_results = [] | |
| # Create an empty adjacent matrix per Month-Year | |
| adj = [[] for i in range(dict_nodes_N[date])] | |
| # Append edges to the matrix | |
| for key, val in dict_original[date].items(): | |
| # As value contains the list of nodes that connect to a focal node, iterate through it | |
| for node in val: | |
| addEdge(adj, key, node, 1) # weight is 1 per each pair. But the code will + 1 every time each pair appears | |
| # iterate through source nodes | |
| for src in dict_nodes_id[date].values(): | |
| # calculate the number of direct paths to each node | |
| result = degree(adj, src) | |
| # Append the list [src, des, degree] to the final list | |
| degree_results += result | |
| # convert list to dataframe | |
| df = pl.DataFrame(degree_results, schema = ['From', 'To', 'Degree']) | |
| # create a nested dictionary to save the dataframes | |
| dict_degree[date] = df | |
| return dict_degree | |
| ####################### | |
| # Create string decoder | |
| ####################### | |
| # Need to convert the string ID of node into integer ID to speed up the processing | |
| def node_decoder(dict_degree, dict_nodes_id): | |
| # Decode dataframes in the dictionary | |
| # Create a dictionary to store all results of all Year-Month pairs | |
| dict_degree_decode = {} | |
| # Get the Month-Year pair | |
| for date in tqdm(dict_degree.keys()): | |
| # Prepare the dictionary of ids to decode in each Month-Year | |
| ids = dict_nodes_id[date] | |
| ids_rev = {v: k for k, v in ids.items()} | |
| # Convert dataframe to dictionary to speed up decoding | |
| for key, val in dict_degree[date].to_dict().items(): | |
| # For source node, decode using decoder | |
| if key == 'From': | |
| From = [ids_rev[ele] for ele in val] | |
| # For destination node, decode using decoder | |
| elif key == 'To': | |
| To = [ids_rev[ele] for ele in val] | |
| # For degree value, don't need to decode | |
| elif key == 'Degree': | |
| Degree = val | |
| # create a nested list to convert list to dataframe again | |
| result = [From, To, Degree] | |
| df = pl.DataFrame(result, | |
| schema = {'From' : pl.Utf8, | |
| 'To' : pl.Utf8, | |
| 'Degree' : pl.Int64}) | |
| # Some nodes are poorly encoded, filter such nodes | |
| df = df.filter(~pl.any(pl.col(pl.Utf8).is_null())) | |
| dict_degree_decode[date] = df | |
| return dict_degree_decode | |
| ####################### | |
| # Create auto-processer | |
| ####################### | |
| def main(dict_original): | |
| # Create node list and its count by Year-Month | |
| tqdm.write('Creating nodes') | |
| dict_nodes_list, dict_nodes_N = get_nodes(dict_original) | |
| # Convert String ID to Integer ID to speed up the process | |
| tqdm.write('Encoding nodes') | |
| dict_original, dict_nodes_list, dict_nodes_id = node_labeler(dict_original, dict_nodes_list) | |
| # Calculate degree between nodes | |
| tqdm.write('Calculating degree between nodes') | |
| dict_degree = get_degree(dict_original, dict_nodes_N, dict_nodes_id) | |
| # Decode Integer ID to string ID to save file | |
| tqdm.write('Decoding nodes') | |
| dict_degree = node_decoder(dict_degree, dict_nodes_id) | |
| return dict_degree | |
| if __name__ == '__main__': | |
| ################ | |
| # Data cleaning | |
| ################ | |
| # Connect to MySQL - DB: Instagram | |
| connection = create_engine(db = 'organic') | |
| # Get all the table names in the DB | |
| tables = """ | |
| SHOW tables; | |
| """ | |
| table_names = pd.read_sql(tables, connection) | |
| # # Set date limit to speed up the process and reduce memory usage | |
| # date = '2018-01-01' | |
| ############# | |
| # Post Sample | |
| ############# | |
| for i, table in enumerate(('organicpost1_update', 'organicpost2_update')): | |
| query = f""" | |
| SELECT PostID, PosterID, PostYear, PostMonth, PostMention | |
| FROM {table}; | |
| """ | |
| if i == 0: | |
| df_org = pd.read_sql(query, | |
| connection) | |
| if i == 1: | |
| df_org = pd.concat([df_org, | |
| pd.read_sql(query, | |
| connection)]) | |
| ################ | |
| # Comment Sample | |
| ################ | |
| for i, table in enumerate(('organicpost1_comments', 'organicpost2_comments')): | |
| # Filter posts: Keep posts where CommenterID exists -- remove posts w.o. comments | |
| query1 = f""" | |
| SELECT FileID, PostYear, PostMonth, CommenterID | |
| FROM {table}_2018_04 | |
| WHERE CommenterID IS NOT NULL; | |
| """ | |
| query2 = f""" | |
| SELECT FileID, PostYear, PostMonth, CommenterID | |
| FROM {table}_rest | |
| WHERE CommenterID IS NOT NULL; | |
| """ | |
| if i == 0: | |
| # create dataframe with comments from 2018-4 from organicpost1 table | |
| df_cmts = pd.read_sql(query1, | |
| connection) | |
| # then, append rest of the comments from organicpost1 table | |
| df_cmts = pd.concat([df_cmts, | |
| pd.read_sql(query2, | |
| connection)]) | |
| if i == 1: | |
| # append comments from 2018-4 from organicpost2 table | |
| df_cmts = pd.concat([df_cmts, | |
| pd.read_sql(query1, | |
| connection)]) | |
| # then, append rest of the comments from organicpost2 table | |
| df_cmts = pd.concat([df_cmts, | |
| pd.read_sql(query2, | |
| connection)]) | |
| connection.dispose() | |
| # -- Assume commenters have social capital only when at least one of their comments are positive -- # | |
| ## Commenters that had non-positive comments are less likely to have social capital | |
| ## Confine to comments that have at least one positive comments as having social capital | |
| ## Specify the path and file that contains the sentiment data of comments | |
| path = r'F:\Projects\Brokerage\Insta_Analysis_Ver2\data\dataframe\comments_emotion\influencer_organic_posts' | |
| file = 'organicpost_comments_emotion.parquet.gzip' | |
| df_cmts_sc = pd.read_parquet(path + '\\' + file) | |
| ## Keep commenter's ID with at least one positive comments | |
| df_cmts_sc = df_cmts_sc.loc[df_cmts_sc['EmotionPositive'] == 1, 'CommenterID'].drop_duplicates() | |
| df_cmts_sc = df_cmts_sc.to_list() | |
| ## Keep if commenter's ID appears in the list of commenters with positive comments | |
| df_cmts = df_cmts.loc[df_cmts['CommenterID'].isin(df_cmts_sc)].reset_index(drop = True) | |
| ################### | |
| # Influencer Sample | |
| ################### | |
| # Get influencer ID and their first organic post posting date | |
| path = r'F:\Projects\Brokerage\Insta_Analysis_Ver2\data\dataframe\influencers' | |
| df_inf = pd.read_csv(f'{path}\influencer_first_post_date_all.csv') | |
| ##################################################### | |
| # Post data: create a list of nodes per node by edges | |
| ##################################################### | |
| # Change the format of PosterID and lowercase it | |
| df_org['PosterID'] = df_org['PosterID'].apply(lambda x: '@' + x.lower()) | |
| # Keep necessary columns only | |
| df_post = df_org.loc[:, ['PosterID', 'PostYear', 'PostMonth', 'PostMention']].copy() | |
| # Convert string to a list | |
| df_post['PostMention'] = df_post['PostMention'].apply(lambda x: literal_eval(str(x))) | |
| # Remove duplicated IDs | |
| df_post['PostMention'] = df_post['PostMention'].apply(lambda x: list(set([ele.lower() for ele in x]))) | |
| # Edges between the same node does not make sense. Remove it, if exists | |
| df_post['PostMention'] = df_post.\ | |
| apply(lambda x: [ele for ele in x['PostMention'] if ele != x['PosterID']],\ | |
| axis = 1) | |
| # Remove posts without mentions | |
| df_post = df_post.loc[df_post["PostMention"].str.len() != 0, :].copy().reset_index(drop = True) | |
| # Create a nested dictionary by Year-Month | |
| ## Dictionary to store the result | |
| dict_post = {} | |
| ## For each Year-Month pair | |
| for year in df_post['PostYear'].unique(): | |
| for month in df_post['PostMonth'].unique(): | |
| ## Get observations per each pair | |
| df_sub = df_post.loc[(df_post['PostYear'] == year) | |
| & (df_post['PostMonth'] == month), :].copy() | |
| ## Create dictionary if observations exist | |
| if df_sub.empty: | |
| pass | |
| else: | |
| dict_sub = df_sub.groupby('PosterID')['PostMention'].apply(sum).to_dict() | |
| ## Remove empty list from values | |
| dict_sub = {key:val for key, val in dict_sub.items() if val != ([] or [''])} | |
| dict_post[f'{year}-{month}'] = dict_sub | |
| del df_sub, dict_sub | |
| ######################################################## | |
| # Comment data: create a list of nodes per node by edges | |
| ######################################################## | |
| # Change the format of FileID, and create PostID | |
| df_cmts['FileID'] = df_cmts['FileID'].apply(lambda x: x.split('-')[1]) | |
| # Keep necessary columns only | |
| df_cmts = pd.merge(df_cmts.loc[:, ['FileID', 'PostYear', 'PostMonth', 'CommenterID']], | |
| df_org.loc[:, ['PostID', 'PosterID']], | |
| left_on = 'FileID', | |
| right_on = 'PostID', | |
| how = 'left') | |
| ## --- Added Filter here --- ## | |
| # Filter PosterID : Poster that appeared in the influencer dataframe | |
| ## Change the format of PosterID and lowercase it | |
| df_inf['PosterID'] = df_inf['PosterID'].apply(lambda x: '@' + x.lower()) | |
| ## Keep samples that are in Influencer list | |
| df_cmts = df_cmts.loc[df_cmts['PosterID'].isin(df_inf['PosterID']), | |
| :].reset_index(drop = True) | |
| # change the format of CommenterID and lowercase it | |
| df_cmts['CommenterID'] = df_cmts['CommenterID'].apply(lambda x: '@' + x.lower()) | |
| # Create a list of Commenters per node | |
| df_cmts = df_cmts.\ | |
| groupby(by = ['PosterID', 'PostYear', 'PostMonth'])\ | |
| ['CommenterID'].apply(list).reset_index(name='CommenterID') | |
| # Edges between the same node does not make sense. Remove it, if exists | |
| df_cmts['CommenterID'] = df_cmts.\ | |
| apply(lambda x: [ele for ele in x['CommenterID'] if ele != x['PosterID']], axis = 1) | |
| # Create a nested dictionary by Year-Month | |
| ## Dictionary to store the result | |
| dict_comment = {} | |
| ## For each Year-Month pair | |
| for year in df_cmts['PostYear'].unique(): | |
| for month in df_cmts['PostMonth'].unique(): | |
| ## Get observations per each pair | |
| df_sub = df_cmts.loc[(df_cmts['PostYear'] == year) | |
| & (df_cmts['PostMonth'] == month), :].copy() | |
| ## Create dictionary if observations exist | |
| if df_sub.empty: | |
| pass | |
| else: | |
| dict_sub = dict(zip(df_sub['PosterID'], df_sub['CommenterID'])) | |
| ## Remove empty list from values | |
| dict_sub = {key : val for key, val in dict_sub.items() if val != ([] or [''])} | |
| dict_comment[f'{year}-{month}'] = dict_sub | |
| del df_org, df_cmts, df_sub, dict_sub | |
| ####################################### | |
| # Merge two dicts of node relationships | |
| ####################################### | |
| # Dictionary to store the result | |
| dict_all = {} | |
| # For each Year-Month pair | |
| for year in df_post['PostYear'].unique(): | |
| for month in df_post['PostMonth'].unique(): | |
| # first create a key to iterate | |
| date = f'{year}-{month}' | |
| # some Year-Month pairs not exist in posts: check if exsits | |
| if date in dict_post: | |
| dict1 = dict_post[date] | |
| # Even if posts exsit in a given Year-Month pair, comments may not exist | |
| # pass merging dictionaries in that case | |
| if date in dict_comment: | |
| dict2 = dict_comment[date] | |
| dict_all[date] = {key : dict1.get(key,[]) + dict2.get(key,[]) \ | |
| for key in set(list(dict1.keys()) + list(dict2.keys()))} | |
| # When only mentions exist, append values of dict_post | |
| else: | |
| dict_all[date] = dict1 | |
| ## Remove empty list from values | |
| dict_all[date] = {key : val for key, val in dict_all[date].items() if val != ([] or [''])} | |
| else: | |
| pass | |
| del dict1, dict2 | |
| ############# | |
| # Run Process | |
| ############# | |
| dict_degree = main(dict_all) | |
| ################ | |
| # Export to SQL | |
| ################ | |
| connection = create_engine(db = 'influencer_social_capital') | |
| for date in tqdm(dict_degree.keys()): | |
| df = dict_degree[date].to_pandas() | |
| # replace date format to avoid error in MySQL | |
| # e.g., 2018-1 > 2018_1 | |
| date = date.replace('-', '_') | |
| df.to_sql(con = connection, | |
| name = f'social_capital_{date}', | |
| if_exists = 'replace', #append the result of each chunk to existing one | |
| chunksize = 50000, | |
| index = False) | |
| connection.dispose() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment