Skip to content

Instantly share code, notes, and snippets.

@Jong-Sig
Created June 16, 2024 21:18
Show Gist options
  • Save Jong-Sig/f45217ce77ce8ca2f96eeaddf920de63 to your computer and use it in GitHub Desktop.
Save Jong-Sig/f45217ce77ce8ca2f96eeaddf920de63 to your computer and use it in GitHub Desktop.
Draw Network and Measure Centrality
import pandas as pd
import numpy as np
import os
import networkx as nx
from itertools import combinations, chain
"""
Process
1. Matrix drawing
1) Import venture-investor pair data
2) Draw a matrix
- venture as a group / investors as nodes of each group
Source | Target | Type | Weight | Time Interval (e.g., <[2000, 2002]>)
2. Investor identification
1) Import CVC investor dummy data
2) Save investor-dummy columns to export to Gephi
"""
df_pair = pd.read_csv('D:/venturexpert/raw_data/vc/SDC/round_entity_matcher/vx_round_matcher.csv')
df_cvc = pd.read_csv('D:/venturexpert/t-test/t-test.csv')
"""
1. Matrix drawing
"""
## Keep only necessary variables
df_pair = df_pair.loc[:, ['venture_name', 'investor_name', 'round_date']].copy()
## Keep only disclosed ventures
df_pair = df_pair.loc[df_pair['venture_name'] != 'Undisclosed Company']
## Keep only disclosed investors
df_pair = df_pair.loc[(df_pair['investor_name'] != 'Undisclosed Firm') &
(df_pair['investor_name'] != 'Undisclosed European Investors') &
(df_pair['investor_name'] != 'Undisclosed Corporate Investor Jamaica')]
## Convert rounddate to year
df_pair['round_date'] = pd.DatetimeIndex(df_pair['round_date']).year
## Change column names
df_pair = df_pair.rename(columns = {'venture_name': 'group', 'investor_name': 'node'})
## Convert to source node-target node matrix
df_list = df_pair.groupby(by = ['group', 'round_date'])['node'].apply(lambda x: list(x)).reset_index()
df_list['node'] = df_list['node'].apply(lambda x: list(combinations(x, 2)))
df_list = df_list.explode('node')
df_list['node'] = list(df_list['node'])
df_list = df_list.dropna().reset_index(drop = True)
df_st = pd.DataFrame(data = list(df_list['node']), columns = ['Source', 'Target'])
df_list = pd.merge(df_list, df_st, left_index = True, right_index= True)
df_list = df_list.drop(columns = ['group', 'node'])
## Convert round_Date format to time stampformat <[2000, 2002]>
df_list['time start'] = df_list['round_date']
df_list['time end'] = df_list['time start'] + 2
df_list = df_list.drop(columns = ['round_date'])
## Type: Undirected
df_list['Type'] = 'Undirected'
## Weight: number of duplicates
df_list['Weight'] = df_list.groupby(by = ['Source', 'Target', 'time start', 'time end']).transform('count')
## Remove duplicates
df_list = df_list.drop_duplicates().reset_index(drop = True)
df_list.to_csv('matrix.csv', index = False)
"""
2. Centrality
For three year span (t ~ t+2)
Latest: 2018-2020
"""
# Set the range of t from 1980 to 2018 so that t+2 could be until 2020
for i in range(1980, 2019):
# Get time frame that suits t ~ t+2
df_t = df_list.loc[(df_list['time start'] >= i) & (df_list['time start'] <= i+2), :]
# Set network and draw the network
G = nx.Graph
G = nx.from_pandas_edgelist(df_t, 'Source', 'Target', ['Weight'])
# Get centralities
dict_deg = nx.degree_centrality(G)
dict_btw = nx.betweenness_centrality(G)
dict_egn = nx.eigenvector_centrality(G)
dict_clo = nx.closeness_centrality(G)
# Convert centralities to dataframe and reset index
df_t = pd.DataFrame({'degree': pd.Series(dict_deg), 'between': pd.Series(dict_btw), 'eigen': pd.Series(dict_egn), 'closeness': pd.Series(dict_clo)})
df_t = df_t.reset_index()
# Convert column name, such as {centrality}_{year}
df_t.columns = [x + f'_{i}' if x != 'index' else x for x in df_t.columns]
# Merge database
if i == 1980:
df_all = df_t.copy()
else:
df_all = pd.merge(df_all, df_t, on = 'index', how = 'outer')
df_all.to_csv('centrality.csv', index = False)
"""
3. Match centrality with original dataset
"""
df_org = df_cvc.loc[:, ['investor_name', 'first_investment_date']].copy()
# iterate through all investors
for i in df_org.index:
# get each investor's row
df_unit = df_org.iloc[i, :]
# extract variables for the investor identified above
df_sub = df_all.loc[df_all['index'] == df_unit['investor_name']].copy()
# df_all has greater number of observations than df_unit due to different time frame
# pass if df_sub is empty
if df_sub.empty == True:
pass
else:
# convert column name to {centrality} + _ + {t}
df_sub.columns = [x.split('_')[0] + '_' + str(int(x.split('_')[1]) - df_unit['first_investment_date'])\
if x != 'index' else x for x in df_sub.columns]
# concat dataframes
if i == 0:
df_con = df_sub.copy()
else:
df_con = pd.concat([df_con, df_sub])
# filter and drop column names containing - (e.g., degree_-5)
df_con = df_con.loc[:, df_con.columns.drop(list(df_con.filter(regex = '-')))]
#reset index
df_con = df_con.drop_duplicates().reset_index(drop = True)
# Change column name: index to investor name
df_con = df_con.rename(columns = {'index': 'investor_name'})
"""
merge dataframe
"""
df_cvc = pd.merge(df_cvc, df_con, on = 'investor_name', how = 'left')
df_cvc.to_csv('centrality.csv', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment