Created June 16, 2024 21:18
Draw Network and Measure Centrality
import pandas as pd
import numpy as np
import os
import networkx as nx
from itertools import combinations, chain
1. Matrix drawing
1) Import venture-investor pair data
2) Draw a matrix
- venture as a group / investors as nodes of each group
Source | Target | Type | Weight | Time Interval (e.g., <[2000, 2002]>)
2. Investor identification
1) Import CVC investor dummy data
2) Save investor-dummy columns to export to Gephi
df_pair = pd.read_csv('D:/venturexpert/raw_data/vc/SDC/round_entity_matcher/vx_round_matcher.csv')
df_cvc = pd.read_csv('D:/venturexpert/t-test/t-test.csv')
1. Matrix drawing
## Keep only necessary variables
df_pair = df_pair.loc[:, ['venture_name', 'investor_name', 'round_date']].copy()
## Keep only disclosed ventures
df_pair = df_pair.loc[df_pair['venture_name'] != 'Undisclosed Company']
## Keep only disclosed investors
df_pair = df_pair.loc[(df_pair['investor_name'] != 'Undisclosed Firm') &
(df_pair['investor_name'] != 'Undisclosed European Investors') &
(df_pair['investor_name'] != 'Undisclosed Corporate Investor Jamaica')]
## Convert rounddate to year
df_pair['round_date'] = pd.DatetimeIndex(df_pair['round_date']).year
## Change column names
df_pair = df_pair.rename(columns = {'venture_name': 'group', 'investor_name': 'node'})
## Convert to source node-target node matrix
df_list = df_pair.groupby(by = ['group', 'round_date'])['node'].apply(lambda x: list(x)).reset_index()
df_list['node'] = df_list['node'].apply(lambda x: list(combinations(x, 2)))
df_list = df_list.explode('node')
df_list['node'] = list(df_list['node'])
df_list = df_list.dropna().reset_index(drop = True)
df_st = pd.DataFrame(data = list(df_list['node']), columns = ['Source', 'Target'])
df_list = pd.merge(df_list, df_st, left_index = True, right_index= True)
df_list = df_list.drop(columns = ['group', 'node'])
## Convert round_Date format to time stampformat <[2000, 2002]>
df_list['time start'] = df_list['round_date']
df_list['time end'] = df_list['time start'] + 2
df_list = df_list.drop(columns = ['round_date'])
## Type: Undirected
df_list['Type'] = 'Undirected'
## Weight: number of duplicates
df_list['Weight'] = df_list.groupby(by = ['Source', 'Target', 'time start', 'time end']).transform('count')
## Remove duplicates
df_list = df_list.drop_duplicates().reset_index(drop = True)
df_list.to_csv('matrix.csv', index = False)
2. Centrality
For three year span (t ~ t+2)
Latest: 2018-2020
# Set the range of t from 1980 to 2018 so that t+2 could be until 2020
for i in range(1980, 2019):
# Get time frame that suits t ~ t+2
df_t = df_list.loc[(df_list['time start'] >= i) & (df_list['time start'] <= i+2), :]
# Set network and draw the network
G = nx.Graph
G = nx.from_pandas_edgelist(df_t, 'Source', 'Target', ['Weight'])
# Get centralities
dict_deg = nx.degree_centrality(G)
dict_btw = nx.betweenness_centrality(G)
dict_egn = nx.eigenvector_centrality(G)
dict_clo = nx.closeness_centrality(G)
# Convert centralities to dataframe and reset index
df_t = pd.DataFrame({'degree': pd.Series(dict_deg), 'between': pd.Series(dict_btw), 'eigen': pd.Series(dict_egn), 'closeness': pd.Series(dict_clo)})
df_t = df_t.reset_index()
# Convert column name, such as {centrality}_{year}
df_t.columns = [x + f'_{i}' if x != 'index' else x for x in df_t.columns]
# Merge database
if i == 1980:
df_all = df_t.copy()
df_all = pd.merge(df_all, df_t, on = 'index', how = 'outer')
df_all.to_csv('centrality.csv', index = False)
3. Match centrality with original dataset
df_org = df_cvc.loc[:, ['investor_name', 'first_investment_date']].copy()
# iterate through all investors
for i in df_org.index:
# get each investor's row
df_unit = df_org.iloc[i, :]
# extract variables for the investor identified above
df_sub = df_all.loc[df_all['index'] == df_unit['investor_name']].copy()
# df_all has greater number of observations than df_unit due to different time frame
# pass if df_sub is empty
if df_sub.empty == True:
# convert column name to {centrality} + _ + {t}
df_sub.columns = [x.split('_')[0] + '_' + str(int(x.split('_')[1]) - df_unit['first_investment_date'])\
if x != 'index' else x for x in df_sub.columns]
# concat dataframes
if i == 0:
df_con = df_sub.copy()
df_con = pd.concat([df_con, df_sub])
# filter and drop column names containing - (e.g., degree_-5)
df_con = df_con.loc[:, df_con.columns.drop(list(df_con.filter(regex = '-')))]
#reset index
df_con = df_con.drop_duplicates().reset_index(drop = True)
# Change column name: index to investor name
df_con = df_con.rename(columns = {'index': 'investor_name'})
merge dataframe
df_cvc = pd.merge(df_cvc, df_con, on = 'investor_name', how = 'left')
df_cvc.to_csv('centrality.csv', index = False)
