Jong-Sig/Centrality.py

## Centrality.py
import pandas as pd
import numpy as np
import os
import networkx as nx
from itertools import combinations, chain

"""
Process
    1. Matrix drawing
        1) Import venture-investor pair data
        2) Draw a matrix
            - venture as a group / investors as nodes of each group
        Source | Target | Type | Weight | Time Interval (e.g., <[2000, 2002]>)

    2. Investor identification
        1) Import CVC investor dummy data
        2) Save investor-dummy columns to export to Gephi
"""
df_pair = pd.read_csv('D:/venturexpert/raw_data/vc/SDC/round_entity_matcher/vx_round_matcher.csv')
df_cvc = pd.read_csv('D:/venturexpert/t-test/t-test.csv')

"""
1. Matrix drawing
"""
## Keep only necessary variables
df_pair = df_pair.loc[:, ['venture_name', 'investor_name', 'round_date']].copy()

## Keep only disclosed ventures
df_pair = df_pair.loc[df_pair['venture_name'] != 'Undisclosed Company']

## Keep only disclosed investors
df_pair = df_pair.loc[(df_pair['investor_name'] != 'Undisclosed Firm') &
 (df_pair['investor_name'] != 'Undisclosed European Investors') &
 (df_pair['investor_name'] != 'Undisclosed Corporate Investor Jamaica')]

## Convert rounddate to year
df_pair['round_date'] = pd.DatetimeIndex(df_pair['round_date']).year

## Change column names
df_pair = df_pair.rename(columns = {'venture_name': 'group', 'investor_name': 'node'})

## Convert to source node-target node matrix
df_list = df_pair.groupby(by = ['group', 'round_date'])['node'].apply(lambda x: list(x)).reset_index()
df_list['node'] = df_list['node'].apply(lambda x: list(combinations(x, 2)))
df_list = df_list.explode('node')
df_list['node'] = list(df_list['node'])
df_list = df_list.dropna().reset_index(drop = True)
df_st = pd.DataFrame(data = list(df_list['node']), columns = ['Source', 'Target'])
df_list = pd.merge(df_list, df_st, left_index = True, right_index= True)
df_list = df_list.drop(columns = ['group', 'node'])

## Convert round_Date format to time stampformat <[2000, 2002]>
df_list['time start'] = df_list['round_date']
df_list['time end'] = df_list['time start'] + 2
df_list = df_list.drop(columns = ['round_date'])

## Type: Undirected
df_list['Type'] = 'Undirected'

## Weight: number of duplicates
df_list['Weight'] = df_list.groupby(by = ['Source', 'Target', 'time start', 'time end']).transform('count')

## Remove duplicates
df_list = df_list.drop_duplicates().reset_index(drop = True)

df_list.to_csv('matrix.csv', index = False)

"""
2. Centrality
    For three year span (t ~ t+2)
        Latest: 2018-2020
"""
# Set the range of t from 1980 to 2018 so that t+2 could be until 2020
for i in range(1980, 2019):
    # Get time frame that suits t ~ t+2
    df_t = df_list.loc[(df_list['time start'] >= i) & (df_list['time start'] <= i+2), :]
    # Set network and draw the network
    G = nx.Graph
    G = nx.from_pandas_edgelist(df_t, 'Source', 'Target', ['Weight'])
    # Get centralities
    dict_deg = nx.degree_centrality(G)
    dict_btw = nx.betweenness_centrality(G)
    dict_egn = nx.eigenvector_centrality(G)
    dict_clo = nx.closeness_centrality(G)
    # Convert centralities to dataframe and reset index
    df_t = pd.DataFrame({'degree': pd.Series(dict_deg), 'between': pd.Series(dict_btw), 'eigen': pd.Series(dict_egn), 'closeness': pd.Series(dict_clo)})
    df_t = df_t.reset_index()
    # Convert column name, such as {centrality}_{year}
    df_t.columns = [x + f'_{i}' if x != 'index' else x for x in df_t.columns]

    # Merge database
    if i == 1980:
        df_all = df_t.copy()
    else:
        df_all = pd.merge(df_all, df_t, on = 'index', how = 'outer')

df_all.to_csv('centrality.csv', index = False)

"""
3. Match centrality with original dataset
"""
df_org = df_cvc.loc[:, ['investor_name', 'first_investment_date']].copy()

# iterate through all investors
for i in df_org.index:
    # get each investor's row
    df_unit = df_org.iloc[i, :]
    # extract variables for the investor identified above
    df_sub = df_all.loc[df_all['index'] == df_unit['investor_name']].copy()
    # df_all has greater number of observations than df_unit due to different time frame
    # pass if df_sub is empty
    if df_sub.empty == True:
        pass
    else:
        # convert column name to {centrality} + _ + {t}
        df_sub.columns = [x.split('_')[0] + '_' + str(int(x.split('_')[1]) - df_unit['first_investment_date'])\
            if x != 'index' else x for x in df_sub.columns]
        # concat dataframes
        if i == 0:
            df_con = df_sub.copy()
        else:
            df_con = pd.concat([df_con, df_sub])

        # filter and drop column names containing - (e.g., degree_-5)
        df_con = df_con.loc[:, df_con.columns.drop(list(df_con.filter(regex = '-')))]

#reset index
df_con = df_con.drop_duplicates().reset_index(drop = True)

# Change column name: index to investor name
df_con = df_con.rename(columns = {'index': 'investor_name'})

"""
merge dataframe
"""
df_cvc = pd.merge(df_cvc, df_con, on = 'investor_name', how = 'left')

df_cvc.to_csv('centrality.csv', index = False)
	import pandas as pd
	import numpy as np
	import os
	import networkx as nx
	from itertools import combinations, chain

	"""
	Process
	1. Matrix drawing
	1) Import venture-investor pair data
	2) Draw a matrix
	- venture as a group / investors as nodes of each group
	Source \| Target \| Type \| Weight \| Time Interval (e.g., <[2000, 2002]>)

	2. Investor identification
	1) Import CVC investor dummy data
	2) Save investor-dummy columns to export to Gephi
	"""
	df_pair = pd.read_csv('D:/venturexpert/raw_data/vc/SDC/round_entity_matcher/vx_round_matcher.csv')
	df_cvc = pd.read_csv('D:/venturexpert/t-test/t-test.csv')

	"""
	1. Matrix drawing
	"""
	## Keep only necessary variables
	df_pair = df_pair.loc[:, ['venture_name', 'investor_name', 'round_date']].copy()

	## Keep only disclosed ventures
	df_pair = df_pair.loc[df_pair['venture_name'] != 'Undisclosed Company']

	## Keep only disclosed investors
	df_pair = df_pair.loc[(df_pair['investor_name'] != 'Undisclosed Firm') &
	(df_pair['investor_name'] != 'Undisclosed European Investors') &
	(df_pair['investor_name'] != 'Undisclosed Corporate Investor Jamaica')]

	## Convert rounddate to year
	df_pair['round_date'] = pd.DatetimeIndex(df_pair['round_date']).year

	## Change column names
	df_pair = df_pair.rename(columns = {'venture_name': 'group', 'investor_name': 'node'})

	## Convert to source node-target node matrix
	df_list = df_pair.groupby(by = ['group', 'round_date'])['node'].apply(lambda x: list(x)).reset_index()
	df_list['node'] = df_list['node'].apply(lambda x: list(combinations(x, 2)))
	df_list = df_list.explode('node')
	df_list['node'] = list(df_list['node'])
	df_list = df_list.dropna().reset_index(drop = True)
	df_st = pd.DataFrame(data = list(df_list['node']), columns = ['Source', 'Target'])
	df_list = pd.merge(df_list, df_st, left_index = True, right_index= True)
	df_list = df_list.drop(columns = ['group', 'node'])

	## Convert round_Date format to time stampformat <[2000, 2002]>
	df_list['time start'] = df_list['round_date']
	df_list['time end'] = df_list['time start'] + 2
	df_list = df_list.drop(columns = ['round_date'])

	## Type: Undirected
	df_list['Type'] = 'Undirected'

	## Weight: number of duplicates
	df_list['Weight'] = df_list.groupby(by = ['Source', 'Target', 'time start', 'time end']).transform('count')

	## Remove duplicates
	df_list = df_list.drop_duplicates().reset_index(drop = True)

	df_list.to_csv('matrix.csv', index = False)

	"""
	2. Centrality
	For three year span (t ~ t+2)
	Latest: 2018-2020
	"""
	# Set the range of t from 1980 to 2018 so that t+2 could be until 2020
	for i in range(1980, 2019):
	# Get time frame that suits t ~ t+2
	df_t = df_list.loc[(df_list['time start'] >= i) & (df_list['time start'] <= i+2), :]
	# Set network and draw the network
	G = nx.Graph
	G = nx.from_pandas_edgelist(df_t, 'Source', 'Target', ['Weight'])
	# Get centralities
	dict_deg = nx.degree_centrality(G)
	dict_btw = nx.betweenness_centrality(G)
	dict_egn = nx.eigenvector_centrality(G)
	dict_clo = nx.closeness_centrality(G)
	# Convert centralities to dataframe and reset index
	df_t = pd.DataFrame({'degree': pd.Series(dict_deg), 'between': pd.Series(dict_btw), 'eigen': pd.Series(dict_egn), 'closeness': pd.Series(dict_clo)})
	df_t = df_t.reset_index()
	# Convert column name, such as {centrality}_{year}
	df_t.columns = [x + f'_{i}' if x != 'index' else x for x in df_t.columns]

	# Merge database
	if i == 1980:
	df_all = df_t.copy()
	else:
	df_all = pd.merge(df_all, df_t, on = 'index', how = 'outer')

	df_all.to_csv('centrality.csv', index = False)

	"""
	3. Match centrality with original dataset
	"""
	df_org = df_cvc.loc[:, ['investor_name', 'first_investment_date']].copy()

	# iterate through all investors
	for i in df_org.index:
	# get each investor's row
	df_unit = df_org.iloc[i, :]
	# extract variables for the investor identified above
	df_sub = df_all.loc[df_all['index'] == df_unit['investor_name']].copy()
	# df_all has greater number of observations than df_unit due to different time frame
	# pass if df_sub is empty
	if df_sub.empty == True:
	pass
	else:
	# convert column name to {centrality} + _ + {t}
	df_sub.columns = [x.split('_')[0] + '_' + str(int(x.split('_')[1]) - df_unit['first_investment_date'])\
	if x != 'index' else x for x in df_sub.columns]
	# concat dataframes
	if i == 0:
	df_con = df_sub.copy()
	else:
	df_con = pd.concat([df_con, df_sub])

	# filter and drop column names containing - (e.g., degree_-5)
	df_con = df_con.loc[:, df_con.columns.drop(list(df_con.filter(regex = '-')))]

	#reset index
	df_con = df_con.drop_duplicates().reset_index(drop = True)

	# Change column name: index to investor name
	df_con = df_con.rename(columns = {'index': 'investor_name'})

	"""
	merge dataframe
	"""
	df_cvc = pd.merge(df_cvc, df_con, on = 'investor_name', how = 'left')

	df_cvc.to_csv('centrality.csv', index = False)