cjue25/01_Basics_of_NetworkX.py

## 01_Basics_of_NetworkX.py
#Basics of NetworkX
import networkx as nx
G=nx.Graph()

#增加節點
G.add_node(1)

#一次增加多個節點
G.add_nodes_from([2,3,'u','v'])

#print (G.nodes())

#增加邊
G.add_edge(1,2)
G.add_edge('u','v')

#一次增加多個邊
G.add_edges_from([(1,3),(1,4),(1,5),(1,6)])
G.add_edge("u","w") #會自動增加沒有輸入的節點

#print (G.edges())

#刪除
G.remove_node(2)
G.remove_nodes_from([4,5])

G.remove_edge(1,3)
G.remove_edges_from([(1,2),('u','v')])

#算個數
G.number_of_nodes()
G.number_of_edges()

##畫Graph
import matplotlib.pyplot as plt
G=nx.karate_club_graph()
nx.draw(G, with_labels=True, node_color="lightblue", edge_color="gray")
plt.savefig('karate_graph.png')

#為一個DegreeView，類似dic的形式，key為node、value為邊數
G.degree()

#索引有兩種方式
G.degree(33)
G.degree()[33]

## 02_erdos-renyi_graph.py
from scipy.stats import bernoulli
##erdos-renyi graph
#p是機率，介於0~1之間，輸出值為0或1，也就是說P越高，輸出1的機率越大
#搭配bernoulli.rvs(p=p) 使用
def er_graph(N,p):
    G=nx.Graph()
    G.add_nodes_from(range(N))
    for node1 in G.nodes():
        for node2 in G.nodes():
            if node1<node2 and bernoulli.rvs(p=p): #1就是True，並且不易重複畫線
                G.add_edge(node1,node2)
    return G

nx.draw(er_graph(50,0.08),node_size=40, node_color="gray")
plt.savefig('er_graph1.png')

def plot_degree_distribution(G):
    degree_sequence = [d for n, d in G.degree()]
    plt.hist(degree_sequence, histtype="step")
    plt.xlabel("Degree $k$")
    plt.ylabel("$P(k)$")
    plt.title("Degree distribution")

G1=er_graph(500,0.08)
plot_degree_distribution(G1)
G2=er_graph(500,0.08)
plot_degree_distribution(G2)
G3=er_graph(500,0.08)
plot_degree_distribution(G3)

plt.savefig('hist_3.png')

## 03_data.py
import numpy as np
A1=np.loadtxt("adj_allVillageRelationships_vilno_1.csv", delimiter=",")
A2=np.loadtxt("adj_allVillageRelationships_vilno_2.csv", delimiter=",")

G1=nx.to_networkx_graph(A1)
G2=nx.to_networkx_graph(A2)

def basic_net_stats(G):
    print ("Number of nodes: %d" % G.number_of_nodes())
    print ("Number of edges: %d" % G.number_of_edges())
    degree_sequence = [d for n, d in G.degree()]
    print("Average degree: %.2f" % np.mean(degree_sequence))

basic_net_stats(G1)
basic_net_stats(G2)

plot_degree_distribution(G1)
plot_degree_distribution(G2)
plt.savefig("village_hist.png")

## 04_plot.py
## 利用gen=nx.connected_component_subgraphs(G1)
## len(gen.__next__()) 可以找出每一個compenets的node數量
## 利用max找到最大的component
G1_LCC=max(nx.connected_component_subgraphs(G1), key=len)
G2_LCC=max(nx.connected_component_subgraphs(G2), key=len)

fraction_G1 = G1_LCC.number_of_nodes() / G1.number_of_nodes()
fraction_G2 = G2_LCC.number_of_nodes() / G2.number_of_nodes()

plt.figure()
nx.draw(G1_LCC, node_color='red', edge_color='gray', node_size=20)
plt.savefig('village1.png')

plt.figure()
nx.draw(G2_LCC, node_color='green', edge_color='gray', node_size=20)
plt.savefig('village2.png')

## 05_hw_probabilty.py
"""
data_filepath ='https://s3.amazonaws.com/assets.datacamp.com/production/course_974/datasets/'
"""
from collections import Counter
def frequency(chars):
    return Counter(favorite_colors.values())

def chance_homophily(chars):
    fre = frequency(chars)
    total=0
    for i in fre.values():
        total+=(i/sum(fre.values()))**2
    return total

favorite_colors = {
    "ankit":  "red",
    "xiaoyu": "blue",
    "mary":   "blue"
}

color_homophily=chance_homophily(favorite_colors)
print (color_homophily)

import pandas as pd
df  = pd.read_stata(data_filepath + "individual_characteristics.dta")
df1 = df[df.village==1]
df2 = df[df.village==2]

df1.head()

sex1      = dict(zip(df1.pid.values, df1.resp_gend))
caste1    = dict(zip(df1.pid.values, df1.caste))
religion1 = dict(zip(df1.pid.values, df1.religion))

# Continue for df2 as well.
sex2      = dict(zip(df2.pid.values, df2.resp_gend))
caste2    = dict(zip(df2.pid.values, df2.caste))
religion2 = dict(zip(df2.pid.values, df2.religion))

###另一個做法###
sex1      = df1.set_index("pid")["resp_gend"].to_dict()
caste1    = df1.set_index("pid")["caste"].to_dict()
religion1 = df1.set_index("pid")["religion"].to_dict()

sex2      = df2.set_index("pid")["resp_gend"].to_dict()
caste2    = df2.set_index("pid")["caste"].to_dict()
religion2 = df2.set_index("pid")["religion"].to_dict()

print("Village 1 chance of same sex:", chance_homophily(sex1))
print("Village 1 chance of same caste:", chance_homophily(caste1))
print("Village 1 chance of same religion:", chance_homophily(religion1))
print("Village 2 chance of same sex:", chance_homophily(sex2))
print("Village 2 chance of same caste:", chance_homophily(caste2))
print("Village 2 chance of same religion:", chance_homophily(religion2))

## 06_ratio.py
def homophily(G, chars, IDs):
    """
    Given a network G, a dict of characteristics chars for node IDs,
    and dict of node IDs for each node in the network,
    find the homophily of the network.
    """
    num_same_ties = 0
    num_ties = 0
    for n1, n2 in G.edges():
        if IDs[n1] in chars and IDs[n2] in chars:
            if G.has_edge(n1, n2):
                num_ties+=1
                # Should `num_ties` be incremented?  What about `num_same_ties`?
                if chars[IDs[n1]] == chars[IDs[n2]]:
                    num_same_ties+=1
                    # Should `num_ties` be incremented?  What about `num_same_ties`?
    return (num_same_ties / num_ties)

#原始資料沒有標題所以讀的時候要設立header=None
pid1=pd.read_csv(data_filepath +"key_vilno_1.csv",header=None)
pid2=pd.read_csv(data_filepath +"key_vilno_2.csv",header=None)

print("Village 1 observed proportion of same sex:", homophily(G1, sex1, pid1))
print("Village 1 observed proportion of same caste:", homophily(G1, caste1, pid1))
print("Village 1 observed proportion of same religion:", homophily(G1, religion1, pid1))
print("Village 2 observed proportion of same sex:", homophily(G2, sex2, pid2))
print("Village 2 observed proportion of same caste:", homophily(G2, caste2, pid2))
print("Village 2 observed proportion of same religion:", homophily(G2, religion2, pid2))
	#Basics of NetworkX
	import networkx as nx
	G=nx.Graph()

	#增加節點
	G.add_node(1)

	#一次增加多個節點
	G.add_nodes_from([2,3,'u','v'])

	#print (G.nodes())

	#增加邊
	G.add_edge(1,2)
	G.add_edge('u','v')

	#一次增加多個邊
	G.add_edges_from([(1,3),(1,4),(1,5),(1,6)])
	G.add_edge("u","w") #會自動增加沒有輸入的節點

	#print (G.edges())

	#刪除
	G.remove_node(2)
	G.remove_nodes_from([4,5])

	G.remove_edge(1,3)
	G.remove_edges_from([(1,2),('u','v')])

	#算個數
	G.number_of_nodes()
	G.number_of_edges()

	##畫Graph
	import matplotlib.pyplot as plt
	G=nx.karate_club_graph()
	nx.draw(G, with_labels=True, node_color="lightblue", edge_color="gray")
	plt.savefig('karate_graph.png')

	#為一個DegreeView，類似dic的形式，key為node、value為邊數
	G.degree()

	#索引有兩種方式
	G.degree(33)
	G.degree()[33]
	from scipy.stats import bernoulli
	##erdos-renyi graph
	#p是機率，介於0~1之間，輸出值為0或1，也就是說P越高，輸出1的機率越大
	#搭配bernoulli.rvs(p=p) 使用
	def er_graph(N,p):
	G=nx.Graph()
	G.add_nodes_from(range(N))
	for node1 in G.nodes():
	for node2 in G.nodes():
	if node1<node2 and bernoulli.rvs(p=p): #1就是True，並且不易重複畫線
	G.add_edge(node1,node2)
	return G

	nx.draw(er_graph(50,0.08),node_size=40, node_color="gray")
	plt.savefig('er_graph1.png')

	def plot_degree_distribution(G):
	degree_sequence = [d for n, d in G.degree()]
	plt.hist(degree_sequence, histtype="step")
	plt.xlabel("Degree $k$")
	plt.ylabel("$P(k)$")
	plt.title("Degree distribution")

	G1=er_graph(500,0.08)
	plot_degree_distribution(G1)
	G2=er_graph(500,0.08)
	plot_degree_distribution(G2)
	G3=er_graph(500,0.08)
	plot_degree_distribution(G3)

	plt.savefig('hist_3.png')
	import numpy as np
	A1=np.loadtxt("adj_allVillageRelationships_vilno_1.csv", delimiter=",")
	A2=np.loadtxt("adj_allVillageRelationships_vilno_2.csv", delimiter=",")

	G1=nx.to_networkx_graph(A1)
	G2=nx.to_networkx_graph(A2)

	def basic_net_stats(G):
	print ("Number of nodes: %d" % G.number_of_nodes())
	print ("Number of edges: %d" % G.number_of_edges())
	degree_sequence = [d for n, d in G.degree()]
	print("Average degree: %.2f" % np.mean(degree_sequence))

	basic_net_stats(G1)
	basic_net_stats(G2)

	plot_degree_distribution(G1)
	plot_degree_distribution(G2)
	plt.savefig("village_hist.png")
	## 利用gen=nx.connected_component_subgraphs(G1)
	## len(gen.__next__()) 可以找出每一個compenets的node數量
	## 利用max找到最大的component
	G1_LCC=max(nx.connected_component_subgraphs(G1), key=len)
	G2_LCC=max(nx.connected_component_subgraphs(G2), key=len)

	fraction_G1 = G1_LCC.number_of_nodes() / G1.number_of_nodes()
	fraction_G2 = G2_LCC.number_of_nodes() / G2.number_of_nodes()

	plt.figure()
	nx.draw(G1_LCC, node_color='red', edge_color='gray', node_size=20)
	plt.savefig('village1.png')

	plt.figure()
	nx.draw(G2_LCC, node_color='green', edge_color='gray', node_size=20)
	plt.savefig('village2.png')
	"""
	data_filepath ='https://s3.amazonaws.com/assets.datacamp.com/production/course_974/datasets/'
	"""
	from collections import Counter
	def frequency(chars):
	return Counter(favorite_colors.values())

	def chance_homophily(chars):
	fre = frequency(chars)
	total=0
	for i in fre.values():
	total+=(i/sum(fre.values()))**2
	return total

	favorite_colors = {
	"ankit": "red",
	"xiaoyu": "blue",
	"mary": "blue"
	}

	color_homophily=chance_homophily(favorite_colors)
	print (color_homophily)

	import pandas as pd
	df = pd.read_stata(data_filepath + "individual_characteristics.dta")
	df1 = df[df.village==1]
	df2 = df[df.village==2]

	df1.head()

	sex1 = dict(zip(df1.pid.values, df1.resp_gend))
	caste1 = dict(zip(df1.pid.values, df1.caste))
	religion1 = dict(zip(df1.pid.values, df1.religion))

	# Continue for df2 as well.
	sex2 = dict(zip(df2.pid.values, df2.resp_gend))
	caste2 = dict(zip(df2.pid.values, df2.caste))
	religion2 = dict(zip(df2.pid.values, df2.religion))

	###另一個做法###
	sex1 = df1.set_index("pid")["resp_gend"].to_dict()
	caste1 = df1.set_index("pid")["caste"].to_dict()
	religion1 = df1.set_index("pid")["religion"].to_dict()

	sex2 = df2.set_index("pid")["resp_gend"].to_dict()
	caste2 = df2.set_index("pid")["caste"].to_dict()
	religion2 = df2.set_index("pid")["religion"].to_dict()

	print("Village 1 chance of same sex:", chance_homophily(sex1))
	print("Village 1 chance of same caste:", chance_homophily(caste1))
	print("Village 1 chance of same religion:", chance_homophily(religion1))
	print("Village 2 chance of same sex:", chance_homophily(sex2))
	print("Village 2 chance of same caste:", chance_homophily(caste2))
	print("Village 2 chance of same religion:", chance_homophily(religion2))
	def homophily(G, chars, IDs):
	"""
	Given a network G, a dict of characteristics chars for node IDs,
	and dict of node IDs for each node in the network,
	find the homophily of the network.
	"""
	num_same_ties = 0
	num_ties = 0
	for n1, n2 in G.edges():
	if IDs[n1] in chars and IDs[n2] in chars:
	if G.has_edge(n1, n2):
	num_ties+=1
	# Should `num_ties` be incremented? What about `num_same_ties`?
	if chars[IDs[n1]] == chars[IDs[n2]]:
	num_same_ties+=1
	# Should `num_ties` be incremented? What about `num_same_ties`?
	return (num_same_ties / num_ties)

	#原始資料沒有標題所以讀的時候要設立header=None
	pid1=pd.read_csv(data_filepath +"key_vilno_1.csv",header=None)
	pid2=pd.read_csv(data_filepath +"key_vilno_2.csv",header=None)

	print("Village 1 observed proportion of same sex:", homophily(G1, sex1, pid1))
	print("Village 1 observed proportion of same caste:", homophily(G1, caste1, pid1))
	print("Village 1 observed proportion of same religion:", homophily(G1, religion1, pid1))
	print("Village 2 observed proportion of same sex:", homophily(G2, sex2, pid2))
	print("Village 2 observed proportion of same caste:", homophily(G2, caste2, pid2))
	print("Village 2 observed proportion of same religion:", homophily(G2, religion2, pid2))