Skip to content

Instantly share code, notes, and snippets.

@cjue25
Last active February 8, 2018 08:47
Show Gist options
  • Save cjue25/da4aa5c5ccfd1f92838440d898598506 to your computer and use it in GitHub Desktop.
Save cjue25/da4aa5c5ccfd1f92838440d898598506 to your computer and use it in GitHub Desktop.
Using Python for Research_Case Study 6
#Basics of NetworkX
import networkx as nx
G=nx.Graph()
#增加節點
G.add_node(1)
#一次增加多個節點
G.add_nodes_from([2,3,'u','v'])
#print (G.nodes())
#增加邊
G.add_edge(1,2)
G.add_edge('u','v')
#一次增加多個邊
G.add_edges_from([(1,3),(1,4),(1,5),(1,6)])
G.add_edge("u","w") #會自動增加沒有輸入的節點
#print (G.edges())
#刪除
G.remove_node(2)
G.remove_nodes_from([4,5])
G.remove_edge(1,3)
G.remove_edges_from([(1,2),('u','v')])
#算個數
G.number_of_nodes()
G.number_of_edges()
##畫Graph
import matplotlib.pyplot as plt
G=nx.karate_club_graph()
nx.draw(G, with_labels=True, node_color="lightblue", edge_color="gray")
plt.savefig('karate_graph.png')
#為一個DegreeView,類似dic的形式,key為node、value為邊數
G.degree()
#索引有兩種方式
G.degree(33)
G.degree()[33]
from scipy.stats import bernoulli
##erdos-renyi graph
#p是機率,介於0~1之間,輸出值為0或1,也就是說P越高,輸出1的機率越大
#搭配bernoulli.rvs(p=p) 使用
def er_graph(N,p):
G=nx.Graph()
G.add_nodes_from(range(N))
for node1 in G.nodes():
for node2 in G.nodes():
if node1<node2 and bernoulli.rvs(p=p): #1就是True,並且不易重複畫線
G.add_edge(node1,node2)
return G
nx.draw(er_graph(50,0.08),node_size=40, node_color="gray")
plt.savefig('er_graph1.png')
def plot_degree_distribution(G):
degree_sequence = [d for n, d in G.degree()]
plt.hist(degree_sequence, histtype="step")
plt.xlabel("Degree $k$")
plt.ylabel("$P(k)$")
plt.title("Degree distribution")
G1=er_graph(500,0.08)
plot_degree_distribution(G1)
G2=er_graph(500,0.08)
plot_degree_distribution(G2)
G3=er_graph(500,0.08)
plot_degree_distribution(G3)
plt.savefig('hist_3.png')
import numpy as np
A1=np.loadtxt("adj_allVillageRelationships_vilno_1.csv", delimiter=",")
A2=np.loadtxt("adj_allVillageRelationships_vilno_2.csv", delimiter=",")
G1=nx.to_networkx_graph(A1)
G2=nx.to_networkx_graph(A2)
def basic_net_stats(G):
print ("Number of nodes: %d" % G.number_of_nodes())
print ("Number of edges: %d" % G.number_of_edges())
degree_sequence = [d for n, d in G.degree()]
print("Average degree: %.2f" % np.mean(degree_sequence))
basic_net_stats(G1)
basic_net_stats(G2)
plot_degree_distribution(G1)
plot_degree_distribution(G2)
plt.savefig("village_hist.png")
## 利用gen=nx.connected_component_subgraphs(G1)
## len(gen.__next__()) 可以找出每一個compenets的node數量
## 利用max找到最大的component
G1_LCC=max(nx.connected_component_subgraphs(G1), key=len)
G2_LCC=max(nx.connected_component_subgraphs(G2), key=len)
fraction_G1 = G1_LCC.number_of_nodes() / G1.number_of_nodes()
fraction_G2 = G2_LCC.number_of_nodes() / G2.number_of_nodes()
plt.figure()
nx.draw(G1_LCC, node_color='red', edge_color='gray', node_size=20)
plt.savefig('village1.png')
plt.figure()
nx.draw(G2_LCC, node_color='green', edge_color='gray', node_size=20)
plt.savefig('village2.png')
"""
data_filepath ='https://s3.amazonaws.com/assets.datacamp.com/production/course_974/datasets/'
"""
from collections import Counter
def frequency(chars):
return Counter(favorite_colors.values())
def chance_homophily(chars):
fre = frequency(chars)
total=0
for i in fre.values():
total+=(i/sum(fre.values()))**2
return total
favorite_colors = {
"ankit": "red",
"xiaoyu": "blue",
"mary": "blue"
}
color_homophily=chance_homophily(favorite_colors)
print (color_homophily)
import pandas as pd
df = pd.read_stata(data_filepath + "individual_characteristics.dta")
df1 = df[df.village==1]
df2 = df[df.village==2]
df1.head()
sex1 = dict(zip(df1.pid.values, df1.resp_gend))
caste1 = dict(zip(df1.pid.values, df1.caste))
religion1 = dict(zip(df1.pid.values, df1.religion))
# Continue for df2 as well.
sex2 = dict(zip(df2.pid.values, df2.resp_gend))
caste2 = dict(zip(df2.pid.values, df2.caste))
religion2 = dict(zip(df2.pid.values, df2.religion))
###另一個做法###
sex1 = df1.set_index("pid")["resp_gend"].to_dict()
caste1 = df1.set_index("pid")["caste"].to_dict()
religion1 = df1.set_index("pid")["religion"].to_dict()
sex2 = df2.set_index("pid")["resp_gend"].to_dict()
caste2 = df2.set_index("pid")["caste"].to_dict()
religion2 = df2.set_index("pid")["religion"].to_dict()
print("Village 1 chance of same sex:", chance_homophily(sex1))
print("Village 1 chance of same caste:", chance_homophily(caste1))
print("Village 1 chance of same religion:", chance_homophily(religion1))
print("Village 2 chance of same sex:", chance_homophily(sex2))
print("Village 2 chance of same caste:", chance_homophily(caste2))
print("Village 2 chance of same religion:", chance_homophily(religion2))
def homophily(G, chars, IDs):
"""
Given a network G, a dict of characteristics chars for node IDs,
and dict of node IDs for each node in the network,
find the homophily of the network.
"""
num_same_ties = 0
num_ties = 0
for n1, n2 in G.edges():
if IDs[n1] in chars and IDs[n2] in chars:
if G.has_edge(n1, n2):
num_ties+=1
# Should `num_ties` be incremented? What about `num_same_ties`?
if chars[IDs[n1]] == chars[IDs[n2]]:
num_same_ties+=1
# Should `num_ties` be incremented? What about `num_same_ties`?
return (num_same_ties / num_ties)
#原始資料沒有標題所以讀的時候要設立header=None
pid1=pd.read_csv(data_filepath +"key_vilno_1.csv",header=None)
pid2=pd.read_csv(data_filepath +"key_vilno_2.csv",header=None)
print("Village 1 observed proportion of same sex:", homophily(G1, sex1, pid1))
print("Village 1 observed proportion of same caste:", homophily(G1, caste1, pid1))
print("Village 1 observed proportion of same religion:", homophily(G1, religion1, pid1))
print("Village 2 observed proportion of same sex:", homophily(G2, sex2, pid2))
print("Village 2 observed proportion of same caste:", homophily(G2, caste2, pid2))
print("Village 2 observed proportion of same religion:", homophily(G2, religion2, pid2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment