Skip to content

Instantly share code, notes, and snippets.

@chengjun
Created August 16, 2014 08:23
Show Gist options
  • Save chengjun/2d88a65f3c6ccafface6 to your computer and use it in GitHub Desktop.
Save chengjun/2d88a65f3c6ccafface6 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 13 21:28:10 2014
@author: v_chjwang
"""
from os import listdir
import glob
from collections import defaultdict
import numpy as np
import networkx as nx
from numpy import log, mean
import statsmodels.api as sm
import matplotlib.pyplot as plt
'''
# Network balancing
'''
def flowBalancing(G):
RG = G.reverse()
H = G.copy()
def nodeBalancing(node):
outw=0
for i in G.edges(node):
outw+=G[i[0]][i[1]].values()[0]
inw=0
for i in RG.edges(node):
inw+=RG[i[0]][i[1]].values()[0]
deltaflow=inw-outw
if deltaflow > 0:
H.add_edge(node, "sink",weight=deltaflow)
elif deltaflow < 0:
H.add_edge("source", node, weight=abs(deltaflow))
else:
pass
for i in G.nodes():
nodeBalancing(i)
if ("source", "source") in H.edges(): H.remove_edge("source", "source")
if ("sink", "sink") in H.edges(): H.remove_edge("sink", "sink")
return H
'''
# network data preprocessing
'''
def constructFlowNetwork (ad):
with open(ad) as f:
E = defaultdict(int)
for line in f:
From, To = line.strip('\n').split('\t')[1:]
if To == 's':
To = 'sink'
E[From,To] += 1
G=nx.DiGraph()
for i,j in E.items():
x,y=i
G.add_edge(x,y,weight=j)
return G
'''
# get flow distance
'''
def toSink(G, i):
try:
di = G[i]['sink'].values()[0]
except:
di = 0
return di
def flowDistanceDT(G): #input a balanced nx graph
R = G.reverse()
mapping = {'source':'sink','sink':'source'}
H = nx.relabel_nodes(R,mapping)
#---------initialize flow distance dict------
L = dict((i,1) for i in G.nodes()) #FlowDistance
#---------prepare weighted out-degree dict------
D = {i: toSink(G, i) for i in G.nodes()} #Di
T = G.out_degree(weight='weight') #Ti
#---------iterate until converge------------
ls = np.array(L.values())
delta = len(L)*0.01 + 1
while delta > len(L)*0.01:
for i in L:
l=1
for m,n in H.edges(i):
l+=L[n]*H[m][n].values()[0]/float(T[m])
L[i]=l
delta = sum(np.abs(np.array(L.values()) - ls))
ls = np.array(L.values())
#---------clean the result-------
del L['sink']
for i in L:
L[i]-=1
L['sink'] = L.pop('source')
T['sink'] = T.pop('source')
D['sink'] = D.pop('source')
return L.values(), D.values(), T.values()
def prepocessingData(G):
# G = h
fd, di, ti = flowDistanceDT(G)
# sort data.frame
mt = np.array([fd, di, ti]).T
mts = sorted(mt.tolist(), key = lambda x: x[0])
fd, di, ti = np.array(mts).T
# get sum values
Drmax = sum(di)
Dmax = sum(di)
Tmax = sum(ti)
r_index = [index for index, v in enumerate(fd)]
di_cum = [ sum(di[:i+1]) for i in r_index]
# replace zero with a small number
for k, i in enumerate(di_cum):
if i==0:
di_cum[k] = 0.000001
else:
pass
# regression
ti_cum = [ sum(ti[:i+1]) for i in r_index]
di_cum_log = log(di_cum)
ti_cum_log = log(ti_cum)
Drmax_log = [log(Drmax) for i in range(len(r_index))]
return ti_cum_log, di_cum_log, Drmax_log, Dmax, Tmax
def sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log):
x = sm.add_constant(di_cum_log)
X = np.column_stack((x, Drmax_log))
model = sm.OLS(ti_cum_log, X)
res = model.fit()
b, a = res.params[1:]
return a, b
'''
'''
'''
# allometric growth
# get uv and pv
'''
def alloRegressFit(xdata,ydata):
x=np.log(xdata+1);y=np.log(ydata+1);
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
beta=res.params[1]; r2=res.rsquared
return beta, r2
def saveAlloBeta(file_name, siteBetaR2):
with open(file_name, 'a') as g:
g.write(siteBetaR2+"\n")
def calAlloBeta(sites):
num = 0
theta = []
for i in sites:
sitePath = path+ i +"/post_days_network_with_sink/*"
ads = glob.glob(sitePath)
E = defaultdict(lambda:[])
for dat in ads:
with open(dat) as f:
users = []
PV = 0
for line in f:
person, From, To = line.strip('\n').split('\t')
if To != 's':
PV += 1
users.append(person)
UV = len(set(users))
E[dat[-10:]].append([PV, UV])
pv, uv=np.array(E.values()).T
beta, r2 = alloRegressFit(uv[0], pv[0]) #注意:我在这里犯了一次错误!!!已经捕捉这个bug#
siteBetaR2 = i+'\t'+str(beta)+'\t'+str(r2)
saveAlloBeta(file_name, siteBetaR2)
num += 1
print siteBetaR2, num
theta.extend([beta])
return theta
# --------Run: get all site names---------
path='F:/attention flow/stackexchange/unzip/'
sites = [ f for f in listdir(path) if f[-1]=='m'] # .com
# file for saving the sitename, beta, R square
file_name = path + 'allo_site_beta_rsquare'
# Run
theta = calAlloBeta(sites)
abdt = []
for i in sites:
sitePath = path+ i +"/post_days_network_with_sink/*"
ads = glob.glob(sitePath)
# prepare data
ti_cum_log = []
di_cum_log = []
Drmax_log = []
Dmax = []
Tmax = []
for i in ads:
g = constructFlowNetwork(i)
h=flowBalancing(g)
try:
tcl, dcl, Dl, Dm, Tm = prepocessingData(h)
ti_cum_log.extend(tcl)
di_cum_log.extend(dcl)
Drmax_log.extend(Dl)
Dmax.extend([Dm])
Tmax.extend([Tm])
except:
print 'error:', i
pass
#fit model
a, b = sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log)
Dmax_mean = mean(Dmax)
Tmax_mean = mean(Tmax)
abdt.append( [a, b, Dmax_mean, Tmax_mean])
a, b, dmax, tmax = np.array(abdt).T.tolist()
a_plus_b = (np.matrix(a) +np.matrix(b)).tolist()[0]
theta_minus_a = (np.matrix(theta) - np.matrix(a)).tolist()[0]
def scatterPlot(xdata,ydata,col,mark,xlab,ylab):
xx = sm.add_constant(xdata, prepend=True)
res = sm.OLS(ydata,xx).fit()
constant=res.params[0];beta=res.params[1]#; r2=res.rsquared
plt.plot(xdata,ydata,mark,color=col)
plt.xlabel(xlab);plt.ylabel(ylab)
minx,maxx=plt.xlim(); miny,maxy=plt.ylim()
plt.text((maxx-minx)/2,(maxy-miny)/3,np.round(beta,2))
xs = np.linspace(min(xdata),max(xdata),100)
plt.plot(xs, constant + xs*beta,color='r',linestyle='-')
fig = plt.figure(figsize=(10, 15),facecolor='white')
plt.subplot(3, 2, 1)
scatterPlot(a_plus_b, theta, 'b', 'o', 'a + b', r'$\theta$')
plt.subplot(3, 2, 2)
scatterPlot(a, b, 'b', 'o', 'a', 'b')
plt.subplot(3, 2, 3)
scatterPlot(log(dmax), a, 'b', 'o', 'ln(mean Dmax)', 'a')
plt.subplot(3, 2, 4)
scatterPlot(log(dmax), b, 'b', 'o', 'ln(mean Dmax)', 'b')
plt.subplot(3, 2, 5)
scatterPlot(dmax, tmax, 'b', 'o', 'Dmax', 'Tmax')
plt.subplot(3, 2, 6)
scatterPlot(b, theta_minus_a, 'b', 'o', 'b', r'$theta$ - a')
plt.tight_layout()
fig = plt.gcf()
plt.show()
fig.savefig('abfit.png', dpi = 300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment