Created
August 16, 2014 08:23
-
-
Save chengjun/2d88a65f3c6ccafface6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Aug 13 21:28:10 2014 | |
@author: v_chjwang | |
""" | |
from os import listdir | |
import glob | |
from collections import defaultdict | |
import numpy as np | |
import networkx as nx | |
from numpy import log, mean | |
import statsmodels.api as sm | |
import matplotlib.pyplot as plt | |
''' | |
# Network balancing | |
''' | |
def flowBalancing(G): | |
RG = G.reverse() | |
H = G.copy() | |
def nodeBalancing(node): | |
outw=0 | |
for i in G.edges(node): | |
outw+=G[i[0]][i[1]].values()[0] | |
inw=0 | |
for i in RG.edges(node): | |
inw+=RG[i[0]][i[1]].values()[0] | |
deltaflow=inw-outw | |
if deltaflow > 0: | |
H.add_edge(node, "sink",weight=deltaflow) | |
elif deltaflow < 0: | |
H.add_edge("source", node, weight=abs(deltaflow)) | |
else: | |
pass | |
for i in G.nodes(): | |
nodeBalancing(i) | |
if ("source", "source") in H.edges(): H.remove_edge("source", "source") | |
if ("sink", "sink") in H.edges(): H.remove_edge("sink", "sink") | |
return H | |
''' | |
# network data preprocessing | |
''' | |
def constructFlowNetwork (ad): | |
with open(ad) as f: | |
E = defaultdict(int) | |
for line in f: | |
From, To = line.strip('\n').split('\t')[1:] | |
if To == 's': | |
To = 'sink' | |
E[From,To] += 1 | |
G=nx.DiGraph() | |
for i,j in E.items(): | |
x,y=i | |
G.add_edge(x,y,weight=j) | |
return G | |
''' | |
# get flow distance | |
''' | |
def toSink(G, i): | |
try: | |
di = G[i]['sink'].values()[0] | |
except: | |
di = 0 | |
return di | |
def flowDistanceDT(G): #input a balanced nx graph | |
R = G.reverse() | |
mapping = {'source':'sink','sink':'source'} | |
H = nx.relabel_nodes(R,mapping) | |
#---------initialize flow distance dict------ | |
L = dict((i,1) for i in G.nodes()) #FlowDistance | |
#---------prepare weighted out-degree dict------ | |
D = {i: toSink(G, i) for i in G.nodes()} #Di | |
T = G.out_degree(weight='weight') #Ti | |
#---------iterate until converge------------ | |
ls = np.array(L.values()) | |
delta = len(L)*0.01 + 1 | |
while delta > len(L)*0.01: | |
for i in L: | |
l=1 | |
for m,n in H.edges(i): | |
l+=L[n]*H[m][n].values()[0]/float(T[m]) | |
L[i]=l | |
delta = sum(np.abs(np.array(L.values()) - ls)) | |
ls = np.array(L.values()) | |
#---------clean the result------- | |
del L['sink'] | |
for i in L: | |
L[i]-=1 | |
L['sink'] = L.pop('source') | |
T['sink'] = T.pop('source') | |
D['sink'] = D.pop('source') | |
return L.values(), D.values(), T.values() | |
def prepocessingData(G): | |
# G = h | |
fd, di, ti = flowDistanceDT(G) | |
# sort data.frame | |
mt = np.array([fd, di, ti]).T | |
mts = sorted(mt.tolist(), key = lambda x: x[0]) | |
fd, di, ti = np.array(mts).T | |
# get sum values | |
Drmax = sum(di) | |
Dmax = sum(di) | |
Tmax = sum(ti) | |
r_index = [index for index, v in enumerate(fd)] | |
di_cum = [ sum(di[:i+1]) for i in r_index] | |
# replace zero with a small number | |
for k, i in enumerate(di_cum): | |
if i==0: | |
di_cum[k] = 0.000001 | |
else: | |
pass | |
# regression | |
ti_cum = [ sum(ti[:i+1]) for i in r_index] | |
di_cum_log = log(di_cum) | |
ti_cum_log = log(ti_cum) | |
Drmax_log = [log(Drmax) for i in range(len(r_index))] | |
return ti_cum_log, di_cum_log, Drmax_log, Dmax, Tmax | |
def sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log): | |
x = sm.add_constant(di_cum_log) | |
X = np.column_stack((x, Drmax_log)) | |
model = sm.OLS(ti_cum_log, X) | |
res = model.fit() | |
b, a = res.params[1:] | |
return a, b | |
''' | |
''' | |
''' | |
# allometric growth | |
# get uv and pv | |
''' | |
def alloRegressFit(xdata,ydata): | |
x=np.log(xdata+1);y=np.log(ydata+1); | |
xx = sm.add_constant(x, prepend=True) | |
res = sm.OLS(y,xx).fit() | |
beta=res.params[1]; r2=res.rsquared | |
return beta, r2 | |
def saveAlloBeta(file_name, siteBetaR2): | |
with open(file_name, 'a') as g: | |
g.write(siteBetaR2+"\n") | |
def calAlloBeta(sites): | |
num = 0 | |
theta = [] | |
for i in sites: | |
sitePath = path+ i +"/post_days_network_with_sink/*" | |
ads = glob.glob(sitePath) | |
E = defaultdict(lambda:[]) | |
for dat in ads: | |
with open(dat) as f: | |
users = [] | |
PV = 0 | |
for line in f: | |
person, From, To = line.strip('\n').split('\t') | |
if To != 's': | |
PV += 1 | |
users.append(person) | |
UV = len(set(users)) | |
E[dat[-10:]].append([PV, UV]) | |
pv, uv=np.array(E.values()).T | |
beta, r2 = alloRegressFit(uv[0], pv[0]) #注意:我在这里犯了一次错误!!!已经捕捉这个bug# | |
siteBetaR2 = i+'\t'+str(beta)+'\t'+str(r2) | |
saveAlloBeta(file_name, siteBetaR2) | |
num += 1 | |
print siteBetaR2, num | |
theta.extend([beta]) | |
return theta | |
# --------Run: get all site names--------- | |
path='F:/attention flow/stackexchange/unzip/' | |
sites = [ f for f in listdir(path) if f[-1]=='m'] # .com | |
# file for saving the sitename, beta, R square | |
file_name = path + 'allo_site_beta_rsquare' | |
# Run | |
theta = calAlloBeta(sites) | |
abdt = [] | |
for i in sites: | |
sitePath = path+ i +"/post_days_network_with_sink/*" | |
ads = glob.glob(sitePath) | |
# prepare data | |
ti_cum_log = [] | |
di_cum_log = [] | |
Drmax_log = [] | |
Dmax = [] | |
Tmax = [] | |
for i in ads: | |
g = constructFlowNetwork(i) | |
h=flowBalancing(g) | |
try: | |
tcl, dcl, Dl, Dm, Tm = prepocessingData(h) | |
ti_cum_log.extend(tcl) | |
di_cum_log.extend(dcl) | |
Drmax_log.extend(Dl) | |
Dmax.extend([Dm]) | |
Tmax.extend([Tm]) | |
except: | |
print 'error:', i | |
pass | |
#fit model | |
a, b = sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log) | |
Dmax_mean = mean(Dmax) | |
Tmax_mean = mean(Tmax) | |
abdt.append( [a, b, Dmax_mean, Tmax_mean]) | |
a, b, dmax, tmax = np.array(abdt).T.tolist() | |
a_plus_b = (np.matrix(a) +np.matrix(b)).tolist()[0] | |
theta_minus_a = (np.matrix(theta) - np.matrix(a)).tolist()[0] | |
def scatterPlot(xdata,ydata,col,mark,xlab,ylab): | |
xx = sm.add_constant(xdata, prepend=True) | |
res = sm.OLS(ydata,xx).fit() | |
constant=res.params[0];beta=res.params[1]#; r2=res.rsquared | |
plt.plot(xdata,ydata,mark,color=col) | |
plt.xlabel(xlab);plt.ylabel(ylab) | |
minx,maxx=plt.xlim(); miny,maxy=plt.ylim() | |
plt.text((maxx-minx)/2,(maxy-miny)/3,np.round(beta,2)) | |
xs = np.linspace(min(xdata),max(xdata),100) | |
plt.plot(xs, constant + xs*beta,color='r',linestyle='-') | |
fig = plt.figure(figsize=(10, 15),facecolor='white') | |
plt.subplot(3, 2, 1) | |
scatterPlot(a_plus_b, theta, 'b', 'o', 'a + b', r'$\theta$') | |
plt.subplot(3, 2, 2) | |
scatterPlot(a, b, 'b', 'o', 'a', 'b') | |
plt.subplot(3, 2, 3) | |
scatterPlot(log(dmax), a, 'b', 'o', 'ln(mean Dmax)', 'a') | |
plt.subplot(3, 2, 4) | |
scatterPlot(log(dmax), b, 'b', 'o', 'ln(mean Dmax)', 'b') | |
plt.subplot(3, 2, 5) | |
scatterPlot(dmax, tmax, 'b', 'o', 'Dmax', 'Tmax') | |
plt.subplot(3, 2, 6) | |
scatterPlot(b, theta_minus_a, 'b', 'o', 'b', r'$theta$ - a') | |
plt.tight_layout() | |
fig = plt.gcf() | |
plt.show() | |
fig.savefig('abfit.png', dpi = 300) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment