chengjun/stackexchange_days_size_adjusted_fit.py

## stackexchange_days_size_adjusted_fit.py
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 13 21:28:10 2014

@author: v_chjwang
"""

from os import listdir
import glob
from collections import defaultdict
import numpy as np
import networkx as nx
from numpy import log, mean
import statsmodels.api as sm
import matplotlib.pyplot as plt

'''
# Network balancing
'''

def flowBalancing(G):
    RG = G.reverse()
    H = G.copy()
    def nodeBalancing(node):
        outw=0
        for i in G.edges(node):
            outw+=G[i[0]][i[1]].values()[0]
        inw=0
        for i in RG.edges(node):
            inw+=RG[i[0]][i[1]].values()[0]
        deltaflow=inw-outw
        if deltaflow > 0:
            H.add_edge(node, "sink",weight=deltaflow)
        elif deltaflow < 0:
            H.add_edge("source", node, weight=abs(deltaflow))
        else:
            pass
    for i in G.nodes():
        nodeBalancing(i)
    if ("source", "source") in H.edges():  H.remove_edge("source", "source")
    if ("sink", "sink") in H.edges(): H.remove_edge("sink", "sink")
    return H

'''
# network data preprocessing
'''

def constructFlowNetwork (ad):
    with open(ad) as f:
        E = defaultdict(int)
        for line in f:
            From, To = line.strip('\n').split('\t')[1:]
            if To == 's':
                To = 'sink'
            E[From,To] += 1
    G=nx.DiGraph()
    for i,j in E.items():
        x,y=i
        G.add_edge(x,y,weight=j)
    return G


'''
# get flow distance
'''
def toSink(G, i):
        try:
            di = G[i]['sink'].values()[0]
        except:
            di = 0
        return di

def flowDistanceDT(G): #input a balanced nx graph
    R = G.reverse()
    mapping = {'source':'sink','sink':'source'}
    H = nx.relabel_nodes(R,mapping)
    #---------initialize flow distance dict------
    L = dict((i,1) for i in G.nodes())  #FlowDistance
    #---------prepare weighted out-degree dict------
    D = {i: toSink(G, i) for i in G.nodes()} #Di
    T = G.out_degree(weight='weight')        #Ti
    #---------iterate until converge------------
    ls = np.array(L.values())
    delta = len(L)*0.01 + 1
    while delta > len(L)*0.01:
        for i in L:
            l=1
            for m,n in H.edges(i):
                l+=L[n]*H[m][n].values()[0]/float(T[m])
            L[i]=l
        delta = sum(np.abs(np.array(L.values()) - ls))
        ls = np.array(L.values())
    #---------clean the result-------
    del L['sink']
    for i in L:
        L[i]-=1
    L['sink'] = L.pop('source')
    T['sink'] = T.pop('source')
    D['sink'] = D.pop('source')
    return L.values(), D.values(), T.values()

def prepocessingData(G):
    # G = h
    fd, di, ti = flowDistanceDT(G)
    # sort data.frame
    mt = np.array([fd, di, ti]).T
    mts = sorted(mt.tolist(), key = lambda x: x[0])
    fd, di, ti = np.array(mts).T
    # get sum values
    Drmax = sum(di)
    Dmax = sum(di)
    Tmax = sum(ti)
    r_index = [index for index, v in enumerate(fd)]
    di_cum = [ sum(di[:i+1]) for i in r_index]
    # replace zero with a small number
    for k, i in enumerate(di_cum):
        if i==0:
            di_cum[k] = 0.000001
        else:
            pass
    # regression
    ti_cum = [ sum(ti[:i+1]) for i in r_index]
    di_cum_log = log(di_cum)
    ti_cum_log = log(ti_cum)
    Drmax_log = [log(Drmax) for i in range(len(r_index))]
    return ti_cum_log, di_cum_log, Drmax_log, Dmax, Tmax

def sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log):
    x = sm.add_constant(di_cum_log)
    X = np.column_stack((x, Drmax_log))
    model = sm.OLS(ti_cum_log, X)
    res = model.fit()
    b, a = res.params[1:]
    return a, b

'''
'''

'''
# allometric growth
# get uv and pv
'''
def alloRegressFit(xdata,ydata):
    x=np.log(xdata+1);y=np.log(ydata+1);
    xx = sm.add_constant(x, prepend=True)
    res = sm.OLS(y,xx).fit()
    beta=res.params[1]; r2=res.rsquared
    return beta, r2


def saveAlloBeta(file_name, siteBetaR2):
    with open(file_name, 'a') as g:
        g.write(siteBetaR2+"\n")

def calAlloBeta(sites):
    num = 0
    theta = []
    for i in sites:
        sitePath = path+ i +"/post_days_network_with_sink/*"
        ads = glob.glob(sitePath)
        E = defaultdict(lambda:[])
        for dat in ads:
            with open(dat) as f:
                users = []
                PV = 0
                for line in f:
                    person, From, To = line.strip('\n').split('\t')
                    if To != 's':
                        PV += 1
                        users.append(person)
                UV = len(set(users))
                E[dat[-10:]].append([PV, UV])
        pv, uv=np.array(E.values()).T
        beta, r2 = alloRegressFit(uv[0], pv[0]) #注意：我在这里犯了一次错误！！！已经捕捉这个bug#
        siteBetaR2 = i+'\t'+str(beta)+'\t'+str(r2)
        saveAlloBeta(file_name, siteBetaR2)
        num += 1
        print siteBetaR2, num
        theta.extend([beta])
    return theta

# --------Run: get all site names---------
path='F:/attention flow/stackexchange/unzip/'
sites = [ f for f in listdir(path) if f[-1]=='m'] # .com
# file for saving the sitename, beta, R square
file_name = path + 'allo_site_beta_rsquare'
# Run
theta = calAlloBeta(sites)

abdt = []
for i in sites:
    sitePath = path+ i +"/post_days_network_with_sink/*"
    ads = glob.glob(sitePath)
    # prepare data
    ti_cum_log = []
    di_cum_log = []
    Drmax_log = []
    Dmax = []
    Tmax = []
    for i in ads:
        g = constructFlowNetwork(i)
        h=flowBalancing(g)
        try:
            tcl, dcl, Dl, Dm, Tm = prepocessingData(h)
            ti_cum_log.extend(tcl)
            di_cum_log.extend(dcl)
            Drmax_log.extend(Dl)
            Dmax.extend([Dm])
            Tmax.extend([Tm])
        except:
            print 'error:', i
            pass

    #fit model
    a, b = sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log)
    Dmax_mean = mean(Dmax)
    Tmax_mean = mean(Tmax)
    abdt.append( [a, b, Dmax_mean, Tmax_mean])

a, b, dmax, tmax = np.array(abdt).T.tolist()
a_plus_b = (np.matrix(a) +np.matrix(b)).tolist()[0]
theta_minus_a = (np.matrix(theta) - np.matrix(a)).tolist()[0]
def scatterPlot(xdata,ydata,col,mark,xlab,ylab):
    xx = sm.add_constant(xdata, prepend=True)
    res = sm.OLS(ydata,xx).fit()
    constant=res.params[0];beta=res.params[1]#; r2=res.rsquared
    plt.plot(xdata,ydata,mark,color=col)
    plt.xlabel(xlab);plt.ylabel(ylab)
    minx,maxx=plt.xlim(); miny,maxy=plt.ylim()
    plt.text((maxx-minx)/2,(maxy-miny)/3,np.round(beta,2))
    xs = np.linspace(min(xdata),max(xdata),100)
    plt.plot(xs, constant + xs*beta,color='r',linestyle='-')

fig = plt.figure(figsize=(10, 15),facecolor='white')
plt.subplot(3, 2, 1)
scatterPlot(a_plus_b, theta, 'b', 'o', 'a + b', r'$\theta$')
plt.subplot(3, 2, 2)
scatterPlot(a, b, 'b', 'o', 'a', 'b')
plt.subplot(3, 2, 3)
scatterPlot(log(dmax), a, 'b', 'o', 'ln(mean Dmax)', 'a')
plt.subplot(3, 2, 4)
scatterPlot(log(dmax), b, 'b', 'o', 'ln(mean Dmax)', 'b')
plt.subplot(3, 2, 5)
scatterPlot(dmax, tmax, 'b', 'o', 'Dmax', 'Tmax')
plt.subplot(3, 2, 6)
scatterPlot(b, theta_minus_a, 'b', 'o', 'b', r'$theta$ - a')
plt.tight_layout()
fig = plt.gcf()
plt.show()

fig.savefig('abfit.png', dpi = 300)
	# -- coding: utf-8 --
	"""
	Created on Wed Aug 13 21:28:10 2014

	@author: v_chjwang
	"""

	from os import listdir
	import glob
	from collections import defaultdict
	import numpy as np
	import networkx as nx
	from numpy import log, mean
	import statsmodels.api as sm
	import matplotlib.pyplot as plt

	'''
	# Network balancing
	'''

	def flowBalancing(G):
	RG = G.reverse()
	H = G.copy()
	def nodeBalancing(node):
	outw=0
	for i in G.edges(node):
	outw+=G[i[0]][i[1]].values()[0]
	inw=0
	for i in RG.edges(node):
	inw+=RG[i[0]][i[1]].values()[0]
	deltaflow=inw-outw
	if deltaflow > 0:
	H.add_edge(node, "sink",weight=deltaflow)
	elif deltaflow < 0:
	H.add_edge("source", node, weight=abs(deltaflow))
	else:
	pass
	for i in G.nodes():
	nodeBalancing(i)
	if ("source", "source") in H.edges(): H.remove_edge("source", "source")
	if ("sink", "sink") in H.edges(): H.remove_edge("sink", "sink")
	return H

	'''
	# network data preprocessing
	'''

	def constructFlowNetwork (ad):
	with open(ad) as f:
	E = defaultdict(int)
	for line in f:
	From, To = line.strip('\n').split('\t')[1:]
	if To == 's':
	To = 'sink'
	E[From,To] += 1
	G=nx.DiGraph()
	for i,j in E.items():
	x,y=i
	G.add_edge(x,y,weight=j)
	return G


	'''
	# get flow distance
	'''
	def toSink(G, i):
	try:
	di = G[i]['sink'].values()[0]
	except:
	di = 0
	return di

	def flowDistanceDT(G): #input a balanced nx graph
	R = G.reverse()
	mapping = {'source':'sink','sink':'source'}
	H = nx.relabel_nodes(R,mapping)
	#---------initialize flow distance dict------
	L = dict((i,1) for i in G.nodes()) #FlowDistance
	#---------prepare weighted out-degree dict------
	D = {i: toSink(G, i) for i in G.nodes()} #Di
	T = G.out_degree(weight='weight') #Ti
	#---------iterate until converge------------
	ls = np.array(L.values())
	delta = len(L)*0.01 + 1
	while delta > len(L)*0.01:
	for i in L:
	l=1
	for m,n in H.edges(i):
	l+=L[n]*H[m][n].values()[0]/float(T[m])
	L[i]=l
	delta = sum(np.abs(np.array(L.values()) - ls))
	ls = np.array(L.values())
	#---------clean the result-------
	del L['sink']
	for i in L:
	L[i]-=1
	L['sink'] = L.pop('source')
	T['sink'] = T.pop('source')
	D['sink'] = D.pop('source')
	return L.values(), D.values(), T.values()

	def prepocessingData(G):
	# G = h
	fd, di, ti = flowDistanceDT(G)
	# sort data.frame
	mt = np.array([fd, di, ti]).T
	mts = sorted(mt.tolist(), key = lambda x: x[0])
	fd, di, ti = np.array(mts).T
	# get sum values
	Drmax = sum(di)
	Dmax = sum(di)
	Tmax = sum(ti)
	r_index = [index for index, v in enumerate(fd)]
	di_cum = [ sum(di[:i+1]) for i in r_index]
	# replace zero with a small number
	for k, i in enumerate(di_cum):
	if i==0:
	di_cum[k] = 0.000001
	else:
	pass
	# regression
	ti_cum = [ sum(ti[:i+1]) for i in r_index]
	di_cum_log = log(di_cum)
	ti_cum_log = log(ti_cum)
	Drmax_log = [log(Drmax) for i in range(len(r_index))]
	return ti_cum_log, di_cum_log, Drmax_log, Dmax, Tmax

	def sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log):
	x = sm.add_constant(di_cum_log)
	X = np.column_stack((x, Drmax_log))
	model = sm.OLS(ti_cum_log, X)
	res = model.fit()
	b, a = res.params[1:]
	return a, b

	'''
	'''

	'''
	# allometric growth
	# get uv and pv
	'''
	def alloRegressFit(xdata,ydata):
	x=np.log(xdata+1);y=np.log(ydata+1);
	xx = sm.add_constant(x, prepend=True)
	res = sm.OLS(y,xx).fit()
	beta=res.params[1]; r2=res.rsquared
	return beta, r2


	def saveAlloBeta(file_name, siteBetaR2):
	with open(file_name, 'a') as g:
	g.write(siteBetaR2+"\n")

	def calAlloBeta(sites):
	num = 0
	theta = []
	for i in sites:
	sitePath = path+ i +"/post_days_network_with_sink/*"
	ads = glob.glob(sitePath)
	E = defaultdict(lambda:[])
	for dat in ads:
	with open(dat) as f:
	users = []
	PV = 0
	for line in f:
	person, From, To = line.strip('\n').split('\t')
	if To != 's':
	PV += 1
	users.append(person)
	UV = len(set(users))
	E[dat[-10:]].append([PV, UV])
	pv, uv=np.array(E.values()).T
	beta, r2 = alloRegressFit(uv[0], pv[0]) #注意：我在这里犯了一次错误！！！已经捕捉这个bug#
	siteBetaR2 = i+'\t'+str(beta)+'\t'+str(r2)
	saveAlloBeta(file_name, siteBetaR2)
	num += 1
	print siteBetaR2, num
	theta.extend([beta])
	return theta

	# --------Run: get all site names---------
	path='F:/attention flow/stackexchange/unzip/'
	sites = [ f for f in listdir(path) if f[-1]=='m'] # .com
	# file for saving the sitename, beta, R square
	file_name = path + 'allo_site_beta_rsquare'
	# Run
	theta = calAlloBeta(sites)

	abdt = []
	for i in sites:
	sitePath = path+ i +"/post_days_network_with_sink/*"
	ads = glob.glob(sitePath)
	# prepare data
	ti_cum_log = []
	di_cum_log = []
	Drmax_log = []
	Dmax = []
	Tmax = []
	for i in ads:
	g = constructFlowNetwork(i)
	h=flowBalancing(g)
	try:
	tcl, dcl, Dl, Dm, Tm = prepocessingData(h)
	ti_cum_log.extend(tcl)
	di_cum_log.extend(dcl)
	Drmax_log.extend(Dl)
	Dmax.extend([Dm])
	Tmax.extend([Tm])
	except:
	print 'error:', i
	pass

	#fit model
	a, b = sizeAdjustedFit(ti_cum_log, di_cum_log, Drmax_log)
	Dmax_mean = mean(Dmax)
	Tmax_mean = mean(Tmax)
	abdt.append( [a, b, Dmax_mean, Tmax_mean])

	a, b, dmax, tmax = np.array(abdt).T.tolist()
	a_plus_b = (np.matrix(a) +np.matrix(b)).tolist()[0]
	theta_minus_a = (np.matrix(theta) - np.matrix(a)).tolist()[0]
	def scatterPlot(xdata,ydata,col,mark,xlab,ylab):
	xx = sm.add_constant(xdata, prepend=True)
	res = sm.OLS(ydata,xx).fit()
	constant=res.params[0];beta=res.params[1]#; r2=res.rsquared
	plt.plot(xdata,ydata,mark,color=col)
	plt.xlabel(xlab);plt.ylabel(ylab)
	minx,maxx=plt.xlim(); miny,maxy=plt.ylim()
	plt.text((maxx-minx)/2,(maxy-miny)/3,np.round(beta,2))
	xs = np.linspace(min(xdata),max(xdata),100)
	plt.plot(xs, constant + xs*beta,color='r',linestyle='-')

	fig = plt.figure(figsize=(10, 15),facecolor='white')
	plt.subplot(3, 2, 1)
	scatterPlot(a_plus_b, theta, 'b', 'o', 'a + b', r'$\theta$')
	plt.subplot(3, 2, 2)
	scatterPlot(a, b, 'b', 'o', 'a', 'b')
	plt.subplot(3, 2, 3)
	scatterPlot(log(dmax), a, 'b', 'o', 'ln(mean Dmax)', 'a')
	plt.subplot(3, 2, 4)
	scatterPlot(log(dmax), b, 'b', 'o', 'ln(mean Dmax)', 'b')
	plt.subplot(3, 2, 5)
	scatterPlot(dmax, tmax, 'b', 'o', 'Dmax', 'Tmax')
	plt.subplot(3, 2, 6)
	scatterPlot(b, theta_minus_a, 'b', 'o', 'b', r'$theta$ - a')
	plt.tight_layout()
	fig = plt.gcf()
	plt.show()

	fig.savefig('abfit.png', dpi = 300)