Created
November 1, 2014 04:05
-
-
Save richardkwo/78a0de320b836960e255 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Jan 14 19:54:29 2013 | |
This script measures the function preferential attachment | |
from evlutionary networks data. | |
= input file format | |
node1 <tab> node2 <tab> YYYY-MM-DD HH:MM:SS | |
Three columns, seperated by tab. The last column is written in ISO time format. | |
(The ISO time format can be sorted with lexicographical order without conversion) | |
= algorithm | |
The method is from EPL 61 (4) ``Measuring perferential attachement in evolving | |
networks'' by H. Jeong et al. | |
The program seeks to estimate the function of PI(k, T0, T1) from network data. | |
== parameters | |
T0: time point for ``old'' network | |
[T1, T1+dT]: time slice during which newly added links are recorded, treated as | |
the time "point" for ``new'' network | |
NOTE: we use the time resolution of "1 day". | |
== method | |
The algorithm first scans the network once, during which it profiles: | |
(1) the links established during [T1, T1+dT] | |
(2) degrees of all nodes up to time T0 | |
Then, for every link (i, j) added between T1 and T1+dT, ki and kj are retrieved. | |
Two types are devided and written to file separately: | |
(1) ki=0 or kj=0, external link, which refers to a link that brings a new | |
node added to the network. Say i is the old node (then kj=0), then ki | |
at time T0 is taken down (the count for k1 is incremented), finally forming | |
a histogram which gives the function PI(k, T0, T1) | |
NOTE: it is possible that the ``old'' node is introduced in [T0, T1], | |
making both ki=0 and kj=0. This should be discarded. | |
(2) ki!=0 and kj!=0, internal link, which refers to a link established | |
between two friends who have not associated each other yet. We take down | |
(ki, kj) for estimating the functiom PI(ki*kj, T0, T1) | |
@author: richard | |
""" | |
import dataMISC | |
import datetime | |
import sys | |
minNode = 1 | |
maxNode = 12000000 | |
def dateStringAdd(T0, days): | |
'''Return the date after "days" days of T0 | |
Input arguments | |
T0 is an ISO date in the format of "YYYY-MM-DD" | |
days in the number of days (int) added to T0 | |
Output | |
Return T1 = T0 + days in "YYYY-MM-DD" format | |
''' | |
T0_dt = datetime.datetime.strptime(T0, "%Y-%m-%d") | |
T1_dt = T0_dt + datetime.timedelta(days=days) | |
T1 = T1_dt.strftime("%Y-%m-%d") | |
return T1 | |
def measureDegreeIncrement(inputFileName, outputFileName, T0, dT): | |
'''Measure the degree added during the time window [T0, T0+dT] as | |
a function of the degrees already formed just before the time T0. | |
Input arguments | |
T0 is the time starting the time window. By scanning the network | |
formed just before T0, we profile the vector for "old" degrees. | |
dT is the length of time window | |
inputFileName is the file of network edgelist SORTED IN | |
INCREASING ORDER OF DATE TIME | |
Output | |
CSV file with every row in the format of: | |
k, k.increment | |
A companion file with name outputFileName + ".date" records | |
the time setting for computation, in the row format of: | |
time.start, time.end, dT | |
Methods | |
1. Scan the network and record | |
(1) just before the time window, we take down k_ as the | |
list of degree for every node | |
(2) just as T0 arrives, we make a copy of k_ in the name of | |
k_new. | |
(3) scanning through the time window and record the newly | |
created links fallen into the window. For every edge, we | |
record the nodes it involved and add to the SET nodeSetInNewLinks, | |
and increment the degrees of those nodes in the vector k_new | |
(4) iterate i over nodeSetInNewLinks, and write | |
(k_(i), k_new(i)-k_(i)) to output file | |
''' | |
# k_[i] is the degree of node i | |
k_ = [0] * maxNode | |
k_new = k_[:] | |
# T1 = T0 + dT | |
# time window [T0, T1] | |
T1 = dateStringAdd(T0, dT) | |
# dict used as a set for keeping nodes involved in links | |
# created during the time window | |
nodeSetInNewLinks = {} | |
totalLine = 409135012 | |
currentLine = 0 | |
print "Reading file %s, %d lines in total..." % (inputFileName, totalLine) | |
print "NOTE: The input file must be SORTED in ascending order of date and time\n" | |
fr = open(inputFileName, 'r') | |
for line in fr: | |
currentLine += 1 | |
dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ") | |
line = line.rstrip() | |
lineItems = line.split('\t') | |
# nodes in integer numbering | |
node1 = int(lineItems[0]) | |
node2 = int(lineItems[1]) | |
linkTime = lineItems[2].split(' ') | |
linkTime = linkTime[0] # YYYY-MM-DD | |
# profile degrees k_ | |
# in the snapshot just before T0 | |
if linkTime < T0: | |
# before T0 | |
k_[node1] += 1 | |
k_[node2] += 1 | |
k_new[node1] += 1 | |
k_new[node2] += 1 | |
elif linkTime <= T1: | |
# in the time window [T0, T0+T1] | |
# add nodes involved to nodeSetInNewLinks | |
nodeSetInNewLinks[node1] = 0 | |
nodeSetInNewLinks[node2] = 0 | |
# increment the degree of node1 and node2 in k_new | |
k_new[node1] += 1 | |
k_new[node2] += 1 | |
else: | |
# now we jump out of the time window | |
# no need for further scanning | |
break | |
# file read over | |
fr.close() | |
# iterate over nodes involved in newly added links in the time window | |
fw = open(outputFileName, "w") | |
print >>fw, "k, k.increment" | |
for i in nodeSetInNewLinks.keys(): | |
print >>fw, "%d, %d" % (k_[i], k_new[i]-k_[i]) | |
fw.close() | |
outputFileName_date = outputFileName + ".date" | |
fw_date = open(outputFileName_date, "w") | |
print >>fw_date, "time.start, time.end, dT" | |
print >>fw_date, "%s, %s, %d" % (T0, T1, dT) | |
fw_date.close() | |
print "Done." | |
print "Outputfile written to", outputFileName | |
print "Date settings written to", outputFileName_date | |
def measurePA(inputFileName, outputFileNamePrefix, T0, T1, dT): | |
'''Measure the preferential attachement by evolutionary network data. | |
inputFileName speicifies the file of network data, in the row format of | |
node1_int <tab> node2_int <tab> YYYY-MM-DD HH:MM:SS | |
Input arguments | |
T0 is the time for ``old'' network, in ``YYYY-MM-DD'' format | |
T1 is the time for observing newly-added links, | |
in ``YYYY-MM-DD'' format. T0 <= T1. | |
dT is the interval, making [T1, T1+dT] the actual time window for | |
observing added links | |
Output files | |
outputFileNamePrefix + "_external.txt" records the degree of the | |
"attached" node in the snapshot of network just before T0, with row | |
format: | |
k | |
outputFileNamePrefix + "_internal.txt" records the degree of the | |
nodes who were included in networks just before T0 but make the | |
first link in the time window, wiht row format: | |
k0, k1 | |
outputFileNamePrefix + "_degree.txt" records the number of nodes | |
with a degree. The row k gives the number of nodes with degree k | |
in the snapshot just before T0. Row format: | |
num.of.nodes.with.degree | |
outputFileNamePrefix + "_date.txt" records T0, T1, dT of this set | |
of output. Row format: | |
T0, T1, dT, T2 | |
''' | |
# k_[i] is the degree of node i | |
k_ = [0] * maxNode | |
# T2 = T1 + dT | |
T1_dt = datetime.datetime.strptime(T1, "%Y-%m-%d") | |
T2_dt = T1_dt + datetime.timedelta(days=dT) | |
T2 = T2_dt.strftime("%Y-%m-%d") # [T1, T2] would be the window | |
# list for maintaining new links fallen into the time window | |
newlinks = [] | |
# maintain the counting of number of nodes of a specific degree | |
degrees = [] | |
totalLine = 409135012 | |
currentLine = 0 | |
print "Reading file %s, %d lines in total..." % (inputFileName, totalLine) | |
fr = open(inputFileName, 'r') | |
for line in fr: | |
currentLine += 1 | |
dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ") | |
line = line.rstrip() | |
lineItems = line.split('\t') | |
# nodes in integer numbering | |
node1 = int(lineItems[0]) | |
node2 = int(lineItems[1]) | |
linkTime = lineItems[2].split(' ') | |
linkTime = linkTime[0] # YYYY-MM-DD | |
# profile degrees k_ | |
# in the snapshot just before T0 | |
if linkTime < T0: | |
k_[node1] += 1 | |
k_[node2] += 1 | |
# record new links in the time window | |
if T1<=linkTime and linkTime<=T2: | |
newlinks.append((node1, node2)) | |
# file read over | |
fr.close() | |
# profiling the degree distribution by | |
# counting the number of nodes of a specific degree | |
print "Profiling degree distribution..." | |
degrees = [0] * (max(k_)+1) | |
for kofi in k_: | |
degrees[kofi] += 1 | |
# then degrees[k] == number of nodes of degree k | |
# write it to file | |
# this file records the number of nodes of degree k | |
# row l simply gives the number of nodes with degree l | |
outputFileName_degree = outputFileNamePrefix + "_degree.txt" | |
fw_deg = open(outputFileName_degree, "w") | |
print >>fw_deg, "num.of.nodes.with.degree" | |
for dgn in degrees: | |
print >>fw_deg, "%d" % (dgn) | |
fw_deg.close() | |
print "Iterating over %d new links added in the time window..." % (len(newlinks)) | |
# k_ profiled | |
# iterate over links observed in [T1, T1+dT] | |
# and write k-dependencies to file | |
outputFileName_external = outputFileNamePrefix + "_external.txt" | |
outputFileName_internal = outputFileNamePrefix + "_internal.txt" | |
fw_ext = open(outputFileName_external, 'w') | |
print >>fw_ext, "k" | |
fw_int = open(outputFileName_internal, 'w') | |
print >>fw_int, "k0, k1" | |
for node1, node2 in newlinks: | |
k1 = k_[node1] | |
k2 = k_[node2] | |
if k1==0 and k2==0: | |
# 1 node after T1, 1 node added in [T0, T1] | |
continue | |
elif k1>0 and k2>0: | |
# internal link | |
print >>fw_int, "%d, %d" % (k1, k2) | |
else: | |
# external link | |
k0 = max(k1, k2) # the degree of ``old'' node | |
print >>fw_ext, "%d" % (k0) | |
# newlinks iterated | |
fw_int.close() | |
fw_ext.close() | |
# write T0, T1 and dT of this computation | |
# dT is in the unit of days | |
outputFileName_date = outputFileNamePrefix + "_date.txt" | |
fw_date = open(outputFileName_date, 'w') | |
print >>fw_date, "T0, T1, dT, T2" | |
print >>fw_date, "%s, %s, %d, %s" % (T0, T1, dT, T2) | |
fw_date.close() | |
print "Done" | |
print "External links dependencies saved to", outputFileName_external | |
print "Internal links dependencies saved to", outputFileName_internal | |
print "Degree counting saved to", outputFileName_degree | |
print "Time and time windows saved to", outputFileName_date | |
if __name__=="__main__": | |
T_vec = ["2006-01-01", | |
"2006-04-01", | |
"2006-07-01", | |
"2006-10-01", | |
"2007-01-01", | |
"2007-04-01", | |
"2007-07-01", | |
"2007-10-01"] | |
# T_vec = ["2006-07-01"] | |
InputFileName = "../RenRenData/friends_all.txt" | |
for T0 in T_vec: | |
# T1 = dateStringAdd(T0, 30) | |
T1 = T0 | |
dT = 30 #(days) | |
OutputFileNamePrefix = "../RenRenData/results/measure_preferential_attachment/Stat_T0_%s_T1_%s" % (T0, T1) | |
print "Now:" | |
print "T0=", T0, "T1=", T1, "dT=", dT | |
print "Input:\n", InputFileName | |
print "Output:\n", OutputFileNamePrefix | |
print "" | |
measurePA(InputFileName, OutputFileNamePrefix, T0, T1, dT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment