Skip to content

Instantly share code, notes, and snippets.

@richardkwo
Created November 1, 2014 04:05
Show Gist options
  • Save richardkwo/78a0de320b836960e255 to your computer and use it in GitHub Desktop.
Save richardkwo/78a0de320b836960e255 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 19:54:29 2013
This script measures the function preferential attachment
from evlutionary networks data.
= input file format
node1 <tab> node2 <tab> YYYY-MM-DD HH:MM:SS
Three columns, seperated by tab. The last column is written in ISO time format.
(The ISO time format can be sorted with lexicographical order without conversion)
= algorithm
The method is from EPL 61 (4) ``Measuring perferential attachement in evolving
networks'' by H. Jeong et al.
The program seeks to estimate the function of PI(k, T0, T1) from network data.
== parameters
T0: time point for ``old'' network
[T1, T1+dT]: time slice during which newly added links are recorded, treated as
the time "point" for ``new'' network
NOTE: we use the time resolution of "1 day".
== method
The algorithm first scans the network once, during which it profiles:
(1) the links established during [T1, T1+dT]
(2) degrees of all nodes up to time T0
Then, for every link (i, j) added between T1 and T1+dT, ki and kj are retrieved.
Two types are devided and written to file separately:
(1) ki=0 or kj=0, external link, which refers to a link that brings a new
node added to the network. Say i is the old node (then kj=0), then ki
at time T0 is taken down (the count for k1 is incremented), finally forming
a histogram which gives the function PI(k, T0, T1)
NOTE: it is possible that the ``old'' node is introduced in [T0, T1],
making both ki=0 and kj=0. This should be discarded.
(2) ki!=0 and kj!=0, internal link, which refers to a link established
between two friends who have not associated each other yet. We take down
(ki, kj) for estimating the functiom PI(ki*kj, T0, T1)
@author: richard
"""
import dataMISC
import datetime
import sys
minNode = 1
maxNode = 12000000
def dateStringAdd(T0, days):
'''Return the date after "days" days of T0
Input arguments
T0 is an ISO date in the format of "YYYY-MM-DD"
days in the number of days (int) added to T0
Output
Return T1 = T0 + days in "YYYY-MM-DD" format
'''
T0_dt = datetime.datetime.strptime(T0, "%Y-%m-%d")
T1_dt = T0_dt + datetime.timedelta(days=days)
T1 = T1_dt.strftime("%Y-%m-%d")
return T1
def measureDegreeIncrement(inputFileName, outputFileName, T0, dT):
'''Measure the degree added during the time window [T0, T0+dT] as
a function of the degrees already formed just before the time T0.
Input arguments
T0 is the time starting the time window. By scanning the network
formed just before T0, we profile the vector for "old" degrees.
dT is the length of time window
inputFileName is the file of network edgelist SORTED IN
INCREASING ORDER OF DATE TIME
Output
CSV file with every row in the format of:
k, k.increment
A companion file with name outputFileName + ".date" records
the time setting for computation, in the row format of:
time.start, time.end, dT
Methods
1. Scan the network and record
(1) just before the time window, we take down k_ as the
list of degree for every node
(2) just as T0 arrives, we make a copy of k_ in the name of
k_new.
(3) scanning through the time window and record the newly
created links fallen into the window. For every edge, we
record the nodes it involved and add to the SET nodeSetInNewLinks,
and increment the degrees of those nodes in the vector k_new
(4) iterate i over nodeSetInNewLinks, and write
(k_(i), k_new(i)-k_(i)) to output file
'''
# k_[i] is the degree of node i
k_ = [0] * maxNode
k_new = k_[:]
# T1 = T0 + dT
# time window [T0, T1]
T1 = dateStringAdd(T0, dT)
# dict used as a set for keeping nodes involved in links
# created during the time window
nodeSetInNewLinks = {}
totalLine = 409135012
currentLine = 0
print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)
print "NOTE: The input file must be SORTED in ascending order of date and time\n"
fr = open(inputFileName, 'r')
for line in fr:
currentLine += 1
dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
line = line.rstrip()
lineItems = line.split('\t')
# nodes in integer numbering
node1 = int(lineItems[0])
node2 = int(lineItems[1])
linkTime = lineItems[2].split(' ')
linkTime = linkTime[0] # YYYY-MM-DD
# profile degrees k_
# in the snapshot just before T0
if linkTime < T0:
# before T0
k_[node1] += 1
k_[node2] += 1
k_new[node1] += 1
k_new[node2] += 1
elif linkTime <= T1:
# in the time window [T0, T0+T1]
# add nodes involved to nodeSetInNewLinks
nodeSetInNewLinks[node1] = 0
nodeSetInNewLinks[node2] = 0
# increment the degree of node1 and node2 in k_new
k_new[node1] += 1
k_new[node2] += 1
else:
# now we jump out of the time window
# no need for further scanning
break
# file read over
fr.close()
# iterate over nodes involved in newly added links in the time window
fw = open(outputFileName, "w")
print >>fw, "k, k.increment"
for i in nodeSetInNewLinks.keys():
print >>fw, "%d, %d" % (k_[i], k_new[i]-k_[i])
fw.close()
outputFileName_date = outputFileName + ".date"
fw_date = open(outputFileName_date, "w")
print >>fw_date, "time.start, time.end, dT"
print >>fw_date, "%s, %s, %d" % (T0, T1, dT)
fw_date.close()
print "Done."
print "Outputfile written to", outputFileName
print "Date settings written to", outputFileName_date
def measurePA(inputFileName, outputFileNamePrefix, T0, T1, dT):
'''Measure the preferential attachement by evolutionary network data.
inputFileName speicifies the file of network data, in the row format of
node1_int <tab> node2_int <tab> YYYY-MM-DD HH:MM:SS
Input arguments
T0 is the time for ``old'' network, in ``YYYY-MM-DD'' format
T1 is the time for observing newly-added links,
in ``YYYY-MM-DD'' format. T0 <= T1.
dT is the interval, making [T1, T1+dT] the actual time window for
observing added links
Output files
outputFileNamePrefix + "_external.txt" records the degree of the
"attached" node in the snapshot of network just before T0, with row
format:
k
outputFileNamePrefix + "_internal.txt" records the degree of the
nodes who were included in networks just before T0 but make the
first link in the time window, wiht row format:
k0, k1
outputFileNamePrefix + "_degree.txt" records the number of nodes
with a degree. The row k gives the number of nodes with degree k
in the snapshot just before T0. Row format:
num.of.nodes.with.degree
outputFileNamePrefix + "_date.txt" records T0, T1, dT of this set
of output. Row format:
T0, T1, dT, T2
'''
# k_[i] is the degree of node i
k_ = [0] * maxNode
# T2 = T1 + dT
T1_dt = datetime.datetime.strptime(T1, "%Y-%m-%d")
T2_dt = T1_dt + datetime.timedelta(days=dT)
T2 = T2_dt.strftime("%Y-%m-%d") # [T1, T2] would be the window
# list for maintaining new links fallen into the time window
newlinks = []
# maintain the counting of number of nodes of a specific degree
degrees = []
totalLine = 409135012
currentLine = 0
print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)
fr = open(inputFileName, 'r')
for line in fr:
currentLine += 1
dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
line = line.rstrip()
lineItems = line.split('\t')
# nodes in integer numbering
node1 = int(lineItems[0])
node2 = int(lineItems[1])
linkTime = lineItems[2].split(' ')
linkTime = linkTime[0] # YYYY-MM-DD
# profile degrees k_
# in the snapshot just before T0
if linkTime < T0:
k_[node1] += 1
k_[node2] += 1
# record new links in the time window
if T1<=linkTime and linkTime<=T2:
newlinks.append((node1, node2))
# file read over
fr.close()
# profiling the degree distribution by
# counting the number of nodes of a specific degree
print "Profiling degree distribution..."
degrees = [0] * (max(k_)+1)
for kofi in k_:
degrees[kofi] += 1
# then degrees[k] == number of nodes of degree k
# write it to file
# this file records the number of nodes of degree k
# row l simply gives the number of nodes with degree l
outputFileName_degree = outputFileNamePrefix + "_degree.txt"
fw_deg = open(outputFileName_degree, "w")
print >>fw_deg, "num.of.nodes.with.degree"
for dgn in degrees:
print >>fw_deg, "%d" % (dgn)
fw_deg.close()
print "Iterating over %d new links added in the time window..." % (len(newlinks))
# k_ profiled
# iterate over links observed in [T1, T1+dT]
# and write k-dependencies to file
outputFileName_external = outputFileNamePrefix + "_external.txt"
outputFileName_internal = outputFileNamePrefix + "_internal.txt"
fw_ext = open(outputFileName_external, 'w')
print >>fw_ext, "k"
fw_int = open(outputFileName_internal, 'w')
print >>fw_int, "k0, k1"
for node1, node2 in newlinks:
k1 = k_[node1]
k2 = k_[node2]
if k1==0 and k2==0:
# 1 node after T1, 1 node added in [T0, T1]
continue
elif k1>0 and k2>0:
# internal link
print >>fw_int, "%d, %d" % (k1, k2)
else:
# external link
k0 = max(k1, k2) # the degree of ``old'' node
print >>fw_ext, "%d" % (k0)
# newlinks iterated
fw_int.close()
fw_ext.close()
# write T0, T1 and dT of this computation
# dT is in the unit of days
outputFileName_date = outputFileNamePrefix + "_date.txt"
fw_date = open(outputFileName_date, 'w')
print >>fw_date, "T0, T1, dT, T2"
print >>fw_date, "%s, %s, %d, %s" % (T0, T1, dT, T2)
fw_date.close()
print "Done"
print "External links dependencies saved to", outputFileName_external
print "Internal links dependencies saved to", outputFileName_internal
print "Degree counting saved to", outputFileName_degree
print "Time and time windows saved to", outputFileName_date
if __name__=="__main__":
T_vec = ["2006-01-01",
"2006-04-01",
"2006-07-01",
"2006-10-01",
"2007-01-01",
"2007-04-01",
"2007-07-01",
"2007-10-01"]
# T_vec = ["2006-07-01"]
InputFileName = "../RenRenData/friends_all.txt"
for T0 in T_vec:
# T1 = dateStringAdd(T0, 30)
T1 = T0
dT = 30 #(days)
OutputFileNamePrefix = "../RenRenData/results/measure_preferential_attachment/Stat_T0_%s_T1_%s" % (T0, T1)
print "Now:"
print "T0=", T0, "T1=", T1, "dT=", dT
print "Input:\n", InputFileName
print "Output:\n", OutputFileNamePrefix
print ""
measurePA(InputFileName, OutputFileNamePrefix, T0, T1, dT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment