richardkwo/gist:78a0de320b836960e255

## gistfile1.py
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 19:54:29 2013

This script measures the function preferential attachment
from evlutionary networks data.

= input file format
node1 <tab> node2 <tab> YYYY-MM-DD HH:MM:SS

Three columns, seperated by tab. The last column is written in ISO time format.
(The ISO time format can be sorted with lexicographical order without conversion)

= algorithm

The method is from EPL 61 (4) ``Measuring perferential attachement in evolving
networks'' by H. Jeong et al.

The program seeks to estimate the function of PI(k, T0, T1) from network data.

== parameters
T0: time point for ``old'' network

[T1, T1+dT]: time slice during which newly added links are recorded, treated as
the time "point" for ``new'' network

NOTE: we use the time resolution of "1 day".

== method
The algorithm first scans the network once, during which it profiles:
    (1) the links established during [T1, T1+dT]
    (2) degrees of all nodes up to time T0

Then, for every link (i, j) added between T1 and T1+dT, ki and kj are retrieved.
Two types are devided and written to file separately:

    (1) ki=0 or kj=0, external link, which refers to a link that brings a new
    node added to the network. Say i is the old node (then kj=0), then ki
    at time T0 is taken down (the count for k1 is incremented), finally forming
    a histogram which gives the function PI(k, T0, T1)

    NOTE: it is possible that the ``old'' node is introduced in [T0, T1],
    making both ki=0 and kj=0. This should be discarded.

    (2) ki!=0 and kj!=0, internal link, which refers to a link established
    between two friends who have not associated each other yet. We take down
    (ki, kj) for estimating the functiom PI(ki*kj, T0, T1)

@author: richard
"""

import dataMISC
import datetime
import sys

minNode = 1
maxNode = 12000000

def dateStringAdd(T0, days):
    '''Return the date after "days" days of T0

    Input arguments

        T0 is an ISO date in the format of "YYYY-MM-DD"

        days in the number of days (int) added to T0

    Output

        Return T1 = T0 + days in "YYYY-MM-DD" format
    '''

    T0_dt = datetime.datetime.strptime(T0, "%Y-%m-%d")
    T1_dt = T0_dt + datetime.timedelta(days=days)
    T1 = T1_dt.strftime("%Y-%m-%d")
    return T1

def measureDegreeIncrement(inputFileName, outputFileName, T0, dT):
    '''Measure the degree added during the time window [T0, T0+dT] as
    a function of the degrees already formed just before the time T0.

    Input arguments

        T0 is the time starting the time window. By scanning the network
        formed just before T0, we profile the vector for "old" degrees.

        dT is the length of time window

        inputFileName is the file of network edgelist SORTED IN
        INCREASING ORDER OF DATE TIME

    Output

         CSV file with every row in the format of:
             k, k.increment

         A companion file with name outputFileName + ".date" records
         the time setting for computation, in the row format of:
             time.start, time.end, dT

    Methods

        1. Scan the network and record
            (1) just before the time window, we take down k_ as the
            list of degree for every node

            (2) just as T0 arrives, we make a copy of k_ in the name of
            k_new.

            (3) scanning through the time window and record the newly
            created links fallen into the window. For every edge, we
            record the nodes it involved and add to the SET nodeSetInNewLinks,
            and increment the degrees of those nodes in the vector k_new

            (4) iterate i over nodeSetInNewLinks, and write
            (k_(i), k_new(i)-k_(i)) to output file
    '''
    # k_[i] is the degree of node i
    k_ = [0] * maxNode
    k_new = k_[:]

    # T1 = T0 + dT
    # time window [T0, T1]
    T1 = dateStringAdd(T0, dT)

    # dict used as a set for keeping nodes involved in links
    # created during the time window
    nodeSetInNewLinks = {}

    totalLine = 409135012
    currentLine = 0

    print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)
    print "NOTE: The input file must be SORTED in ascending order of date and time\n"

    fr = open(inputFileName, 'r')
    for line in fr:
        currentLine += 1
        dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
        line = line.rstrip()
        lineItems = line.split('\t')
        # nodes in integer numbering
        node1 = int(lineItems[0])
        node2 = int(lineItems[1])
        linkTime = lineItems[2].split(' ')
        linkTime = linkTime[0]  # YYYY-MM-DD


        # profile degrees k_
        # in the snapshot just before T0
        if linkTime < T0:
            # before T0
            k_[node1] += 1
            k_[node2] += 1
            k_new[node1] += 1
            k_new[node2] += 1
        elif linkTime <= T1:
            # in the time window [T0, T0+T1]
            # add nodes involved to nodeSetInNewLinks
            nodeSetInNewLinks[node1] = 0
            nodeSetInNewLinks[node2] = 0
            # increment the degree of node1 and node2 in k_new
            k_new[node1] += 1
            k_new[node2] += 1
        else:
            # now we jump out of the time window
            # no need for further scanning
            break
    # file read over
    fr.close()

    # iterate over nodes involved in newly added links in the time window
    fw = open(outputFileName, "w")
    print >>fw, "k, k.increment"
    for i in nodeSetInNewLinks.keys():
        print >>fw, "%d, %d" % (k_[i], k_new[i]-k_[i])
    fw.close()
    outputFileName_date = outputFileName + ".date"
    fw_date = open(outputFileName_date, "w")
    print >>fw_date, "time.start, time.end, dT"
    print >>fw_date, "%s, %s, %d" % (T0, T1, dT)
    fw_date.close()

    print "Done."
    print "Outputfile written to", outputFileName
    print "Date settings written to", outputFileName_date


def measurePA(inputFileName, outputFileNamePrefix, T0, T1, dT):
    '''Measure the preferential attachement by evolutionary network data.

    inputFileName speicifies the file of network data, in the row format of
    node1_int <tab> node2_int <tab> YYYY-MM-DD HH:MM:SS

    Input arguments

        T0 is the time for ``old'' network, in ``YYYY-MM-DD'' format

        T1 is the time for observing newly-added links,
        in ``YYYY-MM-DD'' format. T0 <= T1.

        dT is the interval, making [T1, T1+dT] the actual time window for
        observing added links

    Output files

        outputFileNamePrefix + "_external.txt" records the degree of the
        "attached" node in the snapshot of network just before T0, with row
        format:
            k

        outputFileNamePrefix + "_internal.txt" records the degree of the
        nodes who were included in networks just before T0 but make the
        first link in the time window, wiht row format:
            k0, k1

        outputFileNamePrefix + "_degree.txt" records the number of nodes
        with a degree. The row k gives the number of nodes with degree k
        in the snapshot just before T0. Row format:
            num.of.nodes.with.degree

        outputFileNamePrefix + "_date.txt" records T0, T1, dT of this set
        of output. Row format:
            T0, T1, dT, T2

    '''

    # k_[i] is the degree of node i
    k_ = [0] * maxNode

    # T2 = T1 + dT
    T1_dt = datetime.datetime.strptime(T1, "%Y-%m-%d")
    T2_dt = T1_dt + datetime.timedelta(days=dT)
    T2 = T2_dt.strftime("%Y-%m-%d") # [T1, T2] would be the window

    # list for maintaining new links fallen into the time window
    newlinks = []

    # maintain the counting of number of nodes of a specific degree
    degrees = []

    totalLine = 409135012
    currentLine = 0
    print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)

    fr = open(inputFileName, 'r')
    for line in fr:
        currentLine += 1
        dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
        line = line.rstrip()
        lineItems = line.split('\t')
        # nodes in integer numbering
        node1 = int(lineItems[0])
        node2 = int(lineItems[1])
        linkTime = lineItems[2].split(' ')
        linkTime = linkTime[0]  # YYYY-MM-DD


        # profile degrees k_
        # in the snapshot just before T0
        if linkTime < T0:
            k_[node1] += 1
            k_[node2] += 1

        # record new links in the time window
        if T1<=linkTime and linkTime<=T2:
            newlinks.append((node1, node2))

    # file read over
    fr.close()

    # profiling the degree distribution by
    # counting the number of nodes of a specific degree
    print "Profiling degree distribution..."
    degrees = [0] * (max(k_)+1)
    for kofi in k_:
        degrees[kofi] += 1
        # then degrees[k] == number of nodes of degree k
    # write it to file
    # this file records the number of nodes of degree k
    # row l simply gives the number of nodes with degree l
    outputFileName_degree = outputFileNamePrefix + "_degree.txt"
    fw_deg = open(outputFileName_degree, "w")
    print >>fw_deg, "num.of.nodes.with.degree"
    for dgn in degrees:
        print >>fw_deg, "%d" % (dgn)
    fw_deg.close()

    print "Iterating over %d new links added in the time window..." % (len(newlinks))
    # k_ profiled
    # iterate over links observed in [T1, T1+dT]
    # and write k-dependencies to file
    outputFileName_external = outputFileNamePrefix + "_external.txt"
    outputFileName_internal = outputFileNamePrefix + "_internal.txt"
    fw_ext = open(outputFileName_external, 'w')
    print >>fw_ext, "k"
    fw_int = open(outputFileName_internal, 'w')
    print >>fw_int, "k0, k1"
    for node1, node2 in newlinks:
        k1 = k_[node1]
        k2 = k_[node2]
        if k1==0 and k2==0:
            # 1 node after T1, 1 node added in [T0, T1]
            continue
        elif k1>0 and k2>0:
            # internal link
            print >>fw_int, "%d, %d" % (k1, k2)
        else:
            # external link
            k0 = max(k1, k2)    # the degree of ``old'' node
            print >>fw_ext, "%d" % (k0)
    # newlinks iterated
    fw_int.close()
    fw_ext.close()
    # write T0, T1 and dT of this computation
    # dT is in the unit of days
    outputFileName_date = outputFileNamePrefix + "_date.txt"
    fw_date = open(outputFileName_date, 'w')
    print >>fw_date, "T0, T1, dT, T2"
    print >>fw_date, "%s, %s, %d, %s" % (T0, T1, dT, T2)
    fw_date.close()

    print "Done"
    print "External links dependencies saved to", outputFileName_external
    print "Internal links dependencies saved to", outputFileName_internal
    print "Degree counting saved to", outputFileName_degree
    print "Time and time windows saved to", outputFileName_date


if __name__=="__main__":

    T_vec = ["2006-01-01",
             "2006-04-01",
             "2006-07-01",
             "2006-10-01",
             "2007-01-01",
             "2007-04-01",
             "2007-07-01",
             "2007-10-01"]


#    T_vec = ["2006-07-01"]

    InputFileName = "../RenRenData/friends_all.txt"

    for T0 in T_vec:
#        T1 = dateStringAdd(T0, 30)
        T1 = T0
        dT = 30 #(days)
        OutputFileNamePrefix = "../RenRenData/results/measure_preferential_attachment/Stat_T0_%s_T1_%s" % (T0, T1)
        print "Now:"
        print "T0=", T0, "T1=", T1, "dT=", dT
        print "Input:\n", InputFileName
        print "Output:\n", OutputFileNamePrefix
        print ""
        measurePA(InputFileName, OutputFileNamePrefix, T0, T1, dT)
	# -- coding: utf-8 --
	"""
	Created on Mon Jan 14 19:54:29 2013

	This script measures the function preferential attachment
	from evlutionary networks data.

	= input file format
	node1 <tab> node2 <tab> YYYY-MM-DD HH:MM:SS

	Three columns, seperated by tab. The last column is written in ISO time format.
	(The ISO time format can be sorted with lexicographical order without conversion)

	= algorithm

	The method is from EPL 61 (4) ``Measuring perferential attachement in evolving
	networks'' by H. Jeong et al.

	The program seeks to estimate the function of PI(k, T0, T1) from network data.

	== parameters
	T0: time point for ``old'' network

	[T1, T1+dT]: time slice during which newly added links are recorded, treated as
	the time "point" for ``new'' network

	NOTE: we use the time resolution of "1 day".

	== method
	The algorithm first scans the network once, during which it profiles:
	(1) the links established during [T1, T1+dT]
	(2) degrees of all nodes up to time T0

	Then, for every link (i, j) added between T1 and T1+dT, ki and kj are retrieved.
	Two types are devided and written to file separately:

	(1) ki=0 or kj=0, external link, which refers to a link that brings a new
	node added to the network. Say i is the old node (then kj=0), then ki
	at time T0 is taken down (the count for k1 is incremented), finally forming
	a histogram which gives the function PI(k, T0, T1)

	NOTE: it is possible that the ``old'' node is introduced in [T0, T1],
	making both ki=0 and kj=0. This should be discarded.

	(2) ki!=0 and kj!=0, internal link, which refers to a link established
	between two friends who have not associated each other yet. We take down
	(ki, kj) for estimating the functiom PI(ki*kj, T0, T1)

	@author: richard
	"""

	import dataMISC
	import datetime
	import sys

	minNode = 1
	maxNode = 12000000

	def dateStringAdd(T0, days):
	'''Return the date after "days" days of T0

	Input arguments

	T0 is an ISO date in the format of "YYYY-MM-DD"

	days in the number of days (int) added to T0

	Output

	Return T1 = T0 + days in "YYYY-MM-DD" format
	'''

	T0_dt = datetime.datetime.strptime(T0, "%Y-%m-%d")
	T1_dt = T0_dt + datetime.timedelta(days=days)
	T1 = T1_dt.strftime("%Y-%m-%d")
	return T1

	def measureDegreeIncrement(inputFileName, outputFileName, T0, dT):
	'''Measure the degree added during the time window [T0, T0+dT] as
	a function of the degrees already formed just before the time T0.

	Input arguments

	T0 is the time starting the time window. By scanning the network
	formed just before T0, we profile the vector for "old" degrees.

	dT is the length of time window

	inputFileName is the file of network edgelist SORTED IN
	INCREASING ORDER OF DATE TIME

	Output

	CSV file with every row in the format of:
	k, k.increment

	A companion file with name outputFileName + ".date" records
	the time setting for computation, in the row format of:
	time.start, time.end, dT

	Methods

	1. Scan the network and record
	(1) just before the time window, we take down k_ as the
	list of degree for every node

	(2) just as T0 arrives, we make a copy of k_ in the name of
	k_new.

	(3) scanning through the time window and record the newly
	created links fallen into the window. For every edge, we
	record the nodes it involved and add to the SET nodeSetInNewLinks,
	and increment the degrees of those nodes in the vector k_new

	(4) iterate i over nodeSetInNewLinks, and write
	(k_(i), k_new(i)-k_(i)) to output file
	'''
	# k_[i] is the degree of node i
	k_ = [0] * maxNode
	k_new = k_[:]

	# T1 = T0 + dT
	# time window [T0, T1]
	T1 = dateStringAdd(T0, dT)

	# dict used as a set for keeping nodes involved in links
	# created during the time window
	nodeSetInNewLinks = {}

	totalLine = 409135012
	currentLine = 0

	print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)
	print "NOTE: The input file must be SORTED in ascending order of date and time\n"

	fr = open(inputFileName, 'r')
	for line in fr:
	currentLine += 1
	dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
	line = line.rstrip()
	lineItems = line.split('\t')
	# nodes in integer numbering
	node1 = int(lineItems[0])
	node2 = int(lineItems[1])
	linkTime = lineItems[2].split(' ')
	linkTime = linkTime[0] # YYYY-MM-DD


	# profile degrees k_
	# in the snapshot just before T0
	if linkTime < T0:
	# before T0
	k_[node1] += 1
	k_[node2] += 1
	k_new[node1] += 1
	k_new[node2] += 1
	elif linkTime <= T1:
	# in the time window [T0, T0+T1]
	# add nodes involved to nodeSetInNewLinks
	nodeSetInNewLinks[node1] = 0
	nodeSetInNewLinks[node2] = 0
	# increment the degree of node1 and node2 in k_new
	k_new[node1] += 1
	k_new[node2] += 1
	else:
	# now we jump out of the time window
	# no need for further scanning
	break
	# file read over
	fr.close()

	# iterate over nodes involved in newly added links in the time window
	fw = open(outputFileName, "w")
	print >>fw, "k, k.increment"
	for i in nodeSetInNewLinks.keys():
	print >>fw, "%d, %d" % (k_[i], k_new[i]-k_[i])
	fw.close()
	outputFileName_date = outputFileName + ".date"
	fw_date = open(outputFileName_date, "w")
	print >>fw_date, "time.start, time.end, dT"
	print >>fw_date, "%s, %s, %d" % (T0, T1, dT)
	fw_date.close()

	print "Done."
	print "Outputfile written to", outputFileName
	print "Date settings written to", outputFileName_date


	def measurePA(inputFileName, outputFileNamePrefix, T0, T1, dT):
	'''Measure the preferential attachement by evolutionary network data.

	inputFileName speicifies the file of network data, in the row format of
	node1_int <tab> node2_int <tab> YYYY-MM-DD HH:MM:SS

	Input arguments

	T0 is the time for ``old'' network, in ``YYYY-MM-DD'' format

	T1 is the time for observing newly-added links,
	in ``YYYY-MM-DD'' format. T0 <= T1.

	dT is the interval, making [T1, T1+dT] the actual time window for
	observing added links

	Output files

	outputFileNamePrefix + "_external.txt" records the degree of the
	"attached" node in the snapshot of network just before T0, with row
	format:
	k

	outputFileNamePrefix + "_internal.txt" records the degree of the
	nodes who were included in networks just before T0 but make the
	first link in the time window, wiht row format:
	k0, k1

	outputFileNamePrefix + "_degree.txt" records the number of nodes
	with a degree. The row k gives the number of nodes with degree k
	in the snapshot just before T0. Row format:
	num.of.nodes.with.degree

	outputFileNamePrefix + "_date.txt" records T0, T1, dT of this set
	of output. Row format:
	T0, T1, dT, T2

	'''

	# k_[i] is the degree of node i
	k_ = [0] * maxNode

	# T2 = T1 + dT
	T1_dt = datetime.datetime.strptime(T1, "%Y-%m-%d")
	T2_dt = T1_dt + datetime.timedelta(days=dT)
	T2 = T2_dt.strftime("%Y-%m-%d") # [T1, T2] would be the window

	# list for maintaining new links fallen into the time window
	newlinks = []

	# maintain the counting of number of nodes of a specific degree
	degrees = []

	totalLine = 409135012
	currentLine = 0
	print "Reading file %s, %d lines in total..." % (inputFileName, totalLine)

	fr = open(inputFileName, 'r')
	for line in fr:
	currentLine += 1
	dataMISC.reportProgress(currentLine, totalLine, 200, "profiling ")
	line = line.rstrip()
	lineItems = line.split('\t')
	# nodes in integer numbering
	node1 = int(lineItems[0])
	node2 = int(lineItems[1])
	linkTime = lineItems[2].split(' ')
	linkTime = linkTime[0] # YYYY-MM-DD


	# profile degrees k_
	# in the snapshot just before T0
	if linkTime < T0:
	k_[node1] += 1
	k_[node2] += 1

	# record new links in the time window
	if T1<=linkTime and linkTime<=T2:
	newlinks.append((node1, node2))

	# file read over
	fr.close()

	# profiling the degree distribution by
	# counting the number of nodes of a specific degree
	print "Profiling degree distribution..."
	degrees = [0] * (max(k_)+1)
	for kofi in k_:
	degrees[kofi] += 1
	# then degrees[k] == number of nodes of degree k
	# write it to file
	# this file records the number of nodes of degree k
	# row l simply gives the number of nodes with degree l
	outputFileName_degree = outputFileNamePrefix + "_degree.txt"
	fw_deg = open(outputFileName_degree, "w")
	print >>fw_deg, "num.of.nodes.with.degree"
	for dgn in degrees:
	print >>fw_deg, "%d" % (dgn)
	fw_deg.close()

	print "Iterating over %d new links added in the time window..." % (len(newlinks))
	# k_ profiled
	# iterate over links observed in [T1, T1+dT]
	# and write k-dependencies to file
	outputFileName_external = outputFileNamePrefix + "_external.txt"
	outputFileName_internal = outputFileNamePrefix + "_internal.txt"
	fw_ext = open(outputFileName_external, 'w')
	print >>fw_ext, "k"
	fw_int = open(outputFileName_internal, 'w')
	print >>fw_int, "k0, k1"
	for node1, node2 in newlinks:
	k1 = k_[node1]
	k2 = k_[node2]
	if k1==0 and k2==0:
	# 1 node after T1, 1 node added in [T0, T1]
	continue
	elif k1>0 and k2>0:
	# internal link
	print >>fw_int, "%d, %d" % (k1, k2)
	else:
	# external link
	k0 = max(k1, k2) # the degree of ``old'' node
	print >>fw_ext, "%d" % (k0)
	# newlinks iterated
	fw_int.close()
	fw_ext.close()
	# write T0, T1 and dT of this computation
	# dT is in the unit of days
	outputFileName_date = outputFileNamePrefix + "_date.txt"
	fw_date = open(outputFileName_date, 'w')
	print >>fw_date, "T0, T1, dT, T2"
	print >>fw_date, "%s, %s, %d, %s" % (T0, T1, dT, T2)
	fw_date.close()

	print "Done"
	print "External links dependencies saved to", outputFileName_external
	print "Internal links dependencies saved to", outputFileName_internal
	print "Degree counting saved to", outputFileName_degree
	print "Time and time windows saved to", outputFileName_date


	if __name__=="__main__":

	T_vec = ["2006-01-01",
	"2006-04-01",
	"2006-07-01",
	"2006-10-01",
	"2007-01-01",
	"2007-04-01",
	"2007-07-01",
	"2007-10-01"]


	# T_vec = ["2006-07-01"]

	InputFileName = "../RenRenData/friends_all.txt"

	for T0 in T_vec:
	# T1 = dateStringAdd(T0, 30)
	T1 = T0
	dT = 30 #(days)
	OutputFileNamePrefix = "../RenRenData/results/measure_preferential_attachment/Stat_T0_%s_T1_%s" % (T0, T1)
	print "Now:"
	print "T0=", T0, "T1=", T1, "dT=", dT
	print "Input:\n", InputFileName
	print "Output:\n", OutputFileNamePrefix
	print ""
	measurePA(InputFileName, OutputFileNamePrefix, T0, T1, dT)