psychemedia/gplus_espTest.py

## gplus_espTest.py
# ABOUT:
# A script that grabs a list of the friends or followers of a user on Google+,
# grabs a sample of their friends, and generates the resulting social graph

# USAGE:
# Requirements: networkx (see DEPENDENCIES)
# Configuration: see CONFIGURATION
# Output: files will be save to the reports directory
# To run the script:
# 1) Download this file to a new directory somewhere as eg gplusESPnet.py
# 2) cd to the directory
# 3) *The first time*, create to new subdirectories (reports and cache); for example, run the following from the command line: mkdir reports; mkdir cache
# 4) Call the script by running the following from the command line:
# python gplusESPnet.py

# DEPENDENCIES
# The script makes use of the networkx library; you should only need to install it once.
# To install networkx, from the command line type: easy_install networkx
# If that doesn't work, follow the instructions on http://networkx.lanl.gov/install.html
# In short: a) download and unzip http://networkx.lanl.gov/download/networkx/networkx-1.5.zip
# b) cd to the networkx-1.5 directory, c) type: python setup.py install
# END DEPENDENCIES

import networkx as nx

#--- the following should already be available
import urllib2,re
import md5,urllib,os,tempfile,time
import random
import datetime

import StringIO


#USER SETTINGS
#rootID is the Google+ ID of the person whos ESP net you want to map
rootID='100095426689697101649'
#You also need to provide the name of this user
name='Tony Hirst'


#----
# Do some checks...
def checkDir(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

checkDir('reports')
checkDir('cache')
#---

oidRootNamePairs={rootID:name}

defCache=360000
typ='fo'
typ2='fr'
DG=nx.DiGraph()


reobj = re.compile(r'.*([0-9]{21}).*')
reobj2 = re.compile(r',\["([^"]*)".*')
reobj3=re.compile(r'.*[0-9]{21}"\]\n,\[\]\n,\["[^"]*')
#oids = reobj3.findall(data)
#for oid in oids:
#,[[,,"112696985248193005986"]\n,[]\n,["Dawn Wicks-Sutton
reobj4=re.compile(r',\[+,,"([0-9]{21})"]\n,\[\]\n,\["(.*)$')
#ascii(reobj4.match(oid).group(2)) is name, tho check not '' if so 'U N Owen", reobj4.match(oid).group(1) is ID
def ascii(s): return "".join(i for i in s if ord(i)<128)

def getoidName(i,currIDs,oidNames):
	l=i.next()
	#print l
	oid = reobj.match(l)
	if oid==None:
		print 'at the end???'
		return i,currIDs,oidNames,-1
	else: oid=oid.group(1)
	#if we don't get an ID, then return oidNames, i, -1
	if oid not in currIDs:
		#print 'toploop'
		i.next()
		n=i.next()
		n=ascii(reobj2.match(n).group(1))
		if oid not in oidNames:
			oidNames[oid]=n
		currIDs.append(oid)
		#print oid,n
		next=''
		while next!=',[]\n':
			next=i.next()
			#print '...'+next+',,,,'
		next=''
		while next!=']\n':
			next=i.next()
	else:
		print 'bottomloop'
		next=''
		while next!=']\n':
			next=i.next()
	return i,currIDs,oidNames,1

def getoidNames(oidNames,oid='',typ='fr'):
	#oidNames = {}
	if oid=='': return oidNames,[]
	currIDs=[]
	#???I suspect this only does one page of up to 1000(?) users? Need to check?
	if typ=='fr':
		url='https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&rt=j'
	elif typ=='fo':
		url='https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&n=1000&rt=j'
	else:
		exit(-1)
	print url
	#data = urllib.urlopen(url).read()
	data=getGenericCachedData(url)
	i=StringIO.StringIO(data)
	i.next()
	i.next()
	i.next()
	#if flag returns <0, we're done
	flag=1
	while flag>0:
		i,currIDs,oidNames,flag=getoidName(i,currIDs,oidNames)
	#print currIDs,oidNames
	return oidNames,currIDs
#friends
#https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&rt=j

#followers
#https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&n=1000&rt=j

#----------------------------------------------------------------
#Yield successive n-sized chunks from l
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

def report(m, verbose=False):
  if verbose is True:
    print m


class DiskCacheFetcherfname:
    def __init__(self, cache_dir=None):
        # If no cache directory specified, use system temp directory
        if cache_dir is None:
            cache_dir = tempfile.gettempdir()
        self.cache_dir = cache_dir
    def fetch(self, url, max_age=0):
        # Use MD5 hash of the URL as the filename
        filename = md5.new(url).hexdigest()
        filepath = os.path.join(self.cache_dir, filename)
        if os.path.exists(filepath):
            if int(time.time()) - os.path.getmtime(filepath) < max_age:
                #return open(filepath).read()
                report("using "+filename+", cached copy of fetched url: "+url)
                return filepath
        report("fetching fresh copy of fetched url: "+url)
        # Retrieve over HTTP and cache, using rename to avoid collisions
        data = urllib.urlopen(url).read()
        fd, temppath = tempfile.mkstemp()
        fp = os.fdopen(fd, 'w')
        fp.write(data)
        fp.close()
        os.rename(temppath, filepath)
        return filepath

def getGenericCachedData(url, cachetime=defCache):
  fetcher=DiskCacheFetcherfname('cache')
  fn=fetcher.fetch(url, cachetime)
  f=open(fn)
  data=f.read()
  f.close()

  return data


def addDirectedEdges(DG,fromNode,toSet,flip=False):
	for toNode in toSet:
		if flip==True:
			DG.add_edge(toNode,fromNode)
		else:
			DG.add_edge(fromNode,toNode)
	#print nx.info(DG)
	return DG

def labelNodes(G,names):
	for nodeID in G.node:
		G.node[nodeID]['label']=names[nodeID]
	return G


oidNamePairs={}
for id in oidRootNamePairs:
	oidNamePairs,currIDs=getoidNames(oidNamePairs,id,typ)
	print currIDs
	flip=(typ=='fr')
	DG=addDirectedEdges(DG, id, currIDs,flip=flip)
	n=len(currIDs)
	print str(n)
	c=1
	for cid in currIDs:
		print '\tSub-level run: getting ',typ2,str(c),'of',str(n),typ,cid
		oidNamePairs,ccurrIDs=getoidNames(oidNamePairs,cid,typ2)
		DG=addDirectedEdges(DG, cid, ccurrIDs)
		c=c+1
for id in oidRootNamePairs:
	if id not in oidNamePairs:
		oidNamePairs[id]=oidRootNamePairs[id]
DG=labelNodes(DG,oidNamePairs)
print nx.info(DG)

now = datetime.datetime.now()
ts = now.strftime("_%Y-%m-%d-%H-%M-%S")

fname=name.replace(' ','_')
nx.write_graphml(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".graphml"]))
nx.write_edgelist(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".txt"]),data=False)

def filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax):
	#need to tweak this to allow filtering by in and out degree?
	if addUserFriendships==1:
		DG=addFocus(DG,user,typ)
	#handle min,in,out degree
	filter=[]
	#filter=[n for n in DG if DG.degree(n)>=mindegree]
	for n in DG:
		if outdegreemax==None or DG.out_degree(n)<=outdegreemax:
			if mindegree!=None:
				if DG.degree(n)>=mindegree:
					filter.append(n)
			else:
				if indegree!=None:
					if DG.in_degree(n)>=indegree:
						filter.append(n)
				if outdegree!=None:
					if DG.out_degree(n)>=outdegree:
						filter.append(n)
	#the filter represents the intersect of the *degreesets
	#indegree and outdegree values are ignored if mindegree is set
	filter=set(filter)
	H=DG.subgraph(filter)
	#Superstitiously, perhaps, make sure we only grab nodes that project edges...
	filter= [n for n in H if H.degree(n)>0]
	L=H.subgraph(filter)
	#print "Filter set:",filter
	print L.order(),L.size()
	#L=labelGraph(L,filter)

	if mindegree==None: tm='X'
	else: tm=str(mindegree)
	if indegree==None: ti='X'
	else: ti=str(indegree)
	if outdegree==None: to='X'
	else: to=str(outdegree)
	if outdegreemax==None: tom='X'
	else: tom=str(outdegreemax)
	st='/'.join([projname,name+'_google'+typ+typ2+'degree_'+tm+'_'+ti+'_'+to+'_'+tom+"_esp"])
	print nx.info(L)
	nx.write_graphml(L, st+".graphml")
	nx.write_edgelist(L, st+".txt",data=False)


mindegree=None
indegree=20
outdegree=25
outdegreemax=None
addUserFriendships=0
user=''
indegreemax=None
projname='reports/'
filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax)
	# ABOUT:
	# A script that grabs a list of the friends or followers of a user on Google+,
	# grabs a sample of their friends, and generates the resulting social graph

	# USAGE:
	# Requirements: networkx (see DEPENDENCIES)
	# Configuration: see CONFIGURATION
	# Output: files will be save to the reports directory
	# To run the script:
	# 1) Download this file to a new directory somewhere as eg gplusESPnet.py
	# 2) cd to the directory
	# 3) The first time, create to new subdirectories (reports and cache); for example, run the following from the command line: mkdir reports; mkdir cache
	# 4) Call the script by running the following from the command line:
	# python gplusESPnet.py

	# DEPENDENCIES
	# The script makes use of the networkx library; you should only need to install it once.
	# To install networkx, from the command line type: easy_install networkx
	# If that doesn't work, follow the instructions on http://networkx.lanl.gov/install.html
	# In short: a) download and unzip http://networkx.lanl.gov/download/networkx/networkx-1.5.zip
	# b) cd to the networkx-1.5 directory, c) type: python setup.py install
	# END DEPENDENCIES

	import networkx as nx

	#--- the following should already be available
	import urllib2,re
	import md5,urllib,os,tempfile,time
	import random
	import datetime

	import StringIO


	#USER SETTINGS
	#rootID is the Google+ ID of the person whos ESP net you want to map
	rootID='100095426689697101649'
	#You also need to provide the name of this user
	name='Tony Hirst'


	#----
	# Do some checks...
	def checkDir(dirpath):
	if not os.path.exists(dirpath):
	os.makedirs(dirpath)

	checkDir('reports')
	checkDir('cache')
	#---

	oidRootNamePairs={rootID:name}

	defCache=360000
	typ='fo'
	typ2='fr'
	DG=nx.DiGraph()


	reobj = re.compile(r'.([0-9]{21}).')
	reobj2 = re.compile(r',\["([^"])".')
	reobj3=re.compile(r'.[0-9]{21}"\]\n,\[\]\n,\["[^"]')
	#oids = reobj3.findall(data)
	#for oid in oids:
	#,[[,,"112696985248193005986"]\n,[]\n,["Dawn Wicks-Sutton
	reobj4=re.compile(r',\[+,,"([0-9]{21})"]\n,\[\]\n,\["(.*)$')
	#ascii(reobj4.match(oid).group(2)) is name, tho check not '' if so 'U N Owen", reobj4.match(oid).group(1) is ID
	def ascii(s): return "".join(i for i in s if ord(i)<128)

	def getoidName(i,currIDs,oidNames):
	l=i.next()
	#print l
	oid = reobj.match(l)
	if oid==None:
	print 'at the end???'
	return i,currIDs,oidNames,-1
	else: oid=oid.group(1)
	#if we don't get an ID, then return oidNames, i, -1
	if oid not in currIDs:
	#print 'toploop'
	i.next()
	n=i.next()
	n=ascii(reobj2.match(n).group(1))
	if oid not in oidNames:
	oidNames[oid]=n
	currIDs.append(oid)
	#print oid,n
	next=''
	while next!=',[]\n':
	next=i.next()
	#print '...'+next+',,,,'
	next=''
	while next!=']\n':
	next=i.next()
	else:
	print 'bottomloop'
	next=''
	while next!=']\n':
	next=i.next()
	return i,currIDs,oidNames,1

	def getoidNames(oidNames,oid='',typ='fr'):
	#oidNames = {}
	if oid=='': return oidNames,[]
	currIDs=[]
	#???I suspect this only does one page of up to 1000(?) users? Need to check?
	if typ=='fr':
	url='https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&rt=j'
	elif typ=='fo':
	url='https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22'+oid+'%22%5D&n=1000&rt=j'
	else:
	exit(-1)
	print url
	#data = urllib.urlopen(url).read()
	data=getGenericCachedData(url)
	i=StringIO.StringIO(data)
	i.next()
	i.next()
	i.next()
	#if flag returns <0, we're done
	flag=1
	while flag>0:
	i,currIDs,oidNames,flag=getoidName(i,currIDs,oidNames)
	#print currIDs,oidNames
	return oidNames,currIDs
	#friends
	#https://plus.google.com/u/0/_/socialgraph/lookup/visible/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&rt=j

	#followers
	#https://plus.google.com/u/0/_/socialgraph/lookup/incoming/?o=%5Bnull%2Cnull%2C%22GOOGLEPLUSUSERID%22%5D&n=1000&rt=j

	#----------------------------------------------------------------
	#Yield successive n-sized chunks from l
	def chunks(l, n):
	for i in xrange(0, len(l), n):
	yield l[i:i+n]

	def report(m, verbose=False):
	if verbose is True:
	print m


	class DiskCacheFetcherfname:
	def __init__(self, cache_dir=None):
	# If no cache directory specified, use system temp directory
	if cache_dir is None:
	cache_dir = tempfile.gettempdir()
	self.cache_dir = cache_dir
	def fetch(self, url, max_age=0):
	# Use MD5 hash of the URL as the filename
	filename = md5.new(url).hexdigest()
	filepath = os.path.join(self.cache_dir, filename)
	if os.path.exists(filepath):
	if int(time.time()) - os.path.getmtime(filepath) < max_age:
	#return open(filepath).read()
	report("using "+filename+", cached copy of fetched url: "+url)
	return filepath
	report("fetching fresh copy of fetched url: "+url)
	# Retrieve over HTTP and cache, using rename to avoid collisions
	data = urllib.urlopen(url).read()
	fd, temppath = tempfile.mkstemp()
	fp = os.fdopen(fd, 'w')
	fp.write(data)
	fp.close()
	os.rename(temppath, filepath)
	return filepath

	def getGenericCachedData(url, cachetime=defCache):
	fetcher=DiskCacheFetcherfname('cache')
	fn=fetcher.fetch(url, cachetime)
	f=open(fn)
	data=f.read()
	f.close()

	return data


	def addDirectedEdges(DG,fromNode,toSet,flip=False):
	for toNode in toSet:
	if flip==True:
	DG.add_edge(toNode,fromNode)
	else:
	DG.add_edge(fromNode,toNode)
	#print nx.info(DG)
	return DG

	def labelNodes(G,names):
	for nodeID in G.node:
	G.node[nodeID]['label']=names[nodeID]
	return G



	oidNamePairs={}
	for id in oidRootNamePairs:
	oidNamePairs,currIDs=getoidNames(oidNamePairs,id,typ)
	print currIDs
	flip=(typ=='fr')
	DG=addDirectedEdges(DG, id, currIDs,flip=flip)
	n=len(currIDs)
	print str(n)
	c=1
	for cid in currIDs:
	print '\tSub-level run: getting ',typ2,str(c),'of',str(n),typ,cid
	oidNamePairs,ccurrIDs=getoidNames(oidNamePairs,cid,typ2)
	DG=addDirectedEdges(DG, cid, ccurrIDs)
	c=c+1
	for id in oidRootNamePairs:
	if id not in oidNamePairs:
	oidNamePairs[id]=oidRootNamePairs[id]
	DG=labelNodes(DG,oidNamePairs)
	print nx.info(DG)

	now = datetime.datetime.now()
	ts = now.strftime("_%Y-%m-%d-%H-%M-%S")

	fname=name.replace(' ','_')
	nx.write_graphml(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".graphml"]))
	nx.write_edgelist(DG, '/'.join(['reports',fname+'_google'+typ+'Friends_'+ts+".txt"]),data=False)

	def filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax):
	#need to tweak this to allow filtering by in and out degree?
	if addUserFriendships==1:
	DG=addFocus(DG,user,typ)
	#handle min,in,out degree
	filter=[]
	#filter=[n for n in DG if DG.degree(n)>=mindegree]
	for n in DG:
	if outdegreemax==None or DG.out_degree(n)<=outdegreemax:
	if mindegree!=None:
	if DG.degree(n)>=mindegree:
	filter.append(n)
	else:
	if indegree!=None:
	if DG.in_degree(n)>=indegree:
	filter.append(n)
	if outdegree!=None:
	if DG.out_degree(n)>=outdegree:
	filter.append(n)
	#the filter represents the intersect of the *degreesets
	#indegree and outdegree values are ignored if mindegree is set
	filter=set(filter)
	H=DG.subgraph(filter)
	#Superstitiously, perhaps, make sure we only grab nodes that project edges...
	filter= [n for n in H if H.degree(n)>0]
	L=H.subgraph(filter)
	#print "Filter set:",filter
	print L.order(),L.size()
	#L=labelGraph(L,filter)

	if mindegree==None: tm='X'
	else: tm=str(mindegree)
	if indegree==None: ti='X'
	else: ti=str(indegree)
	if outdegree==None: to='X'
	else: to=str(outdegree)
	if outdegreemax==None: tom='X'
	else: tom=str(outdegreemax)
	st='/'.join([projname,name+'_google'+typ+typ2+'degree_'+tm+'_'+ti+'_'+to+'_'+tom+"_esp"])
	print nx.info(L)
	nx.write_graphml(L, st+".graphml")
	nx.write_edgelist(L, st+".txt",data=False)


	mindegree=None
	indegree=20
	outdegree=25
	outdegreemax=None
	addUserFriendships=0
	user=''
	indegreemax=None
	projname='reports/'
	filterNet(DG,mindegree,indegree,outdegree,outdegreemax,typ,typ2,addUserFriendships,user,indegreemax)