privong/twitter_analyze.py

## twitter_analyze.py
#!/usr/bin/python
#
# Usage:
# twitter_t_histogram.py file1.xml [file2.xml]
#
# first file required, second file optional (will overplot and give legends)
# Output files are file1-min.png, file1-sec.png

from xml.dom import minidom
from datetime import datetime
import sys
import numpy
import math
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

second=False

months={'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}

print "Loading and parsing tweets from %s." % (sys.argv[1])
twixml=minidom.parse(sys.argv[1])
twitimes=twixml.getElementsByTagName('created_at')
print "Found %i tweets" % (len(twitimes)/2)
name=sys.argv[1].split('.')

if (len(sys.argv)>2):
  second=True
  print "Loading and parsing tweets from %s." % (sys.argv[2])
  twixml2=minidom.parse(sys.argv[2])
  twitimes2=twixml2.getElementsByTagName('created_at')
  print "Found %i tweets" % (len(twitimes2)/2)
  name2=sys.argv[2].split('.')


prevt=0
count=0  # count the number of tweets analyzed
tint=[]  # array of time intervals for the first histogram (all intervals)
tint2=[] # array of time intervals < 1500 seconds
tint3=[]  # array of time intervals < 300s
hlist=[]  # listing of hours for our histogram
for i in twitimes:
  if ((count % 2)==0):
    j=(i.firstChild.data).split(' ')
    l=j[3].split(':')
    # make sure we're still on UT time
    if (j[4]!='+0000'):
      print "Not UT, ignoring..."
    else:
      if (prevt!=0):
        nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
        dt=prevt-nowt
        dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60.
        hlist.append(nowt.hour+nowt.minute/60.)
        # generate three separate histograms. All intervals, intervals less than 1500s and intervals less than 300s
        if (dtint!=0):
          tint.append(dtint)
          if (dtint < 1500.):
            tint2.append(dtint)
          if (dtint < 300.):
            tint3.append(dtint*60)
        prevt=nowt
      else:
        print "First timestamp"
        prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
  count+=1

# see if a second file has been provided, analyze in the same way as above
if (second):
  tint21=[]
  tint22=[]
  tint23=[]
  hlist2=[]
  for i in twitimes2:
    if ((count % 2)==0):
      j=(i.firstChild.data).split(' ')
      l=j[3].split(':')
      # make sure we're still on UT time
      if (j[4]!='+0000'):
        print "Not UT, ignoring..."
      else:
        if (prevt!=0):
          nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
          dt=prevt-nowt
          dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60.
          hlist2.append(nowt.hour+nowt.minute/60.)
          if (dtint!=0):
            tint21.append(dtint)
            # the interval is logarithmic so the histogram can be!
            if (dtint < 1500.):
              tint22.append(dtint)
            if (dtint < 300.):
              tint23.append(dtint*60)
          prevt=nowt
        else:
          print "First timestamp"
          prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
    count+=1


# 2 panel plot with the whole interval on top and the < 1500s intervals on the bottom
# histogram of the intervals
fig=plt.figure()
ax=fig.add_subplot(211)
n,bins,patches=ax.hist(tint,bins=20,range=(0,max(tint)),log=True,normed=True,label=name[0])
if (second):
  n21,bins21,patches21=ax.hist(tint21,bins=20,range=(0,max(tint)),log=True,normed=True,label=name2[0])
  plt.suptitle(name[0]+', '+name2[0]+' (Minutes)')
else:
  plt.suptitle(name[0]+' (Minutes)')
ax.set_xlabel('Time Between Tweets (minutes)')
ax.set_ylabel('Fraction of Tweets')
plt.legend()
ax=fig.add_subplot(212)
n2,bins2,patches2=ax.hist(tint2,bins=20,range=(0,max(tint2)),log=True,normed=True)
if (second):
  n22,bins22,patches22=ax.hist(tint22,bins=20,range=(0,max(tint2)),log=True,normed=True)
ax.set_xlabel('Time Between Tweets (minutes)')
ax.set_ylabel('Fraction of Tweets')
plt.savefig(name[0]+'-min.png',format='png')

# make a second plot for the 300s binning
fig=plt.figure()
ax=fig.add_subplot(111)
n3,bins3,patches3=ax.hist(tint3,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name[0])
midpt=[]
nn=[]
y=[]
# fit the powerlaw
for i in xrange(len(bins3)-1):
  if n3[i]!=0:
    midpt.append(math.log10(bins3[i]+(bins3[i+1]-bins3[i])/2.))
    nn.append(math.log10(n3[i]))
p=numpy.polyfit(midpt,nn,1)
midpt=[]
for i in xrange(len(bins3)-1):
  midpt.append(bins3[i]+(bins3[i+1]-bins3[i])/2.)
  y.append((10**p[1])*(midpt[i]**p[0]))
plt.plot(midpt,y,label="Power law fit, $\gamma$"+str(p[0]))
if (second):
  n23,bins23,patches23=ax.hist(tint23,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name2[0])
  plt.suptitle(name[0]+', '+name2[0]+' (Seconds)')
else:
  plt.suptitle(name[0]+' (Seconds)')
ax.set_xlabel('Time Between Tweets (seconds)')
plt.legend()
ax.set_ylabel('Fraction of Tweets')
plt.savefig(name[0]+'-sec.png',format='png')

# plot the histogram of UT times posted
fig=plt.figure()
ax=fig.add_subplot(111)
nh,binsh,patchesh=ax.hist(hlist,bins=24,range=(0,24),log=False,normed=True,label=name[0])
if (second):
  nh2,binsh2,patchesh2=ax.hist(hlist2,bins=24,range=(0,24),log=False,normed=True,label=name2[0])
ax.set_xlabel('Hour of the Day (UTC)')
ax.set_ylabel('Fraction of Tweets')
plt.legend()
plt.savefig(name[0]+'-HR_hist.png',format='png')
	#!/usr/bin/python
	#
	# Usage:
	# twitter_t_histogram.py file1.xml [file2.xml]
	#
	# first file required, second file optional (will overplot and give legends)
	# Output files are file1-min.png, file1-sec.png

	from xml.dom import minidom
	from datetime import datetime
	import sys
	import numpy
	import math
	import matplotlib.pyplot as plt
	import matplotlib.mlab as mlab

	second=False

	months={'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}

	print "Loading and parsing tweets from %s." % (sys.argv[1])
	twixml=minidom.parse(sys.argv[1])
	twitimes=twixml.getElementsByTagName('created_at')
	print "Found %i tweets" % (len(twitimes)/2)
	name=sys.argv[1].split('.')

	if (len(sys.argv)>2):
	second=True
	print "Loading and parsing tweets from %s." % (sys.argv[2])
	twixml2=minidom.parse(sys.argv[2])
	twitimes2=twixml2.getElementsByTagName('created_at')
	print "Found %i tweets" % (len(twitimes2)/2)
	name2=sys.argv[2].split('.')


	prevt=0
	count=0 # count the number of tweets analyzed
	tint=[] # array of time intervals for the first histogram (all intervals)
	tint2=[] # array of time intervals < 1500 seconds
	tint3=[] # array of time intervals < 300s
	hlist=[] # listing of hours for our histogram
	for i in twitimes:
	if ((count % 2)==0):
	j=(i.firstChild.data).split(' ')
	l=j[3].split(':')
	# make sure we're still on UT time
	if (j[4]!='+0000'):
	print "Not UT, ignoring..."
	else:
	if (prevt!=0):
	nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
	dt=prevt-nowt
	dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 106) / 106)/60.
	hlist.append(nowt.hour+nowt.minute/60.)
	# generate three separate histograms. All intervals, intervals less than 1500s and intervals less than 300s
	if (dtint!=0):
	tint.append(dtint)
	if (dtint < 1500.):
	tint2.append(dtint)
	if (dtint < 300.):
	tint3.append(dtint*60)
	prevt=nowt
	else:
	print "First timestamp"
	prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
	count+=1

	# see if a second file has been provided, analyze in the same way as above
	if (second):
	tint21=[]
	tint22=[]
	tint23=[]
	hlist2=[]
	for i in twitimes2:
	if ((count % 2)==0):
	j=(i.firstChild.data).split(' ')
	l=j[3].split(':')
	# make sure we're still on UT time
	if (j[4]!='+0000'):
	print "Not UT, ignoring..."
	else:
	if (prevt!=0):
	nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
	dt=prevt-nowt
	dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 106) / 106)/60.
	hlist2.append(nowt.hour+nowt.minute/60.)
	if (dtint!=0):
	tint21.append(dtint)
	# the interval is logarithmic so the histogram can be!
	if (dtint < 1500.):
	tint22.append(dtint)
	if (dtint < 300.):
	tint23.append(dtint*60)
	prevt=nowt
	else:
	print "First timestamp"
	prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
	count+=1


	# 2 panel plot with the whole interval on top and the < 1500s intervals on the bottom
	# histogram of the intervals
	fig=plt.figure()
	ax=fig.add_subplot(211)
	n,bins,patches=ax.hist(tint,bins=20,range=(0,max(tint)),log=True,normed=True,label=name[0])
	if (second):
	n21,bins21,patches21=ax.hist(tint21,bins=20,range=(0,max(tint)),log=True,normed=True,label=name2[0])
	plt.suptitle(name[0]+', '+name2[0]+' (Minutes)')
	else:
	plt.suptitle(name[0]+' (Minutes)')
	ax.set_xlabel('Time Between Tweets (minutes)')
	ax.set_ylabel('Fraction of Tweets')
	plt.legend()
	ax=fig.add_subplot(212)
	n2,bins2,patches2=ax.hist(tint2,bins=20,range=(0,max(tint2)),log=True,normed=True)
	if (second):
	n22,bins22,patches22=ax.hist(tint22,bins=20,range=(0,max(tint2)),log=True,normed=True)
	ax.set_xlabel('Time Between Tweets (minutes)')
	ax.set_ylabel('Fraction of Tweets')
	plt.savefig(name[0]+'-min.png',format='png')

	# make a second plot for the 300s binning
	fig=plt.figure()
	ax=fig.add_subplot(111)
	n3,bins3,patches3=ax.hist(tint3,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name[0])
	midpt=[]
	nn=[]
	y=[]
	# fit the powerlaw
	for i in xrange(len(bins3)-1):
	if n3[i]!=0:
	midpt.append(math.log10(bins3[i]+(bins3[i+1]-bins3[i])/2.))
	nn.append(math.log10(n3[i]))
	p=numpy.polyfit(midpt,nn,1)
	midpt=[]
	for i in xrange(len(bins3)-1):
	midpt.append(bins3[i]+(bins3[i+1]-bins3[i])/2.)
	y.append((10*p[1])(midpt[i]**p[0]))
	plt.plot(midpt,y,label="Power law fit, $\gamma$"+str(p[0]))
	if (second):
	n23,bins23,patches23=ax.hist(tint23,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name2[0])
	plt.suptitle(name[0]+', '+name2[0]+' (Seconds)')
	else:
	plt.suptitle(name[0]+' (Seconds)')
	ax.set_xlabel('Time Between Tweets (seconds)')
	plt.legend()
	ax.set_ylabel('Fraction of Tweets')
	plt.savefig(name[0]+'-sec.png',format='png')

	# plot the histogram of UT times posted
	fig=plt.figure()
	ax=fig.add_subplot(111)
	nh,binsh,patchesh=ax.hist(hlist,bins=24,range=(0,24),log=False,normed=True,label=name[0])
	if (second):
	nh2,binsh2,patchesh2=ax.hist(hlist2,bins=24,range=(0,24),log=False,normed=True,label=name2[0])
	ax.set_xlabel('Hour of the Day (UTC)')
	ax.set_ylabel('Fraction of Tweets')
	plt.legend()
	plt.savefig(name[0]+'-HR_hist.png',format='png')