Skip to content

Instantly share code, notes, and snippets.

@privong
Created February 21, 2013 07:05
Show Gist options
  • Save privong/5002852 to your computer and use it in GitHub Desktop.
Save privong/5002852 to your computer and use it in GitHub Desktop.
Simple analysis of interval between tweets and tweets as a function of time of day. Uses an xml format from (the now non-functioning) twitterbackup (http://johannburkard.de/blog/programming/java/backup-twitter-tweets-with-twitterbackup.html). Fits a powerlaw to the time between tweets.
#!/usr/bin/python
#
# Usage:
# twitter_t_histogram.py file1.xml [file2.xml]
#
# first file required, second file optional (will overplot and give legends)
# Output files are file1-min.png, file1-sec.png
from xml.dom import minidom
from datetime import datetime
import sys
import numpy
import math
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
second=False
months={'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
print "Loading and parsing tweets from %s." % (sys.argv[1])
twixml=minidom.parse(sys.argv[1])
twitimes=twixml.getElementsByTagName('created_at')
print "Found %i tweets" % (len(twitimes)/2)
name=sys.argv[1].split('.')
if (len(sys.argv)>2):
second=True
print "Loading and parsing tweets from %s." % (sys.argv[2])
twixml2=minidom.parse(sys.argv[2])
twitimes2=twixml2.getElementsByTagName('created_at')
print "Found %i tweets" % (len(twitimes2)/2)
name2=sys.argv[2].split('.')
prevt=0
count=0 # count the number of tweets analyzed
tint=[] # array of time intervals for the first histogram (all intervals)
tint2=[] # array of time intervals < 1500 seconds
tint3=[] # array of time intervals < 300s
hlist=[] # listing of hours for our histogram
for i in twitimes:
if ((count % 2)==0):
j=(i.firstChild.data).split(' ')
l=j[3].split(':')
# make sure we're still on UT time
if (j[4]!='+0000'):
print "Not UT, ignoring..."
else:
if (prevt!=0):
nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
dt=prevt-nowt
dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60.
hlist.append(nowt.hour+nowt.minute/60.)
# generate three separate histograms. All intervals, intervals less than 1500s and intervals less than 300s
if (dtint!=0):
tint.append(dtint)
if (dtint < 1500.):
tint2.append(dtint)
if (dtint < 300.):
tint3.append(dtint*60)
prevt=nowt
else:
print "First timestamp"
prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
count+=1
# see if a second file has been provided, analyze in the same way as above
if (second):
tint21=[]
tint22=[]
tint23=[]
hlist2=[]
for i in twitimes2:
if ((count % 2)==0):
j=(i.firstChild.data).split(' ')
l=j[3].split(':')
# make sure we're still on UT time
if (j[4]!='+0000'):
print "Not UT, ignoring..."
else:
if (prevt!=0):
nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
dt=prevt-nowt
dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60.
hlist2.append(nowt.hour+nowt.minute/60.)
if (dtint!=0):
tint21.append(dtint)
# the interval is logarithmic so the histogram can be!
if (dtint < 1500.):
tint22.append(dtint)
if (dtint < 300.):
tint23.append(dtint*60)
prevt=nowt
else:
print "First timestamp"
prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2]))
count+=1
# 2 panel plot with the whole interval on top and the < 1500s intervals on the bottom
# histogram of the intervals
fig=plt.figure()
ax=fig.add_subplot(211)
n,bins,patches=ax.hist(tint,bins=20,range=(0,max(tint)),log=True,normed=True,label=name[0])
if (second):
n21,bins21,patches21=ax.hist(tint21,bins=20,range=(0,max(tint)),log=True,normed=True,label=name2[0])
plt.suptitle(name[0]+', '+name2[0]+' (Minutes)')
else:
plt.suptitle(name[0]+' (Minutes)')
ax.set_xlabel('Time Between Tweets (minutes)')
ax.set_ylabel('Fraction of Tweets')
plt.legend()
ax=fig.add_subplot(212)
n2,bins2,patches2=ax.hist(tint2,bins=20,range=(0,max(tint2)),log=True,normed=True)
if (second):
n22,bins22,patches22=ax.hist(tint22,bins=20,range=(0,max(tint2)),log=True,normed=True)
ax.set_xlabel('Time Between Tweets (minutes)')
ax.set_ylabel('Fraction of Tweets')
plt.savefig(name[0]+'-min.png',format='png')
# make a second plot for the 300s binning
fig=plt.figure()
ax=fig.add_subplot(111)
n3,bins3,patches3=ax.hist(tint3,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name[0])
midpt=[]
nn=[]
y=[]
# fit the powerlaw
for i in xrange(len(bins3)-1):
if n3[i]!=0:
midpt.append(math.log10(bins3[i]+(bins3[i+1]-bins3[i])/2.))
nn.append(math.log10(n3[i]))
p=numpy.polyfit(midpt,nn,1)
midpt=[]
for i in xrange(len(bins3)-1):
midpt.append(bins3[i]+(bins3[i+1]-bins3[i])/2.)
y.append((10**p[1])*(midpt[i]**p[0]))
plt.plot(midpt,y,label="Power law fit, $\gamma$"+str(p[0]))
if (second):
n23,bins23,patches23=ax.hist(tint23,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name2[0])
plt.suptitle(name[0]+', '+name2[0]+' (Seconds)')
else:
plt.suptitle(name[0]+' (Seconds)')
ax.set_xlabel('Time Between Tweets (seconds)')
plt.legend()
ax.set_ylabel('Fraction of Tweets')
plt.savefig(name[0]+'-sec.png',format='png')
# plot the histogram of UT times posted
fig=plt.figure()
ax=fig.add_subplot(111)
nh,binsh,patchesh=ax.hist(hlist,bins=24,range=(0,24),log=False,normed=True,label=name[0])
if (second):
nh2,binsh2,patchesh2=ax.hist(hlist2,bins=24,range=(0,24),log=False,normed=True,label=name2[0])
ax.set_xlabel('Hour of the Day (UTC)')
ax.set_ylabel('Fraction of Tweets')
plt.legend()
plt.savefig(name[0]+'-HR_hist.png',format='png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment