Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Created December 12, 2011 00:19
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save psychemedia/1463711 to your computer and use it in GitHub Desktop.
Save psychemedia/1463711 to your computer and use it in GitHub Desktop.
Export tweets from a twapperkeeper archive
import feedparser, csv, os
import argparse, urllib, StringIO
from lxml import etree
parser = argparse.ArgumentParser(description='Rescue Twapperkeeper archive')
group = parser.add_mutually_exclusive_group()
group.add_argument('-user',help='Name of a user whose archives you want to rescue.')
group.add_argument('-tag',help='Name of a hashtag archive you want to rescue.')
group.add_argument('-tags',nargs='*',help='A space separated list of a hashtag archives you want to rescue.')
group.add_argument('-userarchives',help='Show the archives associated with a particular user')
parser.add_argument('-l',default=500,type=int,metavar='N',help='The number of tweets grabbed on each page call. Max 10000 seems to be a sensible maximum.')
parser.add_argument('-bigmax',default=1500,type=int,metavar='N',help='The maximum number of tweets to download from the archive. Default is deliberately small.')
parser.add_argument('-reportdir',default='reports',help='The directory path you want to store reports in.')
parser.add_argument('-excl',nargs='*', help="A space separated list of archives (without the #), you don't want to be saved from a given user. Note that this is currently case sensitive.")
#-excl should really require presence of -user to work?
args=parser.parse_args()
#You will probably need to install feedparser first.
#If you have easy_install, this should do the job:
##From the command line: easy_install feedparser
#USAGE: from the command line:
## python twapperkeeperRescue.py -h
#call the script from the commandline with:
# python twapperkeeperRescue.py -tag MYTAG
#to grab a single hashtag archive
#call the script from the commandline with:
# python twapperkeeperRescue.py -user USERNAME
#to grab all the hastag archives associated with a particular individual
#you also need to set the maximum number of tweets to download (-bigmax) and the number of tweets per download page (-l)
# python twapperkeeperRescue.py -tag MYTAG -l 1500 -bigmax 1000000
#would download up to a million tweets from an archive 1500 tweets at a time
#Only set up for hashtag exports atm
tag=args.tag
tags=args.tags
#We can also grab all the hashtag archives from a given user
user=args.user
excl=args.excl
if excl==None: excl=[]
#l is the number of tweets we try to export at a time; I find 50000 is too big
l=args.l
#bigmax is a fudge; I really should look to see how many tweets are in an archive and pull that many..
#As it is, I grab another page if the current page is size l;
#bigmax stops the programme running forever, and kills things when bigmax number of tweets have been pulled
bigmax=args.bigmax
#Specify a directory for the saved files
userdir=args.reportdir
#if the specified directory doesn't exist, create it
def checkDir(dirpath):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
checkDir(userdir)
def writeNextPage(archive,writer,tag,l,ey='',em='',ed='',ehh='',emm=''):
print tag,l,ey,em,ed,ehh,emm
#if len(archive)>=int(bigmax):
# print 'bailing out...'
# return archive
#We'd need to tweak this URL if we want to export archives other than hashtag archives
url='http://twapperkeeper.com/rss.php?type=hashtag&name='+tag+'&l='+str(l)
url=url+'&em='+str(em)+'&ed='+str(ed)+'&ey='+str(ey)+'&ehh='+str(ehh)+'&emm='+str(emm)
d = feedparser.parse(url)
de = d['entries']
for i in de:
writer.writerow([i['title'].encode('utf-8')])
archive.extend(de)
if len(de)==l and len(archive)<bigmax:
ey=de[-1]['updated_parsed'][0]
em=de[-1]['updated_parsed'][1]
ed=de[-1]['updated_parsed'][2]
ehh=de[-1]['updated_parsed'][3]
emm=de[-1]['updated_parsed'][4]
writeNextPage(archive,writer,tag,l,ey,em,ed,ehh,emm)
return archive
def saveArchive(tag,l):
#we're going to save the output to a file in the specified directory using a standard filename
fn='/'.join([userdir,'twArchive_'+tag+'.txt'])
f=open(fn,'wb+')
writer = csv.writer(f)
archive=writeNextPage([],writer,tag,l)
f.close()
return archive
def getUserArchiveNames(user,l):
#scraper cribs from http://www.techchorus.net/web-scraping-lxml
url='http://twapperkeeper.com/allnotebooks.php?type=hashtag&name=&description=&tag=&created_by='+user
print url
result = urllib.urlopen(url)
html = result.read()
parser = etree.HTMLParser()
tree = etree.parse(StringIO.StringIO(html), parser)
userarchives=tree.xpath('//table[1]/tr[position()>1]/td[position()=2]/a/child::text()')
#NEED TO WORK THIS IN - ARCHIVE SIZES: td[position()=5]
print userarchives
return userarchives
def saveUserArchives(userarchives,l):
for userarchive in userarchives:
tag=userarchive.lstrip('#')
if tag not in excl:
archive=saveArchive(tag,l)
if args.user!=None:
userarchives=getUserArchiveNames(user,l)
saveUserArchives(userarchives,l)
elif args.tag!=None:
archive=saveArchive(tag,l)
elif args.tags!=None:
for tag in tags:
archive=saveArchive(tag,l)
elif args.userarchives!=None:
#Really need to also grab the size of each archive?
userarchives=getUserArchiveNames(args.userarchives,l)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment