Created
December 12, 2011 00:19
-
-
Save psychemedia/1463711 to your computer and use it in GitHub Desktop.
Export tweets from a twapperkeeper archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import feedparser, csv, os | |
import argparse, urllib, StringIO | |
from lxml import etree | |
parser = argparse.ArgumentParser(description='Rescue Twapperkeeper archive') | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument('-user',help='Name of a user whose archives you want to rescue.') | |
group.add_argument('-tag',help='Name of a hashtag archive you want to rescue.') | |
group.add_argument('-tags',nargs='*',help='A space separated list of a hashtag archives you want to rescue.') | |
group.add_argument('-userarchives',help='Show the archives associated with a particular user') | |
parser.add_argument('-l',default=500,type=int,metavar='N',help='The number of tweets grabbed on each page call. Max 10000 seems to be a sensible maximum.') | |
parser.add_argument('-bigmax',default=1500,type=int,metavar='N',help='The maximum number of tweets to download from the archive. Default is deliberately small.') | |
parser.add_argument('-reportdir',default='reports',help='The directory path you want to store reports in.') | |
parser.add_argument('-excl',nargs='*', help="A space separated list of archives (without the #), you don't want to be saved from a given user. Note that this is currently case sensitive.") | |
#-excl should really require presence of -user to work? | |
args=parser.parse_args() | |
#You will probably need to install feedparser first. | |
#If you have easy_install, this should do the job: | |
##From the command line: easy_install feedparser | |
#USAGE: from the command line: | |
## python twapperkeeperRescue.py -h | |
#call the script from the commandline with: | |
# python twapperkeeperRescue.py -tag MYTAG | |
#to grab a single hashtag archive | |
#call the script from the commandline with: | |
# python twapperkeeperRescue.py -user USERNAME | |
#to grab all the hastag archives associated with a particular individual | |
#you also need to set the maximum number of tweets to download (-bigmax) and the number of tweets per download page (-l) | |
# python twapperkeeperRescue.py -tag MYTAG -l 1500 -bigmax 1000000 | |
#would download up to a million tweets from an archive 1500 tweets at a time | |
#Only set up for hashtag exports atm | |
tag=args.tag | |
tags=args.tags | |
#We can also grab all the hashtag archives from a given user | |
user=args.user | |
excl=args.excl | |
if excl==None: excl=[] | |
#l is the number of tweets we try to export at a time; I find 50000 is too big | |
l=args.l | |
#bigmax is a fudge; I really should look to see how many tweets are in an archive and pull that many.. | |
#As it is, I grab another page if the current page is size l; | |
#bigmax stops the programme running forever, and kills things when bigmax number of tweets have been pulled | |
bigmax=args.bigmax | |
#Specify a directory for the saved files | |
userdir=args.reportdir | |
#if the specified directory doesn't exist, create it | |
def checkDir(dirpath): | |
if not os.path.exists(dirpath): | |
os.makedirs(dirpath) | |
checkDir(userdir) | |
def writeNextPage(archive,writer,tag,l,ey='',em='',ed='',ehh='',emm=''): | |
print tag,l,ey,em,ed,ehh,emm | |
#if len(archive)>=int(bigmax): | |
# print 'bailing out...' | |
# return archive | |
#We'd need to tweak this URL if we want to export archives other than hashtag archives | |
url='http://twapperkeeper.com/rss.php?type=hashtag&name='+tag+'&l='+str(l) | |
url=url+'&em='+str(em)+'&ed='+str(ed)+'&ey='+str(ey)+'&ehh='+str(ehh)+'&emm='+str(emm) | |
d = feedparser.parse(url) | |
de = d['entries'] | |
for i in de: | |
writer.writerow([i['title'].encode('utf-8')]) | |
archive.extend(de) | |
if len(de)==l and len(archive)<bigmax: | |
ey=de[-1]['updated_parsed'][0] | |
em=de[-1]['updated_parsed'][1] | |
ed=de[-1]['updated_parsed'][2] | |
ehh=de[-1]['updated_parsed'][3] | |
emm=de[-1]['updated_parsed'][4] | |
writeNextPage(archive,writer,tag,l,ey,em,ed,ehh,emm) | |
return archive | |
def saveArchive(tag,l): | |
#we're going to save the output to a file in the specified directory using a standard filename | |
fn='/'.join([userdir,'twArchive_'+tag+'.txt']) | |
f=open(fn,'wb+') | |
writer = csv.writer(f) | |
archive=writeNextPage([],writer,tag,l) | |
f.close() | |
return archive | |
def getUserArchiveNames(user,l): | |
#scraper cribs from http://www.techchorus.net/web-scraping-lxml | |
url='http://twapperkeeper.com/allnotebooks.php?type=hashtag&name=&description=&tag=&created_by='+user | |
print url | |
result = urllib.urlopen(url) | |
html = result.read() | |
parser = etree.HTMLParser() | |
tree = etree.parse(StringIO.StringIO(html), parser) | |
userarchives=tree.xpath('//table[1]/tr[position()>1]/td[position()=2]/a/child::text()') | |
#NEED TO WORK THIS IN - ARCHIVE SIZES: td[position()=5] | |
print userarchives | |
return userarchives | |
def saveUserArchives(userarchives,l): | |
for userarchive in userarchives: | |
tag=userarchive.lstrip('#') | |
if tag not in excl: | |
archive=saveArchive(tag,l) | |
if args.user!=None: | |
userarchives=getUserArchiveNames(user,l) | |
saveUserArchives(userarchives,l) | |
elif args.tag!=None: | |
archive=saveArchive(tag,l) | |
elif args.tags!=None: | |
for tag in tags: | |
archive=saveArchive(tag,l) | |
elif args.userarchives!=None: | |
#Really need to also grab the size of each archive? | |
userarchives=getUserArchiveNames(args.userarchives,l) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment