Instantly share code, notes, and snippets.

Embed
What would you like to do?
Crawler for reddit post written in Python
import praw
import time
from pprint import pprint
import urllib2
import urllib
from urlparse import urlparse
from os.path import splitext, basename
import cPickle
import os
def main():
limit=1000 #we need to limit the number of post crawled (1000 is the max)
execution('aww',limit,'False') # execution of the crawler for 'aww' hot category
execution('aww',limit,'True') # execution of the crawler for 'aww' controversial category
def execution(category,limit,controversial):
r = praw.Reddit(user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1; Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
already_done = []
listclawler=[]
iter_number=0;
if(controversial=='False'):
submissions = r.get_subreddit(category).get_hot(limit=limit)
else:
if(controversial=='True'):
submissions = r.get_subreddit(category).get_controversial(limit=limit)
for x in submissions:
disassembled = urlparse(x.url)
filename,file_ext = splitext(basename(disassembled.path))
# Save the file
if(file_ext==''):
file_ext='.jpg'
filename_new=filename+file_ext
url_new='http://i.imgur.com/'+filename_new;
print('url_new')
try:
print(urllib2.urlopen(url_new).geturl())
img_new = urllib.urlopen(url_new)
if(controversial=='True'):
localFile_new = open(os.path.join(category+'_controversial/',filename_new), 'wb')
if(controversial=='False'):
localFile_new = open(os.path.join(category+'/',filename_new), 'wb')
localFile_new.write(img_new.read())
localFile_new.close()
Item = crawler.Crawler(url_new,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
listclawler.append(Item)
except Exception, e:
print(e)
else:
if(file_ext!='' and file_ext!='.gif'):
filename_new=filename+file_ext
try:
print(urllib2.urlopen(x.url).geturl())
img_new = urllib.urlopen(x.url)
if(controversial=='True'):
localFile = open(os.path.join(category+'_controversial/',filename_new), 'wb')
if(controversial=='False'):
localFile = open(os.path.join(category+'/',filename_new), 'wb')
localFile.write(img_new.read())
localFile.close()
Item = crawler.Crawler(x.url,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
listclawler.append(Item)
except Exception, e:
print(e)
#save posts attributes (url,num of comments,file name,ups,downs etc) in a .pkl file
if(controversial=='True'):
filehandler = open(os.path.join(category+'_controversial/',category+'_controversial_pkl.pkl'), 'wb')
if(controversial=='False'):
filehandler = open(os.path.join(category+'/',category+'_pkl.pkl'), 'wb')
print('Saving pkl file')
cPickle.dump(listclawler, filehandler)
filehandler.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment