Skip to content

Instantly share code, notes, and snippets.

@SimoneErcoli
Created October 2, 2014 10:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SimoneErcoli/73f8556539b2132363b2 to your computer and use it in GitHub Desktop.
Save SimoneErcoli/73f8556539b2132363b2 to your computer and use it in GitHub Desktop.
Crawler for reddit post written in Python
import praw
import time
from pprint import pprint
import urllib2
import urllib
from urlparse import urlparse
from os.path import splitext, basename
import cPickle
import os
def main():
limit=1000 #we need to limit the number of post crawled (1000 is the max)
execution('aww',limit,'False') # execution of the crawler for 'aww' hot category
execution('aww',limit,'True') # execution of the crawler for 'aww' controversial category
def execution(category,limit,controversial):
r = praw.Reddit(user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1; Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
already_done = []
listclawler=[]
iter_number=0;
if(controversial=='False'):
submissions = r.get_subreddit(category).get_hot(limit=limit)
else:
if(controversial=='True'):
submissions = r.get_subreddit(category).get_controversial(limit=limit)
for x in submissions:
disassembled = urlparse(x.url)
filename,file_ext = splitext(basename(disassembled.path))
# Save the file
if(file_ext==''):
file_ext='.jpg'
filename_new=filename+file_ext
url_new='http://i.imgur.com/'+filename_new;
print('url_new')
try:
print(urllib2.urlopen(url_new).geturl())
img_new = urllib.urlopen(url_new)
if(controversial=='True'):
localFile_new = open(os.path.join(category+'_controversial/',filename_new), 'wb')
if(controversial=='False'):
localFile_new = open(os.path.join(category+'/',filename_new), 'wb')
localFile_new.write(img_new.read())
localFile_new.close()
Item = crawler.Crawler(url_new,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
listclawler.append(Item)
except Exception, e:
print(e)
else:
if(file_ext!='' and file_ext!='.gif'):
filename_new=filename+file_ext
try:
print(urllib2.urlopen(x.url).geturl())
img_new = urllib.urlopen(x.url)
if(controversial=='True'):
localFile = open(os.path.join(category+'_controversial/',filename_new), 'wb')
if(controversial=='False'):
localFile = open(os.path.join(category+'/',filename_new), 'wb')
localFile.write(img_new.read())
localFile.close()
Item = crawler.Crawler(x.url,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
listclawler.append(Item)
except Exception, e:
print(e)
#save posts attributes (url,num of comments,file name,ups,downs etc) in a .pkl file
if(controversial=='True'):
filehandler = open(os.path.join(category+'_controversial/',category+'_controversial_pkl.pkl'), 'wb')
if(controversial=='False'):
filehandler = open(os.path.join(category+'/',category+'_pkl.pkl'), 'wb')
print('Saving pkl file')
cPickle.dump(listclawler, filehandler)
filehandler.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment