SimoneErcoli/RedditCrawler

## RedditCrawler
import praw
import time
from pprint import pprint
import urllib2
import urllib
from urlparse import urlparse
from os.path import splitext, basename
import cPickle
import os


def main():


    limit=1000   #we need to limit the number of post crawled (1000 is the max)


    execution('aww',limit,'False')  # execution of the crawler for 'aww' hot category
    execution('aww',limit,'True')   # execution of the crawler for 'aww' controversial category


def execution(category,limit,controversial):


  r = praw.Reddit(user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1; Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
  already_done = []
  listclawler=[]
  iter_number=0;


  if(controversial=='False'):
   submissions = r.get_subreddit(category).get_hot(limit=limit)

  else:
    if(controversial=='True'):
     submissions = r.get_subreddit(category).get_controversial(limit=limit)


  for x in submissions:


       disassembled = urlparse(x.url)
       filename,file_ext = splitext(basename(disassembled.path))


       # Save the file

       if(file_ext==''):
         file_ext='.jpg'
         filename_new=filename+file_ext
         url_new='http://i.imgur.com/'+filename_new;
         print('url_new')
         try:
          print(urllib2.urlopen(url_new).geturl())
          img_new = urllib.urlopen(url_new)
          if(controversial=='True'):
           localFile_new = open(os.path.join(category+'_controversial/',filename_new), 'wb')
          if(controversial=='False'):
           localFile_new = open(os.path.join(category+'/',filename_new), 'wb')
          localFile_new.write(img_new.read())
          localFile_new.close()
          Item = crawler.Crawler(url_new,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
          listclawler.append(Item)
         except Exception, e:
          print(e)


       else:
         if(file_ext!='' and file_ext!='.gif'):
           filename_new=filename+file_ext

           try:
             print(urllib2.urlopen(x.url).geturl())
             img_new = urllib.urlopen(x.url)
             if(controversial=='True'):
              localFile = open(os.path.join(category+'_controversial/',filename_new), 'wb')
             if(controversial=='False'):
              localFile = open(os.path.join(category+'/',filename_new), 'wb')
             localFile.write(img_new.read())
             localFile.close()
             Item = crawler.Crawler(x.url,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
             listclawler.append(Item)
           except Exception, e:
             print(e)


  #save posts attributes (url,num of comments,file name,ups,downs etc) in a .pkl file
  if(controversial=='True'):
   filehandler = open(os.path.join(category+'_controversial/',category+'_controversial_pkl.pkl'), 'wb')
  if(controversial=='False'):
   filehandler = open(os.path.join(category+'/',category+'_pkl.pkl'), 'wb')

  print('Saving pkl file')
  cPickle.dump(listclawler, filehandler)
  filehandler.close()
	import praw
	import time
	from pprint import pprint
	import urllib2
	import urllib
	from urlparse import urlparse
	from os.path import splitext, basename
	import cPickle
	import os




	def main():


	limit=1000 #we need to limit the number of post crawled (1000 is the max)


	execution('aww',limit,'False') # execution of the crawler for 'aww' hot category
	execution('aww',limit,'True') # execution of the crawler for 'aww' controversial category






	def execution(category,limit,controversial):


	r = praw.Reddit(user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1; Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
	already_done = []
	listclawler=[]
	iter_number=0;


	if(controversial=='False'):
	submissions = r.get_subreddit(category).get_hot(limit=limit)

	else:
	if(controversial=='True'):
	submissions = r.get_subreddit(category).get_controversial(limit=limit)



	for x in submissions:


	disassembled = urlparse(x.url)
	filename,file_ext = splitext(basename(disassembled.path))




	# Save the file

	if(file_ext==''):
	file_ext='.jpg'
	filename_new=filename+file_ext
	url_new='http://i.imgur.com/'+filename_new;
	print('url_new')
	try:
	print(urllib2.urlopen(url_new).geturl())
	img_new = urllib.urlopen(url_new)
	if(controversial=='True'):
	localFile_new = open(os.path.join(category+'_controversial/',filename_new), 'wb')
	if(controversial=='False'):
	localFile_new = open(os.path.join(category+'/',filename_new), 'wb')
	localFile_new.write(img_new.read())
	localFile_new.close()
	Item = crawler.Crawler(url_new,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
	listclawler.append(Item)
	except Exception, e:
	print(e)





	else:
	if(file_ext!='' and file_ext!='.gif'):
	filename_new=filename+file_ext

	try:
	print(urllib2.urlopen(x.url).geturl())
	img_new = urllib.urlopen(x.url)
	if(controversial=='True'):
	localFile = open(os.path.join(category+'_controversial/',filename_new), 'wb')
	if(controversial=='False'):
	localFile = open(os.path.join(category+'/',filename_new), 'wb')
	localFile.write(img_new.read())
	localFile.close()
	Item = crawler.Crawler(x.url,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext)
	listclawler.append(Item)
	except Exception, e:
	print(e)




	#save posts attributes (url,num of comments,file name,ups,downs etc) in a .pkl file
	if(controversial=='True'):
	filehandler = open(os.path.join(category+'_controversial/',category+'_controversial_pkl.pkl'), 'wb')
	if(controversial=='False'):
	filehandler = open(os.path.join(category+'/',category+'_pkl.pkl'), 'wb')

	print('Saving pkl file')
	cPickle.dump(listclawler, filehandler)
	filehandler.close()