Created
October 2, 2014 10:09
-
-
Save SimoneErcoli/73f8556539b2132363b2 to your computer and use it in GitHub Desktop.
Crawler for reddit post written in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
import time | |
from pprint import pprint | |
import urllib2 | |
import urllib | |
from urlparse import urlparse | |
from os.path import splitext, basename | |
import cPickle | |
import os | |
def main(): | |
limit=1000 #we need to limit the number of post crawled (1000 is the max) | |
execution('aww',limit,'False') # execution of the crawler for 'aww' hot category | |
execution('aww',limit,'True') # execution of the crawler for 'aww' controversial category | |
def execution(category,limit,controversial): | |
r = praw.Reddit(user_agent= 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1; Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1') | |
already_done = [] | |
listclawler=[] | |
iter_number=0; | |
if(controversial=='False'): | |
submissions = r.get_subreddit(category).get_hot(limit=limit) | |
else: | |
if(controversial=='True'): | |
submissions = r.get_subreddit(category).get_controversial(limit=limit) | |
for x in submissions: | |
disassembled = urlparse(x.url) | |
filename,file_ext = splitext(basename(disassembled.path)) | |
# Save the file | |
if(file_ext==''): | |
file_ext='.jpg' | |
filename_new=filename+file_ext | |
url_new='http://i.imgur.com/'+filename_new; | |
print('url_new') | |
try: | |
print(urllib2.urlopen(url_new).geturl()) | |
img_new = urllib.urlopen(url_new) | |
if(controversial=='True'): | |
localFile_new = open(os.path.join(category+'_controversial/',filename_new), 'wb') | |
if(controversial=='False'): | |
localFile_new = open(os.path.join(category+'/',filename_new), 'wb') | |
localFile_new.write(img_new.read()) | |
localFile_new.close() | |
Item = crawler.Crawler(url_new,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext) | |
listclawler.append(Item) | |
except Exception, e: | |
print(e) | |
else: | |
if(file_ext!='' and file_ext!='.gif'): | |
filename_new=filename+file_ext | |
try: | |
print(urllib2.urlopen(x.url).geturl()) | |
img_new = urllib.urlopen(x.url) | |
if(controversial=='True'): | |
localFile = open(os.path.join(category+'_controversial/',filename_new), 'wb') | |
if(controversial=='False'): | |
localFile = open(os.path.join(category+'/',filename_new), 'wb') | |
localFile.write(img_new.read()) | |
localFile.close() | |
Item = crawler.Crawler(x.url,filename_new,x.comments.__len__(),x.ups,x.downs,x.score,x.title,x.created,x.selftext) | |
listclawler.append(Item) | |
except Exception, e: | |
print(e) | |
#save posts attributes (url,num of comments,file name,ups,downs etc) in a .pkl file | |
if(controversial=='True'): | |
filehandler = open(os.path.join(category+'_controversial/',category+'_controversial_pkl.pkl'), 'wb') | |
if(controversial=='False'): | |
filehandler = open(os.path.join(category+'/',category+'_pkl.pkl'), 'wb') | |
print('Saving pkl file') | |
cPickle.dump(listclawler, filehandler) | |
filehandler.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment