Created
July 25, 2015 22:19
-
-
Save Healdb/af26f7bef505f9faf60c to your computer and use it in GitHub Desktop.
saverD.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import time | |
import datetime | |
import praw | |
import os | |
import traceback | |
import requests | |
print " __ __ _ _ _ _ " | |
print " the / / / /| | (_) | | (_) " | |
print " / / __ / /_| | ___ __ _ ___ ___ ___ _ _ __ __ _ _ __ ___| |__ ___ _____ _ __ " | |
print " / / '__/ / _` |/ \ / _` |/ _ \/ __/ _ \| | '_ \ / _` | '__/ __| '_ \| \ \ / / _ \ '__|" | |
print " / /| | / / (_| | (_) | (_| | __/ (_| (_) | | | | | | (_| | | | (__| | | | |\ V / __/ | " | |
print " /_/ |_|/_/ \__,_|\___/ \__, |\___|\___\___/|_|_| |_| \__,_|_| \___|_| |_|_| \_/ \___|_| project" | |
print " __/ | " | |
print " |___/ code-name: chugger " | |
print "Code written by: /u/peoplma and /u/healdb" | |
print "Sillines added by: /u/joshtheimpaler" | |
print "Wow by: /r/dogecoin\n\n" | |
b = "timestamp:" | |
d = ".." | |
#Config Details- | |
r = praw.Reddit('searchandarchive by ') | |
folderName="subArchive" | |
if os.path.exists('config.txt'): | |
line = file('config.txt').read() | |
startStamp,endStamp,step,subName=line.split(',') | |
startStamp,endStamp,step=int(startStamp),int(endStamp),int(step) | |
else: | |
subName=raw_input('Input the subreddit to archive: ') | |
s=raw_input('Input start date in the format dd/mm/yyyy: ') | |
startStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple())) | |
s=raw_input('Input end date in the format dd/mm/yyyy: ') | |
endStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple())) | |
step=input('Input seconds between each search, 30 recommended: ') | |
obj=file('config.txt','w') | |
obj.write(str(startStamp)+','+str(endStamp)+','+str(step)+','+str(subName)) | |
#Seconds | |
if not os.path.exists(folderName): | |
os.makedirs(folderName) | |
def getNew(subName,folderName): | |
subreddit_comment = r.get_comments(subName, limit=1000) | |
subreddit_posts = r.get_submissions(subName, limit=1000) | |
for comment in subreddit_comment: | |
print comment | |
url= comment.permalink | |
data= {'user-agent':'archive by /u/healdb'} | |
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it | |
response = requests.get(url+'.json',headers=data) | |
#Create a folder called dogecoinArchive before running the script | |
filename=folderName+"/"+comment.name | |
obj=open(filename, 'w') | |
obj.write(response.text) | |
obj.close() | |
#print post_json | |
for post in subreddit_posts: | |
print post | |
url1= post.permalink | |
#pprint(vars(post)) | |
data= {'user-agent':'archive by /u/healdb'} | |
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it | |
if submission.id not in already_done: | |
response = requests.get(url1+'.json',headers=data) | |
#Create a folder called dogecoinArchive before running the script | |
filename=folderName+"/"+post.name | |
obj=open(filename, 'w') | |
obj.write(response.text) | |
obj.close() | |
#print post_json | |
already_done.add(submission.id) | |
else: | |
continue | |
def main(startStamp,endStamp,step,folderName,subName,progress): | |
count=step | |
try: | |
startStamp =open(folderName+"/lastTimestamp.txt").read() | |
print("Got last timestamp from file: " + startStamp) | |
startStamp=int(startStamp) | |
except: | |
pass | |
for currentStamp in range(startStamp,endStamp,step): | |
f = str(currentStamp) | |
g = str(currentStamp+step) | |
search_results = r.search(b+f+d+g, subreddit=subName, syntax='cloudsearch') | |
print(('\n'*1000)+'['+'*'*int((float(count)/float(progress)*20.0))+'_'*(20-int(float(count)/float(progress)*20.0))+']') | |
count+=step | |
for post in search_results: | |
#print("---I found a post! It\'s called:" + str(post)) | |
url= (post.permalink).replace('?ref=search_posts','') | |
#pprint(vars(post)) | |
data= {'user-agent':'archive by /u/healdb'} | |
#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it | |
response = requests.get(url+'.json',headers=data) | |
#Create a folder called dogecoinArchive before running the script | |
filename=folderName+"/"+post.name+'.json' | |
obj=open(filename, 'w') | |
obj.write(response.text) | |
obj.close() | |
#print post_json | |
#print("I saved the post and named it " + str(post.name) + " .---") | |
obj=open(folderName+"/lastTimestamp.txt", 'w') | |
obj.write(str(currentStamp)) | |
obj.close() | |
print('Welp, all done here! Stopped at timestamp '+ str(currentStamp)) | |
progress = endStamp-startStamp | |
while True: | |
try: | |
main(startStamp,endStamp,step,folderName,subName,progress) | |
print("Succesfully got all posts within parameters. Now archiving all new posts and comments.") | |
while True: | |
getNew(subName,folderName) | |
except KeyboardInterrupt: | |
exit() | |
except: | |
print("Error in the program! The error was as follows: ") | |
error = traceback.format_exc() | |
time.sleep(5) | |
print(error) | |
time.sleep(5) | |
print("Resuming in 5 seconds...") | |
time.sleep(5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment