Healdb/saverD.py

## saverD.py
# -*- coding: utf-8 -*-
import time
import datetime
import praw
import os
import traceback
import requests
print "      __     __ _                            _                        _     _                "
print " the / /    / /| |                          (_)                      | |   (_)               "
print "    / / __ / /_| | ___   __ _  ___  ___ ___  _ _ __     __ _ _ __ ___| |__  ___   _____ _ __ "
print "   / / '__/ / _` |/   \ / _` |/ _ \/ __/ _ \| | '_ \   / _` | '__/ __| '_ \| \ \ / / _ \ '__|"
print "  / /| | / / (_| | (_) | (_| |  __/ (_| (_) | | | | | | (_| | | | (__| | | | |\ V /  __/ |  "
print " /_/ |_|/_/ \__,_|\___/ \__, |\___|\___\___/|_|_| |_|  \__,_|_|  \___|_| |_|_| \_/ \___|_| project"
print "                         __/ |   "
print "                        |___/           code-name: chugger  "
print "Code written by: /u/peoplma and /u/healdb"
print "Sillines added by: /u/joshtheimpaler"
print "Wow by: /r/dogecoin\n\n"
b = "timestamp:"
d = ".."

#Config Details-
r = praw.Reddit('searchandarchive by ')
folderName="subArchive"

if os.path.exists('config.txt'):
	line = file('config.txt').read()
	startStamp,endStamp,step,subName=line.split(',')
	startStamp,endStamp,step=int(startStamp),int(endStamp),int(step)
else:
	subName=raw_input('Input the subreddit to archive: ')
	s=raw_input('Input start date in the format dd/mm/yyyy: ')
	startStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
	s=raw_input('Input end date in the format dd/mm/yyyy: ')
	endStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
	step=input('Input seconds between each search, 30 recommended: ')
	obj=file('config.txt','w')
	obj.write(str(startStamp)+','+str(endStamp)+','+str(step)+','+str(subName))

#Seconds

if not os.path.exists(folderName):
    os.makedirs(folderName)

def getNew(subName,folderName):
    subreddit_comment = r.get_comments(subName, limit=1000)
    subreddit_posts = r.get_submissions(subName, limit=1000)
    for comment in subreddit_comment:
        print comment
        url= comment.permalink
        data= {'user-agent':'archive by /u/healdb'}
        #manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
        response = requests.get(url+'.json',headers=data)
        #Create a folder called dogecoinArchive before running the script
        filename=folderName+"/"+comment.name
        obj=open(filename, 'w')
        obj.write(response.text)
        obj.close()
        #print post_json
    for post in subreddit_posts:
        print post
        url1= post.permalink
        #pprint(vars(post))
        data= {'user-agent':'archive by /u/healdb'}
        #manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
        if submission.id not in already_done:
            response = requests.get(url1+'.json',headers=data)
            #Create a folder called dogecoinArchive before running the script
            filename=folderName+"/"+post.name
            obj=open(filename, 'w')
            obj.write(response.text)
            obj.close()
            #print post_json
            already_done.add(submission.id)
        else:
            continue
def main(startStamp,endStamp,step,folderName,subName,progress):
    count=step
    try:
        startStamp =open(folderName+"/lastTimestamp.txt").read()
        print("Got last timestamp from file: " + startStamp)
        startStamp=int(startStamp)
    except:
        pass
    for currentStamp in range(startStamp,endStamp,step):
        f = str(currentStamp)
        g = str(currentStamp+step)
        search_results = r.search(b+f+d+g, subreddit=subName, syntax='cloudsearch')
        print(('\n'*1000)+'['+'*'*int((float(count)/float(progress)*20.0))+'_'*(20-int(float(count)/float(progress)*20.0))+']')
        count+=step
        for post in search_results:
            #print("---I found a post! It\'s called:" + str(post))
            url= (post.permalink).replace('?ref=search_posts','')
            #pprint(vars(post))
            data= {'user-agent':'archive by /u/healdb'}
            #manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
            response = requests.get(url+'.json',headers=data)
            #Create a folder called dogecoinArchive before running the script
            filename=folderName+"/"+post.name+'.json'
            obj=open(filename, 'w')
            obj.write(response.text)
            obj.close()
            #print post_json
            #print("I saved the post and named it " + str(post.name) + " .---")
        obj=open(folderName+"/lastTimestamp.txt", 'w')
        obj.write(str(currentStamp))
        obj.close()
    print('Welp, all done here! Stopped at timestamp '+ str(currentStamp))
progress = endStamp-startStamp
while True:
    try:
        main(startStamp,endStamp,step,folderName,subName,progress)
        print("Succesfully got all posts within parameters. Now archiving all new posts and comments.")
        while True:
            getNew(subName,folderName)
    except KeyboardInterrupt:
        exit()
    except:
        print("Error in the program! The error was as follows: ")
        error = traceback.format_exc()
        time.sleep(5)
        print(error)
        time.sleep(5)
        print("Resuming in 5 seconds...")
        time.sleep(5)
	# -- coding: utf-8 --
	import time
	import datetime
	import praw
	import os
	import traceback
	import requests
	print " __ __ _ _ _ _ "
	print " the / / / /\| \| (_) \| \| (_) "
	print " / / __ / /_\| \| ___ __ _ ___ ___ ___ _ _ __ __ _ _ __ ___\| \|__ ___ _____ _ __ "
	print " / / '__/ / _` \|/ \ / _` \|/ _ \/ __/ _ \\| \| '_ \ / _` \| '__/ __\| '_ \\| \ \ / / _ \ '__\|"
	print " / /\| \| / / (_\| \| (_) \| (_\| \| __/ (_\| (_) \| \| \| \| \| \| (_\| \| \| \| (__\| \| \| \| \|\ V / __/ \| "
	print " /_/ \|_\|/_/ \__,_\|\___/ \__, \|\___\|\___\___/\|_\|_\| \|_\| \__,_\|_\| \___\|_\| \|_\|_\| \_/ \___\|_\| project"
	print " __/ \| "
	print " \|___/ code-name: chugger "
	print "Code written by: /u/peoplma and /u/healdb"
	print "Sillines added by: /u/joshtheimpaler"
	print "Wow by: /r/dogecoin\n\n"
	b = "timestamp:"
	d = ".."

	#Config Details-
	r = praw.Reddit('searchandarchive by ')
	folderName="subArchive"

	if os.path.exists('config.txt'):
	line = file('config.txt').read()
	startStamp,endStamp,step,subName=line.split(',')
	startStamp,endStamp,step=int(startStamp),int(endStamp),int(step)
	else:
	subName=raw_input('Input the subreddit to archive: ')
	s=raw_input('Input start date in the format dd/mm/yyyy: ')
	startStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
	s=raw_input('Input end date in the format dd/mm/yyyy: ')
	endStamp= int(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
	step=input('Input seconds between each search, 30 recommended: ')
	obj=file('config.txt','w')
	obj.write(str(startStamp)+','+str(endStamp)+','+str(step)+','+str(subName))

	#Seconds

	if not os.path.exists(folderName):
	os.makedirs(folderName)

	def getNew(subName,folderName):
	subreddit_comment = r.get_comments(subName, limit=1000)
	subreddit_posts = r.get_submissions(subName, limit=1000)
	for comment in subreddit_comment:
	print comment
	url= comment.permalink
	data= {'user-agent':'archive by /u/healdb'}
	#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
	response = requests.get(url+'.json',headers=data)
	#Create a folder called dogecoinArchive before running the script
	filename=folderName+"/"+comment.name
	obj=open(filename, 'w')
	obj.write(response.text)
	obj.close()
	#print post_json
	for post in subreddit_posts:
	print post
	url1= post.permalink
	#pprint(vars(post))
	data= {'user-agent':'archive by /u/healdb'}
	#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
	if submission.id not in already_done:
	response = requests.get(url1+'.json',headers=data)
	#Create a folder called dogecoinArchive before running the script
	filename=folderName+"/"+post.name
	obj=open(filename, 'w')
	obj.write(response.text)
	obj.close()
	#print post_json
	already_done.add(submission.id)
	else:
	continue
	def main(startStamp,endStamp,step,folderName,subName,progress):
	count=step
	try:
	startStamp =open(folderName+"/lastTimestamp.txt").read()
	print("Got last timestamp from file: " + startStamp)
	startStamp=int(startStamp)
	except:
	pass
	for currentStamp in range(startStamp,endStamp,step):
	f = str(currentStamp)
	g = str(currentStamp+step)
	search_results = r.search(b+f+d+g, subreddit=subName, syntax='cloudsearch')
	print(('\n'1000)+'['+''int((float(count)/float(progress)20.0))+'_'(20-int(float(count)/float(progress)20.0))+']')
	count+=step
	for post in search_results:
	#print("---I found a post! It\'s called:" + str(post))
	url= (post.permalink).replace('?ref=search_posts','')
	#pprint(vars(post))
	data= {'user-agent':'archive by /u/healdb'}
	#manually grabbing this file is much faster than loading the individual json files of every single comment, as this json provides all of it
	response = requests.get(url+'.json',headers=data)
	#Create a folder called dogecoinArchive before running the script
	filename=folderName+"/"+post.name+'.json'
	obj=open(filename, 'w')
	obj.write(response.text)
	obj.close()
	#print post_json
	#print("I saved the post and named it " + str(post.name) + " .---")
	obj=open(folderName+"/lastTimestamp.txt", 'w')
	obj.write(str(currentStamp))
	obj.close()
	print('Welp, all done here! Stopped at timestamp '+ str(currentStamp))
	progress = endStamp-startStamp
	while True:
	try:
	main(startStamp,endStamp,step,folderName,subName,progress)
	print("Succesfully got all posts within parameters. Now archiving all new posts and comments.")
	while True:
	getNew(subName,folderName)
	except KeyboardInterrupt:
	exit()
	except:
	print("Error in the program! The error was as follows: ")
	error = traceback.format_exc()
	time.sleep(5)
	print(error)
	time.sleep(5)
	print("Resuming in 5 seconds...")
	time.sleep(5)