Skip to content

Instantly share code, notes, and snippets.

@nikolak
Created November 5, 2013 01:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nikolak/7312370 to your computer and use it in GitHub Desktop.
Save nikolak/7312370 to your computer and use it in GitHub Desktop.
import sqlite3
import time
import json
import urllib2
def get_submissions():
url="http://www.reddit.com/r/all/new/.json" #URL of the page that we want to fetch
headers = { 'User-Agent' : 'fetching new submissions script' } # our script "identifier"
req = urllib2.Request(url, None, headers) # We create new request here to open the url with above set headers
data = urllib2.urlopen(req).read() # Open url and make data variable equal whatever we get
json_data=json.loads(data) # Load json from variable defined above as "data", it should be string/text that we get from reddit
return json_data["data"]["children"] # The JSON contains useless fields, the data we need is in data/children keys
if __name__=="__main__":
conn = sqlite3.connect('database.db') # Either create new database or connect to exiting one, sqlite module
# handles this for us so we don't have to worry about whether it exists or not
c = conn.cursor() # This is database cursor, basically what we will use to enter data into database
c.execute('''CREATE TABLE IF NOT EXISTS submissions
(id text, title text, author text, subreddit text, domain text, date real)''')
#Self explanatory but here we create new table named submissions, think of it like a sheet in excel
#but only if it doesn't already exist, also we create fields id, title etc which contain text. Only date is real i.e. number
user_choice=raw_input("1 - Gather data\n2 - Example stats\n>>>") # We ask user to provide input and make that input
# assigned to variable named user_choice
if user_choice=="1": # if user_choice is equal number one then we do this
while True: # this causes infinite loop, because True can never be false.
print "*"*50 #print * fifty times in a row
print "Getting new submissions..."
all_submissions=get_submissions() # we call above defined function which opens reddit and returns json list of submissions
for submission in all_submissions: # we go over that list and on each run the item we're going over is assigned to variable named submission
submission=submission["data"] # quick fix, we can either type submission["data"]["id"] etc or just make submission equal to key "data"
s=(submission["id"],) # we make a "variable" s with submission id
c.execute('SELECT * FROM submissions WHERE id=?', s) # we tell our database cursor to get us ANY item from submission table with id equal to variable s
if c.fetchone() is None: # then we fetch just one result from above execution, if there are no entries this item does not exist in our databse and we should write it
item=(submission["id"],submission["title"],submission["author"],submission["subreddit"],
submission["domain"],submission["created_utc"]) # this just creates a variable item with subission properties all in one place, we need those for our database
print "Submission ID {}, not in database - adding now.".format(submission["id"]) # not necessary, but so we know what we're doing
c.execute('INSERT INTO submissions VALUES (?,?,?,?,?,?)',item) # pretty much self explained, we enter 6 values into submission table, those values are taken from item variable
else: # we got some results from c.execute so our item already exist in database, skip it and print to user that it's skipped
print "Skipping submission ID {}, already in database".format(submission["id"])
conn.commit() # commit changes to database file on hard drive
print "Changes saved sucessfuly, waiting 10 seconds..."
time.sleep(10) #sleep, time-out for 10 seconds before repating action again. Because as I said True is always true, and whatever is inside will just loop forever
elif user_choice=="2": # we want to get some stats
while True: #To enable user to enter new words all the time without having to restart script
user_input=raw_input("Enter a word to display all submissions that have it in title\n") #same as above, get user input - assign it to useR_input variable
for row in c.execute('SELECT * FROM submissions WHERE TITLE LIKE ("%" || ? || "%") ORDER BY date',(user_input,)):
# here we get all submissions from our database which have word user entered anywhere in their title, and we sort those by date
print row
else:
# the user input is neither 1 nor 2 so it's invalid.
print "Invalid input"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment