Skip to content

Instantly share code, notes, and snippets.

@jdmar3
Forked from claymcleod/parseReddit.py
Created November 4, 2017 16:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdmar3/80b15fcb37544be40049d6b13ed376f4 to your computer and use it in GitHub Desktop.
Save jdmar3/80b15fcb37544be40049d6b13ed376f4 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# Title: Reddit Data Mining Script
# Authors: Clay McLeod
# Description: This script mines JSON data
# from the Reddit front page and stores it
# as a CSV file for analysis.
# Section: Python
# Subsection: Data Science
want=["domain", "subreddit", "subreddit_id" "id", "author", "score", "over_18", "downs", "created_utc", "ups", "num_comments"]
# Don't touch below this point
from pprint import pprint
import requests
import json
import csv
import datetime
import time
import arff
import os
import time, sys
from sets import Set
from optparse import OptionParser
import six
parser = OptionParser()
parser.add_option("-f", "--file",
action="store", type="string", dest="fileName", default="results.csv",
help="specify the output csv file");
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
parser.add_option("-o", "--once",
action="store_true", dest="once", default=False,
help="run program only once")
parser.add_option("-d", "--dup",
action="store_false", dest="detect_duplicates", default=True,
help="do not detect duplicates in csv")
parser.add_option("-s", "--score",
action="store", type="int", dest="score_floor", default=0,
help="set a minimum score (ratio of upvotes to downvotes) to add to the csv")
parser.add_option("-w", "--wait",
action="store", type="int", dest="wait_time", default=(60 * 15),
help="set the amount of time to repoll the data")
(options, args) = parser.parse_args()
q = []
domains = []
subreddits = []
lastcomment=''
duplicates = 0
wait_time = options.wait_time
fileName = options.fileName
headers = {
'User-Agent': 'Reddit Data Mining Script'
}
def log(my_string):
if options.verbose:
print "[+] %s" % my_string
return
csvheaders = []
csvheadersset= False
authoridx = -1;
subredditidx = -1;
domainidx = -1;
utcidx = -1;
scoreidx = -1;
upsidx = -1;
downsidx = -1;
numcommentsidx = -1
if(os.path.exists(fileName)):
with open(fileName, 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
i = -1
print "Detecting duplicates..."
for row in spamreader:
i = i + 1
addrow = []
array = row[0].split(',')
if array[0] in want:
csvheaders = array
csvheadersset = True
authoridx = array.index("author")
subredditidx = array.index("subreddit")
domainidx = array.index("domain")
utcidx = array.index("created_utc")
scoreidx = array.index("score")
upsidx = array.index("ups")
downsidx = array.index("downs")
numcommentsidx = array.index("num_comments")
continue
for x in array:
x.strip()
addrow.append(x)
if(options.detect_duplicates):
for qrow in q:
if (addrow[authoridx] == qrow[authoridx] and addrow[subredditidx] == qrow[subredditidx] and addrow[domainidx] == qrow[domainidx] and addrow[utcidx] == qrow[utcidx]):
qrow[scoreidx] = addrow[scoreidx];
qrow[upsidx] = addrow[upsidx];
qrow[downsidx] = addrow[downsidx];
qrow[numcommentsidx] = addrow[numcommentsidx];
duplicates = duplicates + 1
break
else:
if(options.score_floor <= int(csvheaders.index("score"))):
q.append(addrow)
else:
if(options.score_floor <= int(csvheaders.index("score"))):
q.append(addrow)
log("Loaded {0} instances and updated {1} entries".format(str(len(q)),duplicates))
log("Outputting to %s" % fileName)
while(True):
for i in range(1, 11):
if(lastcomment==''):
r = requests.get(r'http://www.reddit.com/.json?limit=100', headers=headers)
else:
r = requests.get(r'http://www.reddit.com/.json?limit=100&after='+lastcomment, headers=headers)
data = r.json()
for child in data['data']['children']:
curr = []
if len(csvheaders) > 0:
csvheadersset = True
for k in child['data']:
for header in want:
if k == header:
curr.append(str(child['data'][k]))
if csvheadersset == False:
csvheaders.append(k)
for qrow in q:
if (curr[authoridx] == qrow[authoridx] and curr[subredditidx] == qrow[subredditidx] and curr[domainidx] == qrow[domainidx] and curr[utcidx] == qrow[utcidx]):
qrow[scoreidx] = curr[scoreidx];
qrow[upsidx] = curr[upsidx];
qrow[downsidx] = curr[downsidx];
qrow[numcommentsidx] = curr[numcommentsidx];
duplicates = duplicates + 1
break
if(curr not in q):
if(options.score_floor <= int(csvheaders.index("score"))):
q.append(curr)
lastcomment = child['data']['name']
if not options.once:
log("Sleeping {0} seconds. Data-Size: {1}".format(wait_time, str(len(q))))
else:
log("Finished execution. Data-Size: {0}".format(str(len(q))))
if os.path.exists(fileName):
os.remove(fileName)
csvfile = open(fileName, 'wb')
spamwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(csvheaders)
for row in q:
spamwriter.writerow(row)
csvfile.close()
if options.once:
exit()
time.sleep(wait_time)
lastcomment = ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment