Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save natematias/d115d6aa4d8935d4b591b08ad4091863 to your computer and use it in GitHub Desktop.
Save natematias/d115d6aa4d8935d4b591b08ad4091863 to your computer and use it in GitHub Desktop.
Python code for processing pushshift data to output all comments associated with a specific subreddit
import sys,os,io
import simplejson as json
import zstandard as zstd
subreddit = "futurology"
infile = sys.argv[1]
outfile = sys.argv[2]
print("infile: {0}".format(infile))
counter = 0
with open(outfile, "w+") as of:
with open(infile, 'rb') as fh:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
with dctx.stream_reader(fh) as reader:
previous_line = ""
while True:
chunk = reader.read(2**24) # 16mb chunks
if not chunk:
break
string_data = chunk.decode('utf-8')
lines = string_data.split("\n")
for i, line in enumerate(lines[:-1]):
if i == 0:
line = previous_line + line
comment = json.loads(line)
if(comment['subreddit'].lower() == subreddit):
print(line, file = of)
counter += 1
if(counter % 1000 == 0):
sys.stdout.write(".")
sys.stdout.flush()
previous_line = lines[-1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment