Created
July 18, 2022 20:52
-
-
Save natematias/d115d6aa4d8935d4b591b08ad4091863 to your computer and use it in GitHub Desktop.
Python code for processing pushshift data to output all comments associated with a specific subreddit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys,os,io | |
import simplejson as json | |
import zstandard as zstd | |
subreddit = "futurology" | |
infile = sys.argv[1] | |
outfile = sys.argv[2] | |
print("infile: {0}".format(infile)) | |
counter = 0 | |
with open(outfile, "w+") as of: | |
with open(infile, 'rb') as fh: | |
dctx = zstd.ZstdDecompressor(max_window_size=2147483648) | |
with dctx.stream_reader(fh) as reader: | |
previous_line = "" | |
while True: | |
chunk = reader.read(2**24) # 16mb chunks | |
if not chunk: | |
break | |
string_data = chunk.decode('utf-8') | |
lines = string_data.split("\n") | |
for i, line in enumerate(lines[:-1]): | |
if i == 0: | |
line = previous_line + line | |
comment = json.loads(line) | |
if(comment['subreddit'].lower() == subreddit): | |
print(line, file = of) | |
counter += 1 | |
if(counter % 1000 == 0): | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
previous_line = lines[-1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment