natematias/all-subreddit-comments-from-pushshift-zst-file.py

## all-subreddit-comments-from-pushshift-zst-file.py
import sys,os,io
import simplejson as json
import zstandard as zstd

subreddit = "futurology"

infile = sys.argv[1]
outfile = sys.argv[2]

print("infile: {0}".format(infile))

counter = 0

with open(outfile, "w+") as of:
  with open(infile, 'rb') as fh:
    dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(2**24)  # 16mb chunks
            if not chunk:
                break

            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                if i == 0:
                    line = previous_line + line
                comment = json.loads(line)
                if(comment['subreddit'].lower() == subreddit):
                    print(line, file = of)
                counter += 1
                if(counter % 1000 == 0):
                    sys.stdout.write(".")
                    sys.stdout.flush()

            previous_line = lines[-1]
	import sys,os,io
	import simplejson as json
	import zstandard as zstd

	subreddit = "futurology"

	infile = sys.argv[1]
	outfile = sys.argv[2]

	print("infile: {0}".format(infile))

	counter = 0

	with open(outfile, "w+") as of:
	with open(infile, 'rb') as fh:
	dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
	with dctx.stream_reader(fh) as reader:
	previous_line = ""
	while True:
	chunk = reader.read(2**24) # 16mb chunks
	if not chunk:
	break

	string_data = chunk.decode('utf-8')
	lines = string_data.split("\n")
	for i, line in enumerate(lines[:-1]):
	if i == 0:
	line = previous_line + line
	comment = json.loads(line)
	if(comment['subreddit'].lower() == subreddit):
	print(line, file = of)
	counter += 1
	if(counter % 1000 == 0):
	sys.stdout.write(".")
	sys.stdout.flush()

	previous_line = lines[-1]