egg82/parse_reddit.py

## parse_reddit.py
import re
import os
import bz2
import lzma
import zstandard as zstd
import json
import sys
import html
import shutil

BASE_DIR = "path/to/reddit_data"
OUT_FILE = "output.json"
CHECKPOINT_FILE = "checkpoint.dat"

whitelist = []
blacklist = [
    "sports",
    "hockey",
    "nba",
    "NBA2k",
    "nfl",
    "NFTL",
    "soccer",
    "CFB",
    "FIFA",
    "FifaCareers",
    "BostonBruins",
    "DetroitRedWings",
    "rangers",
    "NHLHUT",
    "WWE",
    "MaddenUltimateTeam",
    "NYKnicks",
    "CollegeBasketball",
    "NOLAPelicans",
    "BGASL",
    "sixers",

    "announcements",
    "blog",

    "counting",

    "The_Donald",
    "IncelTears",
    "askanincel",
    "Incelselfies",
    "IncelsWithoutHate",
    "Incels",

    "pokemon",
    "CasualPokemonTrades",
    "Pokemongiveaway",
    "pokemontrades",
    "Pokemonexchange",
    "PokeMoonSun",
    "BankBallExchange",
    "pokemonduel",
    "SVExchange",
    "ClubNintendoTrade",
    "PokemonQRCodes",
    "ACTrade",
    "RocketLeagueExchange",
    "rocket_league_trading",

    "YamakuHighSchool",
    "XMenRP",
    "CampArcadia",
    "MonarchyOfEquestria",
    "rwbyRP",
    "TTPloreplaycentral",
    "TheDescendantsOfRome",
    "CampHalfBloodRP",
    "RWBY",
    "rwbyRP",
    "EroticRolePlay",
    "PercyJacksonRP",
    "PotterPlayRP",
    "HogwartsRP",
    "ALORP",
    "SupersRP",
    "dcrp",
    "BloodGulchRP",
    "IronThronePowers",
    "MassEffectPhoenix",
    "MigrantFleet",
    "TheInnBetween",
    "AntiHeroReborn",
    "AuraRP",
    "CrimsonShoresRP",
    "DarkPantheon",
    "darkestdungeonrp",
    "Devilrp",
    "Fairy_TailRP",
    "FTRP",
    "HeroesAcademyReborn",
    "HonorHillRP",
    "TheNarutoWorld",
    "randomsuperpowers",
    "RidersOfBerk",
    "SalvaticaRP",
    "SuperWorldRP",
    "TheKalenSeries",
    "GreekMythRP",
    "BullworthRP",
    "InfamousSecondRP",
    "vegasquadrantrp"
]
substring_blacklist = [
    "[",
    "http://",
    "https://",
    " r/",
    " u/",
    "/r/",
    "/u/",
    "reddit",
    "Reddit",
    "upvot",
    "Upvot",
    "downvot",
    "Downvot",
    "OOC:"
]

def main():
    if not os.path.exists(BASE_DIR):
        raise Exception("BASE_DIR does not exist")
    directory = os.path.abspath(BASE_DIR)
    if not os.path.isdir(directory):
        raise Exception("BASE_DIR must be a directory")

    checkpoint = None
    chk_f = os.path.abspath(CHECKPOINT_FILE)
    if os.path.exists(chk_f) and not os.path.isfile(chk_f):
        raise Exception("CHECKPOINT_FILE must be a file")
    if os.path.exists(chk_f):
        checkpoint = get_checkpoint(chk_f)

    out_f = os.path.abspath(OUT_FILE)
    if os.path.exists(out_f) and not os.path.isfile(out_f):
        raise Exception("OUT_FILE must be a file")
    if checkpoint == None and os.path.exists(out_f):
        print("WARNING: overwriting existing file " + out_f)
        os.remove(out_f)

    print("Scanning base dir " + directory)
    files = get_files(directory, (".bz2", ".xz", ".zst"), True)
    files.sort()
    print("Found " + str(len(files)) + " files")

    if checkpoint != None:
        print("Resuming from checkpoint")

    total_comments = 0

    for file in files:
        file = unzip(file)
        if checkpoint == None or checkpoint == file:
            print("Scanning " + file)
            total_comments += parse_comments(file, out_f, chk_f)
            checkpoint = None # Wipe checkpoint to force scanning of the next file (checkpoint is only used to find the "first" file to scan)
        else:
            print("Skipping " + file)
    os.remove(chk_f)
    print("Total comments: " + str(total_comments))

def get_checkpoint(file):
    with open(file, "rt", encoding="utf8") as in_file:
        checkpoint = in_file.readline().strip()
        if len(checkpoint) > 0:
            return checkpoint
    return None

def get_files(path, extensions, recursive):
    ret_val = []
    for file in os.listdir(path):
        file = os.path.join(path, file)
        if recursive and os.path.isdir(file):
            ret_val.extend(get_files(file, extensions, recursive))
        if os.path.isfile(file) and ext_match(file, extensions):
            ret_val.append(file)
    return ret_val

def ext_match(file, extensions):
    for extension in extensions:
        if file.endswith(extension):
            return True
    return False

def unzip(file):
    raw_file = re.sub(r"\..*$", ".raw", file)
    if os.path.exists(raw_file) and not os.path.isfile(raw_file):
        shutil.rmtree(raw_file)
    if not os.path.exists(raw_file):
        if file.endswith(".bz2"):
            unzip_bzip2(file, raw_file)
        elif file.endswith(".xz"):
            unzip_lzma(file, raw_file)
        elif file.endswith(".zst"):
            unzip_zstd(file, raw_file)
        else:
            return None
    return raw_file

def unzip_bzip2(input, output):
    print("Decompressing " + input)
    with open(output, "wb") as out_file, bz2.open(input, "rb") as in_file:
        for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
            out_file.write(data)

def unzip_lzma(input, output):
    print("Decompressing " + input)
    with open(output, "wb") as out_file, lzma.open(input, "rb") as in_file:
        for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
            out_file.write(data)

def unzip_zstd(input, output):
    print("Decompressing " + input)
    with open(output, "wb") as out_file, open(input, "rb") as in_file:
        decompressor = zstd.ZstdDecompressor()
        for data in decompressor.read_to_iter(in_file, 1024 * 1024 * 4):
            out_file.write(data)

def parse_comments(input, output, checkpoint):
    num_comments = 0
    with open(input, "rt", encoding="utf8") as in_file, open(checkpoint, "wt+", encoding="utf8") as checkpoint_file:
        checkpoint_file.write(input + "\n")
        checkpoint_file.flush()
        for i, line in enumerate(in_file):
            j = parse_json(line)
            if j == None or not is_included(j, whitelist, blacklist, substring_blacklist):
                continue
            j = normalize_comment(j)
            if not check_length(j["body"]):
                continue
            #sys.stdout.buffer.write((j["body"] + "\n").encode("utf8")) # debug print
            checkpoint_file.write(json.dumps(j) + "\n") # json output
            # checkpoint_file.write(j["body"]) # single-line output
            num_comments += 1
            if num_comments % 1000 == 0:
                print(str(num_comments) + " comments so far..")
    print("Fetched " + str(num_comments) + " comments")
    if num_comments > 0:
        print("Flushing checkpoint data..")
        with open(checkpoint, "rt", encoding="utf8") as checkpoint_file, open(output, "at+", encoding="utf8") as out_file:
            checkpoint_file.readline()
            for i, line in enumerate(checkpoint_file):
                out_file.write(line)
    return num_comments

def parse_json(line):
    line = line.strip()
    if len(line) <= 1:
        return None
    if line[-1] != '}' and line[-2] != '}':
        return None
    return json.loads(line)

def is_included(json, whitelist, blacklist, substring_blacklist):
    body = json["body"]

    if not check_length(body):
        return False

    subreddit = json["subreddit"]
    if len(whitelist) > 0 and subreddit not in whitelist:
        return False
    if len(blacklist) > 0 and subreddit in blacklist:
        return False

    if len(substring_blacklist) > 0:
        for substring in substring_blacklist:
            if body.find(substring) >= 0:
                return False

    return True

def check_length(body):
    if body is None:
        return False
    body_len = len(body)
    return False if body_len < 4 or body_len > 280 else True

def normalize_comment(json):
    ret_val = {}

    if not json["id"].startswith("t1_"):
        json["id"] = "t1_" + json["id"]
    if json["parent_id"] != None and not json["parent_id"].startswith("t1_"):
        json["parent_id"] = None

    body = json["body"]
    body = re.sub(r'\r\n', '\n', body) # We only want \n
    body = re.sub(r'[ \t]+\n', '\n', body) # Strip whitespace before newline (keep whitespace after newline)
    body = re.sub(r'[ \t\n]+$', '', body) # Strip trailing whitespace (keep leading whitespace)
    body = re.sub(r'\n+', '\n', body) # For json output
    #body = re.sub(r'\n+', ' ', body) # For single-line output
    body = html.unescape(body) # Normalize HTML
    json["body"] = body

    ret_val["id"] = json["id"]
    ret_val["parent_id"] = json["parent_id"]
    ret_val["body"] = json["body"]
    return ret_val

main()
	import re
	import os
	import bz2
	import lzma
	import zstandard as zstd
	import json
	import sys
	import html
	import shutil

	BASE_DIR = "path/to/reddit_data"
	OUT_FILE = "output.json"
	CHECKPOINT_FILE = "checkpoint.dat"

	whitelist = []
	blacklist = [
	"sports",
	"hockey",
	"nba",
	"NBA2k",
	"nfl",
	"NFTL",
	"soccer",
	"CFB",
	"FIFA",
	"FifaCareers",
	"BostonBruins",
	"DetroitRedWings",
	"rangers",
	"NHLHUT",
	"WWE",
	"MaddenUltimateTeam",
	"NYKnicks",
	"CollegeBasketball",
	"NOLAPelicans",
	"BGASL",
	"sixers",

	"announcements",
	"blog",

	"counting",

	"The_Donald",
	"IncelTears",
	"askanincel",
	"Incelselfies",
	"IncelsWithoutHate",
	"Incels",

	"pokemon",
	"CasualPokemonTrades",
	"Pokemongiveaway",
	"pokemontrades",
	"Pokemonexchange",
	"PokeMoonSun",
	"BankBallExchange",
	"pokemonduel",
	"SVExchange",
	"ClubNintendoTrade",
	"PokemonQRCodes",
	"ACTrade",
	"RocketLeagueExchange",
	"rocket_league_trading",

	"YamakuHighSchool",
	"XMenRP",
	"CampArcadia",
	"MonarchyOfEquestria",
	"rwbyRP",
	"TTPloreplaycentral",
	"TheDescendantsOfRome",
	"CampHalfBloodRP",
	"RWBY",
	"rwbyRP",
	"EroticRolePlay",
	"PercyJacksonRP",
	"PotterPlayRP",
	"HogwartsRP",
	"ALORP",
	"SupersRP",
	"dcrp",
	"BloodGulchRP",
	"IronThronePowers",
	"MassEffectPhoenix",
	"MigrantFleet",
	"TheInnBetween",
	"AntiHeroReborn",
	"AuraRP",
	"CrimsonShoresRP",
	"DarkPantheon",
	"darkestdungeonrp",
	"Devilrp",
	"Fairy_TailRP",
	"FTRP",
	"HeroesAcademyReborn",
	"HonorHillRP",
	"TheNarutoWorld",
	"randomsuperpowers",
	"RidersOfBerk",
	"SalvaticaRP",
	"SuperWorldRP",
	"TheKalenSeries",
	"GreekMythRP",
	"BullworthRP",
	"InfamousSecondRP",
	"vegasquadrantrp"
	]
	substring_blacklist = [
	"[",
	"http://",
	"https://",
	" r/",
	" u/",
	"/r/",
	"/u/",
	"reddit",
	"Reddit",
	"upvot",
	"Upvot",
	"downvot",
	"Downvot",
	"OOC:"
	]

	def main():
	if not os.path.exists(BASE_DIR):
	raise Exception("BASE_DIR does not exist")
	directory = os.path.abspath(BASE_DIR)
	if not os.path.isdir(directory):
	raise Exception("BASE_DIR must be a directory")

	checkpoint = None
	chk_f = os.path.abspath(CHECKPOINT_FILE)
	if os.path.exists(chk_f) and not os.path.isfile(chk_f):
	raise Exception("CHECKPOINT_FILE must be a file")
	if os.path.exists(chk_f):
	checkpoint = get_checkpoint(chk_f)

	out_f = os.path.abspath(OUT_FILE)
	if os.path.exists(out_f) and not os.path.isfile(out_f):
	raise Exception("OUT_FILE must be a file")
	if checkpoint == None and os.path.exists(out_f):
	print("WARNING: overwriting existing file " + out_f)
	os.remove(out_f)

	print("Scanning base dir " + directory)
	files = get_files(directory, (".bz2", ".xz", ".zst"), True)
	files.sort()
	print("Found " + str(len(files)) + " files")

	if checkpoint != None:
	print("Resuming from checkpoint")

	total_comments = 0

	for file in files:
	file = unzip(file)
	if checkpoint == None or checkpoint == file:
	print("Scanning " + file)
	total_comments += parse_comments(file, out_f, chk_f)
	checkpoint = None # Wipe checkpoint to force scanning of the next file (checkpoint is only used to find the "first" file to scan)
	else:
	print("Skipping " + file)
	os.remove(chk_f)
	print("Total comments: " + str(total_comments))

	def get_checkpoint(file):
	with open(file, "rt", encoding="utf8") as in_file:
	checkpoint = in_file.readline().strip()
	if len(checkpoint) > 0:
	return checkpoint
	return None

	def get_files(path, extensions, recursive):
	ret_val = []
	for file in os.listdir(path):
	file = os.path.join(path, file)
	if recursive and os.path.isdir(file):
	ret_val.extend(get_files(file, extensions, recursive))
	if os.path.isfile(file) and ext_match(file, extensions):
	ret_val.append(file)
	return ret_val

	def ext_match(file, extensions):
	for extension in extensions:
	if file.endswith(extension):
	return True
	return False

	def unzip(file):
	raw_file = re.sub(r"\..*$", ".raw", file)
	if os.path.exists(raw_file) and not os.path.isfile(raw_file):
	shutil.rmtree(raw_file)
	if not os.path.exists(raw_file):
	if file.endswith(".bz2"):
	unzip_bzip2(file, raw_file)
	elif file.endswith(".xz"):
	unzip_lzma(file, raw_file)
	elif file.endswith(".zst"):
	unzip_zstd(file, raw_file)
	else:
	return None
	return raw_file

	def unzip_bzip2(input, output):
	print("Decompressing " + input)
	with open(output, "wb") as out_file, bz2.open(input, "rb") as in_file:
	for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
	out_file.write(data)

	def unzip_lzma(input, output):
	print("Decompressing " + input)
	with open(output, "wb") as out_file, lzma.open(input, "rb") as in_file:
	for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
	out_file.write(data)

	def unzip_zstd(input, output):
	print("Decompressing " + input)
	with open(output, "wb") as out_file, open(input, "rb") as in_file:
	decompressor = zstd.ZstdDecompressor()
	for data in decompressor.read_to_iter(in_file, 1024 * 1024 * 4):
	out_file.write(data)

	def parse_comments(input, output, checkpoint):
	num_comments = 0
	with open(input, "rt", encoding="utf8") as in_file, open(checkpoint, "wt+", encoding="utf8") as checkpoint_file:
	checkpoint_file.write(input + "\n")
	checkpoint_file.flush()
	for i, line in enumerate(in_file):
	j = parse_json(line)
	if j == None or not is_included(j, whitelist, blacklist, substring_blacklist):
	continue
	j = normalize_comment(j)
	if not check_length(j["body"]):
	continue
	#sys.stdout.buffer.write((j["body"] + "\n").encode("utf8")) # debug print
	checkpoint_file.write(json.dumps(j) + "\n") # json output
	# checkpoint_file.write(j["body"]) # single-line output
	num_comments += 1
	if num_comments % 1000 == 0:
	print(str(num_comments) + " comments so far..")
	print("Fetched " + str(num_comments) + " comments")
	if num_comments > 0:
	print("Flushing checkpoint data..")
	with open(checkpoint, "rt", encoding="utf8") as checkpoint_file, open(output, "at+", encoding="utf8") as out_file:
	checkpoint_file.readline()
	for i, line in enumerate(checkpoint_file):
	out_file.write(line)
	return num_comments

	def parse_json(line):
	line = line.strip()
	if len(line) <= 1:
	return None
	if line[-1] != '}' and line[-2] != '}':
	return None
	return json.loads(line)

	def is_included(json, whitelist, blacklist, substring_blacklist):
	body = json["body"]

	if not check_length(body):
	return False

	subreddit = json["subreddit"]
	if len(whitelist) > 0 and subreddit not in whitelist:
	return False
	if len(blacklist) > 0 and subreddit in blacklist:
	return False

	if len(substring_blacklist) > 0:
	for substring in substring_blacklist:
	if body.find(substring) >= 0:
	return False

	return True

	def check_length(body):
	if body is None:
	return False
	body_len = len(body)
	return False if body_len < 4 or body_len > 280 else True

	def normalize_comment(json):
	ret_val = {}

	if not json["id"].startswith("t1_"):
	json["id"] = "t1_" + json["id"]
	if json["parent_id"] != None and not json["parent_id"].startswith("t1_"):
	json["parent_id"] = None

	body = json["body"]
	body = re.sub(r'\r\n', '\n', body) # We only want \n
	body = re.sub(r'[ \t]+\n', '\n', body) # Strip whitespace before newline (keep whitespace after newline)
	body = re.sub(r'[ \t\n]+$', '', body) # Strip trailing whitespace (keep leading whitespace)
	body = re.sub(r'\n+', '\n', body) # For json output
	#body = re.sub(r'\n+', ' ', body) # For single-line output
	body = html.unescape(body) # Normalize HTML
	json["body"] = body

	ret_val["id"] = json["id"]
	ret_val["parent_id"] = json["parent_id"]
	ret_val["body"] = json["body"]
	return ret_val

	main()