Skip to content

Instantly share code, notes, and snippets.

@egg82
Last active October 30, 2019 00:40
Show Gist options
  • Save egg82/33e1ac4e8353fce34c833e3736fc678b to your computer and use it in GitHub Desktop.
Save egg82/33e1ac4e8353fce34c833e3736fc678b to your computer and use it in GitHub Desktop.
import re
import os
import bz2
import lzma
import zstandard as zstd
import json
import sys
import html
import shutil
BASE_DIR = "path/to/reddit_data"
OUT_FILE = "output.json"
CHECKPOINT_FILE = "checkpoint.dat"
whitelist = []
blacklist = [
"sports",
"hockey",
"nba",
"NBA2k",
"nfl",
"NFTL",
"soccer",
"CFB",
"FIFA",
"FifaCareers",
"BostonBruins",
"DetroitRedWings",
"rangers",
"NHLHUT",
"WWE",
"MaddenUltimateTeam",
"NYKnicks",
"CollegeBasketball",
"NOLAPelicans",
"BGASL",
"sixers",
"announcements",
"blog",
"counting",
"The_Donald",
"IncelTears",
"askanincel",
"Incelselfies",
"IncelsWithoutHate",
"Incels",
"pokemon",
"CasualPokemonTrades",
"Pokemongiveaway",
"pokemontrades",
"Pokemonexchange",
"PokeMoonSun",
"BankBallExchange",
"pokemonduel",
"SVExchange",
"ClubNintendoTrade",
"PokemonQRCodes",
"ACTrade",
"RocketLeagueExchange",
"rocket_league_trading",
"YamakuHighSchool",
"XMenRP",
"CampArcadia",
"MonarchyOfEquestria",
"rwbyRP",
"TTPloreplaycentral",
"TheDescendantsOfRome",
"CampHalfBloodRP",
"RWBY",
"rwbyRP",
"EroticRolePlay",
"PercyJacksonRP",
"PotterPlayRP",
"HogwartsRP",
"ALORP",
"SupersRP",
"dcrp",
"BloodGulchRP",
"IronThronePowers",
"MassEffectPhoenix",
"MigrantFleet",
"TheInnBetween",
"AntiHeroReborn",
"AuraRP",
"CrimsonShoresRP",
"DarkPantheon",
"darkestdungeonrp",
"Devilrp",
"Fairy_TailRP",
"FTRP",
"HeroesAcademyReborn",
"HonorHillRP",
"TheNarutoWorld",
"randomsuperpowers",
"RidersOfBerk",
"SalvaticaRP",
"SuperWorldRP",
"TheKalenSeries",
"GreekMythRP",
"BullworthRP",
"InfamousSecondRP",
"vegasquadrantrp"
]
substring_blacklist = [
"[",
"http://",
"https://",
" r/",
" u/",
"/r/",
"/u/",
"reddit",
"Reddit",
"upvot",
"Upvot",
"downvot",
"Downvot",
"OOC:"
]
def main():
if not os.path.exists(BASE_DIR):
raise Exception("BASE_DIR does not exist")
directory = os.path.abspath(BASE_DIR)
if not os.path.isdir(directory):
raise Exception("BASE_DIR must be a directory")
checkpoint = None
chk_f = os.path.abspath(CHECKPOINT_FILE)
if os.path.exists(chk_f) and not os.path.isfile(chk_f):
raise Exception("CHECKPOINT_FILE must be a file")
if os.path.exists(chk_f):
checkpoint = get_checkpoint(chk_f)
out_f = os.path.abspath(OUT_FILE)
if os.path.exists(out_f) and not os.path.isfile(out_f):
raise Exception("OUT_FILE must be a file")
if checkpoint == None and os.path.exists(out_f):
print("WARNING: overwriting existing file " + out_f)
os.remove(out_f)
print("Scanning base dir " + directory)
files = get_files(directory, (".bz2", ".xz", ".zst"), True)
files.sort()
print("Found " + str(len(files)) + " files")
if checkpoint != None:
print("Resuming from checkpoint")
total_comments = 0
for file in files:
file = unzip(file)
if checkpoint == None or checkpoint == file:
print("Scanning " + file)
total_comments += parse_comments(file, out_f, chk_f)
checkpoint = None # Wipe checkpoint to force scanning of the next file (checkpoint is only used to find the "first" file to scan)
else:
print("Skipping " + file)
os.remove(chk_f)
print("Total comments: " + str(total_comments))
def get_checkpoint(file):
with open(file, "rt", encoding="utf8") as in_file:
checkpoint = in_file.readline().strip()
if len(checkpoint) > 0:
return checkpoint
return None
def get_files(path, extensions, recursive):
ret_val = []
for file in os.listdir(path):
file = os.path.join(path, file)
if recursive and os.path.isdir(file):
ret_val.extend(get_files(file, extensions, recursive))
if os.path.isfile(file) and ext_match(file, extensions):
ret_val.append(file)
return ret_val
def ext_match(file, extensions):
for extension in extensions:
if file.endswith(extension):
return True
return False
def unzip(file):
raw_file = re.sub(r"\..*$", ".raw", file)
if os.path.exists(raw_file) and not os.path.isfile(raw_file):
shutil.rmtree(raw_file)
if not os.path.exists(raw_file):
if file.endswith(".bz2"):
unzip_bzip2(file, raw_file)
elif file.endswith(".xz"):
unzip_lzma(file, raw_file)
elif file.endswith(".zst"):
unzip_zstd(file, raw_file)
else:
return None
return raw_file
def unzip_bzip2(input, output):
print("Decompressing " + input)
with open(output, "wb") as out_file, bz2.open(input, "rb") as in_file:
for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
out_file.write(data)
def unzip_lzma(input, output):
print("Decompressing " + input)
with open(output, "wb") as out_file, lzma.open(input, "rb") as in_file:
for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""):
out_file.write(data)
def unzip_zstd(input, output):
print("Decompressing " + input)
with open(output, "wb") as out_file, open(input, "rb") as in_file:
decompressor = zstd.ZstdDecompressor()
for data in decompressor.read_to_iter(in_file, 1024 * 1024 * 4):
out_file.write(data)
def parse_comments(input, output, checkpoint):
num_comments = 0
with open(input, "rt", encoding="utf8") as in_file, open(checkpoint, "wt+", encoding="utf8") as checkpoint_file:
checkpoint_file.write(input + "\n")
checkpoint_file.flush()
for i, line in enumerate(in_file):
j = parse_json(line)
if j == None or not is_included(j, whitelist, blacklist, substring_blacklist):
continue
j = normalize_comment(j)
if not check_length(j["body"]):
continue
#sys.stdout.buffer.write((j["body"] + "\n").encode("utf8")) # debug print
checkpoint_file.write(json.dumps(j) + "\n") # json output
# checkpoint_file.write(j["body"]) # single-line output
num_comments += 1
if num_comments % 1000 == 0:
print(str(num_comments) + " comments so far..")
print("Fetched " + str(num_comments) + " comments")
if num_comments > 0:
print("Flushing checkpoint data..")
with open(checkpoint, "rt", encoding="utf8") as checkpoint_file, open(output, "at+", encoding="utf8") as out_file:
checkpoint_file.readline()
for i, line in enumerate(checkpoint_file):
out_file.write(line)
return num_comments
def parse_json(line):
line = line.strip()
if len(line) <= 1:
return None
if line[-1] != '}' and line[-2] != '}':
return None
return json.loads(line)
def is_included(json, whitelist, blacklist, substring_blacklist):
body = json["body"]
if not check_length(body):
return False
subreddit = json["subreddit"]
if len(whitelist) > 0 and subreddit not in whitelist:
return False
if len(blacklist) > 0 and subreddit in blacklist:
return False
if len(substring_blacklist) > 0:
for substring in substring_blacklist:
if body.find(substring) >= 0:
return False
return True
def check_length(body):
if body is None:
return False
body_len = len(body)
return False if body_len < 4 or body_len > 280 else True
def normalize_comment(json):
ret_val = {}
if not json["id"].startswith("t1_"):
json["id"] = "t1_" + json["id"]
if json["parent_id"] != None and not json["parent_id"].startswith("t1_"):
json["parent_id"] = None
body = json["body"]
body = re.sub(r'\r\n', '\n', body) # We only want \n
body = re.sub(r'[ \t]+\n', '\n', body) # Strip whitespace before newline (keep whitespace after newline)
body = re.sub(r'[ \t\n]+$', '', body) # Strip trailing whitespace (keep leading whitespace)
body = re.sub(r'\n+', '\n', body) # For json output
#body = re.sub(r'\n+', ' ', body) # For single-line output
body = html.unescape(body) # Normalize HTML
json["body"] = body
ret_val["id"] = json["id"]
ret_val["parent_id"] = json["parent_id"]
ret_val["body"] = json["body"]
return ret_val
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment