Last active
October 30, 2019 00:40
-
-
Save egg82/33e1ac4e8353fce34c833e3736fc678b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import bz2 | |
import lzma | |
import zstandard as zstd | |
import json | |
import sys | |
import html | |
import shutil | |
BASE_DIR = "path/to/reddit_data" | |
OUT_FILE = "output.json" | |
CHECKPOINT_FILE = "checkpoint.dat" | |
whitelist = [] | |
blacklist = [ | |
"sports", | |
"hockey", | |
"nba", | |
"NBA2k", | |
"nfl", | |
"NFTL", | |
"soccer", | |
"CFB", | |
"FIFA", | |
"FifaCareers", | |
"BostonBruins", | |
"DetroitRedWings", | |
"rangers", | |
"NHLHUT", | |
"WWE", | |
"MaddenUltimateTeam", | |
"NYKnicks", | |
"CollegeBasketball", | |
"NOLAPelicans", | |
"BGASL", | |
"sixers", | |
"announcements", | |
"blog", | |
"counting", | |
"The_Donald", | |
"IncelTears", | |
"askanincel", | |
"Incelselfies", | |
"IncelsWithoutHate", | |
"Incels", | |
"pokemon", | |
"CasualPokemonTrades", | |
"Pokemongiveaway", | |
"pokemontrades", | |
"Pokemonexchange", | |
"PokeMoonSun", | |
"BankBallExchange", | |
"pokemonduel", | |
"SVExchange", | |
"ClubNintendoTrade", | |
"PokemonQRCodes", | |
"ACTrade", | |
"RocketLeagueExchange", | |
"rocket_league_trading", | |
"YamakuHighSchool", | |
"XMenRP", | |
"CampArcadia", | |
"MonarchyOfEquestria", | |
"rwbyRP", | |
"TTPloreplaycentral", | |
"TheDescendantsOfRome", | |
"CampHalfBloodRP", | |
"RWBY", | |
"rwbyRP", | |
"EroticRolePlay", | |
"PercyJacksonRP", | |
"PotterPlayRP", | |
"HogwartsRP", | |
"ALORP", | |
"SupersRP", | |
"dcrp", | |
"BloodGulchRP", | |
"IronThronePowers", | |
"MassEffectPhoenix", | |
"MigrantFleet", | |
"TheInnBetween", | |
"AntiHeroReborn", | |
"AuraRP", | |
"CrimsonShoresRP", | |
"DarkPantheon", | |
"darkestdungeonrp", | |
"Devilrp", | |
"Fairy_TailRP", | |
"FTRP", | |
"HeroesAcademyReborn", | |
"HonorHillRP", | |
"TheNarutoWorld", | |
"randomsuperpowers", | |
"RidersOfBerk", | |
"SalvaticaRP", | |
"SuperWorldRP", | |
"TheKalenSeries", | |
"GreekMythRP", | |
"BullworthRP", | |
"InfamousSecondRP", | |
"vegasquadrantrp" | |
] | |
substring_blacklist = [ | |
"[", | |
"http://", | |
"https://", | |
" r/", | |
" u/", | |
"/r/", | |
"/u/", | |
"reddit", | |
"Reddit", | |
"upvot", | |
"Upvot", | |
"downvot", | |
"Downvot", | |
"OOC:" | |
] | |
def main(): | |
if not os.path.exists(BASE_DIR): | |
raise Exception("BASE_DIR does not exist") | |
directory = os.path.abspath(BASE_DIR) | |
if not os.path.isdir(directory): | |
raise Exception("BASE_DIR must be a directory") | |
checkpoint = None | |
chk_f = os.path.abspath(CHECKPOINT_FILE) | |
if os.path.exists(chk_f) and not os.path.isfile(chk_f): | |
raise Exception("CHECKPOINT_FILE must be a file") | |
if os.path.exists(chk_f): | |
checkpoint = get_checkpoint(chk_f) | |
out_f = os.path.abspath(OUT_FILE) | |
if os.path.exists(out_f) and not os.path.isfile(out_f): | |
raise Exception("OUT_FILE must be a file") | |
if checkpoint == None and os.path.exists(out_f): | |
print("WARNING: overwriting existing file " + out_f) | |
os.remove(out_f) | |
print("Scanning base dir " + directory) | |
files = get_files(directory, (".bz2", ".xz", ".zst"), True) | |
files.sort() | |
print("Found " + str(len(files)) + " files") | |
if checkpoint != None: | |
print("Resuming from checkpoint") | |
total_comments = 0 | |
for file in files: | |
file = unzip(file) | |
if checkpoint == None or checkpoint == file: | |
print("Scanning " + file) | |
total_comments += parse_comments(file, out_f, chk_f) | |
checkpoint = None # Wipe checkpoint to force scanning of the next file (checkpoint is only used to find the "first" file to scan) | |
else: | |
print("Skipping " + file) | |
os.remove(chk_f) | |
print("Total comments: " + str(total_comments)) | |
def get_checkpoint(file): | |
with open(file, "rt", encoding="utf8") as in_file: | |
checkpoint = in_file.readline().strip() | |
if len(checkpoint) > 0: | |
return checkpoint | |
return None | |
def get_files(path, extensions, recursive): | |
ret_val = [] | |
for file in os.listdir(path): | |
file = os.path.join(path, file) | |
if recursive and os.path.isdir(file): | |
ret_val.extend(get_files(file, extensions, recursive)) | |
if os.path.isfile(file) and ext_match(file, extensions): | |
ret_val.append(file) | |
return ret_val | |
def ext_match(file, extensions): | |
for extension in extensions: | |
if file.endswith(extension): | |
return True | |
return False | |
def unzip(file): | |
raw_file = re.sub(r"\..*$", ".raw", file) | |
if os.path.exists(raw_file) and not os.path.isfile(raw_file): | |
shutil.rmtree(raw_file) | |
if not os.path.exists(raw_file): | |
if file.endswith(".bz2"): | |
unzip_bzip2(file, raw_file) | |
elif file.endswith(".xz"): | |
unzip_lzma(file, raw_file) | |
elif file.endswith(".zst"): | |
unzip_zstd(file, raw_file) | |
else: | |
return None | |
return raw_file | |
def unzip_bzip2(input, output): | |
print("Decompressing " + input) | |
with open(output, "wb") as out_file, bz2.open(input, "rb") as in_file: | |
for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""): | |
out_file.write(data) | |
def unzip_lzma(input, output): | |
print("Decompressing " + input) | |
with open(output, "wb") as out_file, lzma.open(input, "rb") as in_file: | |
for data in iter(lambda : in_file.read(1024 * 1024 * 4), b""): | |
out_file.write(data) | |
def unzip_zstd(input, output): | |
print("Decompressing " + input) | |
with open(output, "wb") as out_file, open(input, "rb") as in_file: | |
decompressor = zstd.ZstdDecompressor() | |
for data in decompressor.read_to_iter(in_file, 1024 * 1024 * 4): | |
out_file.write(data) | |
def parse_comments(input, output, checkpoint): | |
num_comments = 0 | |
with open(input, "rt", encoding="utf8") as in_file, open(checkpoint, "wt+", encoding="utf8") as checkpoint_file: | |
checkpoint_file.write(input + "\n") | |
checkpoint_file.flush() | |
for i, line in enumerate(in_file): | |
j = parse_json(line) | |
if j == None or not is_included(j, whitelist, blacklist, substring_blacklist): | |
continue | |
j = normalize_comment(j) | |
if not check_length(j["body"]): | |
continue | |
#sys.stdout.buffer.write((j["body"] + "\n").encode("utf8")) # debug print | |
checkpoint_file.write(json.dumps(j) + "\n") # json output | |
# checkpoint_file.write(j["body"]) # single-line output | |
num_comments += 1 | |
if num_comments % 1000 == 0: | |
print(str(num_comments) + " comments so far..") | |
print("Fetched " + str(num_comments) + " comments") | |
if num_comments > 0: | |
print("Flushing checkpoint data..") | |
with open(checkpoint, "rt", encoding="utf8") as checkpoint_file, open(output, "at+", encoding="utf8") as out_file: | |
checkpoint_file.readline() | |
for i, line in enumerate(checkpoint_file): | |
out_file.write(line) | |
return num_comments | |
def parse_json(line): | |
line = line.strip() | |
if len(line) <= 1: | |
return None | |
if line[-1] != '}' and line[-2] != '}': | |
return None | |
return json.loads(line) | |
def is_included(json, whitelist, blacklist, substring_blacklist): | |
body = json["body"] | |
if not check_length(body): | |
return False | |
subreddit = json["subreddit"] | |
if len(whitelist) > 0 and subreddit not in whitelist: | |
return False | |
if len(blacklist) > 0 and subreddit in blacklist: | |
return False | |
if len(substring_blacklist) > 0: | |
for substring in substring_blacklist: | |
if body.find(substring) >= 0: | |
return False | |
return True | |
def check_length(body): | |
if body is None: | |
return False | |
body_len = len(body) | |
return False if body_len < 4 or body_len > 280 else True | |
def normalize_comment(json): | |
ret_val = {} | |
if not json["id"].startswith("t1_"): | |
json["id"] = "t1_" + json["id"] | |
if json["parent_id"] != None and not json["parent_id"].startswith("t1_"): | |
json["parent_id"] = None | |
body = json["body"] | |
body = re.sub(r'\r\n', '\n', body) # We only want \n | |
body = re.sub(r'[ \t]+\n', '\n', body) # Strip whitespace before newline (keep whitespace after newline) | |
body = re.sub(r'[ \t\n]+$', '', body) # Strip trailing whitespace (keep leading whitespace) | |
body = re.sub(r'\n+', '\n', body) # For json output | |
#body = re.sub(r'\n+', ' ', body) # For single-line output | |
body = html.unescape(body) # Normalize HTML | |
json["body"] = body | |
ret_val["id"] = json["id"] | |
ret_val["parent_id"] = json["parent_id"] | |
ret_val["body"] = json["body"] | |
return ret_val | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment