Skip to content

Instantly share code, notes, and snippets.

@MaLiN2223
Created July 18, 2021 05:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaLiN2223/c90017290d659340b3682d107fde77f0 to your computer and use it in GitHub Desktop.
Save MaLiN2223/c90017290d659340b3682d107fde77f0 to your computer and use it in GitHub Desktop.
import bz2
import lzma
from src.reddit_input_processing.zreader import Zreader
def decode_bz2_posts(file_path: str, base_path: str, subreddits: Set[str]):
with bz2.BZ2File(f"{base_path}/{file_path}", "rb") as source_file:
read_posts(source_file, subreddits)
def decode_xz_posts(file_path: str, base_path: str, subreddits: Set[str]):
with open(f"{base_path}/{file_path}", "rb") as compressed:
with lzma.LZMAFile(compressed) as uncompressed:
read_posts(uncompressed, subreddits)
def decode_zst_posts(file_path: str, base_path: str, subreddits: Set[str]):
zreader = Zreader(f"{base_path}/{file_path}", chunk_size=8192)
read_posts(zreader.readlines(), subreddits)
def decode_bz2_comments(file_path: str, base_path: str, subreddits: Set[str]):
with bz2.BZ2File(f"{base_path}/{file_path}", "rb") as source_file:
read_comments(source_file, subreddits)
def decode_xz_comments(file_path: str, base_path: str, subreddits: Set[str]):
with open(f"{base_path}/{file_path}", "rb") as compressed:
with lzma.LZMAFile(compressed) as uncompressed:
read_comments(uncompressed, subreddits)
def decode_zst_comments(file_path: str, base_path: str, subreddits: Set[str]):
zreader = Zreader(f"{base_path}/{file_path}", chunk_size=8192)
read_comments(zreader.readlines(), subreddits)
def load(file_name: str, base_path: str = "data_in", subreddits: Set[str] = set()):
if len(subreddits) == 0:
raise ValueError("Subreddits are empty")
if "RS_" in file_name:
print("Submissions", file_name)
if ".bz2" in file_name:
decode_bz2_posts(file_name, base_path, subreddits)
elif ".xz" in file_name:
decode_xz_posts(file_name, base_path, subreddits)
elif ".zst" in file_name:
decode_zst_posts(file_name, base_path, subreddits)
else:
print("Unrecognized file name", file_name)
elif "RC_" in file_name:
print("Comments", file_name)
if ".bz2" in file_name:
decode_bz2_comments(file_name, base_path, subreddits)
elif ".xz" in file_name:
decode_xz_comments(file_name, base_path, subreddits)
elif ".zst" in file_name:
decode_zst_comments(file_name, base_path, subreddits)
else:
print("Unrecognized file name", file_name)
else:
print("Unrecognized file name", file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment