Skip to content

Instantly share code, notes, and snippets.

@FeryET
Created August 17, 2021 07:56
Show Gist options
  • Save FeryET/80a19dbd691691f8398464665e5a23d9 to your computer and use it in GitHub Desktop.
Save FeryET/80a19dbd691691f8398464665e5a23d9 to your computer and use it in GitHub Desktop.
Download /r/CryptoCurrency Comments
import os
import shutil
import gzip
from pmaw import PushshiftAPI
import datetime as dt
from tqdm import tqdm
import pandas as pd
from pathlib import Path
# Creating data folder
data_path = "INSERT/PATH/TO/DATAFOLDER"
if os.path.exists(data_path):
shutil.rmtree(data_path)
os.mkdir(data_path)
# creating api
api = PushshiftAPI(file_checkpoint=5, limit_type='backoff',
jitter="full", batch_size=10)
cache = []
max_cache_size = 100_000
start_epoch = dt.datetime(2021, 7, 14).timestamp()
end_epoch = dt.datetime(2021, 8, 14).timestamp()
posts = api.search_comments(
q="",
limit=1_000_000,
subreddit="cryptocurrency",
mem_safe=True,
safe_exit=True)
print(f'{len(posts)} posts retrieved from Pushshift')
for idx, p in enumerate(tqdm(posts)):
cache.append(p)
if len(cache) >= max_cache_size:
df_id = idx // max_cache_size
df: pd.DataFrame = pd.DataFrame(cache)
df.to_json(
os.path.join(data_path, f"part_{df_id}.json.gz"), double_precision=5,
)
del cache
del df
cache = []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment