Skip to content

Instantly share code, notes, and snippets.

@fferegrino
Created May 4, 2022 08:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fferegrino/dd3b7aa8d2010a42e2626079b93239c7 to your computer and use it in GitHub Desktop.
Save fferegrino/dd3b7aa8d2010a42e2626079b93239c7 to your computer and use it in GitHub Desktop.
Gists related to the "Downloading data from Reddit using Python" post.
reddit = praw.Reddit(
client_id=os.environ["CLIENT_ID"],
client_secret=os.environ["CLIENT_SECRET"],
password=os.environ["PASSWORD"],
user_agent="Live Thread Scraper by UkraineNewsBot",
username="UkraineNewsBot",
)
live_threads = pd.DataFrame(submissions, columns=["author"] + properties)
live_threads[["id", "name", "author", "title", "created_utc", "created_at", "num_comments", "score", "upvote_ratio", "permalink"]].sort_values(
"created_utc", ascending=True
).to_csv("data/threads.csv", index=False)

Gists related to the "Downloading data from Reddit using Python" post.

def hash_string(content):
return hashlib.md5(content.encode()).hexdigest()
properties = [
"id", "created_utc", "name", "num_comments",
"permalink", "score", "title", "upvote_ratio"
]
def extract_submission_props(post):
post_props = [post.author.name]
post_props.extend([getattr(post, pr) for pr in properties])
return post_props
submissions = []
for post in subs:
title_low = post.title.lower()
if (
title_low.startswith("/r/worldnews live thread")
or title_low.startswith("r/worldnews live thread")
or title_low.startswith("worldnews live thread")
) and begin_point.timestamp() < post.created_utc < today.timestamp():
submissions.append(extract_submission_props(post))
subs = []
for username in mods:
user = reddit.redditor(name=username)
for post in user.submissions.new(limit=200):
subs.append(post)
mods = [
# list of mods
]
comment_props = [
"id", "body", "edited",
"created_utc", "link_id",
"parent_id", "distinguished",
"depth", "ups", "downs", "score",
"total_awards_received", "gilded",
]
def extract_comment(comment, submission_id):
if comment.author:
cmmt = [hash_string(comment.author.name), submission_id]
else:
cmmt = [None, submission_id]
cmmt.extend([getattr(comment, prop) for prop in comment_props])
if comment.gildings:
gildings = str(comment.gildings)
else:
gildings = None
cmmt.append(gildings)
return cmmt
for submission_id in live_threads["id"]:
file_name = f"data/comments/comments__{submission_id}.csv"
if os.path.exists(file_name):
continue
submission = reddit.submission(id=submission_id)
submission.comments.replace_more(limit=1)
comments = []
for comment in submission.comments.list():
comments.append(extract_comment(comment, submission_id))
frame = pd.DataFrame(comments, columns=["author", "submission_id"] + comment_props + ["gildings"])
frame.to_csv(file_name, index=False)
begin_point = datetime(2022, 2, 1)
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(hours=12)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment