Gists related to the "Downloading data from Reddit using Python" post.
Created
May 4, 2022 08:02
-
-
Save fferegrino/dd3b7aa8d2010a42e2626079b93239c7 to your computer and use it in GitHub Desktop.
Gists related to the "Downloading data from Reddit using Python" post.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
reddit = praw.Reddit( | |
client_id=os.environ["CLIENT_ID"], | |
client_secret=os.environ["CLIENT_SECRET"], | |
password=os.environ["PASSWORD"], | |
user_agent="Live Thread Scraper by UkraineNewsBot", | |
username="UkraineNewsBot", | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
live_threads = pd.DataFrame(submissions, columns=["author"] + properties) | |
live_threads[["id", "name", "author", "title", "created_utc", "created_at", "num_comments", "score", "upvote_ratio", "permalink"]].sort_values( | |
"created_utc", ascending=True | |
).to_csv("data/threads.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def hash_string(content): | |
return hashlib.md5(content.encode()).hexdigest() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
properties = [ | |
"id", "created_utc", "name", "num_comments", | |
"permalink", "score", "title", "upvote_ratio" | |
] | |
def extract_submission_props(post): | |
post_props = [post.author.name] | |
post_props.extend([getattr(post, pr) for pr in properties]) | |
return post_props | |
submissions = [] | |
for post in subs: | |
title_low = post.title.lower() | |
if ( | |
title_low.startswith("/r/worldnews live thread") | |
or title_low.startswith("r/worldnews live thread") | |
or title_low.startswith("worldnews live thread") | |
) and begin_point.timestamp() < post.created_utc < today.timestamp(): | |
submissions.append(extract_submission_props(post)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
subs = [] | |
for username in mods: | |
user = reddit.redditor(name=username) | |
for post in user.submissions.new(limit=200): | |
subs.append(post) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mods = [ | |
# list of mods | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
comment_props = [ | |
"id", "body", "edited", | |
"created_utc", "link_id", | |
"parent_id", "distinguished", | |
"depth", "ups", "downs", "score", | |
"total_awards_received", "gilded", | |
] | |
def extract_comment(comment, submission_id): | |
if comment.author: | |
cmmt = [hash_string(comment.author.name), submission_id] | |
else: | |
cmmt = [None, submission_id] | |
cmmt.extend([getattr(comment, prop) for prop in comment_props]) | |
if comment.gildings: | |
gildings = str(comment.gildings) | |
else: | |
gildings = None | |
cmmt.append(gildings) | |
return cmmt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for submission_id in live_threads["id"]: | |
file_name = f"data/comments/comments__{submission_id}.csv" | |
if os.path.exists(file_name): | |
continue | |
submission = reddit.submission(id=submission_id) | |
submission.comments.replace_more(limit=1) | |
comments = [] | |
for comment in submission.comments.list(): | |
comments.append(extract_comment(comment, submission_id)) | |
frame = pd.DataFrame(comments, columns=["author", "submission_id"] + comment_props + ["gildings"]) | |
frame.to_csv(file_name, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
begin_point = datetime(2022, 2, 1) | |
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(hours=12) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment