fferegrino/client_creation.py

## client_creation.py
reddit = praw.Reddit(
    client_id=os.environ["CLIENT_ID"],
    client_secret=os.environ["CLIENT_SECRET"],
    password=os.environ["PASSWORD"],
    user_agent="Live Thread Scraper by UkraineNewsBot",
    username="UkraineNewsBot",
)

## convert_into_dataframe.py
live_threads = pd.DataFrame(submissions, columns=["author"] + properties)

live_threads[["id", "name", "author", "title", "created_utc", "created_at", "num_comments", "score", "upvote_ratio", "permalink"]].sort_values(
    "created_utc", ascending=True
).to_csv("data/threads.csv", index=False)

## downloading_data_from_reddit_using_python.md

      
    Raw
  

              downloading_data_from_reddit_using_python.md
            
          
    Gists related to the "Downloading data from Reddit using Python" post.

  
## hash_string.py
def hash_string(content):
    return hashlib.md5(content.encode()).hexdigest()

## iterating_over_submissions.py
properties = [
    "id", "created_utc", "name", "num_comments",
    "permalink", "score", "title", "upvote_ratio"
]
def extract_submission_props(post):
    post_props = [post.author.name]
    post_props.extend([getattr(post, pr) for pr in properties])
    return post_props
submissions = []
for post in subs:
    title_low = post.title.lower()
    if (
        title_low.startswith("/r/worldnews live thread")
        or title_low.startswith("r/worldnews live thread")
        or title_low.startswith("worldnews live thread")
    ) and begin_point.timestamp() < post.created_utc < today.timestamp():
        submissions.append(extract_submission_props(post))

## iterating_over_users.py
subs = []
for username in mods:
    user = reddit.redditor(name=username)
    for post in user.submissions.new(limit=200):
        subs.append(post)

## mods.py
mods = [
    # list of mods
]

## processing_comments.py
comment_props = [
    "id", "body", "edited",
    "created_utc", "link_id",
    "parent_id", "distinguished",
    "depth", "ups", "downs", "score",
    "total_awards_received", "gilded",
]
def extract_comment(comment, submission_id):
    if comment.author:
        cmmt = [hash_string(comment.author.name), submission_id]
    else:
        cmmt = [None, submission_id]
    cmmt.extend([getattr(comment, prop) for prop in comment_props])
    if comment.gildings:
        gildings = str(comment.gildings)
    else:
        gildings = None
    cmmt.append(gildings)
    return cmmt

## saving_each_thread.py
for submission_id in live_threads["id"]:
    file_name = f"data/comments/comments__{submission_id}.csv"
    if os.path.exists(file_name):
        continue
    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=1)
    comments = []
    for comment in submission.comments.list():
        comments.append(extract_comment(comment, submission_id))
    frame = pd.DataFrame(comments, columns=["author", "submission_id"] + comment_props + ["gildings"])
    frame.to_csv(file_name, index=False)

## times.py
begin_point = datetime(2022, 2, 1)
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(hours=12)
	reddit = praw.Reddit(
	client_id=os.environ["CLIENT_ID"],
	client_secret=os.environ["CLIENT_SECRET"],
	password=os.environ["PASSWORD"],
	user_agent="Live Thread Scraper by UkraineNewsBot",
	username="UkraineNewsBot",
	)
	live_threads = pd.DataFrame(submissions, columns=["author"] + properties)

	live_threads[["id", "name", "author", "title", "created_utc", "created_at", "num_comments", "score", "upvote_ratio", "permalink"]].sort_values(
	"created_utc", ascending=True
	).to_csv("data/threads.csv", index=False)
	def hash_string(content):
	return hashlib.md5(content.encode()).hexdigest()
	properties = [
	"id", "created_utc", "name", "num_comments",
	"permalink", "score", "title", "upvote_ratio"
	]
	def extract_submission_props(post):
	post_props = [post.author.name]
	post_props.extend([getattr(post, pr) for pr in properties])
	return post_props
	submissions = []
	for post in subs:
	title_low = post.title.lower()
	if (
	title_low.startswith("/r/worldnews live thread")
	or title_low.startswith("r/worldnews live thread")
	or title_low.startswith("worldnews live thread")
	) and begin_point.timestamp() < post.created_utc < today.timestamp():
	submissions.append(extract_submission_props(post))
	subs = []
	for username in mods:
	user = reddit.redditor(name=username)
	for post in user.submissions.new(limit=200):
	subs.append(post)
	comment_props = [
	"id", "body", "edited",
	"created_utc", "link_id",
	"parent_id", "distinguished",
	"depth", "ups", "downs", "score",
	"total_awards_received", "gilded",
	]
	def extract_comment(comment, submission_id):
	if comment.author:
	cmmt = [hash_string(comment.author.name), submission_id]
	else:
	cmmt = [None, submission_id]
	cmmt.extend([getattr(comment, prop) for prop in comment_props])
	if comment.gildings:
	gildings = str(comment.gildings)
	else:
	gildings = None
	cmmt.append(gildings)
	return cmmt
	for submission_id in live_threads["id"]:
	file_name = f"data/comments/comments__{submission_id}.csv"
	if os.path.exists(file_name):
	continue
	submission = reddit.submission(id=submission_id)
	submission.comments.replace_more(limit=1)
	comments = []
	for comment in submission.comments.list():
	comments.append(extract_comment(comment, submission_id))
	frame = pd.DataFrame(comments, columns=["author", "submission_id"] + comment_props + ["gildings"])
	frame.to_csv(file_name, index=False)
	begin_point = datetime(2022, 2, 1)
	today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(hours=12)