Skip to content

Instantly share code, notes, and snippets.

@theletterf
Created September 14, 2025 15:37
Show Gist options
  • Select an option

  • Save theletterf/56bc8e3465cdaf1f0a9e8f6a75db108b to your computer and use it in GitHub Desktop.

Select an option

Save theletterf/56bc8e3465cdaf1f0a9e8f6a75db108b to your computer and use it in GitHub Desktop.
Reddit posts analyzer
import praw
import json
import csv
from datetime import datetime
import re
from collections import defaultdict
class RedditTechWritingAnalyzer:
def __init__(self):
"""
Initialize the Reddit analyzer.
You'll need to set up Reddit API credentials at https://www.reddit.com/prefs/apps/
"""
self.reddit = None
self.posts_data = []
def setup_reddit_connection(self, client_id, client_secret, user_agent):
"""
Set up connection to Reddit API
"""
try:
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent
)
print("βœ… Reddit connection established")
return True
except Exception as e:
print(f"❌ Failed to connect to Reddit: {e}")
return False
def collect_posts(self, subreddit_name="technicalwriting", limit=300, time_filter="month"):
"""
Collect posts from r/technicalwriting
"""
if not self.reddit:
print("❌ Reddit connection not established")
return False
try:
subreddit = self.reddit.subreddit(subreddit_name)
# Get posts from different categories
post_sources = [
("hot", subreddit.hot(limit=limit//3)),
("new", subreddit.new(limit=limit//3)),
("top", subreddit.top(time_filter=time_filter, limit=limit//3))
]
for source_name, posts in post_sources:
print(f"Collecting {source_name} posts...")
for post in posts:
post_data = {
'id': post.id,
'title': post.title,
'selftext': post.selftext,
'score': post.score,
'num_comments': post.num_comments,
'created_utc': post.created_utc,
'url': post.url,
'source': source_name
}
self.posts_data.append(post_data)
# Remove duplicates based on post ID
seen_ids = set()
unique_posts = []
for post in self.posts_data:
if post['id'] not in seen_ids:
unique_posts.append(post)
seen_ids.add(post['id'])
self.posts_data = unique_posts
print(f"βœ… Collected {len(self.posts_data)} unique posts")
return True
except Exception as e:
print(f"❌ Error collecting posts: {e}")
return False
def save_data(self, filename_prefix="technicalwriting_posts"):
"""
Save collected data to JSON and CSV files
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save as JSON
json_filename = f"{filename_prefix}_{timestamp}.json"
with open(json_filename, 'w', encoding='utf-8') as f:
json.dump(self.posts_data, f, indent=2, ensure_ascii=False)
# Save as CSV for easy viewing
csv_filename = f"{filename_prefix}_{timestamp}.csv"
with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
if self.posts_data:
writer = csv.DictWriter(f, fieldnames=self.posts_data[0].keys())
writer.writeheader()
writer.writerows(self.posts_data)
print(f"βœ… Data saved to {json_filename} and {csv_filename}")
return json_filename, csv_filename
def prepare_for_analysis(self, include_content=True):
"""
Prepare clean data for Claude to analyze without any preprocessing filters
"""
analysis_data = []
for post in self.posts_data:
# Create clean data structure for analysis
clean_post = {
'id': post['id'],
'title': post['title'],
'score': post['score'],
'num_comments': post['num_comments'],
'created_utc': post['created_utc']
}
# Include post content if requested and available
if include_content and post['selftext'] and len(post['selftext'].strip()) > 0:
clean_post['content'] = post['selftext']
analysis_data.append(clean_post)
return analysis_data
def export_for_claude_analysis(self, max_posts=200):
"""
Export data in a format optimized for Claude analysis
"""
analysis_data = self.prepare_for_analysis()
# Limit to most recent/relevant posts to stay within context limits
if len(analysis_data) > max_posts:
# Sort by engagement (score + comments) and recency
analysis_data.sort(key=lambda x: x['score'] + x['num_comments'], reverse=True)
analysis_data = analysis_data[:max_posts]
# Create formatted output for Claude
formatted_output = {
'total_posts_collected': len(self.posts_data),
'posts_for_analysis': len(analysis_data),
'collection_date': datetime.now().isoformat(),
'posts': analysis_data
}
return formatted_output
def identify_anxious_patterns(self):
"""
Identify patterns that suggest anxiety or uncertainty in posts
"""
# Anxiety indicators
anxiety_patterns = {
'validation_seeking': [
r'\bshould i\b', r'\bam i wrong\b', r'\bis it okay\b', r'\bis this normal\b',
r'\bdoes anyone else\b', r'\bis it just me\b', r'\bvalidate\b'
],
'imposter_syndrome': [
r'\bimposter\b', r'\bnot qualified\b', r'\bdont belong\b', r'\bfake it\b',
r'\bnot good enough\b', r'\bunqualified\b', r'\bfraud\b'
],
'career_anxiety': [
r'\bcareer change\b', r'\bswitch\b', r'\btoo late\b', r'\bworth it\b',
r'\bmistake\b', r'\bregret\b', r'\bscared\b', r'\bworried\b'
],
'skill_doubt': [
r'\bnot technical enough\b', r'\bdont know\b', r'\blearning curve\b',
r'\bstruggling with\b', r'\bdifficult\b', r'\boverwhelmed\b'
],
'job_insecurity': [
r'\bjob market\b', r'\bgetting hired\b', r'\bcompetitive\b', r'\brejection\b',
r'\bunemployed\b', r'\blaid off\b', r'\binsecure\b'
]
}
# Question patterns that suggest anxiety
anxious_question_patterns = [
r'\?{2,}', # Multiple question marks
r'\bhelp\?', r'\badvice\?', r'\bthoughts\?', r'\bopinions\?'
]
results = defaultdict(list)
for post in self.posts_data:
full_text = f"{post['title']} {post['selftext']}".lower()
# Check for anxiety patterns
for category, patterns in anxiety_patterns.items():
for pattern in patterns:
if re.search(pattern, full_text, re.IGNORECASE):
results[category].append({
'title': post['title'],
'id': post['id'],
'score': post['score'],
'comments': post['num_comments'],
'matched_pattern': pattern
})
break # Don't double-count same post
# Check for anxious question patterns
title_lower = post['title'].lower()
for pattern in anxious_question_patterns:
if re.search(pattern, title_lower):
results['anxious_questions'].append({
'title': post['title'],
'id': post['id'],
'score': post['score'],
'comments': post['num_comments'],
'matched_pattern': pattern
})
break
return dict(results)
# Usage example
if __name__ == "__main__":
print("πŸš€ Reddit Technical Writing Analyzer")
print("=" * 50)
analyzer = RedditTechWritingAnalyzer()
# You'll need to fill these in with your Reddit API credentials
CLIENT_ID = input("Enter your Reddit CLIENT_ID: ").strip()
CLIENT_SECRET = input("Enter your Reddit CLIENT_SECRET: ").strip()
USER_AGENT = input("Enter your username for USER_AGENT (e.g., 'YourUsername'): ").strip()
USER_AGENT = f"TechWritingAnalyzer/1.0 by {USER_AGENT}"
if not CLIENT_ID:
print("❌ Reddit credentials not found")
exit()
if analyzer.setup_reddit_connection(CLIENT_ID, CLIENT_SECRET, USER_AGENT):
if analyzer.collect_posts():
json_file, csv_file = analyzer.save_data()
print("\nπŸ“€ Preparing data for Claude analysis...")
claude_data = analyzer.export_for_claude_analysis()
# Save the analysis-ready data
analysis_filename = f"claude_analysis_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(analysis_filename, 'w', encoding='utf-8') as f:
json.dump(claude_data, f, indent=2, ensure_ascii=False)
print(f"βœ… Analysis data ready: {analysis_filename}")
print(f"πŸ“Š {claude_data['posts_for_analysis']} posts prepared for semantic analysis")
print("\nπŸ€– Next: Share this file with Claude for unbiased semantic analysis!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment