Skip to content

Instantly share code, notes, and snippets.

@HebeHH
Last active August 3, 2018 15:27
Show Gist options
  • Save HebeHH/0ddf4fac3c1466b784cc51b4e6162bfa to your computer and use it in GitHub Desktop.
Save HebeHH/0ddf4fac3c1466b784cc51b4e6162bfa to your computer and use it in GitHub Desktop.
Search through specific subreddits from the command line. Returns titles and vote scores only
import praw
import re
from tqdm import tqdm
import pandas as pd
pd.options.display.max_colwidth = 70
# pick subreddits, any delimiter but letter, number and new line is okay (will default to r/news if no valid input)
subreddits = "+".join(re.findall(r'[A-Za-z0-9]+', raw_input("Enter desired subreddits on one line\n")))
print "Your subreddits are: " + subreddits
# connect to reddit - needs your own credentials
reddit = praw.Reddit(client_id='ur_cliend_id',
client_secret='ur_client_secret',
user_agent='Hi it me')
# get submissions from subreddits according to user-selected sort
search_type = raw_input("Pick your search type. Options: 'hot' (default), 'new', 'top'\n")
def get_submissions(search_type, subreddits):
if search_type.lower() == 'new':
return reddit.subreddit(subreddits).new(limit = None)
elif search_type.lower() == 'top':
return reddit.subreddit(subreddits).top(limit = None)
else:
return reddit.subreddit(subreddits).hot(limit = None)
print "This may take a bit"
submissions = []
# if you want more info than title and score, edit this bit
try:
#tqdm inaccurate as submissions are lazily evaluated, but lets user know shit is happening
for submission in tqdm(get_submissions(search_type, subreddits)):
submissions.append([submission.title, submission.score])
except:
print "No valid subreddit was entered, defaulting to News."
for submission in tqdm(get_submissions(search_type, "News")):
submissions.append([submission.title, submission.score])
print "Fetched %d submissions" % len(submissions)
submissions = pd.DataFrame(submissions, columns = ['titles', 'scores'])
# get search terms. Can be user input or the most commonn proper nouns (excluding minor ones)
search_term_type = raw_input("Pick your search term type. Options: 'choose own' (default), 'common proper nouns' \n")
if search_term_type.lower() == 'common proper nouns':
# find all proper nouns
stop_words = 'In|The|Man|New|What|My|This|Woman|Best|Why|How|You|Is|Part'
all_titles = re.sub(stop_words, "", " ". join(submissions.titles)) # removes useless words from titles. should expand.
ProperNouns = re.findall(r'[A-Z][a-z]+', all_titles)
# get most common
search_terms = pd.Series(ProperNouns).value_counts().nlargest(10).index.values
else:
# user input search terms
search_terms = re.findall(r'((?<=")[A-Za-z].*?(?="))|([A-Za-z]+)', raw_input("Enter all search terms (on one line):\n"))
search_terms = [e for l in search_terms for e in l if e]
print "your search_terms are: " + ", ".join(search_terms)
# get minimum score to show
try:
min_score = int(raw_input("What is the minimum score for a submission to be included? Defaults to 100.\n"))
except:
min_score = 100
# subset submissions with applicable score
relevant_submissions = submissions[submissions['scores'] >= min_score]
# how to display
output_type = raw_input("Do you want to group by search term, including duplicates? (y/n)\n defaults to all unique relevant titles.\n")
if output_type.lower() == 'y':
# output grouped by search term
for term in search_terms:
print "\n Titles about " + term + ":"
print relevant_submissions[relevant_submissions.titles.str.contains("(?i)"+term)]
else:
# output all unique hits
pattern = "(?i)" + "|(?i)".join(search_terms)
print relevant_submissions[relevant_submissions.titles.str.contains(pattern)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment