Skip to content

Instantly share code, notes, and snippets.

@DimaK415
Last active April 12, 2018 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DimaK415/70c72030784294577406146b4fd8ecef to your computer and use it in GitHub Desktop.
Save DimaK415/70c72030784294577406146b4fd8ecef to your computer and use it in GitHub Desktop.
print('Loading Libraries')
# Standard Libraries
import pandas as pd
import numpy as np
from datetime import datetime
# URL Parser
from urllib.parse import urlparse
# Reddit API
import praw
# Sentiment and NLP TextBlob
from textblob import TextBlob
# Newspaper3k
from newspaper import Article
# Subreddit Scraper Function
# from Scraper_Library import subreddit_title_scraper
print('Completed')
print('Loading Reddit Params')
fileObj = open('Scraper_Params.dat', mode='r')
reddit_params = {}
for line in fileObj:
line = line.strip()
key_value = line.split('=')
if len(key_value) == 2:
reddit_params[key_value[0].strip()] = key_value[1].strip()
print('Complete')
print('Assigning Variables')
red = praw.Reddit(client_id= reddit_params['red_client_id'],
client_secret= reddit_params['red_client_secret'],
password= reddit_params['red_password'],
user_agent= reddit_params['red_user_agent'],
username= reddit_params['red_username'])
blu = praw.Reddit(client_id= reddit_params['blu_client_id'],
client_secret= reddit_params['blu_client_secret'],
password= reddit_params['blu_password'],
user_agent= reddit_params['blu_user_agent'],
username= reddit_params['blu_username'])
red_sub_list = reddit_params['red_list'].strip().split(', ')
print(red_sub_list)
blu_sub_list = reddit_params['blu_list'].strip().split(', ')
sub_limit = int(reddit_params['limit_per_sub'].strip().split(', ')[0])
print('Complete')
def subreddit_title_scraper(sub_list, api, limit, df = True):
global red_sub_list
posts_dict = {"post title" : [],
"subreddit" : [],
"is article" : [],
"article title" : [],
"title polarity" : [],
"title objectivity" : [],
"keywords" : [],
"domain" : [],
"link" : [],
"author" : [],
"date" : [],
"target" : [],
}
article_count = 0
invalid_links = 0
for sub in sub_list:
submissions = (x for x in api.subreddit(sub).hot(limit=limit) if not x.stickied)
for post in submissions:
if sub_list is red_sub_list:
posts_dict['target'].append(True)
if sub_list == blu_sub_list:
posts_dict['target'].append(False)
posts_dict["post title"].append(post.title) ## praw reddit scraping to dict##
posts_dict["link"].append(post.url)
posts_dict["subreddit"].append(sub)
posts_dict["date"].append(datetime.fromtimestamp(post.created_utc))
parsed_url = urlparse(post.url) ## Parse URL for domain
posts_dict['domain'].append(parsed_url.netloc)
post_blob = TextBlob(post.title)
posts_dict["title polarity"].append(post_blob.sentiment[0])
posts_dict["title objectivity"].append(post_blob.sentiment[1])
posts_dict["keywords"].append(post_blob.noun_phrases)
article = Article(post.url) ## Instantiate newspaper3k library ##
if article.is_valid_url: ## Is post a URL? ##
try:
article.download()
article.parse()
except:
posts_dict["is article"].append(False)
posts_dict["article title"].append(np.nan)
posts_dict["author"].append(np.nan)
continue
if article.is_valid_body(): ## Is post an article? ##
article_count += 1
posts_dict["is article"].append(True)
posts_dict["article title"].append(article.title)
if article.authors != []:
posts_dict["author"].append(article.authors)
else:
posts_dict["author"].append(np.nan)
if article_count % 5 == 0:
print(f"Added {article_count} articles")
else:
invalid_links += 1
posts_dict["is article"].append(False)
posts_dict["article title"].append(np.nan)
posts_dict["author"].append(np.nan)
if invalid_links % 5 == 0:
print(f"{invalid_links} invalid links skipped")
if df:
print(f"creating data frame from {article_count + invalid_links} links")
posts_df = pd.DataFrame(posts_dict) ## Make it a dataframe ##
posts_df =posts_df[["subreddit", "post title", "keywords",
"title polarity", "title objectivity",
"domain", "is article", "article title" ,
"link", "author", "date", "target"]]
print(f"Done processing {article_count} articles and {invalid_links} non-articles as dataframe")
return posts_df
else:
print(f"Done processing {article_count} articles and {invalid_links} non-articles as dictionary")
return posts_dict
print(f"Pulling {sub_limit} posts from {str(blu_sub_list)} and {str(red_sub_list)}")
dfb = subreddit_title_scraper(red_sub_list, red, sub_limit, df = True)
dfr = subreddit_title_scraper(blu_sub_list, blu, sub_limit, df = True)
print('Complete')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment