Last active
April 12, 2018 19:25
-
-
Save DimaK415/70c72030784294577406146b4fd8ecef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print('Loading Libraries') | |
# Standard Libraries | |
import pandas as pd | |
import numpy as np | |
from datetime import datetime | |
# URL Parser | |
from urllib.parse import urlparse | |
# Reddit API | |
import praw | |
# Sentiment and NLP TextBlob | |
from textblob import TextBlob | |
# Newspaper3k | |
from newspaper import Article | |
# Subreddit Scraper Function | |
# from Scraper_Library import subreddit_title_scraper | |
print('Completed') | |
print('Loading Reddit Params') | |
fileObj = open('Scraper_Params.dat', mode='r') | |
reddit_params = {} | |
for line in fileObj: | |
line = line.strip() | |
key_value = line.split('=') | |
if len(key_value) == 2: | |
reddit_params[key_value[0].strip()] = key_value[1].strip() | |
print('Complete') | |
print('Assigning Variables') | |
red = praw.Reddit(client_id= reddit_params['red_client_id'], | |
client_secret= reddit_params['red_client_secret'], | |
password= reddit_params['red_password'], | |
user_agent= reddit_params['red_user_agent'], | |
username= reddit_params['red_username']) | |
blu = praw.Reddit(client_id= reddit_params['blu_client_id'], | |
client_secret= reddit_params['blu_client_secret'], | |
password= reddit_params['blu_password'], | |
user_agent= reddit_params['blu_user_agent'], | |
username= reddit_params['blu_username']) | |
red_sub_list = reddit_params['red_list'].strip().split(', ') | |
print(red_sub_list) | |
blu_sub_list = reddit_params['blu_list'].strip().split(', ') | |
sub_limit = int(reddit_params['limit_per_sub'].strip().split(', ')[0]) | |
print('Complete') | |
def subreddit_title_scraper(sub_list, api, limit, df = True): | |
global red_sub_list | |
posts_dict = {"post title" : [], | |
"subreddit" : [], | |
"is article" : [], | |
"article title" : [], | |
"title polarity" : [], | |
"title objectivity" : [], | |
"keywords" : [], | |
"domain" : [], | |
"link" : [], | |
"author" : [], | |
"date" : [], | |
"target" : [], | |
} | |
article_count = 0 | |
invalid_links = 0 | |
for sub in sub_list: | |
submissions = (x for x in api.subreddit(sub).hot(limit=limit) if not x.stickied) | |
for post in submissions: | |
if sub_list is red_sub_list: | |
posts_dict['target'].append(True) | |
if sub_list == blu_sub_list: | |
posts_dict['target'].append(False) | |
posts_dict["post title"].append(post.title) ## praw reddit scraping to dict## | |
posts_dict["link"].append(post.url) | |
posts_dict["subreddit"].append(sub) | |
posts_dict["date"].append(datetime.fromtimestamp(post.created_utc)) | |
parsed_url = urlparse(post.url) ## Parse URL for domain | |
posts_dict['domain'].append(parsed_url.netloc) | |
post_blob = TextBlob(post.title) | |
posts_dict["title polarity"].append(post_blob.sentiment[0]) | |
posts_dict["title objectivity"].append(post_blob.sentiment[1]) | |
posts_dict["keywords"].append(post_blob.noun_phrases) | |
article = Article(post.url) ## Instantiate newspaper3k library ## | |
if article.is_valid_url: ## Is post a URL? ## | |
try: | |
article.download() | |
article.parse() | |
except: | |
posts_dict["is article"].append(False) | |
posts_dict["article title"].append(np.nan) | |
posts_dict["author"].append(np.nan) | |
continue | |
if article.is_valid_body(): ## Is post an article? ## | |
article_count += 1 | |
posts_dict["is article"].append(True) | |
posts_dict["article title"].append(article.title) | |
if article.authors != []: | |
posts_dict["author"].append(article.authors) | |
else: | |
posts_dict["author"].append(np.nan) | |
if article_count % 5 == 0: | |
print(f"Added {article_count} articles") | |
else: | |
invalid_links += 1 | |
posts_dict["is article"].append(False) | |
posts_dict["article title"].append(np.nan) | |
posts_dict["author"].append(np.nan) | |
if invalid_links % 5 == 0: | |
print(f"{invalid_links} invalid links skipped") | |
if df: | |
print(f"creating data frame from {article_count + invalid_links} links") | |
posts_df = pd.DataFrame(posts_dict) ## Make it a dataframe ## | |
posts_df =posts_df[["subreddit", "post title", "keywords", | |
"title polarity", "title objectivity", | |
"domain", "is article", "article title" , | |
"link", "author", "date", "target"]] | |
print(f"Done processing {article_count} articles and {invalid_links} non-articles as dataframe") | |
return posts_df | |
else: | |
print(f"Done processing {article_count} articles and {invalid_links} non-articles as dictionary") | |
return posts_dict | |
print(f"Pulling {sub_limit} posts from {str(blu_sub_list)} and {str(red_sub_list)}") | |
dfb = subreddit_title_scraper(red_sub_list, red, sub_limit, df = True) | |
dfr = subreddit_title_scraper(blu_sub_list, blu, sub_limit, df = True) | |
print('Complete') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment