Skip to content

Instantly share code, notes, and snippets.

View odunola499's full-sized avatar

Jenrola Odunolaoluwa odunola499

View GitHub Profile
@odunola499
odunola499 / extract_reddit.py
Last active September 10, 2022 12:40
Medium article on extracting reddit data
#import modules
import os
import requests #to send our requests through the api
import pandas as pd #to create a pandas dataframe and and save our data in csv format
#get client_id, secret_key, username and password from a text document in the current working directory
with open('key.txt') as f:
data.readlines()
client_id = data[0].strip()
secret_key = data[1].strip()
@odunola499
odunola499 / get_access_token.py
Last active September 10, 2022 13:06
further
#with our secret key and client id we create our basic requests authentication
auth = requests.auth.HTTPBasicAuth(client_id, secret_key)
data = {
'grant_type':'password', #mode of authentication
'username' : username,
'password' : password
} #dictionary with our username, password and type of authentication
headers = {
api = 'https://oauth.reddit.com'
res = requests.get(f'{api}/r/wallstreetbets/new', headers = headers, params = {'limit': '100'})
data = res.json()
#this would contain all the most recent 100 messages from the subreddit in a json format.]
#A json format is a very popular format that resembles a dictionary.
#feel free to check it our or trace the json document tree for what you want
#for this tutorial we would be using a Pandas Dataframe
while True:
res = requests.get(f'{api}/r/wallstreetbets/new', headers = headers,
params = {'limit': '100', 'after':df['name'].iloc[len(df) - 1]})
if len(res.json()['data']['children']) == 0 :
break
for post in res.json()['data']['children']:
df = df.append({
'name':post['data']['name'],
'created_utc': post['data']['created_utc'],
'subreddit':post['data']['subreddit'],
import warnings
warnings.filterwarnings("ignore") #the append method is depreciated so we kill all warnings to
#keep your terminal looking clean and sharp. Be careful when using this in other scenarios though
#knowing what warnings say might help in debugging.
subreddits = ['investing', 'wallstreetbets','StockMarket'] #subreddits
all_df = pd.DataFrame({ #the final df
'name':[],
'created_utc': [],
'subreddit':[],