Kevin McElwee kmcelwee

## download-tweet-media.py
import wget
from os.path import join as pjoin

OUTPUT_DIR = 'tweet-imgs'
media_tweets = [tweet for tweet in tweets if 'media' in tweet['entities']]
for tweet in media_tweets:
    for i, media in enumerate(tweet['entities']['media']):
        url = media['media_url']
        extension = url.split('.')[-1]
        assert extension in ['jpg', 'png']

## web-scraping-cheat-sheet.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                kmcelwee
                / web-scraping-cheat-sheet.md
            
            
              Last active
              July 7, 2021 09:51
            
          
    Web Scraping & Data Analysis Cheat Sheet

Case Study


Fortune 100 & BLM
Coding notebook example (If it has trouble loading, try refreshing)

Definitions


## get_verb.py
def get_verb(s):
    m = [x.root.head.text for x in nlp(s).noun_chunks if x.root.head.pos_ == 'VERB']
    standardized = [WordNetLemmatizer().lemmatize(x, 'v') for x in m]
    remove = set(['d', "’re", "’m", "’s"])
    filtered = [x for x in standardized if x not in remove]
    return None if len(filtered) == 0 else list(set(filtered))

## pull_reddit.py
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)

j = []
latest_id = None
for page in range(10):
    sub = reddit.subreddit('FloridaMan')
    for s in s.top(params={'after': latest_id, 't': 'all'}):
        j.append({
            # all the data you want
        })

## td_data_prep.py
def data_transform(data, timesteps, var='x'):
  m = []
  s = data.to_numpy()
  for i in range(s.shape[0]-timesteps):
      m.append(s[i:i+timesteps].tolist())

  if var == 'x':
      t = np.zeros((len(m), len(m[0]), len(m[0][0])))
      for i, x in enumerate(m):
          for j, y in enumerate(x):

## daily_nn.py
HOURS_AHEAD = 24
s = all_X.shape[1]

model = tf.keras.Sequential()
model.add(layers.Dense(s, activation=tf.nn.relu, input_shape=(HOURS_AHEAD, all_X.shape[1])))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Flatten())

## heat_equation.py
def heat(l, alpha, time_steps):
    '''apply the heat equation to list l, given constants alpha and time_steps'''
    return_l = []
    for t in range(time_steps):
        if len(return_l) != 0:
            l = return_l
            return_l = []
        for i, x in enumerate(l):
            if i == 0:
                diff = (0 - l[i]) - (l[i] - l[i+1])

## peak_likelihood.py
from scipy.stats import norm

def peak_likelihood(hist=None,
        tomorrow=None, tomorrow_std=None,
        two_day=None, two_day_std=None,
        three_day=None, three_day_std=None):
    '''
    Given the predictions and standard deviation of the three-day forecast, in
    addition to the highest load so far this month, what is the likelihood that
    a sample from tomorrow's distribution will be higher than the other three.
	import wget
	from os.path import join as pjoin

	OUTPUT_DIR = 'tweet-imgs'
	media_tweets = [tweet for tweet in tweets if 'media' in tweet['entities']]
	for tweet in media_tweets:
	for i, media in enumerate(tweet['entities']['media']):
	url = media['media_url']
	extension = url.split('.')[-1]
	assert extension in ['jpg', 'png']
	def get_verb(s):
	m = [x.root.head.text for x in nlp(s).noun_chunks if x.root.head.pos_ == 'VERB']
	standardized = [WordNetLemmatizer().lemmatize(x, 'v') for x in m]
	remove = set(['d', "’re", "’m", "’s"])
	filtered = [x for x in standardized if x not in remove]
	return None if len(filtered) == 0 else list(set(filtered))
	reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)

	j = []
	latest_id = None
	for page in range(10):
	sub = reddit.subreddit('FloridaMan')
	for s in s.top(params={'after': latest_id, 't': 'all'}):
	j.append({
	# all the data you want
	})
	def data_transform(data, timesteps, var='x'):
	m = []
	s = data.to_numpy()
	for i in range(s.shape[0]-timesteps):
	m.append(s[i:i+timesteps].tolist())

	if var == 'x':
	t = np.zeros((len(m), len(m[0]), len(m[0][0])))
	for i, x in enumerate(m):
	for j, y in enumerate(x):
	HOURS_AHEAD = 24
	s = all_X.shape[1]

	model = tf.keras.Sequential()
	model.add(layers.Dense(s, activation=tf.nn.relu, input_shape=(HOURS_AHEAD, all_X.shape[1])))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Flatten())
	def heat(l, alpha, time_steps):
	'''apply the heat equation to list l, given constants alpha and time_steps'''
	return_l = []
	for t in range(time_steps):
	if len(return_l) != 0:
	l = return_l
	return_l = []
	for i, x in enumerate(l):
	if i == 0:
	diff = (0 - l[i]) - (l[i] - l[i+1])
	from scipy.stats import norm

	def peak_likelihood(hist=None,
	tomorrow=None, tomorrow_std=None,
	two_day=None, two_day_std=None,
	three_day=None, three_day_std=None):
	'''
	Given the predictions and standard deviation of the three-day forecast, in
	addition to the highest load so far this month, what is the likelihood that
	a sample from tomorrow's distribution will be higher than the other three.