Kevin McElwee kmcelwee

## td_data_prep.py
def data_transform(data, timesteps, var='x'):
  m = []
  s = data.to_numpy()
  for i in range(s.shape[0]-timesteps):
      m.append(s[i:i+timesteps].tolist())

  if var == 'x':
      t = np.zeros((len(m), len(m[0]), len(m[0][0])))
      for i, x in enumerate(m):
          for j, y in enumerate(x):

## peak_likelihood.py
from scipy.stats import norm

def peak_likelihood(hist=None,
        tomorrow=None, tomorrow_std=None,
        two_day=None, two_day_std=None,
        three_day=None, three_day_std=None):
    '''
    Given the predictions and standard deviation of the three-day forecast, in
    addition to the highest load so far this month, what is the likelihood that
    a sample from tomorrow's distribution will be higher than the other three.

## heat_equation.py
def heat(l, alpha, time_steps):
    '''apply the heat equation to list l, given constants alpha and time_steps'''
    return_l = []
    for t in range(time_steps):
        if len(return_l) != 0:
            l = return_l
            return_l = []
        for i, x in enumerate(l):
            if i == 0:
                diff = (0 - l[i]) - (l[i] - l[i+1])

## daily_nn.py
HOURS_AHEAD = 24
s = all_X.shape[1]

model = tf.keras.Sequential()
model.add(layers.Dense(s, activation=tf.nn.relu, input_shape=(HOURS_AHEAD, all_X.shape[1])))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Dense(s, activation=tf.nn.relu))
model.add(layers.Flatten())

## pull_reddit.py
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)

j = []
latest_id = None
for page in range(10):
    sub = reddit.subreddit('FloridaMan')
    for s in s.top(params={'after': latest_id, 't': 'all'}):
        j.append({
            # all the data you want
        })

## get_verb.py
def get_verb(s):
    m = [x.root.head.text for x in nlp(s).noun_chunks if x.root.head.pos_ == 'VERB']
    standardized = [WordNetLemmatizer().lemmatize(x, 'v') for x in m]
    remove = set(['d', "’re", "’m", "’s"])
    filtered = [x for x in standardized if x not in remove]
    return None if len(filtered) == 0 else list(set(filtered))

## download-tweet-media.py
import wget
from os.path import join as pjoin

OUTPUT_DIR = 'tweet-imgs'
media_tweets = [tweet for tweet in tweets if 'media' in tweet['entities']]
for tweet in media_tweets:
    for i, media in enumerate(tweet['entities']['media']):
        url = media['media_url']
        extension = url.split('.')[-1]
        assert extension in ['jpg', 'png']

## multiple-types-pgp.py
import pandas as pd
df = pd.read_csv('pgp.csv')
df_multi_type = df[~pd.isna(df['Type']) & df['Type'].str.contains(';')]
df_multi_type['Type'].count() # 148 multi-type PGPIDs

df_multi_type[df_multi_type['Library'] == 'CUL']['Type'].count() # 75 PGPIDs multi-type from CUL

# list 148 of PGPIDs
31166
32188

## dataspace-psql-queries.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                kmcelwee
                / dataspace-psql-queries.md
            
            
              Last active
              March 12, 2021 18:19
            
          
    PostgreSQL & other queries in Dataspace

To enter the Postgres command line, you need to be the dspace user (sudo su - dspace). The command is psql. Here is a link to the database diagram for DSpace 5.
It sometimes may be quicker to use the REST API than creating a complicated query. And the JRuby DSpace wrapper (documentation) may be simpler as well.
Useful commands:

\dt: describe all tables
\d {TABLE}: describe the given table
\copy ({query}) to '{filename}' as CSV HEADER: saves the query to a CSV with a header


## twitter-reply-exception.json
{
    "created_at": "Wed Nov 18 19:02:24 +0000 2020",
    "id": 1329137902199005184,
    "id_str": "1329137902199005184",
    "full_text": "@2legit2dunk https://t.co/4lx6Z4wqAp",
    "truncated": false,
    "display_text_range": [
        13,
        36
    ],
	def data_transform(data, timesteps, var='x'):
	m = []
	s = data.to_numpy()
	for i in range(s.shape[0]-timesteps):
	m.append(s[i:i+timesteps].tolist())

	if var == 'x':
	t = np.zeros((len(m), len(m[0]), len(m[0][0])))
	for i, x in enumerate(m):
	for j, y in enumerate(x):
	from scipy.stats import norm

	def peak_likelihood(hist=None,
	tomorrow=None, tomorrow_std=None,
	two_day=None, two_day_std=None,
	three_day=None, three_day_std=None):
	'''
	Given the predictions and standard deviation of the three-day forecast, in
	addition to the highest load so far this month, what is the likelihood that
	a sample from tomorrow's distribution will be higher than the other three.
	def heat(l, alpha, time_steps):
	'''apply the heat equation to list l, given constants alpha and time_steps'''
	return_l = []
	for t in range(time_steps):
	if len(return_l) != 0:
	l = return_l
	return_l = []
	for i, x in enumerate(l):
	if i == 0:
	diff = (0 - l[i]) - (l[i] - l[i+1])
	HOURS_AHEAD = 24
	s = all_X.shape[1]

	model = tf.keras.Sequential()
	model.add(layers.Dense(s, activation=tf.nn.relu, input_shape=(HOURS_AHEAD, all_X.shape[1])))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Dense(s, activation=tf.nn.relu))
	model.add(layers.Flatten())
	reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)

	j = []
	latest_id = None
	for page in range(10):
	sub = reddit.subreddit('FloridaMan')
	for s in s.top(params={'after': latest_id, 't': 'all'}):
	j.append({
	# all the data you want
	})
	def get_verb(s):
	m = [x.root.head.text for x in nlp(s).noun_chunks if x.root.head.pos_ == 'VERB']
	standardized = [WordNetLemmatizer().lemmatize(x, 'v') for x in m]
	remove = set(['d', "’re", "’m", "’s"])
	filtered = [x for x in standardized if x not in remove]
	return None if len(filtered) == 0 else list(set(filtered))
	import wget
	from os.path import join as pjoin

	OUTPUT_DIR = 'tweet-imgs'
	media_tweets = [tweet for tweet in tweets if 'media' in tweet['entities']]
	for tweet in media_tweets:
	for i, media in enumerate(tweet['entities']['media']):
	url = media['media_url']
	extension = url.split('.')[-1]
	assert extension in ['jpg', 'png']
	import pandas as pd
	df = pd.read_csv('pgp.csv')
	df_multi_type = df[~pd.isna(df['Type']) & df['Type'].str.contains(';')]
	df_multi_type['Type'].count() # 148 multi-type PGPIDs

	df_multi_type[df_multi_type['Library'] == 'CUL']['Type'].count() # 75 PGPIDs multi-type from CUL

	# list 148 of PGPIDs
	31166
	32188
	{
	"created_at": "Wed Nov 18 19:02:24 +0000 2020",
	"id": 1329137902199005184,
	"id_str": "1329137902199005184",
	"full_text": "@2legit2dunk https://t.co/4lx6Z4wqAp",
	"truncated": false,
	"display_text_range": [
	13,
	36
	],