Skip to content

Instantly share code, notes, and snippets.

View pytholabsbot1's full-sized avatar
😊
Empowering OpenSource & Research

pytholabs_bot1 pytholabsbot1

😊
Empowering OpenSource & Research
View GitHub Profile
##create a nodel to estimate the relationship b/w x and y
##optimization model = OLS estimator
def ols(x,y):
y_ = y.mean()
x_ = x.mean()
b1 = np.sum((y-y_)*(x-x_))/np.sum((x-x_)**2)
def ent(df,attribute):
target_variables = df.play.unique() #This gives all 'Yes' and 'No'
variables = df[attribute].unique() #This gives different features in that attribute (like 'Sweet')
entropy_attribute = 0
for variable in variables:
entropy_each_feature = 0
for target_variable in target_variables:
num = len(df[attribute][df[attribute]==variable][df.play ==target_variable]) #numerator
##1. claculate entropy o the whole dataset
entropy_node = 0 #Initialize Entropy
values = df.play.unique() #Unique objects - 'Yes', 'No'
for value in values:
fraction = df.play.value_counts()[value]/len(df.play)
entropy_node += -fraction*np.log2(fraction)
outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')
dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])
import tweepy
# Replace the API_KEY and API_SECRET with your application's key and secret.
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
import sys
import jsonpickle
import os
searchQuery = 'brexit' # this is what we're searching for
maxTweets = 1000 # Some arbitrary large number
tweetsPerQry = 100 # this is the max the API permits
fName = 'tweets.txt' # We'll store the tweets in a text file.
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape
def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1).reshape(-1,1)
#We use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (ratings - mean_user_rating)
pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':