pytholabs_bot1 pytholabsbot1

## ols.py
##create a nodel to estimate the relationship b/w x and y
##optimization model = OLS estimator

def ols(x,y):

    y_ = y.mean()
    x_ = x.mean()


    b1 = np.sum((y-y_)*(x-x_))/np.sum((x-x_)**2)

## cd
def ent(df,attribute):
    target_variables = df.play.unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Sweet')


    entropy_attribute = 0
    for variable in variables:
        entropy_each_feature = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df.play ==target_variable]) #numerator

## askjdj
##1. claculate entropy o the whole dataset

entropy_node = 0  #Initialize Entropy
values = df.play.unique()  #Unique objects - 'Yes', 'No'
for value in values:
    fraction = df.play.value_counts()[value]/len(df.play)
    entropy_node += -fraction*np.log2(fraction)

## sdmnfj
outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')

## adskjfhksd
dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])

## kmdnvknxckv
import tweepy

# Replace the API_KEY and API_SECRET with your application's key and secret.
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

## sdkjfhkjsh
import sys
import jsonpickle
import os

searchQuery = 'brexit'  # this is what we're searching for
maxTweets = 1000 # Some arbitrary large number
tweetsPerQry = 100  # this is the max the API permits
fName = 'tweets.txt' # We'll store the tweets in a text file.


## gist:c3a3c334d25f27e099acb8d040eda632
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

## gist:ea27704ff41cf7d52dce627cdabaafc1
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')

ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

ratings_train.shape, ratings_test.shape

## gist:67900c435877925a3159124e5b7f614b
def predict(ratings, similarity, type='user'):

    if type == 'user':
        mean_user_rating = ratings.mean(axis=1).reshape(-1,1)
        #We use np.newaxis so that mean_user_rating has same format as ratings

        ratings_diff = (ratings - mean_user_rating)
        pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T

    elif type == 'item':
	##create a nodel to estimate the relationship b/w x and y
	##optimization model = OLS estimator

	def ols(x,y):

	y_ = y.mean()
	x_ = x.mean()


	b1 = np.sum((y-y_)(x-x_))/np.sum((x-x_)*2)
	def ent(df,attribute):
	target_variables = df.play.unique() #This gives all 'Yes' and 'No'
	variables = df[attribute].unique() #This gives different features in that attribute (like 'Sweet')


	entropy_attribute = 0
	for variable in variables:
	entropy_each_feature = 0
	for target_variable in target_variables:
	num = len(df[attribute][df[attribute]==variable][df.play ==target_variable]) #numerator
	##1. claculate entropy o the whole dataset

	entropy_node = 0 #Initialize Entropy
	values = df.play.unique() #Unique objects - 'Yes', 'No'
	for value in values:
	fraction = df.play.value_counts()[value]/len(df.play)
	entropy_node += -fraction*np.log2(fraction)
	outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
	temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
	humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
	windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
	play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')
	dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
	df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])
	import tweepy

	# Replace the API_KEY and API_SECRET with your application's key and secret.
	auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)

	api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
	import sys
	import jsonpickle
	import os

	searchQuery = 'brexit' # this is what we're searching for
	maxTweets = 1000 # Some arbitrary large number
	tweetsPerQry = 100 # this is the max the API permits
	fName = 'tweets.txt' # We'll store the tweets in a text file.
	import pandas as pd
	%matplotlib inline
	import matplotlib
	import matplotlib.pyplot as plt
	import numpy as np

	# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
	# You can check the column names from the readme file
	#Reading users file:
	u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
	r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

	ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')

	ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

	ratings_train.shape, ratings_test.shape
	def predict(ratings, similarity, type='user'):

	if type == 'user':
	mean_user_rating = ratings.mean(axis=1).reshape(-1,1)
	#We use np.newaxis so that mean_user_rating has same format as ratings

	ratings_diff = (ratings - mean_user_rating)
	pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T

	elif type == 'item':