Kenneth Leung kennethleungty

## condo_rental_random_forest.py
from sklearn.ensemble import RandomForestRegressor

# Create the parameter grid for GridSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

## condo_rental_xgboost_regressor.py
import xgboost as xgb

# Setup XGBoost hyperparameter grid
xgb_param_grid = {"learning_rate": [0.05, 0.1, 0.2], # Step size shrinkage used in update to prevents overfitting.
                  "max_depth"        : [6, 8, 9, 10], # Maximum depth of a tree.
                  "min_child_weight" : [1, 3, 5, 7], # Minimum number of instances required in a child node
                  "gamma"            : [0.0, 0.1, 0.2, 0.3], # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                  "colsample_bytree" : [0.3, 0.4, 0.6, 0.8] # Number of features supplied to a tree
}

## condo_rental_lightgbm_regressor.py
import lightgbm as lgb

# Setup LightGBM hyperparameter grid
gbm_param_grid = {'metric': ['rmse'],
                  'max_depth': [9,10,11,12,13],
                  'bagging_fraction': [0.8, 0.9, 1],
                  'feature_fraction': [0.8, 0.9, 1],
                  'min_data_in_leaf': [20,50,80],
                  'learning_rate': [0.01,0.05,0.1,0.2]}

## twitter_api_auth_tweepy.py
import tweepy

api_key = 'your_api_key_here'
api_key_secret = 'your_api_key_secret_here'
access_token = 'your_access_token_here'
access_token_secret = 'your_access_token_secret_here'

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

## tweet_preprocessor.py
# Import tweet-preprocessor package
import preprocessor as p

# Clean tweet text with tweet-preprocessor
tweets_df['text_cleaned'] = tweets_df['text'].apply(lambda x: p.clean(x))

## twitter_nltk_vader.py
import nltk
nltk.download('vader_lexicon') # Download the VADER lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Obtaining NLTK scores
tweets_df['nltk_scores'] = tweets_df['text_cleaned'].apply(lambda x: sia.polarity_scores(x))

## twitter_textblob.py
from textblob import TextBlob

# Obtain polarity scores generated by TextBlob
tweets_df['textblob_score'] = tweets_df['text_cleaned'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Set threshold to define neutral sentiment
neutral_thresh = 0.05

# Convert polarity score into sentiment categories
tweets_df['textblob_sentiment'] = tweets_df['textblob_score'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))

## twitter_corenlp.py
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')

# Function to obtain sentiment score (value) from text
def get_sentiment_score(text):
    output = nlp.annotate(text, properties={'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 100000})
    return (output['sentences'][0]['sentimentValue'])

corenlp_senti_scores = []

## twitter_stanza.py
import stanza

stanza.download('en')

nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

# Obtain (average) sentiment score generated by Stanza for each Tweet
def stanza_analyze(Text):
    document = nlp(Text)
    print('Processing')

## fortune_500_bar_chart_race.py
import bar_chart_race as bcr

bcr.bar_chart_race(df = bcr_df, # Input formatted dataframe
                   n_bars = 10, # Show 10 bars
                   sort='desc', # Sort in descending manner (Highest revenue at top)
                   title='Top 10 Fortune 500 (Global) Companies (1995-2020)',
                   filename = 'Top 10 Fortune 500 (Global) Companies (1995-2020).mp4',
                   period_length = 1600, # Duration of animation for each time period
                   bar_label_size=6, tick_label_size=6,
                   steps_per_period = 70, # Adjust animation smoothness
	from sklearn.ensemble import RandomForestRegressor

	# Create the parameter grid for GridSearchCV
	rf_param_grid = {
	'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
	'max_features': [2, 3], # Maximum number of features considered for splitting a node
	'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
	'n_estimators': [100, 300, 600] # Number of trees in the forest
	}
	import xgboost as xgb

	# Setup XGBoost hyperparameter grid
	xgb_param_grid = {"learning_rate": [0.05, 0.1, 0.2], # Step size shrinkage used in update to prevents overfitting.
	"max_depth" : [6, 8, 9, 10], # Maximum depth of a tree.
	"min_child_weight" : [1, 3, 5, 7], # Minimum number of instances required in a child node
	"gamma" : [0.0, 0.1, 0.2, 0.3], # Minimum loss reduction required to make a further partition on a leaf node of the tree.
	"colsample_bytree" : [0.3, 0.4, 0.6, 0.8] # Number of features supplied to a tree
	}
	import lightgbm as lgb

	# Setup LightGBM hyperparameter grid
	gbm_param_grid = {'metric': ['rmse'],
	'max_depth': [9,10,11,12,13],
	'bagging_fraction': [0.8, 0.9, 1],
	'feature_fraction': [0.8, 0.9, 1],
	'min_data_in_leaf': [20,50,80],
	'learning_rate': [0.01,0.05,0.1,0.2]}
	import tweepy

	api_key = 'your_api_key_here'
	api_key_secret = 'your_api_key_secret_here'
	access_token = 'your_access_token_here'
	access_token_secret = 'your_access_token_secret_here'

	auth = tweepy.OAuthHandler(api_key, api_key_secret)
	auth.set_access_token(access_token, access_token_secret)
	api = tweepy.API(auth)
	# Import tweet-preprocessor package
	import preprocessor as p

	# Clean tweet text with tweet-preprocessor
	tweets_df['text_cleaned'] = tweets_df['text'].apply(lambda x: p.clean(x))
	import nltk
	nltk.download('vader_lexicon') # Download the VADER lexicon
	from nltk.sentiment.vader import SentimentIntensityAnalyzer

	# Initialize sentiment intensity analyzer
	sia = SentimentIntensityAnalyzer()

	# Obtaining NLTK scores
	tweets_df['nltk_scores'] = tweets_df['text_cleaned'].apply(lambda x: sia.polarity_scores(x))
	from textblob import TextBlob

	# Obtain polarity scores generated by TextBlob
	tweets_df['textblob_score'] = tweets_df['text_cleaned'].apply(lambda x: TextBlob(x).sentiment.polarity)

	# Set threshold to define neutral sentiment
	neutral_thresh = 0.05

	# Convert polarity score into sentiment categories
	tweets_df['textblob_sentiment'] = tweets_df['textblob_score'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))
	from pycorenlp import StanfordCoreNLP

	nlp = StanfordCoreNLP('http://localhost:9000')

	# Function to obtain sentiment score (value) from text
	def get_sentiment_score(text):
	output = nlp.annotate(text, properties={'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 100000})
	return (output['sentences'][0]['sentimentValue'])

	corenlp_senti_scores = []
	import stanza

	stanza.download('en')

	nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

	# Obtain (average) sentiment score generated by Stanza for each Tweet
	def stanza_analyze(Text):
	document = nlp(Text)
	print('Processing')
	import bar_chart_race as bcr

	bcr.bar_chart_race(df = bcr_df, # Input formatted dataframe
	n_bars = 10, # Show 10 bars
	sort='desc', # Sort in descending manner (Highest revenue at top)
	title='Top 10 Fortune 500 (Global) Companies (1995-2020)',
	filename = 'Top 10 Fortune 500 (Global) Companies (1995-2020).mp4',
	period_length = 1600, # Duration of animation for each time period
	bar_label_size=6, tick_label_size=6,
	steps_per_period = 70, # Adjust animation smoothness