Shagun Kala Shagun-25

## LSTM Data Preparation.py
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
    dataX.append(a)
    dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

## LSTM Architecture.py
!rm -rf ./logs/
keras.backend.clear_session()
%load_ext tensorboard

model = Sequential()

# Adding the input layer
model.add(LSTM(units=48, activation='tanh', kernel_initializer=tf.keras.initializers.glorot_uniform(seed=26), input_shape = (X_train.shape[1], 1)))

# Adding the output layer

## ARIMA Prediction Plot.py
from sklearn.metrics import mean_squared_error
from math import sqrt

predictions = pd.DataFrame(predictions_ARIMA_diff_cumsum.values)
predictions.set_index(test_data.index, inplace = True)

plt.figure(figsize=(12,7))
plt.plot(train_data['price'], label = 'Train Data')
plt.plot(test_data['price'], label = 'Test Data')
plt.plot(predictions, label = 'Predicted Values')

## VADER Sentiment Analysis.py
#Adding some new words to Vader Dictionary to judge stock market news better.

new_words = {'falls': -9, 'drops': -9, 'rise': 9, 'increases': 9, 'gain': 9, 'hiked': -9, 'dips': -9, 'declines': -9,
'decline': -9, 'hikes': -9, 'jumps': 9, 'lose': -9, 'profit': 9, 'loss': -9, 'shreds': -9, 'sell': -9, 'buy': 9, 'recession': -9,
'rupee weakens': -9, 'record low': -9, 'record high': 9, 'sensex up': 9, 'nifty down': -9, 'sensex down': -9, 'nifty up': 9}

analyser = SentimentIntensityAnalyzer()
analyser.lexicon.update(new_words)

for i in tqdm(tweet_news.itertuples()):

## Cleaning Tweets Data.py
#Cleaning the tweets

def decontracted(phrase):
   # specific
   phrase = re.sub(r"won't", "will not", phrase)
   phrase = re.sub(r"can\'t", "can not", phrase)

   # general
   phrase = re.sub(r"n\'t", " not", phrase)
   phrase = re.sub(r"\'re", " are", phrase)

## Stock Data Cleaning.py
#Converting date to proper format

month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}

for i in tqdm(nifty.itertuples()):
   date_list = i[1].split()
   month = month_dict[date_list[0]]
   year = date_list[2]
   date = date_list[1][:-1]

## Cleaning Tweets Data
#Cleaning the tweets

def decontracted(phrase):
 # specific
 phrase = re.sub(r"won't", "will not", phrase)
 phrase = re.sub(r"can\'t", "can not", phrase)
 # general
 phrase = re.sub(r"n\'t", " not", phrase)
 phrase = re.sub(r"\'re", " are", phrase)
 phrase = re.sub(r"\'s", " is", phrase)

## Twitter Tweets Scraper.py
#configuration

config = twint.Config()
config.Username = "NDTVProfit"
config.Lang = "en"
config.Since = "2015–01–01"
config.Until = "2020-01-01"
config.Store_csv = True
config.Output = "NDTVProfit.csv"
#running search

## Beautiful Soup Scraper.py
#Data is scraped from this url: https://in.investing.com/indices/s-p-cnx-nifty-historical-data?end_date=1577817000&st_date=946665000

url = 'https://in.investing.com/indices/s-p-cnx-nifty-historical-data?end_date=1577817000&st_date=946665000'
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
date_raw = page_soup.find("div", {'class':'common-table-scroller js-table-scroller'}).find_all("td", {"class":"col-rowDate "})
date = [x.text.split("\n")[1] for x in date_raw]
price_raw = []
for i in range(4972):
	def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
	a = dataset[i:(i+look_back), 0]
	dataX.append(a)
	dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)
	!rm -rf ./logs/
	keras.backend.clear_session()
	%load_ext tensorboard

	model = Sequential()

	# Adding the input layer
	model.add(LSTM(units=48, activation='tanh', kernel_initializer=tf.keras.initializers.glorot_uniform(seed=26), input_shape = (X_train.shape[1], 1)))

	# Adding the output layer
	from sklearn.metrics import mean_squared_error
	from math import sqrt

	predictions = pd.DataFrame(predictions_ARIMA_diff_cumsum.values)
	predictions.set_index(test_data.index, inplace = True)

	plt.figure(figsize=(12,7))
	plt.plot(train_data['price'], label = 'Train Data')
	plt.plot(test_data['price'], label = 'Test Data')
	plt.plot(predictions, label = 'Predicted Values')
	#Adding some new words to Vader Dictionary to judge stock market news better.

	new_words = {'falls': -9, 'drops': -9, 'rise': 9, 'increases': 9, 'gain': 9, 'hiked': -9, 'dips': -9, 'declines': -9,
	'decline': -9, 'hikes': -9, 'jumps': 9, 'lose': -9, 'profit': 9, 'loss': -9, 'shreds': -9, 'sell': -9, 'buy': 9, 'recession': -9,
	'rupee weakens': -9, 'record low': -9, 'record high': 9, 'sensex up': 9, 'nifty down': -9, 'sensex down': -9, 'nifty up': 9}

	analyser = SentimentIntensityAnalyzer()
	analyser.lexicon.update(new_words)

	for i in tqdm(tweet_news.itertuples()):
	#Cleaning the tweets

	def decontracted(phrase):
	# specific
	phrase = re.sub(r"won't", "will not", phrase)
	phrase = re.sub(r"can\'t", "can not", phrase)

	# general
	phrase = re.sub(r"n\'t", " not", phrase)
	phrase = re.sub(r"\'re", " are", phrase)
	#Converting date to proper format

	month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
	'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}

	for i in tqdm(nifty.itertuples()):
	date_list = i[1].split()
	month = month_dict[date_list[0]]
	year = date_list[2]
	date = date_list[1][:-1]
	#configuration

	config = twint.Config()
	config.Username = "NDTVProfit"
	config.Lang = "en"
	config.Since = "2015–01–01"
	config.Until = "2020-01-01"
	config.Store_csv = True
	config.Output = "NDTVProfit.csv"
	#running search
	#Data is scraped from this url: https://in.investing.com/indices/s-p-cnx-nifty-historical-data?end_date=1577817000&st_date=946665000

	url = 'https://in.investing.com/indices/s-p-cnx-nifty-historical-data?end_date=1577817000&st_date=946665000'
	req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
	webpage = urlopen(req).read()
	page_soup = soup(webpage, "html.parser")
	date_raw = page_soup.find("div", {'class':'common-table-scroller js-table-scroller'}).find_all("td", {"class":"col-rowDate "})
	date = [x.text.split("\n")[1] for x in date_raw]
	price_raw = []
	for i in range(4972):