muhammadyaseen/vector.s

## vector.s
#	Linear Equations Solver using Guassian-Jordan elimination
#	Syed Usama Jamil	CS-037
#	Muhammad Yaseen		CS-050
#	Project ID		CA-03
#===========================================================================

.data

dimension	:	.byte 4

#matrix		:	.float	1.0, 1.0, 2.0, 9.0, 2.0, 4.0, -3.0, 1.0, 3.0, 6.0, -5.0, 0.0		#pg 7 x=1,y=2,z=3

matrix		:	.float	1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 7.0, 8.0, 9.0, 10.0, 1.0, 12.0, 2.0, 14.0, 5.0, 5.0, 17.0, 18.0, 19.0, 2.0		#pg 7 x=1,y=2,z=3

#matrix		:	.float	2.0, 1.0, 3.0, 1.0, 2.0, 6.0, 8.0, 3.0, 6.0, 8.0, 18.0, 5.0		#pg 7 x=3/10,y=2/5,z=0

#matrix		:	.float	1.0, -2.0, 3.0, 7.0, 2.0, 1.0, 1.0, 4.0, -3.0, 2.0, -2.0, -10.0		#pg 7 x=2,y=-1,z=1

results		: 	.space	1000		#reserve 1000 bytes

result  	: 	.asciiz "Solution is : "

newline		:	.asciiz "\n"

zero		:	.float	0.0

.text

main:

#get the address of input matrix system
la $t0, matrix

#get the dimension ( NxN ) of matrix
lw $t1, dimension

addi $s6,$t1,1			#for offset calc

#Loop control variable, goes from 1 to n, named 'j' in c code
li $t3, 0

#loop for generation of triangular matrix

gen_tri_mat:

	addi $t3, $t3, 1					#increment control var

	bgt $t3, $t1, end_gen_tri_mat		#if current_dim > dim, then we end loop

	li $t2, 0 							#control var for inner_Loop1,, named 'i' in c code

	inner_Loop1:						#goes from 1 to n

		addi $t2, $t2, 1				#increment control var

		bgt $t2, $t1, end_inner_Loop1

		sgt	$t4, $t2, $t3				#Set $t5 if i > j i.e if $t3 > $t2

		beq $t4,$0, inner_Loop1			# we want $t4 to be 1, i.e. NOT equal to zero.. designated ' if (i>j)' in c code

		#if we reach here , it means that if condition was true i.e i>j is true

		# c  = A[i][j] / A[j][j] or A[t2][t3] / A[t3][t3]

		# load A[i][j], i = $t2; j = $t3

		#now we load the Matrix element A[row][col]

		#===============================================================================
		#generic formula (for NxN matrix) for offset value is : N*s*(row-1) + s*(col-1)
		# where
		# N is the matrix dimension
		# s is the size of ONE element of matrix e.g 4 bytes for int, 1 byte for char
		#===============================================================================

		# in code we have row = $t2, col = $t3

		#calc (row - 1)
		addi $t4, $t2, -1

		#calc (col - 1)
		addi $t5, $t3, -1

		# s = 4 bytes for IEEE 754 Single Precision Floating Point
		# calc s*(col-1)
		li $t6, 4

		mul $t5, $t5, $t6

		#calc s*(row-1)
		mul $t4, $t4,$t6

		#calc N*s*(row-1)

		mul $t4, $t4,$s6

		#add the two products
		add $t7, $t5,$t4

		#now $t7 has final offset value to be added to the array starting
		#Let add the value to start address

		add $t7,$t7,$t0

		#now t7 has correct mem address of element A[r][c], Let's laod it -- A[i][j] loaded
		l.s $f0, 0($t7)


		## reset all used intermediate registers so that the next address could be calculated.

		li $t4, 0
		li $t5, 0
		li $t7, 0

		#===========================================================================
		# load A[j][j]	; Here j = t3					#
		#===========================================================================

		#calc (row - 1)
		addi $t4, $t3, -1

		#calc (col - 1)
		addi $t5, $t3, -1

		# s = 4 bytes for IEEE 754 Single Precision Floating Point
		# calc s*(col-1)
		li $t6, 4

		mul $t5, $t5, $t6

		#calc s*(row-1)
		mul $t4, $t4,$t6

		#calc N*s*(row-1)
		mul $t4, $t4,$s6

		#add the two products
		add $t7, $t5,$t4

		#now $t7 has final offset value to be added to the array starting
		#Let add the value to start address

		add $t7,$t7,$t0

		#now t7 has correct mem address of element A[r][c], Let's laod it -- A[j][j] loaded
		l.s $f1, 0($t7)

		## reset all used intermediate registers so that the next address could be calculated.

		li $t4, 0
		li $t5, 0
		li $t7, 0

		#==============================================================
		# Now $f0 = A[i][j]   AND $f1 = A[j][j] , we need to divide now
		#==============================================================

		div.s $f3, $f0, $f1		#this is our 'C'  !!! Causing NaN

		# second inner loop , goes from k=1 to n+1

		addi $s0,$t1, 1		#set s0 to n+1

		li $s1, 0		#loop control var
		inner_Loop2:

			addi $s1, $s1, 1

			bgt $s1,$s0, end_inner_Loop2		#compare with n+1

			# A[i][k]=A[i][k]-c*A[j][k];
			#tast breakdown
			# 1. Load A[i][k] and A[j][k]
			# 2. calc 	C*A[j][k]
			# 3. calc 	x = A[i][k]-c*A[j][k]
			# 4. store x on A[i][k] ... for this we need Address of A[i][k]th element

			# task 1 : Load A[i][k]  ; i = t2 k = s1

			#calc (row - 1)
			addi $t4, $t2, -1

			#calc (col - 1)
			addi $t5, $s1, -1

			# s = 4 bytes for IEEE 754 Single Precision Floating Point
			# calc s*(col-1)
			li $t6, 4

			mul $t5, $t5, $t6

			#calc s*(row-1)
			mul $t4, $t4,$t6

			#calc N*s*(row-1)
			mul $t4, $t4,$s6

			#add the two products
			add $t7, $t5,$t4

			#now $t7 has final offset value to be added to the array starting
			#Let add the value to start address

			add $t7,$t7,$t0

			#now t7 has correct mem address of element A[r][c], Let's laod it
			l.s $f4, 0($t7)

			#save address of A[i][k], bcz we have to store it back
			move $s7, $t7

			## reset all used intermediate registers so that the next address could be calculated.

			li $t4, 0
			li $t5, 0
			li $t7, 0

		 	#==== loaded A[i][k]  ===

		 	# load A[j][k]

		 	#calc (row - 1)
			addi $t4, $t3, -1

			#calc (col - 1)
			addi $t5, $s1, -1

			# s = 4 bytes for IEEE 754 Single Precision Floating Point
			# calc s*(col-1)
			li $t6, 4

			mul $t5, $t5, $t6

			#calc s*(row-1)
			mul $t4, $t4,$t6

			#calc N*s*(row-1)
			mul $t4, $t4,$s6

			#add the two products
			add $t7, $t5,$t4

			#now $t7 has final offset value to be added to the array starting
			#Let add the value to start address

			add $t7,$t7,$t0

			#now t7 has correct mem address of element A[r][c], Let's laod it
			l.s $f5, 0($t7)

			## reset all used intermediate registers so that the next address could be calculated.

			li $t4, 0
			li $t5, 0
			li $t7, 0


			# now $f4 = A[i][k] and $f5 = A[j][k]

			# 2. calc 	C*A[j][k]

		 	#mul.s	$f6, $f4, $f5			#possible error : $f4 shoud be $f3
			mul.s	$f6, $f3, $f5			#attempt at rectifying error

		 	# 3. calc 	x = A[i][k]-c*A[j][k]

		 	sub.s  $f7, $f4,$f6

		 	# 4. store x on A[i][k] ... for this we need Address of A[i][k]th element. s7 has addr of A[i][k]

		 	s.s $f7, 0($s7)

			b inner_Loop2

		end_inner_Loop2:

		b inner_Loop1

	end_inner_Loop1:

	b gen_tri_mat

end_gen_tri_mat:

## reset all used intermediate registers so that the next address could be calculated.
## (added this as precaution, following 3 lines were NOT present in initial attempt
li $t4, 0
li $t5, 0
li $t7, 0

# Now we're done with generation of Upper triangular matrix, it's time to substitute back in!
# Back substitution code

# === START code for caclulating X[n] =====

# calc x[n]=A[n][n+1]/A[n][n], where n is the dimension

# load A[n][n]  ... n is in t1

#calc (row - 1)
addi $t4, $t1, -1

#calc (col - 1)
addi $t5, $t1, -1

# s = 4 bytes for IEEE 754 Single Precision Floating Point
# calc s*(col-1)
li $t6, 4

mul $t5, $t5, $t6

#calc s*(row-1)
mul $t4, $t4,$t6

#calc N*s*(row-1)
mul $t4, $t4,$s6

#add the two products
add $t7, $t5,$t4

#now $t7 has final offset value to be added to the array starting
#Let add the value to start address

add $t7,$t7,$t0

#now t7 has correct mem address of element A[r][c], Let's laod it
l.s $f9, 0($t7)

## reset all used intermediate registers so that the next address could be calculated.

li $t4, 0
li $t5, 0
li $t7, 0

#========================
# load A[n][n+1]
#========================

addi $s2, $t1, 1			#set s2 to n+1

#calc (row - 1)
addi $t4, $t1, -1

#calc (col - 1)
addi $t5, $s2, -1

# s = 4 bytes for IEEE 754 Single Precision Floating Point
# calc s*(col-1)
li $t6, 4

mul $t5, $t5, $t6

#calc s*(row-1)
mul $t4, $t4,$t6

#calc N*s*(row-1)
mul $t4, $t4,$s6

#add the two products
add $t7, $t5,$t4

#now $t7 has final offset value to be added to the array starting
#Let add the value to start address

add $t7,$t7,$t0

#now t7 has correct mem address of element A[r][c], Let's laod it
l.s $f10, 0($t7)

## reset all used intermediate registers so that the next address could be calculated.

li $t4, 0
li $t5, 0
li $t7, 0

#perform division (this ONLY performs the division op, but DOESNOT store result back in x[n]
div.s $f11, $f10, $f9

# Let $t9 be the starting addr of result vector
la $t9, results

#offset = s*(i-1)
#for this step i = n = dimension = $t1
add $t8, $0, $t1
addi $t8, $t8, -1
mul $t8, $t8, 4

#add offset to starting addr of results vector
add $t8, $t8, $t9

#store X[n]
s.s $f11,0($t8)

#reset t8
li $t8, 0


# === END code for caclulating X[n] =====

## Usama Work Start

# backward substitution


#reset two control vars , this clears values which COULD be present

li $t2, 0
li $t3, 0

#Loop control variable, goes from n-1 to 1, named 'i' in c code
addi $t2,$t1,0

#loop for backward substitution

back_sub:

	addi $t2, $t2, -1					#decrement control var

	blt $t2, 1, end_back_sub			#if loop control variable is less than 1, then we end loop

	# j = i + 1,   $t3 = $t2 + 1
	li $t3, 0

	addi $t3, $t2, 0 					#control var for inner_Loop1,, named 'j' in c code

	#li $f15,0							#will contain sum of sum <-- sum+A[i][j]*x[j]
	#add.s $f15,$0,$0
	#commented above two lines, assume $f15 has 0 already

	#sum = 0
	l.s $f15, zero

	inner_Loop_Us:							#goes from i+1 to n

		addi $t3, $t3, 1				#increment control var

		bgt $t3, $t1, end_inner_Loop_Us


		# sum=sum+A[i][j]*x[j] or A[t2][t3]*x[t3]

		# load A[i][j], i = $t2; j = $t3

		#now we load the Matrix element A[row][col]

		#===============================================================================
		#generic formula (for NxN matrix) for offset value is : N*s*(row-1) + s*(col-1)
		# where
		# N is the matrix dimension
		# s is the size of ONE element of matrix e.g 4 bytes for int, 1 byte for char
		#===============================================================================

		# in code we have row = $t2, col = $t3

		#calc (row - 1)
		addi $t4, $t2, -1

		#calc (col - 1)
		addi $t5, $t3, -1

		# s = 4 bytes for IEEE 754 Single Precision Floating Point
		# calc s*(col-1)
		li $t6, 4

		mul $t5, $t5, $t6

		#calc s*(row-1)
		mul $t4, $t4,$t6

		#calc N*s*(row-1)
		mul $t4, $t4,$s6

		#add the two products
		add $t7, $t5,$t4

		#now $t7 has final offset value to be added to the array starting
		#Let add the value to start address

		add $t7,$t7,$t0

		#now t7 has correct mem address of element A[r][c], Let's laod it -- A[i][j] loaded
		l.s $f20, 0($t7)


		## reset all used intermediate registers so that the next address could be calculated.

		li $t4, 0
		li $t5, 0
		li $t7, 0

		#===========================================================================
		# multiply A[i][j]*x[j]	; Here j = t3					#
		#===========================================================================

		#load x[j]

		# Let $t9 be the starting addr of result vector
		la $t9, results

		#4 bytes = 1 SP FP num
		li $t8, 4

		#offset = size*number
		mul $t8, $t8, $s6

		#add offset to starting addr of results vector
		add $t8, $t8, $t9

		l.s $f13,0($t8)					# load address of $t3 in $f13

		# ======================= TEST CODE FOR x[j] loading =========================

		# Let $t9 be the starting addr of result vector
		la $t9, results

		#offset = s*(j-1)
		#for this step j = $t3
		add $t8, $0, $t3
		addi $t8, $t8, -1
		mul $t8, $t8, 4

		#add offset to starting addr of results vector
		add $t8, $t8, $t9

		#load X[n]
		l.s $f13,0($t8)

		#reset t8
		li $t8, 0
		# ======================== end test code =====================================

		mul.s $f14,$f20,$f13				# $s2 stores the product of two

		# now performing 'sum=sum+A[i][j]*x[j]' where sum is represented by $s0

		add.s $f15,$f15,$f14

		#reset t8
		li $t8, 0

		b inner_Loop_Us

	end_inner_Loop_Us:				#inner_loop ends


	# TODO: x[i]=(A[i][n+1]-sum)/A[i][i],
	# where n is the dimension
	# i = t2
	# sum = $s2

	#x[i]=(A[i][n+1]-sum)/A[i][i];
	#tast breakdown
	# 1. Load A[i][i] and A[i][n+1]
	# 2. calc 	(A[i][n+1]-sum)
	# 3. calc 	x[i] = (A[i][n+1]-sum)/A[i][i]


	# task 1 : Load A[i][i] and A[i][n+1]  ; i = t2

	#========================
	# load A[i][i]
	#========================


	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $t2, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc N*s*(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f16, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#========================
	# load A[i][n+1]
	#========================

	addi $s2, $t1, 1			#set s2 to n+1

	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $s2, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc N*s*(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f17, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	# task 2 : calc 	(A[i][n+1]-sum)

	# sum = f15

	sub.s $f18,$f17,$f15


	# task 3 : calc 	x[i] = (A[i][n+1]-sum)/A[i][i]

	#perform division, f19 has x[i]
	div.s $f19, $f18, $f16

	# ======================= TEST CODE FOR x[i] SAVING =========================

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(i-1)
	#for this step i = $t2
	add $t8, $0, $t2
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#store X[n]
	s.s $f19,0($t8)

	#reset t8
	li $t8, 0
	# ======================== end test code =====================================

	#prints out statement
	#li $v0, 4
	#la $a0, result
	#syscall

	##MY added lines
	#li $v0, 2

	# print
	#mov.s $f12,$f19
	#syscall

	##MY added lines end

	b back_sub

end_back_sub:

# Now finally printing the results

# output prompt

li $v0, 4
la $a0, result						#prints out statement
syscall

#print new line

li $v0, 4
la $a0, newline
syscall

#printing results loop

li $t2, 0 						#control var for inner_Loop1,, named 'i' in c code

Output_Loop:						#goes from 1 to n

	addi $t2, $t2, 1				#increment control var

	bgt $t2, $t1, End_Output_Loop

	# print result
	# ======================= TEST CODE FOR x[j] loading =========================

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(j-1)
	#for this step j = $t2
	add $t8, $0, $t2
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#load X[n]
	l.s $f22,0($t8)

	li $v0, 2
	mov.s $f12, $f22
	syscall

	#reset t8
	li $t8, 0
	# ======================== end test code =====================================

	#print new line

	li $v0, 4
	la $a0, newline
	syscall

	b Output_Loop

End_Output_Loop:

jr $ra
	# Linear Equations Solver using Guassian-Jordan elimination
	# Syed Usama Jamil CS-037
	# Muhammad Yaseen CS-050
	# Project ID CA-03
	#===========================================================================

	.data

	dimension : .byte 4

	#matrix : .float 1.0, 1.0, 2.0, 9.0, 2.0, 4.0, -3.0, 1.0, 3.0, 6.0, -5.0, 0.0 #pg 7 x=1,y=2,z=3

	matrix : .float 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 7.0, 8.0, 9.0, 10.0, 1.0, 12.0, 2.0, 14.0, 5.0, 5.0, 17.0, 18.0, 19.0, 2.0 #pg 7 x=1,y=2,z=3

	#matrix : .float 2.0, 1.0, 3.0, 1.0, 2.0, 6.0, 8.0, 3.0, 6.0, 8.0, 18.0, 5.0 #pg 7 x=3/10,y=2/5,z=0

	#matrix : .float 1.0, -2.0, 3.0, 7.0, 2.0, 1.0, 1.0, 4.0, -3.0, 2.0, -2.0, -10.0 #pg 7 x=2,y=-1,z=1

	results : .space 1000 #reserve 1000 bytes

	result : .asciiz "Solution is : "

	newline : .asciiz "\n"

	zero : .float 0.0

	.text

	main:

	#get the address of input matrix system
	la $t0, matrix

	#get the dimension ( NxN ) of matrix
	lw $t1, dimension

	addi $s6,$t1,1 #for offset calc

	#Loop control variable, goes from 1 to n, named 'j' in c code
	li $t3, 0

	#loop for generation of triangular matrix

	gen_tri_mat:

	addi $t3, $t3, 1 #increment control var

	bgt $t3, $t1, end_gen_tri_mat #if current_dim > dim, then we end loop

	li $t2, 0 #control var for inner_Loop1,, named 'i' in c code

	inner_Loop1: #goes from 1 to n

	addi $t2, $t2, 1 #increment control var

	bgt $t2, $t1, end_inner_Loop1

	sgt $t4, $t2, $t3 #Set $t5 if i > j i.e if $t3 > $t2

	beq $t4,$0, inner_Loop1 # we want $t4 to be 1, i.e. NOT equal to zero.. designated ' if (i>j)' in c code

	#if we reach here , it means that if condition was true i.e i>j is true

	# c = A[i][j] / A[j][j] or A[t2][t3] / A[t3][t3]

	# load A[i][j], i = $t2; j = $t3

	#now we load the Matrix element A[row][col]

	#===============================================================================
	#generic formula (for NxN matrix) for offset value is : Ns(row-1) + s*(col-1)
	# where
	# N is the matrix dimension
	# s is the size of ONE element of matrix e.g 4 bytes for int, 1 byte for char
	#===============================================================================

	# in code we have row = $t2, col = $t3

	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $t3, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)

	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it -- A[i][j] loaded
	l.s $f0, 0($t7)


	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#===========================================================================
	# load A[j][j] ; Here j = t3 #
	#===========================================================================

	#calc (row - 1)
	addi $t4, $t3, -1

	#calc (col - 1)
	addi $t5, $t3, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it -- A[j][j] loaded
	l.s $f1, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#==============================================================
	# Now $f0 = A[i][j] AND $f1 = A[j][j] , we need to divide now
	#==============================================================

	div.s $f3, $f0, $f1 #this is our 'C' !!! Causing NaN

	# second inner loop , goes from k=1 to n+1

	addi $s0,$t1, 1 #set s0 to n+1

	li $s1, 0 #loop control var
	inner_Loop2:

	addi $s1, $s1, 1

	bgt $s1,$s0, end_inner_Loop2 #compare with n+1

	# A[i][k]=A[i][k]-c*A[j][k];
	#tast breakdown
	# 1. Load A[i][k] and A[j][k]
	# 2. calc C*A[j][k]
	# 3. calc x = A[i][k]-c*A[j][k]
	# 4. store x on A[i][k] ... for this we need Address of A[i][k]th element

	# task 1 : Load A[i][k] ; i = t2 k = s1

	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $s1, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f4, 0($t7)

	#save address of A[i][k], bcz we have to store it back
	move $s7, $t7

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#==== loaded A[i][k] ===

	# load A[j][k]

	#calc (row - 1)
	addi $t4, $t3, -1

	#calc (col - 1)
	addi $t5, $s1, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f5, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0


	# now $f4 = A[i][k] and $f5 = A[j][k]

	# 2. calc C*A[j][k]

	#mul.s $f6, $f4, $f5 #possible error : $f4 shoud be $f3
	mul.s $f6, $f3, $f5 #attempt at rectifying error

	# 3. calc x = A[i][k]-c*A[j][k]

	sub.s $f7, $f4,$f6

	# 4. store x on A[i][k] ... for this we need Address of A[i][k]th element. s7 has addr of A[i][k]

	s.s $f7, 0($s7)

	b inner_Loop2

	end_inner_Loop2:

	b inner_Loop1

	end_inner_Loop1:

	b gen_tri_mat

	end_gen_tri_mat:

	## reset all used intermediate registers so that the next address could be calculated.
	## (added this as precaution, following 3 lines were NOT present in initial attempt
	li $t4, 0
	li $t5, 0
	li $t7, 0

	# Now we're done with generation of Upper triangular matrix, it's time to substitute back in!
	# Back substitution code

	# === START code for caclulating X[n] =====

	# calc x[n]=A[n][n+1]/A[n][n], where n is the dimension

	# load A[n][n] ... n is in t1

	#calc (row - 1)
	addi $t4, $t1, -1

	#calc (col - 1)
	addi $t5, $t1, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f9, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#========================
	# load A[n][n+1]
	#========================

	addi $s2, $t1, 1 #set s2 to n+1

	#calc (row - 1)
	addi $t4, $t1, -1

	#calc (col - 1)
	addi $t5, $s2, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f10, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#perform division (this ONLY performs the division op, but DOESNOT store result back in x[n]
	div.s $f11, $f10, $f9

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(i-1)
	#for this step i = n = dimension = $t1
	add $t8, $0, $t1
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#store X[n]
	s.s $f11,0($t8)

	#reset t8
	li $t8, 0


	# === END code for caclulating X[n] =====

	## Usama Work Start

	# backward substitution


	#reset two control vars , this clears values which COULD be present

	li $t2, 0
	li $t3, 0

	#Loop control variable, goes from n-1 to 1, named 'i' in c code
	addi $t2,$t1,0

	#loop for backward substitution

	back_sub:

	addi $t2, $t2, -1 #decrement control var

	blt $t2, 1, end_back_sub #if loop control variable is less than 1, then we end loop

	# j = i + 1, $t3 = $t2 + 1
	li $t3, 0

	addi $t3, $t2, 0 #control var for inner_Loop1,, named 'j' in c code

	#li $f15,0 #will contain sum of sum <-- sum+A[i][j]*x[j]
	#add.s $f15,$0,$0
	#commented above two lines, assume $f15 has 0 already

	#sum = 0
	l.s $f15, zero

	inner_Loop_Us: #goes from i+1 to n

	addi $t3, $t3, 1 #increment control var

	bgt $t3, $t1, end_inner_Loop_Us


	# sum=sum+A[i][j]x[j] or A[t2][t3]x[t3]

	# load A[i][j], i = $t2; j = $t3

	#now we load the Matrix element A[row][col]

	#===============================================================================
	#generic formula (for NxN matrix) for offset value is : Ns(row-1) + s*(col-1)
	# where
	# N is the matrix dimension
	# s is the size of ONE element of matrix e.g 4 bytes for int, 1 byte for char
	#===============================================================================

	# in code we have row = $t2, col = $t3

	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $t3, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it -- A[i][j] loaded
	l.s $f20, 0($t7)


	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#===========================================================================
	# multiply A[i][j]*x[j] ; Here j = t3 #
	#===========================================================================

	#load x[j]

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#4 bytes = 1 SP FP num
	li $t8, 4

	#offset = size*number
	mul $t8, $t8, $s6

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	l.s $f13,0($t8) # load address of $t3 in $f13

	# ======================= TEST CODE FOR x[j] loading =========================

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(j-1)
	#for this step j = $t3
	add $t8, $0, $t3
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#load X[n]
	l.s $f13,0($t8)

	#reset t8
	li $t8, 0
	# ======================== end test code =====================================

	mul.s $f14,$f20,$f13 # $s2 stores the product of two

	# now performing 'sum=sum+A[i][j]*x[j]' where sum is represented by $s0

	add.s $f15,$f15,$f14

	#reset t8
	li $t8, 0

	b inner_Loop_Us

	end_inner_Loop_Us: #inner_loop ends


	# TODO: x[i]=(A[i][n+1]-sum)/A[i][i],
	# where n is the dimension
	# i = t2
	# sum = $s2

	#x[i]=(A[i][n+1]-sum)/A[i][i];
	#tast breakdown
	# 1. Load A[i][i] and A[i][n+1]
	# 2. calc (A[i][n+1]-sum)
	# 3. calc x[i] = (A[i][n+1]-sum)/A[i][i]


	# task 1 : Load A[i][i] and A[i][n+1] ; i = t2

	#========================
	# load A[i][i]
	#========================


	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $t2, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f16, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	#========================
	# load A[i][n+1]
	#========================

	addi $s2, $t1, 1 #set s2 to n+1

	#calc (row - 1)
	addi $t4, $t2, -1

	#calc (col - 1)
	addi $t5, $s2, -1

	# s = 4 bytes for IEEE 754 Single Precision Floating Point
	# calc s*(col-1)
	li $t6, 4

	mul $t5, $t5, $t6

	#calc s*(row-1)
	mul $t4, $t4,$t6

	#calc Ns(row-1)
	mul $t4, $t4,$s6

	#add the two products
	add $t7, $t5,$t4

	#now $t7 has final offset value to be added to the array starting
	#Let add the value to start address

	add $t7,$t7,$t0

	#now t7 has correct mem address of element A[r][c], Let's laod it
	l.s $f17, 0($t7)

	## reset all used intermediate registers so that the next address could be calculated.

	li $t4, 0
	li $t5, 0
	li $t7, 0

	# task 2 : calc (A[i][n+1]-sum)

	# sum = f15

	sub.s $f18,$f17,$f15


	# task 3 : calc x[i] = (A[i][n+1]-sum)/A[i][i]

	#perform division, f19 has x[i]
	div.s $f19, $f18, $f16

	# ======================= TEST CODE FOR x[i] SAVING =========================

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(i-1)
	#for this step i = $t2
	add $t8, $0, $t2
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#store X[n]
	s.s $f19,0($t8)

	#reset t8
	li $t8, 0
	# ======================== end test code =====================================

	#prints out statement
	#li $v0, 4
	#la $a0, result
	#syscall

	##MY added lines
	#li $v0, 2

	# print
	#mov.s $f12,$f19
	#syscall

	##MY added lines end

	b back_sub

	end_back_sub:

	# Now finally printing the results

	# output prompt

	li $v0, 4
	la $a0, result #prints out statement
	syscall

	#print new line

	li $v0, 4
	la $a0, newline
	syscall

	#printing results loop

	li $t2, 0 #control var for inner_Loop1,, named 'i' in c code

	Output_Loop: #goes from 1 to n

	addi $t2, $t2, 1 #increment control var

	bgt $t2, $t1, End_Output_Loop

	# print result
	# ======================= TEST CODE FOR x[j] loading =========================

	# Let $t9 be the starting addr of result vector
	la $t9, results

	#offset = s*(j-1)
	#for this step j = $t2
	add $t8, $0, $t2
	addi $t8, $t8, -1
	mul $t8, $t8, 4

	#add offset to starting addr of results vector
	add $t8, $t8, $t9

	#load X[n]
	l.s $f22,0($t8)

	li $v0, 2
	mov.s $f12, $f22
	syscall

	#reset t8
	li $t8, 0
	# ======================== end test code =====================================

	#print new line

	li $v0, 4
	la $a0, newline
	syscall

	b Output_Loop

	End_Output_Loop:

	jr $ra