rafiq/RNAProject2.py

## RNAProject2.py
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
from matplotlib import pyplot as plt
import math
import re

f = open("/Users/rafiqkamal/Desktop/Data_Science/RNAProject210110/RNAL20StructuresGSSizes.txt")
contents = f.readlines()

# The original code worked for what is it was made for, but after talking with the lead researcher, I see that the code needs to be a lot more robust in order to print out various graphs of various parts of the RNA structures in order to try and find patterns. To accomplish this, I will not hard code everything, but rather use some more flexible functions, data structures, and algorithms that would, for example, be able to quickly and easily input a few numbers and look at the top twenty frequencies and their corresponding hamming distances graphs, the lowest twenty frequencies and their hamming distances, or a random set of frequencies and look at their hamming distances. In addition, this code needs to be able to quickly plot scatterplots using any of the above examples.

frequency_structure_array = []# create a main structure a list to hold the original clean data that will be sorted in ascending order by the frequency
count = 0 # create a 1-index variable (primary key) that will keep track of where the particular frequency is from the original data which has a total around 11219

for line in contents:
    x = re.findall("\d+",line)
    y = re.findall("[()\.]+",line)
    frequency_structure_array.append([int(x[0]),math.log10(int(x[0])),y[0]])

frequency_structure_array.sort(key=lambda x:x[1])#sort ascending by frequency

for line in frequency_structure_array:
    count = count + 1#add index to front of list
    line.insert(0,count)
#########output#########      1-index,  frequency, log10 of frequency  structure
#frequency_structure_array=...[11219, 364197924001, 11.561337465902943, '.....']

length = len(frequency_structure_array)
start = 3596 #Input any range from 1 - 11219 which correlates to the ascending order of the frequencies in the original data
end = 3610
increments = 1#int((end - start) / 20)

def hamming_array_maker(array,start,end=length,increments=1):#creates historgrams figures input: hammings list, start of range, end and increments ie steps
    hash = []
    for index in range(start,end,increments):
        structure = array[index][3]
        temp = []
        for line in array:
            temp.append(hamming(structure,line[3]))
        temp.sort()#sort the new hammings arrray
        array[index].append(temp)
        hash.append(array[index])
        temp = []
    return hash
#output# 1-index,  frequency,  log10 of frequency      structure     hammdingArr
########[11219, 364197924001, 11.561337465902943, '....................',[...(sorted) hammingsArray...]]

def hamming(s1,s2):
    result = 0
    if len(s1)!=len(s2):
        print("Strings are not equal")
    else:
        for x,(i,j) in enumerate(zip(s1,s2)):
            if i != j:
                result += 1
    return result

def histogram_maker(array):
    indexNum = array[0]
    hammingArray = array[4]
    smallest_five = hammingArray[0:5]
    mean = get_mean(smallest_five)
    plt.figure( str(indexNum) + " Histogram")
    plt.hist(hammingArray,density=False,bins=20)
    plt.xlabel( "Structure 1_Index: " + str(indexNum) + " Hamming Distances")
    plt.ylabel("Number of Occurences    The mean: " + str(mean))
    plt.show()

def get_mean(arr):#get mean of the input array
    return round(sum(arr) / len(arr),2)

# x axis = log10 of frequenciies frequency_structure_array[2]Y
# y axis = the mean of the five smallest hamming distances (sort and last 5)
def scatterplot_maker (array_of_means,array_of_log10):
    plt.figure( "1-Index: " + str(start) + "to" + str(end) + " increments of " + str(increments) + " Scatter Plot")
    plt.xlabel("log10 Frequencies")
    plt.ylabel("1-Index: Mean of the Smallest Five Hammings")
    plt.scatter(array_of_log10,array_of_means)
    plt.show()

range_of_frequencies = hamming_array_maker(frequency_structure_array,start,end,increments)
#####1-index   frequency      log10 of freq        structure        hammingArray
#[...[11219, 364197924001, 11.561337465902943, '....................', [0, 2, 2, 2..]...]
# log10_frequencies = [el[2] for el in range_of_frequencies]##NOTE this is one way to pull out the nth element from a 2D list of elements
array_of_means_of_smallest_five = []
log10_frequency_array = []

for arr in range_of_frequencies:
    histogram_maker(arr)
    array_of_means_of_smallest_five.append(arr[2])
    log10_frequency_array.append(get_mean(arr[4][0:5]))

scatterplot_maker(array_of_means_of_smallest_five,log10_frequency_array)
	# -- coding: utf-8 --
	"""
	Spyder Editor

	This is a temporary script file.
	"""
	from matplotlib import pyplot as plt
	import math
	import re

	f = open("/Users/rafiqkamal/Desktop/Data_Science/RNAProject210110/RNAL20StructuresGSSizes.txt")
	contents = f.readlines()

	# The original code worked for what is it was made for, but after talking with the lead researcher, I see that the code needs to be a lot more robust in order to print out various graphs of various parts of the RNA structures in order to try and find patterns. To accomplish this, I will not hard code everything, but rather use some more flexible functions, data structures, and algorithms that would, for example, be able to quickly and easily input a few numbers and look at the top twenty frequencies and their corresponding hamming distances graphs, the lowest twenty frequencies and their hamming distances, or a random set of frequencies and look at their hamming distances. In addition, this code needs to be able to quickly plot scatterplots using any of the above examples.

	frequency_structure_array = []# create a main structure a list to hold the original clean data that will be sorted in ascending order by the frequency
	count = 0 # create a 1-index variable (primary key) that will keep track of where the particular frequency is from the original data which has a total around 11219

	for line in contents:
	x = re.findall("\d+",line)
	y = re.findall("[()\.]+",line)
	frequency_structure_array.append([int(x[0]),math.log10(int(x[0])),y[0]])

	frequency_structure_array.sort(key=lambda x:x[1])#sort ascending by frequency

	for line in frequency_structure_array:
	count = count + 1#add index to front of list
	line.insert(0,count)
	#########output######### 1-index, frequency, log10 of frequency structure
	#frequency_structure_array=...[11219, 364197924001, 11.561337465902943, '.....']

	length = len(frequency_structure_array)
	start = 3596 #Input any range from 1 - 11219 which correlates to the ascending order of the frequencies in the original data
	end = 3610
	increments = 1#int((end - start) / 20)

	def hamming_array_maker(array,start,end=length,increments=1):#creates historgrams figures input: hammings list, start of range, end and increments ie steps
	hash = []
	for index in range(start,end,increments):
	structure = array[index][3]
	temp = []
	for line in array:
	temp.append(hamming(structure,line[3]))
	temp.sort()#sort the new hammings arrray
	array[index].append(temp)
	hash.append(array[index])
	temp = []
	return hash
	#output# 1-index, frequency, log10 of frequency structure hammdingArr
	########[11219, 364197924001, 11.561337465902943, '....................',[...(sorted) hammingsArray...]]

	def hamming(s1,s2):
	result = 0
	if len(s1)!=len(s2):
	print("Strings are not equal")
	else:
	for x,(i,j) in enumerate(zip(s1,s2)):
	if i != j:
	result += 1
	return result

	def histogram_maker(array):
	indexNum = array[0]
	hammingArray = array[4]
	smallest_five = hammingArray[0:5]
	mean = get_mean(smallest_five)
	plt.figure( str(indexNum) + " Histogram")
	plt.hist(hammingArray,density=False,bins=20)
	plt.xlabel( "Structure 1_Index: " + str(indexNum) + " Hamming Distances")
	plt.ylabel("Number of Occurences The mean: " + str(mean))
	plt.show()

	def get_mean(arr):#get mean of the input array
	return round(sum(arr) / len(arr),2)

	# x axis = log10 of frequenciies frequency_structure_array[2]Y
	# y axis = the mean of the five smallest hamming distances (sort and last 5)
	def scatterplot_maker (array_of_means,array_of_log10):
	plt.figure( "1-Index: " + str(start) + "to" + str(end) + " increments of " + str(increments) + " Scatter Plot")
	plt.xlabel("log10 Frequencies")
	plt.ylabel("1-Index: Mean of the Smallest Five Hammings")
	plt.scatter(array_of_log10,array_of_means)
	plt.show()

	range_of_frequencies = hamming_array_maker(frequency_structure_array,start,end,increments)
	#####1-index frequency log10 of freq structure hammingArray
	#[...[11219, 364197924001, 11.561337465902943, '....................', [0, 2, 2, 2..]...]
	# log10_frequencies = [el[2] for el in range_of_frequencies]##NOTE this is one way to pull out the nth element from a 2D list of elements
	array_of_means_of_smallest_five = []
	log10_frequency_array = []

	for arr in range_of_frequencies:
	histogram_maker(arr)
	array_of_means_of_smallest_five.append(arr[2])
	log10_frequency_array.append(get_mean(arr[4][0:5]))

	scatterplot_maker(array_of_means_of_smallest_five,log10_frequency_array)