Last active
November 17, 2021 04:30
-
-
Save rafiq/3ed6a5740e665a1f4dc5e04eef11ab4f to your computer and use it in GitHub Desktop.
This is the second iteration of the RNA structures project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Spyder Editor | |
This is a temporary script file. | |
""" | |
from matplotlib import pyplot as plt | |
import math | |
import re | |
f = open("/Users/rafiqkamal/Desktop/Data_Science/RNAProject210110/RNAL20StructuresGSSizes.txt") | |
contents = f.readlines() | |
# The original code worked for what is it was made for, but after talking with the lead researcher, I see that the code needs to be a lot more robust in order to print out various graphs of various parts of the RNA structures in order to try and find patterns. To accomplish this, I will not hard code everything, but rather use some more flexible functions, data structures, and algorithms that would, for example, be able to quickly and easily input a few numbers and look at the top twenty frequencies and their corresponding hamming distances graphs, the lowest twenty frequencies and their hamming distances, or a random set of frequencies and look at their hamming distances. In addition, this code needs to be able to quickly plot scatterplots using any of the above examples. | |
frequency_structure_array = []# create a main structure a list to hold the original clean data that will be sorted in ascending order by the frequency | |
count = 0 # create a 1-index variable (primary key) that will keep track of where the particular frequency is from the original data which has a total around 11219 | |
for line in contents: | |
x = re.findall("\d+",line) | |
y = re.findall("[()\.]+",line) | |
frequency_structure_array.append([int(x[0]),math.log10(int(x[0])),y[0]]) | |
frequency_structure_array.sort(key=lambda x:x[1])#sort ascending by frequency | |
for line in frequency_structure_array: | |
count = count + 1#add index to front of list | |
line.insert(0,count) | |
#########output######### 1-index, frequency, log10 of frequency structure | |
#frequency_structure_array=...[11219, 364197924001, 11.561337465902943, '.....'] | |
length = len(frequency_structure_array) | |
start = 3596 #Input any range from 1 - 11219 which correlates to the ascending order of the frequencies in the original data | |
end = 3610 | |
increments = 1#int((end - start) / 20) | |
def hamming_array_maker(array,start,end=length,increments=1):#creates historgrams figures input: hammings list, start of range, end and increments ie steps | |
hash = [] | |
for index in range(start,end,increments): | |
structure = array[index][3] | |
temp = [] | |
for line in array: | |
temp.append(hamming(structure,line[3])) | |
temp.sort()#sort the new hammings arrray | |
array[index].append(temp) | |
hash.append(array[index]) | |
temp = [] | |
return hash | |
#output# 1-index, frequency, log10 of frequency structure hammdingArr | |
########[11219, 364197924001, 11.561337465902943, '....................',[...(sorted) hammingsArray...]] | |
def hamming(s1,s2): | |
result = 0 | |
if len(s1)!=len(s2): | |
print("Strings are not equal") | |
else: | |
for x,(i,j) in enumerate(zip(s1,s2)): | |
if i != j: | |
result += 1 | |
return result | |
def histogram_maker(array): | |
indexNum = array[0] | |
hammingArray = array[4] | |
smallest_five = hammingArray[0:5] | |
mean = get_mean(smallest_five) | |
plt.figure( str(indexNum) + " Histogram") | |
plt.hist(hammingArray,density=False,bins=20) | |
plt.xlabel( "Structure 1_Index: " + str(indexNum) + " Hamming Distances") | |
plt.ylabel("Number of Occurences The mean: " + str(mean)) | |
plt.show() | |
def get_mean(arr):#get mean of the input array | |
return round(sum(arr) / len(arr),2) | |
# x axis = log10 of frequenciies frequency_structure_array[2]Y | |
# y axis = the mean of the five smallest hamming distances (sort and last 5) | |
def scatterplot_maker (array_of_means,array_of_log10): | |
plt.figure( "1-Index: " + str(start) + "to" + str(end) + " increments of " + str(increments) + " Scatter Plot") | |
plt.xlabel("log10 Frequencies") | |
plt.ylabel("1-Index: Mean of the Smallest Five Hammings") | |
plt.scatter(array_of_log10,array_of_means) | |
plt.show() | |
range_of_frequencies = hamming_array_maker(frequency_structure_array,start,end,increments) | |
#####1-index frequency log10 of freq structure hammingArray | |
#[...[11219, 364197924001, 11.561337465902943, '....................', [0, 2, 2, 2..]...] | |
# log10_frequencies = [el[2] for el in range_of_frequencies]##NOTE this is one way to pull out the nth element from a 2D list of elements | |
array_of_means_of_smallest_five = [] | |
log10_frequency_array = [] | |
for arr in range_of_frequencies: | |
histogram_maker(arr) | |
array_of_means_of_smallest_five.append(arr[2]) | |
log10_frequency_array.append(get_mean(arr[4][0:5])) | |
scatterplot_maker(array_of_means_of_smallest_five,log10_frequency_array) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment