Skip to content

Instantly share code, notes, and snippets.

@rafiq
Last active November 17, 2021 04:30
Show Gist options
  • Save rafiq/3ed6a5740e665a1f4dc5e04eef11ab4f to your computer and use it in GitHub Desktop.
Save rafiq/3ed6a5740e665a1f4dc5e04eef11ab4f to your computer and use it in GitHub Desktop.
This is the second iteration of the RNA structures project.
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
from matplotlib import pyplot as plt
import math
import re
f = open("/Users/rafiqkamal/Desktop/Data_Science/RNAProject210110/RNAL20StructuresGSSizes.txt")
contents = f.readlines()
# The original code worked for what is it was made for, but after talking with the lead researcher, I see that the code needs to be a lot more robust in order to print out various graphs of various parts of the RNA structures in order to try and find patterns. To accomplish this, I will not hard code everything, but rather use some more flexible functions, data structures, and algorithms that would, for example, be able to quickly and easily input a few numbers and look at the top twenty frequencies and their corresponding hamming distances graphs, the lowest twenty frequencies and their hamming distances, or a random set of frequencies and look at their hamming distances. In addition, this code needs to be able to quickly plot scatterplots using any of the above examples.
frequency_structure_array = []# create a main structure a list to hold the original clean data that will be sorted in ascending order by the frequency
count = 0 # create a 1-index variable (primary key) that will keep track of where the particular frequency is from the original data which has a total around 11219
for line in contents:
x = re.findall("\d+",line)
y = re.findall("[()\.]+",line)
frequency_structure_array.append([int(x[0]),math.log10(int(x[0])),y[0]])
frequency_structure_array.sort(key=lambda x:x[1])#sort ascending by frequency
for line in frequency_structure_array:
count = count + 1#add index to front of list
line.insert(0,count)
#########output######### 1-index, frequency, log10 of frequency structure
#frequency_structure_array=...[11219, 364197924001, 11.561337465902943, '.....']
length = len(frequency_structure_array)
start = 3596 #Input any range from 1 - 11219 which correlates to the ascending order of the frequencies in the original data
end = 3610
increments = 1#int((end - start) / 20)
def hamming_array_maker(array,start,end=length,increments=1):#creates historgrams figures input: hammings list, start of range, end and increments ie steps
hash = []
for index in range(start,end,increments):
structure = array[index][3]
temp = []
for line in array:
temp.append(hamming(structure,line[3]))
temp.sort()#sort the new hammings arrray
array[index].append(temp)
hash.append(array[index])
temp = []
return hash
#output# 1-index, frequency, log10 of frequency structure hammdingArr
########[11219, 364197924001, 11.561337465902943, '....................',[...(sorted) hammingsArray...]]
def hamming(s1,s2):
result = 0
if len(s1)!=len(s2):
print("Strings are not equal")
else:
for x,(i,j) in enumerate(zip(s1,s2)):
if i != j:
result += 1
return result
def histogram_maker(array):
indexNum = array[0]
hammingArray = array[4]
smallest_five = hammingArray[0:5]
mean = get_mean(smallest_five)
plt.figure( str(indexNum) + " Histogram")
plt.hist(hammingArray,density=False,bins=20)
plt.xlabel( "Structure 1_Index: " + str(indexNum) + " Hamming Distances")
plt.ylabel("Number of Occurences The mean: " + str(mean))
plt.show()
def get_mean(arr):#get mean of the input array
return round(sum(arr) / len(arr),2)
# x axis = log10 of frequenciies frequency_structure_array[2]Y
# y axis = the mean of the five smallest hamming distances (sort and last 5)
def scatterplot_maker (array_of_means,array_of_log10):
plt.figure( "1-Index: " + str(start) + "to" + str(end) + " increments of " + str(increments) + " Scatter Plot")
plt.xlabel("log10 Frequencies")
plt.ylabel("1-Index: Mean of the Smallest Five Hammings")
plt.scatter(array_of_log10,array_of_means)
plt.show()
range_of_frequencies = hamming_array_maker(frequency_structure_array,start,end,increments)
#####1-index frequency log10 of freq structure hammingArray
#[...[11219, 364197924001, 11.561337465902943, '....................', [0, 2, 2, 2..]...]
# log10_frequencies = [el[2] for el in range_of_frequencies]##NOTE this is one way to pull out the nth element from a 2D list of elements
array_of_means_of_smallest_five = []
log10_frequency_array = []
for arr in range_of_frequencies:
histogram_maker(arr)
array_of_means_of_smallest_five.append(arr[2])
log10_frequency_array.append(get_mean(arr[4][0:5]))
scatterplot_maker(array_of_means_of_smallest_five,log10_frequency_array)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment