Last active
July 19, 2023 04:29
-
-
Save gravityfargo/536af0ca0ddaeacb292826420001792b to your computer and use it in GitHub Desktop.
Python Script to split data for the ECE498 project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import required modules | |
import os | |
import random | |
# Define paths for the train, test and firstlast data files | |
ratio = "50-50" | |
trainFile = "/mnt/Storage/VMShare/" + ratio + "/train.dat" | |
testFile = "/mnt/Storage/VMShare/" + ratio + "/test.dat" | |
firstLastFile = "/mnt/Storage/VMShare/" + ratio + "/firstlast.dat" | |
def main(): | |
# Start a loop that continues indefinitely | |
while True: | |
# Display files in the current working directory | |
print("Files in Working Directory:") | |
print(listFiles()) | |
# Prompt the user to select a file from the current working directory | |
selectedFile = "" | |
while selectedFile not in listFiles(): | |
selectedFile = input("Select the file containing the training data: ") | |
# Open the selected file and read its contents | |
contents = openFile("data.csv") | |
lenOG = len(contents) | |
# Display the dimensions of the data | |
print("The file contains " + str(len(contents)) + " rows and " + str(len(contents[1])) + " columns.") | |
# Ask the user if they want to remove the leading and trailing 5% of data | |
presplit = "" | |
while presplit not in ['y', 'n']: | |
presplit = input("Would you like to remove the leading and trailing 5% of data? (y/n)\n") | |
if presplit == 'y': | |
# If yes, remove the leading and trailing 5% of data | |
contents = firstLastRemove(contents) | |
else: | |
# If no, continue with the original data | |
continue | |
# Ask the user to specify the proportion of data to be used for training | |
percentage = input("How much of the data should be used for training? ex. 30 = 30%\n") | |
splitData(percentage, contents) | |
# Perform sanity check to ensure no rows were lost in the processing | |
lenFirstLast = str(len(openFile(firstLastFile))) | |
lenTrain = str(len(openFile(trainFile))) | |
lenTest = str(len(openFile(testFile))) | |
lenTotal = int(lenFirstLast) + int(lenTrain) + int(lenTest) | |
# Display the number of rows in each file and the total | |
print(str(lenOG) + " rows in the file og file.") | |
print(str(lenFirstLast) + " rows in firstlast.dat") | |
print(str(lenTrain) + " rows in train.dat") | |
print(str(lenTest) + " rows in test.dat") | |
print(str(lenTotal) + " rows total after processing") | |
# Check if the total number of rows matches the original number | |
if lenTotal == lenOG: | |
# If they match, indicate that no lines were lost and exit | |
print("No lines lost.") | |
exit() | |
else: | |
# If they do not match, indicate that lines were lost and exit | |
print("Lines were lost") | |
exit() | |
# This function returns the list of files in the current directory | |
def listFiles(): | |
itemsindir = [] | |
for path in os.listdir('.'): | |
itemsindir.append(path) | |
return itemsindir | |
# This function opens a file and reads its contents into a list | |
def openFile(filename): | |
contentsList = [] | |
lineitemsList = [] | |
try: | |
file = open(filename, 'r') | |
while True: | |
line = file.readline() | |
if not line: | |
break | |
lineitemsList = line.split() | |
contentsList.append(lineitemsList) | |
lineitemsList.clear | |
return contentsList | |
except: | |
print("Invalid File.") | |
exit() | |
# This function removes the leading and trailing 5% of rows from the data | |
def firstLastRemove(contents): | |
totallines = len(contents) | |
numberofdata = round(0.05*totallines) | |
print(str(numberofdata) + " rows will be removed from the top and bottom.") | |
leading = contents[:numberofdata] | |
ending = contents[(totallines-numberofdata):] | |
remaining = contents[numberofdata:(totallines-numberofdata)] | |
firstLastData = leading + ending | |
firstLast = open(firstLastFile, "w") | |
for row in firstLastData: | |
if len(row) > 0: | |
rowstr = ' '.join([str(elem) for elem in row]) | |
firstLast.write(rowstr) | |
firstLast.write('\n') | |
firstLast.close() | |
return remaining | |
# This function splits the data into training and testing sets based on the specified percentage | |
def splitData(percentage, contents): | |
rowstr = "" | |
totallines = len(contents) | |
numberofdata = round((totallines/100)*float(percentage)) | |
random.shuffle(contents) | |
traindata = contents[:numberofdata] | |
testdata = contents[numberofdata:] | |
trainfile = open(trainFile, "w") | |
for row in traindata: | |
if len(row) > 0: | |
rowstr = ' '.join([str(elem) for elem in row]) | |
trainfile.write(rowstr) | |
trainfile.write('\n') | |
trainfile.close() | |
testfile = open(testFile, "w") | |
for row in testdata: | |
if len(row) > 0: | |
rowstr = ' '.join([str(elem) for elem in row]) | |
testfile.write(rowstr) | |
testfile.write('\n') | |
testfile.close() | |
print("The remaining data has been shuffled and split into train.dat and test.dat\n\n") | |
# Execute the main function | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Place this file in a directory along with the source data file and run it. Follow the prompts.