Skip to content

Instantly share code, notes, and snippets.

@gravityfargo
Last active July 19, 2023 04:29
Show Gist options
  • Save gravityfargo/536af0ca0ddaeacb292826420001792b to your computer and use it in GitHub Desktop.
Save gravityfargo/536af0ca0ddaeacb292826420001792b to your computer and use it in GitHub Desktop.
Python Script to split data for the ECE498 project
# Import required modules
import os
import random
# Define paths for the train, test and firstlast data files
ratio = "50-50"
trainFile = "/mnt/Storage/VMShare/" + ratio + "/train.dat"
testFile = "/mnt/Storage/VMShare/" + ratio + "/test.dat"
firstLastFile = "/mnt/Storage/VMShare/" + ratio + "/firstlast.dat"
def main():
# Start a loop that continues indefinitely
while True:
# Display files in the current working directory
print("Files in Working Directory:")
print(listFiles())
# Prompt the user to select a file from the current working directory
selectedFile = ""
while selectedFile not in listFiles():
selectedFile = input("Select the file containing the training data: ")
# Open the selected file and read its contents
contents = openFile("data.csv")
lenOG = len(contents)
# Display the dimensions of the data
print("The file contains " + str(len(contents)) + " rows and " + str(len(contents[1])) + " columns.")
# Ask the user if they want to remove the leading and trailing 5% of data
presplit = ""
while presplit not in ['y', 'n']:
presplit = input("Would you like to remove the leading and trailing 5% of data? (y/n)\n")
if presplit == 'y':
# If yes, remove the leading and trailing 5% of data
contents = firstLastRemove(contents)
else:
# If no, continue with the original data
continue
# Ask the user to specify the proportion of data to be used for training
percentage = input("How much of the data should be used for training? ex. 30 = 30%\n")
splitData(percentage, contents)
# Perform sanity check to ensure no rows were lost in the processing
lenFirstLast = str(len(openFile(firstLastFile)))
lenTrain = str(len(openFile(trainFile)))
lenTest = str(len(openFile(testFile)))
lenTotal = int(lenFirstLast) + int(lenTrain) + int(lenTest)
# Display the number of rows in each file and the total
print(str(lenOG) + " rows in the file og file.")
print(str(lenFirstLast) + " rows in firstlast.dat")
print(str(lenTrain) + " rows in train.dat")
print(str(lenTest) + " rows in test.dat")
print(str(lenTotal) + " rows total after processing")
# Check if the total number of rows matches the original number
if lenTotal == lenOG:
# If they match, indicate that no lines were lost and exit
print("No lines lost.")
exit()
else:
# If they do not match, indicate that lines were lost and exit
print("Lines were lost")
exit()
# This function returns the list of files in the current directory
def listFiles():
itemsindir = []
for path in os.listdir('.'):
itemsindir.append(path)
return itemsindir
# This function opens a file and reads its contents into a list
def openFile(filename):
contentsList = []
lineitemsList = []
try:
file = open(filename, 'r')
while True:
line = file.readline()
if not line:
break
lineitemsList = line.split()
contentsList.append(lineitemsList)
lineitemsList.clear
return contentsList
except:
print("Invalid File.")
exit()
# This function removes the leading and trailing 5% of rows from the data
def firstLastRemove(contents):
totallines = len(contents)
numberofdata = round(0.05*totallines)
print(str(numberofdata) + " rows will be removed from the top and bottom.")
leading = contents[:numberofdata]
ending = contents[(totallines-numberofdata):]
remaining = contents[numberofdata:(totallines-numberofdata)]
firstLastData = leading + ending
firstLast = open(firstLastFile, "w")
for row in firstLastData:
if len(row) > 0:
rowstr = ' '.join([str(elem) for elem in row])
firstLast.write(rowstr)
firstLast.write('\n')
firstLast.close()
return remaining
# This function splits the data into training and testing sets based on the specified percentage
def splitData(percentage, contents):
rowstr = ""
totallines = len(contents)
numberofdata = round((totallines/100)*float(percentage))
random.shuffle(contents)
traindata = contents[:numberofdata]
testdata = contents[numberofdata:]
trainfile = open(trainFile, "w")
for row in traindata:
if len(row) > 0:
rowstr = ' '.join([str(elem) for elem in row])
trainfile.write(rowstr)
trainfile.write('\n')
trainfile.close()
testfile = open(testFile, "w")
for row in testdata:
if len(row) > 0:
rowstr = ' '.join([str(elem) for elem in row])
testfile.write(rowstr)
testfile.write('\n')
testfile.close()
print("The remaining data has been shuffled and split into train.dat and test.dat\n\n")
# Execute the main function
if __name__ == "__main__":
main()
@gravityfargo
Copy link
Author

Place this file in a directory along with the source data file and run it. Follow the prompts.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment