kamath/analyze.py

## analyze.py
def analyze(filename, columns=[], precision=1):
    """ Read a CSV file named filename. For each
    listed (numeric) column, compute the min, max, and average value.
    Generate a table where each row is one of the columns listed
    and the columns correspond to the min, average, and max value.
    The average grade should be rounded to the number of decimal
    places specified by the precision parameter. """

    file = open(filename, 'r') # You could've also said "with open(filename, 'r') as file:" and indented the rest of your block

    column_names = file.readline() # This reads the first line of the file. Now, if I run file.readline() again, it will read the second line of the file.
    # Similarly, if I run file.readlines() now, it will read all the lines after the first line, because the first line has already been read

    # strip() removes spaces and newlines at the beginning/end of a string. file.readlines() adds extraneous newline characters on each line that we want to get rid of using .strip()
    # You could have also used the [:-1] slicing operation instead of strip(), but strip() is more flexible
    column_names = column_names.strip().split(',')

    # COMMON MISTAKE: columns is not the same as column_names! columns exists as a parameter because we might not want stats for every single column - we want our code to be flexible.
    # column_names stores the names of all the columns in the file, and columns is just the columns that we want to analyze

    rows = file.readlines() # As explained on line 12, this now reads all the lines after the first line
    rows = [a.strip().split(',') for a in rows] # Just like on lines 14-15, you could've said a[:-1] instead of strip() but strip() is more flexible

    file.close() # Now that we're done reading the file, we can close it.

    # COMMON MISTAKE: a lot of people tried doing rows.split() directly. You can't call split() on a list!
    # rows is a list of strings, where every element in the list represents a row of grades.csv. You have to iterate through the list and split every string individually.
    # What we're accomplishing on lines 21-22 is creating a list of lists. For every row in rows, row[i] for some index i maps to the column column_names[i]

    # Now that we have our data structured by row, we want to aggregate it by column. Think of this kind of like a pivot table in Excel.
    # We can do this with a dictionary, where the key is a column name, and the value is a list containing the values of that column.

    column_data = {column: [] for column in columns} # Initialize a dictionary where column names are the key and the value is an empty list that will contain the values for that specific column.
    # Notice we used columns instead of column_names above. This is because this dictionary will only contain information about the specific columns we care about.

    # COMMON MISTAKE: Notice how our code here is flexible and modular to account for any number of column names. Many students made hard variables like test1, test2, etc. that contained the column values as lists.
    # The reason we are using a dictionary is because we don't know how many columns are in the given file, so we don't want to hardcode that. If grades.csv changes to include a test5, we want our code to be flexible in that.

    # NOTE in general: You use a dictionary when you have an unknown number of variables. Just like I said, we don't always explicitly know the column names or the number of columns, but we want our code to be flexible.

    # Now, we're going to go through each row, and add values to our column_data dictionary
    for row in rows:
        # Each row is a list, and row[i] maps to column_names[i], so we use the index-based loop to map indices together.
        for i in range(len(column_names)):
            column_name = column_names[i]
            column_value = row[i]

            # Check if the column_name is one of the keys to column_data. If it is, then we know we care about that column.
            # You could also say if column_name in columns and check from the list, but lookups are more efficient using dictionary keys than using a list.
            if column_name in column_data:
                column_data[column_name].append(float(column_value)) # Make sure you convert the value to a float! Otherwise it will be a string.

    # Now we have column_data, where every key is a column we want to analyze, and the associated value is a list containing the values of that column.
    # We can now analyze each individual column and display it nicely.
    print('\tmin\tavg\tmax') # Print min, max, and avg with a tab separation in the middle
    for column in column_data: # Loop through the keys of column_data. You could also say "for column, values in column_data.items():" here.
        values = column_data[column]

        average = round(sum(values) / len(values), precision)
        # COMMON MISTAKE: average was not rounded to an appropriate precision, or precision was hardcoded instead of using precision parameter
        minimum = min(values)
        maximum = max(values)

        print(column + '\t' + str(minimum) + '\t' + str(average) + '\t' + str(maximum))

    # COMMON MISTAKE: values were not printed nicely.

    # That's it! No need to return anything in this function

# Here's the same thing, but with way less code if anyone's interested.
def golfed(filename, columns=[], precision=1):
    column_names, *rows = list(map(lambda x: x.strip().split(','), open(filename, 'r').readlines()))
    column_dict = dict(zip(columns, [[]]*len(columns)))
    column_indices = dict(map(reversed, enumerate(column_names)))
    funcs = [('min', min), ('avg', lambda values: round(sum(values) / len(values), precision)), ('max', max)]
    print('\t' + '\t'.join(map(lambda x: x[0], funcs)))
    for column in columns:
        column_dict[column] = [float(row[column_indices[column]]) for row in rows]
        print('\t'.join([column] + list(map(lambda x: str(x[1](column_dict[column])), funcs))))
	def analyze(filename, columns=[], precision=1):
	""" Read a CSV file named filename. For each
	listed (numeric) column, compute the min, max, and average value.
	Generate a table where each row is one of the columns listed
	and the columns correspond to the min, average, and max value.
	The average grade should be rounded to the number of decimal
	places specified by the precision parameter. """

	file = open(filename, 'r') # You could've also said "with open(filename, 'r') as file:" and indented the rest of your block

	column_names = file.readline() # This reads the first line of the file. Now, if I run file.readline() again, it will read the second line of the file.
	# Similarly, if I run file.readlines() now, it will read all the lines after the first line, because the first line has already been read

	# strip() removes spaces and newlines at the beginning/end of a string. file.readlines() adds extraneous newline characters on each line that we want to get rid of using .strip()
	# You could have also used the [:-1] slicing operation instead of strip(), but strip() is more flexible
	column_names = column_names.strip().split(',')

	# COMMON MISTAKE: columns is not the same as column_names! columns exists as a parameter because we might not want stats for every single column - we want our code to be flexible.
	# column_names stores the names of all the columns in the file, and columns is just the columns that we want to analyze

	rows = file.readlines() # As explained on line 12, this now reads all the lines after the first line
	rows = [a.strip().split(',') for a in rows] # Just like on lines 14-15, you could've said a[:-1] instead of strip() but strip() is more flexible

	file.close() # Now that we're done reading the file, we can close it.

	# COMMON MISTAKE: a lot of people tried doing rows.split() directly. You can't call split() on a list!
	# rows is a list of strings, where every element in the list represents a row of grades.csv. You have to iterate through the list and split every string individually.
	# What we're accomplishing on lines 21-22 is creating a list of lists. For every row in rows, row[i] for some index i maps to the column column_names[i]

	# Now that we have our data structured by row, we want to aggregate it by column. Think of this kind of like a pivot table in Excel.
	# We can do this with a dictionary, where the key is a column name, and the value is a list containing the values of that column.

	column_data = {column: [] for column in columns} # Initialize a dictionary where column names are the key and the value is an empty list that will contain the values for that specific column.
	# Notice we used columns instead of column_names above. This is because this dictionary will only contain information about the specific columns we care about.

	# COMMON MISTAKE: Notice how our code here is flexible and modular to account for any number of column names. Many students made hard variables like test1, test2, etc. that contained the column values as lists.
	# The reason we are using a dictionary is because we don't know how many columns are in the given file, so we don't want to hardcode that. If grades.csv changes to include a test5, we want our code to be flexible in that.

	# NOTE in general: You use a dictionary when you have an unknown number of variables. Just like I said, we don't always explicitly know the column names or the number of columns, but we want our code to be flexible.

	# Now, we're going to go through each row, and add values to our column_data dictionary
	for row in rows:
	# Each row is a list, and row[i] maps to column_names[i], so we use the index-based loop to map indices together.
	for i in range(len(column_names)):
	column_name = column_names[i]
	column_value = row[i]

	# Check if the column_name is one of the keys to column_data. If it is, then we know we care about that column.
	# You could also say if column_name in columns and check from the list, but lookups are more efficient using dictionary keys than using a list.
	if column_name in column_data:
	column_data[column_name].append(float(column_value)) # Make sure you convert the value to a float! Otherwise it will be a string.

	# Now we have column_data, where every key is a column we want to analyze, and the associated value is a list containing the values of that column.
	# We can now analyze each individual column and display it nicely.
	print('\tmin\tavg\tmax') # Print min, max, and avg with a tab separation in the middle
	for column in column_data: # Loop through the keys of column_data. You could also say "for column, values in column_data.items():" here.
	values = column_data[column]

	average = round(sum(values) / len(values), precision)
	# COMMON MISTAKE: average was not rounded to an appropriate precision, or precision was hardcoded instead of using precision parameter
	minimum = min(values)
	maximum = max(values)

	print(column + '\t' + str(minimum) + '\t' + str(average) + '\t' + str(maximum))

	# COMMON MISTAKE: values were not printed nicely.

	# That's it! No need to return anything in this function

	# Here's the same thing, but with way less code if anyone's interested.
	def golfed(filename, columns=[], precision=1):
	column_names, *rows = list(map(lambda x: x.strip().split(','), open(filename, 'r').readlines()))
	column_dict = dict(zip(columns, [[]]*len(columns)))
	column_indices = dict(map(reversed, enumerate(column_names)))
	funcs = [('min', min), ('avg', lambda values: round(sum(values) / len(values), precision)), ('max', max)]
	print('\t' + '\t'.join(map(lambda x: x[0], funcs)))
	for column in columns:
	column_dict[column] = [float(row[column_indices[column]]) for row in rows]
	print('\t'.join([column] + list(map(lambda x: str(x[1](column_dict[column])), funcs))))