khous/csv-repair.py

## csv-repair.py
#! /usr/bin/env python3
# So far lab events and output events are fucked up
import argparse
import os
import sys
# Take input file
# stream correct output

def parse_args ():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "input_file",
        type=str,
        help="Only argument, this script returns a stream you should point at where you want the repaired csv to go."
    )

    return parser.parse_args()

# Discover the expected dimensionality of this csv
# based on the first line of the file
def discover_dimensions (first_line):
    comma_count = 0
    for c in first_line:
        if c == ",":
            comma_count += 1

    return comma_count

# When we encounter a character outside the range of UTF 8, we will begin discarding
# until we reach the next CR LF
# So the output will equal the input until we spill the banks of ascii/utf-8
# Then the output will omit until the next newline and output will equal input again

def validate_line ():
    pass

def is_valid_utf8 (char):
    try:
        ord(char)
        return True
    except TypeError:
        return False

def parse_file (input_file, columns):
    # Define whether or not input should stream to output
    valid = True
    output_line = ""
    line_count = 0
    commas = 0
    while input_file.readable():

        char_bytes = input_file.read(1)
        # Empty bytes is the halt condition / EOF
        if char_bytes == b"":
            break

        char = ""
        try:
            # valid characters or newline
            b_int = int.from_bytes(char_bytes, byteorder="little")
            if (b_int >= 32 and b_int <= 126) or b_int == 10:
                char = char_bytes.decode("utf-8")
            else:
                valid = False
        except UnicodeDecodeError:
            valid = False

        if (char == "\n"):
            line_count += 1

            if valid and commas == columns:
                output_line += char
                sys.stdout.write(output_line)
                sys.stdout.flush()
                output_line = ""
                commas = 0
            else: # We encountered a line break, tentatively set valid
                output_line = ""
                commas = 0
                valid = True

        elif valid:
            if char == ",":
                commas += 1
            output_line += char
        else: # Begin skipping input
            if valid:
                sys.stderr.write("Invalid characters on line: " + line_count + "\n")
                sys.stdout.flush()

            valid = False
def main():
    args = parse_args()
    # p_in, p_out = os.pipe()

    column_count = discover_dimensions(open(args.input_file, "r", encoding="utf-8").readline())

    input_file = open(args.input_file, "rb")
    parse_file(input_file, column_count)


if __name__ == '__main__':
    main()
	#! /usr/bin/env python3
	# So far lab events and output events are fucked up
	import argparse
	import os
	import sys
	# Take input file
	# stream correct output

	def parse_args ():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"input_file",
	type=str,
	help="Only argument, this script returns a stream you should point at where you want the repaired csv to go."
	)

	return parser.parse_args()

	# Discover the expected dimensionality of this csv
	# based on the first line of the file
	def discover_dimensions (first_line):
	comma_count = 0
	for c in first_line:
	if c == ",":
	comma_count += 1

	return comma_count

	# When we encounter a character outside the range of UTF 8, we will begin discarding
	# until we reach the next CR LF
	# So the output will equal the input until we spill the banks of ascii/utf-8
	# Then the output will omit until the next newline and output will equal input again

	def validate_line ():
	pass

	def is_valid_utf8 (char):
	try:
	ord(char)
	return True
	except TypeError:
	return False

	def parse_file (input_file, columns):
	# Define whether or not input should stream to output
	valid = True
	output_line = ""
	line_count = 0
	commas = 0
	while input_file.readable():

	char_bytes = input_file.read(1)
	# Empty bytes is the halt condition / EOF
	if char_bytes == b"":
	break

	char = ""
	try:
	# valid characters or newline
	b_int = int.from_bytes(char_bytes, byteorder="little")
	if (b_int >= 32 and b_int <= 126) or b_int == 10:
	char = char_bytes.decode("utf-8")
	else:
	valid = False
	except UnicodeDecodeError:
	valid = False

	if (char == "\n"):
	line_count += 1

	if valid and commas == columns:
	output_line += char
	sys.stdout.write(output_line)
	sys.stdout.flush()
	output_line = ""
	commas = 0
	else: # We encountered a line break, tentatively set valid
	output_line = ""
	commas = 0
	valid = True

	elif valid:
	if char == ",":
	commas += 1
	output_line += char
	else: # Begin skipping input
	if valid:
	sys.stderr.write("Invalid characters on line: " + line_count + "\n")
	sys.stdout.flush()

	valid = False
	def main():
	args = parse_args()
	# p_in, p_out = os.pipe()

	column_count = discover_dimensions(open(args.input_file, "r", encoding="utf-8").readline())

	input_file = open(args.input_file, "rb")
	parse_file(input_file, column_count)


	if __name__ == '__main__':
	main()