Skip to content

Instantly share code, notes, and snippets.

@khous
Created May 20, 2018 21:20
Show Gist options
  • Save khous/6b66acdd15274b2614640d3cecfd6ea0 to your computer and use it in GitHub Desktop.
Save khous/6b66acdd15274b2614640d3cecfd6ea0 to your computer and use it in GitHub Desktop.
Removes lines of a csv which include characters outside of a certain range
#! /usr/bin/env python3
# So far lab events and output events are fucked up
import argparse
import os
import sys
# Take input file
# stream correct output
def parse_args ():
parser = argparse.ArgumentParser()
parser.add_argument(
"input_file",
type=str,
help="Only argument, this script returns a stream you should point at where you want the repaired csv to go."
)
return parser.parse_args()
# Discover the expected dimensionality of this csv
# based on the first line of the file
def discover_dimensions (first_line):
comma_count = 0
for c in first_line:
if c == ",":
comma_count += 1
return comma_count
# When we encounter a character outside the range of UTF 8, we will begin discarding
# until we reach the next CR LF
# So the output will equal the input until we spill the banks of ascii/utf-8
# Then the output will omit until the next newline and output will equal input again
def validate_line ():
pass
def is_valid_utf8 (char):
try:
ord(char)
return True
except TypeError:
return False
def parse_file (input_file, columns):
# Define whether or not input should stream to output
valid = True
output_line = ""
line_count = 0
commas = 0
while input_file.readable():
char_bytes = input_file.read(1)
# Empty bytes is the halt condition / EOF
if char_bytes == b"":
break
char = ""
try:
# valid characters or newline
b_int = int.from_bytes(char_bytes, byteorder="little")
if (b_int >= 32 and b_int <= 126) or b_int == 10:
char = char_bytes.decode("utf-8")
else:
valid = False
except UnicodeDecodeError:
valid = False
if (char == "\n"):
line_count += 1
if valid and commas == columns:
output_line += char
sys.stdout.write(output_line)
sys.stdout.flush()
output_line = ""
commas = 0
else: # We encountered a line break, tentatively set valid
output_line = ""
commas = 0
valid = True
elif valid:
if char == ",":
commas += 1
output_line += char
else: # Begin skipping input
if valid:
sys.stderr.write("Invalid characters on line: " + line_count + "\n")
sys.stdout.flush()
valid = False
def main():
args = parse_args()
# p_in, p_out = os.pipe()
column_count = discover_dimensions(open(args.input_file, "r", encoding="utf-8").readline())
input_file = open(args.input_file, "rb")
parse_file(input_file, column_count)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment