Created
May 20, 2018 21:20
-
-
Save khous/6b66acdd15274b2614640d3cecfd6ea0 to your computer and use it in GitHub Desktop.
Removes lines of a csv which include characters outside of a certain range
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# So far lab events and output events are fucked up | |
import argparse | |
import os | |
import sys | |
# Take input file | |
# stream correct output | |
def parse_args (): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"input_file", | |
type=str, | |
help="Only argument, this script returns a stream you should point at where you want the repaired csv to go." | |
) | |
return parser.parse_args() | |
# Discover the expected dimensionality of this csv | |
# based on the first line of the file | |
def discover_dimensions (first_line): | |
comma_count = 0 | |
for c in first_line: | |
if c == ",": | |
comma_count += 1 | |
return comma_count | |
# When we encounter a character outside the range of UTF 8, we will begin discarding | |
# until we reach the next CR LF | |
# So the output will equal the input until we spill the banks of ascii/utf-8 | |
# Then the output will omit until the next newline and output will equal input again | |
def validate_line (): | |
pass | |
def is_valid_utf8 (char): | |
try: | |
ord(char) | |
return True | |
except TypeError: | |
return False | |
def parse_file (input_file, columns): | |
# Define whether or not input should stream to output | |
valid = True | |
output_line = "" | |
line_count = 0 | |
commas = 0 | |
while input_file.readable(): | |
char_bytes = input_file.read(1) | |
# Empty bytes is the halt condition / EOF | |
if char_bytes == b"": | |
break | |
char = "" | |
try: | |
# valid characters or newline | |
b_int = int.from_bytes(char_bytes, byteorder="little") | |
if (b_int >= 32 and b_int <= 126) or b_int == 10: | |
char = char_bytes.decode("utf-8") | |
else: | |
valid = False | |
except UnicodeDecodeError: | |
valid = False | |
if (char == "\n"): | |
line_count += 1 | |
if valid and commas == columns: | |
output_line += char | |
sys.stdout.write(output_line) | |
sys.stdout.flush() | |
output_line = "" | |
commas = 0 | |
else: # We encountered a line break, tentatively set valid | |
output_line = "" | |
commas = 0 | |
valid = True | |
elif valid: | |
if char == ",": | |
commas += 1 | |
output_line += char | |
else: # Begin skipping input | |
if valid: | |
sys.stderr.write("Invalid characters on line: " + line_count + "\n") | |
sys.stdout.flush() | |
valid = False | |
def main(): | |
args = parse_args() | |
# p_in, p_out = os.pipe() | |
column_count = discover_dimensions(open(args.input_file, "r", encoding="utf-8").readline()) | |
input_file = open(args.input_file, "rb") | |
parse_file(input_file, column_count) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment