Last active
September 18, 2020 14:07
-
-
Save srmds/523d12bdf1ce74be3c5944a88a133926 to your computer and use it in GitHub Desktop.
Replace SOH control character field separator (Hadoop) with semicolon in CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# replace_soh_separator.py | |
from pathlib import Path | |
import logging | |
logger = logging.getLogger(__name__) | |
if __name__ == "__main__": | |
""" | |
This script replaces the SOH control character (^A, \x01), | |
which is often used in Hadoop for field seperator in CSV files, | |
with a semicolon (;) | |
https://en.wikipedia.org/wiki/C0_and_C1_control_codes#SOH | |
Note: one can also use sed replace: https://stackoverflow.com/questions/13180057/replacing-control-character-in-sed | |
""" | |
# Delimiter / seperator to convert to | |
DELIMITER = ";" | |
# Control character (1st byte) delimiter to replace | |
SOH = "\x01" # aka ^A | |
# Setup path reference to input/output file (edit to reference correct path) | |
data_folder = Path("data") # set your reference directory | |
file_input_path = data_folder / "input.csv" # set your reference input file | |
file_output_path = data_folder / "input_converted.csv" # set your reference output file | |
logger.info(f"Reading input file {file_input_path} to replace all occurrences of {SOH} with delimiter {DELIMITER}") | |
# Open input file | |
file_in = open(file_path, "rt") | |
# Create a separate output file in order to write the converted result | |
file_out = open(file_output_path, "wt") | |
# Parse each line from input file, replace delimiter and write line to output file | |
for line in fin: | |
fout.write(line.replace(SOH, DELIMITER)) | |
# Finally, close both input and output files | |
logger.info("Done, closing files...") | |
file_in.close() | |
file_out.close() | |
logger.info(f"Output file is written to: {file_output_path}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment