-
-
Save kimusan/db7b74ec9f413afd990ed58c2394011b to your computer and use it in GitHub Desktop.
clean csv files with multiline values in mixed quotation formats.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
def clean_multiline_csv(input_file, output_file): | |
cleaned_rows = [] | |
current_row = "" | |
inside_multiline = False | |
with open(input_file, encoding='utf-8') as infile: | |
for line in infile: | |
line = line.rstrip('\n') | |
if not inside_multiline: | |
current_row = line | |
if current_row.count('"') % 2 == 1: | |
inside_multiline = True | |
else: | |
current_row += "\\n" + line.strip() | |
if current_row.count('"') % 2 == 0: | |
inside_multiline = False | |
# Remove inner quotes if any | |
parts = current_row.split(';') | |
for i, part in enumerate(parts): | |
if part.count('"') > 2: | |
parts[i] = '"' + part.replace('"', '').replace("\\n", " ") + '"' | |
break | |
current_row = ';'.join(parts) | |
if not inside_multiline: | |
cleaned_rows.append(current_row) | |
with open(output_file, 'w', encoding='utf-8', newline='') as outfile: | |
for row in cleaned_rows: | |
outfile.write(row + '\n') | |
if __name__ == "__main__": | |
clean_multiline_csv('input.csv', 'output.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment