Skip to content

Instantly share code, notes, and snippets.

@kimusan
Created April 15, 2025 14:15
Show Gist options
  • Save kimusan/db7b74ec9f413afd990ed58c2394011b to your computer and use it in GitHub Desktop.
Save kimusan/db7b74ec9f413afd990ed58c2394011b to your computer and use it in GitHub Desktop.
clean csv files with multiline values in mixed quotation formats.
#!/usr/bin/env python3
import csv
def clean_multiline_csv(input_file, output_file):
cleaned_rows = []
current_row = ""
inside_multiline = False
with open(input_file, encoding='utf-8') as infile:
for line in infile:
line = line.rstrip('\n')
if not inside_multiline:
current_row = line
if current_row.count('"') % 2 == 1:
inside_multiline = True
else:
current_row += "\\n" + line.strip()
if current_row.count('"') % 2 == 0:
inside_multiline = False
# Remove inner quotes if any
parts = current_row.split(';')
for i, part in enumerate(parts):
if part.count('"') > 2:
parts[i] = '"' + part.replace('"', '').replace("\\n", " ") + '"'
break
current_row = ';'.join(parts)
if not inside_multiline:
cleaned_rows.append(current_row)
with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
for row in cleaned_rows:
outfile.write(row + '\n')
if __name__ == "__main__":
clean_multiline_csv('input.csv', 'output.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment