Last active
April 23, 2024 15:27
-
-
Save do-me/70f77ba75629bd41697a31c9f9d8fed1 to your computer and use it in GitHub Desktop.
Replace an arbitrary number of white spaces by just one white space in Python for data cleaning (useful for XML/HTML parsing)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Default replaces white spaces, tabs, line breaks etc. | |
def replace_multiple_whitespaces(text): | |
return re.sub(r'\s+', ' ', text) # use re.sub(r'[ \t]+', ' ', text) if line breaks should be preserved | |
# Use this function if you want to preserve exactly one line break and remove the rest like above | |
def replace_multiple_whitespaces_keep_one_linebreak(text): | |
text = re.sub(r'[ \t]*\r?\n[ \t\r\n]*', '\n', text) | |
# Replace one or more spaces or tabs with a single space (for remaining white spaces) | |
text = re.sub(r'[ \t]+', ' ', text) | |
return text | |
# Example usage: | |
original_text = 'This is a text \n \n \n with multiple \n \t white\tspaces.' | |
print(original_text) | |
print("Version 1") | |
replace_multiple_whitespaces(original_text) | |
# 'This is a text with multiple whitespaces.' | |
print("Version 2") | |
replace_multiple_whitespaces_keep_one_linebreak(original_text) | |
# 'This is a text\nwith multiple\nwhite spaces.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment