Skip to content

Instantly share code, notes, and snippets.

@dynamicguy
Created April 10, 2023 11:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dynamicguy/afc2147b79bbd283a7a91b53eca5e299 to your computer and use it in GitHub Desktop.
Save dynamicguy/afc2147b79bbd283a7a91b53eca5e299 to your computer and use it in GitHub Desktop.
remove standard noise from text
def text_cleaner(text):
rules = [
{r'>\s+': u'>'}, # remove spaces after a tag opens or closes
{r'\s+': u' '}, # replace consecutive spaces
{r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br>
{r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>...
{r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>...
{r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head>
{r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts
{r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags
{r'^\s+': u''} # remove spaces at the beginning
]
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
return text.lower()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment