Skip to content

Instantly share code, notes, and snippets.

@loretoparisi
Last active June 5, 2019 08:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save loretoparisi/56b140cc3c1e12b325cdb3cd7a17a564 to your computer and use it in GitHub Desktop.
Save loretoparisi/56b140cc3c1e12b325cdb3cd7a17a564 to your computer and use it in GitHub Desktop.
De hyphenate a word with hyphens to matched word - Repl: https://repl.it/@loretoparisi/DeHyphen
import re
text = "oh-oh-oh c-c-c-c-come to home today c-c-c-c-come to me fine-tuning"
print(re.sub(r'(\w+(?:-))+(\w+)', '\\2', text))
print(re.sub(r'(?<!\S)(\w{1,3})(?:-\1)*-(\w+)(?!\S)', '\\2', text))
#pattern = r"(?<=-)\w+(?=[^-\w])"
pattern = r"(?<!\S)(\w{1,3})(?:-\1)*-(\w+)(?!\S)"
r = re.compile(pattern, flags=re.I | re.X | re.UNICODE)
for m in r.finditer(text):
word = m.group()
characterOffsetBegin = m.start()
print(word, characterOffsetBegin)
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)"
text = re.sub(pattern, r'\2', text)
print(text)
# c-c-come
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)"
text = re.sub(pattern, r'\2', text)
print(text)
# oh-oh-oh
pattern = r"(?<!\S)(\w{1,2})(?:-\1)*-(\w+)(?!\S)"
text = re.sub(pattern, r'\2', text)
print(text)
def de_hyphen(s):
'''
find hyphenated words and replace with matched word
it supports single and multiple chars between hyphen
c-c-come or oh-oh.ho
@TODO: it does not handle words like `pro-tip`, where there
is no repeating sub-word lemma into the hyphenated part
like in oh-oh or c-c-comma
'''
# c-c-come
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)"
s = re.sub(pattern, r'\2', s)
# oh-oh-oh
pattern = r"(?<!\S)(\w{1,2})(?:-\1)*-(\w+)(?!\S)"
s = re.sub(pattern, r'\2', s)
return s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment