Last active
June 5, 2019 08:52
-
-
Save loretoparisi/56b140cc3c1e12b325cdb3cd7a17a564 to your computer and use it in GitHub Desktop.
De hyphenate a word with hyphens to matched word - Repl: https://repl.it/@loretoparisi/DeHyphen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
text = "oh-oh-oh c-c-c-c-come to home today c-c-c-c-come to me fine-tuning" | |
print(re.sub(r'(\w+(?:-))+(\w+)', '\\2', text)) | |
print(re.sub(r'(?<!\S)(\w{1,3})(?:-\1)*-(\w+)(?!\S)', '\\2', text)) | |
#pattern = r"(?<=-)\w+(?=[^-\w])" | |
pattern = r"(?<!\S)(\w{1,3})(?:-\1)*-(\w+)(?!\S)" | |
r = re.compile(pattern, flags=re.I | re.X | re.UNICODE) | |
for m in r.finditer(text): | |
word = m.group() | |
characterOffsetBegin = m.start() | |
print(word, characterOffsetBegin) | |
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)" | |
text = re.sub(pattern, r'\2', text) | |
print(text) | |
# c-c-come | |
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)" | |
text = re.sub(pattern, r'\2', text) | |
print(text) | |
# oh-oh-oh | |
pattern = r"(?<!\S)(\w{1,2})(?:-\1)*-(\w+)(?!\S)" | |
text = re.sub(pattern, r'\2', text) | |
print(text) | |
def de_hyphen(s): | |
''' | |
find hyphenated words and replace with matched word | |
it supports single and multiple chars between hyphen | |
c-c-come or oh-oh.ho | |
@TODO: it does not handle words like `pro-tip`, where there | |
is no repeating sub-word lemma into the hyphenated part | |
like in oh-oh or c-c-comma | |
''' | |
# c-c-come | |
pattern = r"(?<!\S)(\w{1,1})(?:-\1)*-(\w+)(?!\S)" | |
s = re.sub(pattern, r'\2', s) | |
# oh-oh-oh | |
pattern = r"(?<!\S)(\w{1,2})(?:-\1)*-(\w+)(?!\S)" | |
s = re.sub(pattern, r'\2', s) | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment