Skip to content

Instantly share code, notes, and snippets.

@dreulavelle
Last active April 8, 2024 04:40
Show Gist options
  • Save dreulavelle/44c36d47ffc99e8315b00ad8379ff58a to your computer and use it in GitHub Desktop.
Save dreulavelle/44c36d47ffc99e8315b00ad8379ff58a to your computer and use it in GitHub Desktop.
Clean Title
import regex
def clean_title(raw_title):
NON_ENGLISH_CHARS = "\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f\u0400-\u04ff"
RUSSIAN_CAST_REGEX = regex.compile(r"\([^)]*[\u0400-\u04ff][^)]*\)$|(?<=\/.*)\(.*\)$")
ALT_TITLES_REGEX = regex.compile(rf"[^/|(]*[{NON_ENGLISH_CHARS}][^/|]*[/|]|[/|][^/|(]*[{NON_ENGLISH_CHARS}][^/|]*")
NOT_ONLY_NON_ENGLISH_REGEX = regex.compile(rf"(?<=[a-zA-Z][^{NON_ENGLISH_CHARS}]+)[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}]|[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}](?=[^{NON_ENGLISH_CHARS}]+[a-zA-Z])")
NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#[【★]+|[ \-:/\\[|{{(#$&^]+$")
REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#]+|]$")
cleaned_title = raw_title
cleaned_title = regex.sub(r"_", " ", raw_title, flags=regex.MULTILINE)
cleaned_title = regex.sub(r"\[(movie)\]", "", cleaned_title, flags=regex.IGNORECASE)
cleaned_title = regex.sub(NOT_ALLOWED_SYMBOLS_AT_START_AND_END, "", cleaned_title)
cleaned_title = regex.sub(RUSSIAN_CAST_REGEX, "", cleaned_title)
cleaned_title = regex.sub(ALT_TITLES_REGEX, "", cleaned_title)
cleaned_title = regex.sub(NOT_ONLY_NON_ENGLISH_REGEX, "", cleaned_title)
cleaned_title = regex.sub(REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END, "", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment