Skip to content

Instantly share code, notes, and snippets.

@rusty-snake
Last active November 24, 2021 23:59
Show Gist options
  • Save rusty-snake/a82ffae09e820e053ac486694af777c3 to your computer and use it in GitHub Desktop.
Save rusty-snake/a82ffae09e820e053ac486694af777c3 to your computer and use it in GitHub Desktop.
ClearURLs to µBlock origin converter
# Copyright © 2021 rusty-snake
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
import sys
KNOWN_BAD_FILTERS = []
def normalize_url_pattern(url_pattern: str) -> str:
# No need for protocol and subdomain
url_pattern = url_pattern.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "", 1)
url_pattern = url_pattern.replace(r"https?:\/\/([a-z0-9-.]*\.)", "", 1)
url_pattern = url_pattern.replace(r"^https?:\/\/", "", 1)
# adb TLD globbing
url_pattern = url_pattern.replace(r"(?:\.[a-z]{2,}){1,}", ".*", 1)
# Remove backslashes
url_pattern = url_pattern.replace("\\", "")
# Specific fixups
url_pattern = url_pattern.replace("(?:accounts.)?", "", 1)
url_pattern = url_pattern.replace("(?:support.)?", "", 1)
url_pattern = url_pattern.replace("(?:yandex.*|ya.ru)", "yandex.*", 1)
return url_pattern
def normalize_rule(rule: str) -> str:
rule = rule.replace("(?:%3F)?", "", 1)
rule = rule.replace("(?:", "(")
rule = rule.replace(r"\$", r"\x24")
return rule
def normalize_exception(exception: str) -> tuple[str, str]:
orig_exception = exception
exception = exception.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "||", 1)
exception = exception.replace(r"^https?:\/\/", "||", 1)
# FIXME: |ws://
exception = exception.replace(r"^wss?:\/\/(?:[a-z0-9-]+\.)*?", "|wss://", 1)
exception = exception.replace(r"(?:\.[a-z]{2,}){1,}", "TLD_WILDCARD", 1)
exception = exception.replace("=[^/?&]*", "=")
exception = exception.replace("=.*?", "=")
exception = exception.replace("=.", "=")
exception = exception.replace("[^?]*\\?.*?", "*?*")
exception = exception.replace("[^?]+.*?&?", "*?*")
exception = exception.replace("\\?.*?", "?")
exception = exception.replace(".*?&?", "*")
exception = exception.replace(".*?", "*")
exception = exception.replace("\\", "")
if any(c in "([" for c in exception):
exception = orig_exception
exception = exception.replace("(?:", "(")
return "regex", exception
elif any(c in "/?" for c in exception):
exception = exception.replace("TLD_WILDCARD", ".*", 1)
exception = exception.replace("|wss://zoom.us", "|wss://zoom.us^", 1)
return "path", exception
else:
exception = exception.replace("TLD_WILDCARD", ".*", 1)
exception = exception.replace("||", "", 1)
return "domain", exception
def expand_se(rule: str) -> list[str]:
# TODO:
# 1. "foo_(1|2)_(bar|baz)" -> ["foo_1_bar", "foo_2_bar", "foo_1_baz", "foo_2_baz"]
# 2. "foo_[12]_bar" -> ["foo_1_bar", "foo_2_bar"]
# 3. "foo_?bar" -> ["foobar", "foo_bar"]
# But "foo_[a-z]*_bar" -> ["foo_[a-z]*_bar"]
#
# https://stackoverflow.com/questions/20061268/python-regex-string-expansion
if rule.count("(") == 1 and rule.count(")") == 1 and "\\" not in rule:
fixed_prefix, remains = rule.split("(")
variants, fixed_suffix = remains.split(")")
variants = variants.split("|")
return [fixed_prefix + variant + fixed_suffix for variant in variants]
return [rule]
def is_regex(rule: str) -> bool:
return any(c in r".^$*+?{}[]\|()" for c in rule)
def print_rules(
url_pattern: str, rules: list[str], regex_fromat: str, plain_format: str
) -> None:
for rule in rules:
filter_template = regex_fromat if is_regex(rule) else plain_format
filter_ = filter_template.format(rule, url_pattern)
if filter_ not in KNOWN_BAD_FILTERS:
print(filter_)
def main() -> int:
data_min_json = json.loads(sys.stdin.read())
# TODO: referralMarketing
providers = {
provider["urlPattern"]: provider["rules"]
for provider in data_min_json["providers"].values()
if provider["rules"]
}
# TODO:
# - URL encoded
# $removeparam=%24deep_link,domain=reddit.com
# - Better is_regex
# $removeparam=/^p\[\]=/,domain=flipkart.com
for url_pattern, rules in providers.items():
url_pattern = normalize_url_pattern(url_pattern)
rules = (expand_se(normalize_rule(rule)) for rule in rules)
rules = [rule for expanded_rule in rules for rule in expanded_rule]
if url_pattern == ".*":
print_rules(url_pattern, rules, "$removeparam=/^{0}=/", "$removeparam={0}")
elif "/" in url_pattern:
print_rules(
url_pattern, rules, "||{1}$removeparam=/^{0}=/", "||{1}$removeparam={0}"
)
else:
print_rules(
url_pattern,
rules,
"$removeparam=/^{0}=/,domain={1}",
"$removeparam={0},domain={1}",
)
exceptions = [
exception
for provider in data_min_json["providers"].values()
for exception in provider["exceptions"]
]
for exception in exceptions:
kind, exception = normalize_exception(exception.replace("\\\\", "\\"))
if kind == "regex":
print("@@/{0}/$removeparam".format(exception))
elif kind == "path":
print("@@{0}$removeparam".format(exception))
elif kind == "domain":
print("@@$removeparam,domain={0}".format(exception))
else:
raise ValueError
return 0
if __name__ == "__main__":
sys.exit(main())
# Copyright © 2021 rusty-snake
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
import sys
def normalize_url_pattern(url_pattern: str) -> str:
# No need for protocol and subdomain
url_pattern = url_pattern.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "", 1)
url_pattern = url_pattern.replace(r"https?:\/\/([a-z0-9-.]*\.)", "", 1)
url_pattern = url_pattern.replace(r"^https?:\/\/", "", 1)
# filterlist tld globbing
url_pattern = url_pattern.replace(r"(?:\.[a-z]{2,}){1,}", ".", 1)
# Remove backslashes
url_pattern = url_pattern.replace("\\", "")
# Specific fixups
url_pattern = url_pattern.replace("(?:accounts.)?", "", 1)
url_pattern = url_pattern.replace("(?:support.)?", "", 1)
url_pattern = url_pattern.replace("(?:yandex.|ya.ru)", "yandex.", 1)
return "" if url_pattern == ".*" else f"||{url_pattern}"
def is_regex(rule: str) -> bool:
lastchar = rule[0]
for char in rule[1:]:
if char in r".^$*+?{}[]\|()" and lastchar != "\\":
return True
return False
def print_rules(
filter_template: str,
exceptions_template: str,
url_pattern: str,
rules: str,
exceptions: str,
):
print(filter_template.format(url_pattern, rules))
if exceptions:
print(exceptions_template.format(exceptions, rules))
def main() -> int:
data_min_json = json.loads(sys.stdin.read())
providers = {
provider["urlPattern"]: {
"rules": provider["rules"],
"exceptions": provider["exceptions"],
}
for provider in data_min_json["providers"].values()
if provider["rules"]
}
for url_pattern in providers:
rules = providers[url_pattern]["rules"]
exceptions = providers[url_pattern]["exceptions"]
url_pattern = normalize_url_pattern(url_pattern)
rules = [rule.replace(",", r"\,").replace(r"\$", r"\x24") for rule in rules]
rules_joined = "|".join(rules)
if len(rules) == 1 and not is_regex(rules[0]):
filter_template = "{0}$removeparam={1}"
exceptions_template = "@@/({0})/$removeparam={1}"
else:
filter_template = "{0}$removeparam=/^({1})=/,all"
exceptions_template = "@@/({0})/$removeparam=/^({1})=/,all"
print_rules(
filter_template,
exceptions_template,
url_pattern,
"|".join(rules),
"|".join(exceptions),
)
return 0
if __name__ == "__main__":
sys.exit(main())
@rusty-snake
Copy link
Author

rusty-snake commented Nov 6, 2021

A modified version with an automatically generated list can be found at
https://github.com/DandelionSprout/adfilt/tree/master/ClearURLs%20for%20uBo.

@bitsper2nd
Copy link

OMG I have been waiting for this. Thx!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment