Created
October 10, 2019 15:25
-
-
Save niyawe/23a9f5ea4fbd4660406fd8f45e826c8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
import re | |
from urllib import parse | |
#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de | |
SHORT_URL = input("Please enter your short url: ") | |
# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2 | |
BLOCKED_PARAMS = ["utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "utm_reader", "utm_place", "utm_userid", "utm_cid", "utm_name", "utm_pubreferrer", "utm_swu", "utm_viz_id", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign", "ga_place", "yclid", "_openstat", "fb_action_ids", "fb_action_types", "fb_ref", "fb_source", "fbclid", "action_object_map", "action_type_map", "action_ref_map", "gs_l", "pd_rd_*@amazon.*", "_encoding@amazon.*", "psc@amazon.*", "ved@google.*", "ei@google.*", "sei@google.*", "gws_rd@google.*", "cvid@bing.com", "form@bing.com", "sk@bing.com", "sp@bing.com", "sc@bing.com", "qs@bing.com", "pq@bing.com", "feature@youtube.com", "gclid@youtube.com", "kw@youtube.com", "$/ref@amazon.*", "_hsenc", "mkt_tok", "hmb_campaign", "hmb_medium", "hmb_source", "source@sourceforge.net", "position@sourceforge.net", "callback@bilibili.com", "elqTrackId", "elqTrack", "assetType", "assetId", "recipientId", "campaignId", "siteId", "tag@amazon.*", "ref_@amazon.*", "pf_rd_*@amazon.*", "spm@*.aliexpress.com", "scm@*.aliexpress.com", "aff_platform", "aff_trace_key", "terminal_id", "_hsmi", "fbclid", "spReportId", "spJobID", "spUserID", "spMailingID", "utm_mailing", "utm_brand", "CNDID", "mbid", "trk", "trkCampaign", "sc_campaign", "sc_channel", "sc_content", "sc_medium", "sc_outcome", "sc_geo", "sc_country"] | |
CUSTOM_BLOCKED_PARAMS = ["wt_zmc"] | |
BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS | |
req = requests.head(SHORT_URL, allow_redirects=True) | |
LONG_URL = req.url | |
url_components = parse.urlparse(LONG_URL) | |
assert not SHORT_URL == LONG_URL | |
query_components = {} | |
if len(url_components.query) > 0: | |
query_components = parse.parse_qs(url_components.query) | |
amount_components_before = len(query_components.keys()) | |
amount_dropped_components = 0 | |
for BLOCKED_PARAM in BLOCKED_PARAMS: | |
if "@" in BLOCKED_PARAM: | |
param, url = BLOCKED_PARAM.split("@") | |
match = re.search("^" + url.replace(".", "\.").replace("*", ".*") + "$", url) | |
if match: | |
BLOCKED_PARAM = param | |
else: | |
continue | |
if BLOCKED_PARAM in query_components.keys(): | |
query_components.pop(BLOCKED_PARAM) | |
amount_dropped_components += 1 | |
print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM)) | |
percentage_dropped = (100/amount_components_before)*amount_dropped_components | |
print("We dropped {}% of the URLs components".format(int(percentage_dropped))) | |
if percentage_dropped > 50 and not percentage_dropped == 100: | |
print("Over 50% of the query components where blocked.") | |
print("Therefore printing remaining components to maybe extend the blocklist:") | |
for component in query_components.keys(): | |
print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--"))) | |
else: | |
print("No query components to clean found in url") | |
clean_query = parse.urlencode(query_components, doseq=True) | |
clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment)) | |
print(clean_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment