Skip to content

Instantly share code, notes, and snippets.

@niyawe
Created October 10, 2019 15:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save niyawe/23a9f5ea4fbd4660406fd8f45e826c8e to your computer and use it in GitHub Desktop.
Save niyawe/23a9f5ea4fbd4660406fd8f45e826c8e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import requests
import re
from urllib import parse
#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de
SHORT_URL = input("Please enter your short url: ")
# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
BLOCKED_PARAMS = ["utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "utm_reader", "utm_place", "utm_userid", "utm_cid", "utm_name", "utm_pubreferrer", "utm_swu", "utm_viz_id", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign", "ga_place", "yclid", "_openstat", "fb_action_ids", "fb_action_types", "fb_ref", "fb_source", "fbclid", "action_object_map", "action_type_map", "action_ref_map", "gs_l", "pd_rd_*@amazon.*", "_encoding@amazon.*", "psc@amazon.*", "ved@google.*", "ei@google.*", "sei@google.*", "gws_rd@google.*", "cvid@bing.com", "form@bing.com", "sk@bing.com", "sp@bing.com", "sc@bing.com", "qs@bing.com", "pq@bing.com", "feature@youtube.com", "gclid@youtube.com", "kw@youtube.com", "$/ref@amazon.*", "_hsenc", "mkt_tok", "hmb_campaign", "hmb_medium", "hmb_source", "source@sourceforge.net", "position@sourceforge.net", "callback@bilibili.com", "elqTrackId", "elqTrack", "assetType", "assetId", "recipientId", "campaignId", "siteId", "tag@amazon.*", "ref_@amazon.*", "pf_rd_*@amazon.*", "spm@*.aliexpress.com", "scm@*.aliexpress.com", "aff_platform", "aff_trace_key", "terminal_id", "_hsmi", "fbclid", "spReportId", "spJobID", "spUserID", "spMailingID", "utm_mailing", "utm_brand", "CNDID", "mbid", "trk", "trkCampaign", "sc_campaign", "sc_channel", "sc_content", "sc_medium", "sc_outcome", "sc_geo", "sc_country"]
CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS
req = requests.head(SHORT_URL, allow_redirects=True)
LONG_URL = req.url
url_components = parse.urlparse(LONG_URL)
assert not SHORT_URL == LONG_URL
query_components = {}
if len(url_components.query) > 0:
query_components = parse.parse_qs(url_components.query)
amount_components_before = len(query_components.keys())
amount_dropped_components = 0
for BLOCKED_PARAM in BLOCKED_PARAMS:
if "@" in BLOCKED_PARAM:
param, url = BLOCKED_PARAM.split("@")
match = re.search("^" + url.replace(".", "\.").replace("*", ".*") + "$", url)
if match:
BLOCKED_PARAM = param
else:
continue
if BLOCKED_PARAM in query_components.keys():
query_components.pop(BLOCKED_PARAM)
amount_dropped_components += 1
print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
percentage_dropped = (100/amount_components_before)*amount_dropped_components
print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
if percentage_dropped > 50 and not percentage_dropped == 100:
print("Over 50% of the query components where blocked.")
print("Therefore printing remaining components to maybe extend the blocklist:")
for component in query_components.keys():
print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
else:
print("No query components to clean found in url")
clean_query = parse.urlencode(query_components, doseq=True)
clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))
print(clean_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment