niyawe/unshort.py

## unshort.py
#!/usr/bin/env python3
import requests
import re
from urllib import parse

#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de

SHORT_URL = input("Please enter your short url: ")

# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
BLOCKED_PARAMS = ["utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "utm_reader", "utm_place", "utm_userid", "utm_cid", "utm_name", "utm_pubreferrer", "utm_swu", "utm_viz_id", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign", "ga_place", "yclid", "_openstat", "fb_action_ids", "fb_action_types", "fb_ref", "fb_source", "fbclid", "action_object_map", "action_type_map", "action_ref_map", "gs_l", "pd_rd_*@amazon.*", "_encoding@amazon.*", "psc@amazon.*", "ved@google.*", "ei@google.*", "sei@google.*", "gws_rd@google.*", "cvid@bing.com", "form@bing.com", "sk@bing.com", "sp@bing.com", "sc@bing.com", "qs@bing.com", "pq@bing.com", "feature@youtube.com", "gclid@youtube.com", "kw@youtube.com", "$/ref@amazon.*", "_hsenc", "mkt_tok", "hmb_campaign", "hmb_medium", "hmb_source", "source@sourceforge.net", "position@sourceforge.net", "callback@bilibili.com", "elqTrackId", "elqTrack", "assetType", "assetId", "recipientId", "campaignId", "siteId", "tag@amazon.*", "ref_@amazon.*", "pf_rd_*@amazon.*", "spm@*.aliexpress.com", "scm@*.aliexpress.com", "aff_platform", "aff_trace_key", "terminal_id", "_hsmi", "fbclid", "spReportId", "spJobID", "spUserID", "spMailingID", "utm_mailing", "utm_brand", "CNDID", "mbid", "trk", "trkCampaign", "sc_campaign", "sc_channel", "sc_content", "sc_medium", "sc_outcome", "sc_geo", "sc_country"]

CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS

req = requests.head(SHORT_URL, allow_redirects=True)
LONG_URL = req.url
url_components = parse.urlparse(LONG_URL)
assert not SHORT_URL == LONG_URL

query_components = {}
if len(url_components.query) > 0:
    query_components = parse.parse_qs(url_components.query)
    amount_components_before = len(query_components.keys())
    amount_dropped_components = 0
    for BLOCKED_PARAM in BLOCKED_PARAMS:
        if "@" in BLOCKED_PARAM:
            param, url = BLOCKED_PARAM.split("@")
            match = re.search("^" + url.replace(".", "\.").replace("*", ".*") + "$", url)
            if match:
                BLOCKED_PARAM = param
            else:
                continue
        if BLOCKED_PARAM in query_components.keys():
            query_components.pop(BLOCKED_PARAM)
            amount_dropped_components += 1
            print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
    percentage_dropped = (100/amount_components_before)*amount_dropped_components
    print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
    if percentage_dropped > 50 and not percentage_dropped == 100:
        print("Over 50% of the query components where blocked.")
        print("Therefore printing remaining components to maybe extend the blocklist:")
        for component in query_components.keys():
            print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
else:
    print("No query components to clean found in url")

clean_query = parse.urlencode(query_components, doseq=True)
clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))

print(clean_url)
	#!/usr/bin/env python3
	import requests
	import re
	from urllib import parse

	#SHORT_URL = "http://bit.ly/33jJ7Fu" #example from twitter from zeit.de

	SHORT_URL = input("Please enter your short url: ")

	# borrowed from https://github.com/Smile4ever/Neat-URL/blob/023ac7437810144c0d5389ec50435bba028effd3/options.js#L2
	BLOCKED_PARAMS = ["utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "utm_reader", "utm_place", "utm_userid", "utm_cid", "utm_name", "utm_pubreferrer", "utm_swu", "utm_viz_id", "ga_source", "ga_medium", "ga_term", "ga_content", "ga_campaign", "ga_place", "yclid", "_openstat", "fb_action_ids", "fb_action_types", "fb_ref", "fb_source", "fbclid", "action_object_map", "action_type_map", "action_ref_map", "gs_l", "pd_rd_@amazon.", "_encoding@amazon.", "psc@amazon.", "ved@google.", "ei@google.", "sei@google.", "gws_rd@google.", "cvid@bing.com", "form@bing.com", "sk@bing.com", "sp@bing.com", "sc@bing.com", "qs@bing.com", "pq@bing.com", "feature@youtube.com", "gclid@youtube.com", "kw@youtube.com", "$/ref@amazon.", "_hsenc", "mkt_tok", "hmb_campaign", "hmb_medium", "hmb_source", "source@sourceforge.net", "position@sourceforge.net", "callback@bilibili.com", "elqTrackId", "elqTrack", "assetType", "assetId", "recipientId", "campaignId", "siteId", "tag@amazon.", "ref_@amazon.", "pf_rd_@amazon.", "spm@.aliexpress.com", "scm@*.aliexpress.com", "aff_platform", "aff_trace_key", "terminal_id", "_hsmi", "fbclid", "spReportId", "spJobID", "spUserID", "spMailingID", "utm_mailing", "utm_brand", "CNDID", "mbid", "trk", "trkCampaign", "sc_campaign", "sc_channel", "sc_content", "sc_medium", "sc_outcome", "sc_geo", "sc_country"]

	CUSTOM_BLOCKED_PARAMS = ["wt_zmc"]
	BLOCKED_PARAMS = BLOCKED_PARAMS + CUSTOM_BLOCKED_PARAMS

	req = requests.head(SHORT_URL, allow_redirects=True)
	LONG_URL = req.url
	url_components = parse.urlparse(LONG_URL)
	assert not SHORT_URL == LONG_URL

	query_components = {}
	if len(url_components.query) > 0:
	query_components = parse.parse_qs(url_components.query)
	amount_components_before = len(query_components.keys())
	amount_dropped_components = 0
	for BLOCKED_PARAM in BLOCKED_PARAMS:
	if "@" in BLOCKED_PARAM:
	param, url = BLOCKED_PARAM.split("@")
	match = re.search("^" + url.replace(".", "\.").replace("", ".") + "$", url)
	if match:
	BLOCKED_PARAM = param
	else:
	continue
	if BLOCKED_PARAM in query_components.keys():
	query_components.pop(BLOCKED_PARAM)
	amount_dropped_components += 1
	print("Dropping blocked param \"{}\"".format(BLOCKED_PARAM))
	percentage_dropped = (100/amount_components_before)*amount_dropped_components
	print("We dropped {}% of the URLs components".format(int(percentage_dropped)))
	if percentage_dropped > 50 and not percentage_dropped == 100:
	print("Over 50% of the query components where blocked.")
	print("Therefore printing remaining components to maybe extend the blocklist:")
	for component in query_components.keys():
	print("\t - {}: \"{}\"".format(component, query_components.get(component, "--EMPTY STRING--")))
	else:
	print("No query components to clean found in url")

	clean_query = parse.urlencode(query_components, doseq=True)
	clean_url = parse.urlunparse((url_components.scheme, url_components.netloc, url_components.path, url_components.params, clean_query, url_components.fragment))

	print(clean_url)