timtrueman/gist:296560

## gistfile1.py
def truncate_urls_to_domain_plus_tld(text):
    """
    Get all the urls
    Find the domain + tld
    Find and replace each url found with its domain + tld
    """
    match_urls = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^%s\s]|/)))')
    break_down_url = re.compile(r'^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$')
    urls = match_urls.findall(text)
    for url in urls:
        try:
            if url[0][-1] == "/":
                fixed_url = url[0]
            else:
                fixed_url = "%s/" % url[0]
            result = break_down_url.search(fixed_url)
            componentized_url = result.groupdict()
            domain_and_tld = componentized_url['hostname']
            chunks = domain_and_tld.split(".")
            if chunks[-2] in ['co', 'ac', 'gov', 'com', 'mil', 'org', 'net', 'edu',]:
                domain_and_tld = ".".join([chunks[-3], chunks[-2], chunks[-1]])
            else:
                domain_and_tld = ".".join([chunks[-2], chunks[-1]])
            print domain_and_tld
            text = text.replace(url[0], " %s" % domain_and_tld.replace(".", "-----")) # hack to pass on dots in the domain
        except Exception, e:
            print "Failed to shorten URL to domain + TLD: %s (so it got removed)" % url[0]
            text = text.replace(url[0], "")
    return text
	def truncate_urls_to_domain_plus_tld(text):
	"""
	Get all the urls
	Find the domain + tld
	Find and replace each url found with its domain + tld
	"""
	match_urls = re.compile(r'\b(([\w-]+://?\|www[.])[^\s()<>]+(?:\([\w\d]+\)\|([^%s\s]\|/)))')
	break_down_url = re.compile(r'^((?P<scheme>https?\|ftp):\/)?\/?((?P<username>.?)(:(?P<password>.?)\|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]))?(?P<path>(\/\w+)\/)(?P<filename>[-\w.]+[^#?\s])?(?P<query>\?([^#]))?(?P<fragment>#(.*))?$')
	urls = match_urls.findall(text)
	for url in urls:
	try:
	if url[0][-1] == "/":
	fixed_url = url[0]
	else:
	fixed_url = "%s/" % url[0]
	result = break_down_url.search(fixed_url)
	componentized_url = result.groupdict()
	domain_and_tld = componentized_url['hostname']
	chunks = domain_and_tld.split(".")
	if chunks[-2] in ['co', 'ac', 'gov', 'com', 'mil', 'org', 'net', 'edu',]:
	domain_and_tld = ".".join([chunks[-3], chunks[-2], chunks[-1]])
	else:
	domain_and_tld = ".".join([chunks[-2], chunks[-1]])
	print domain_and_tld
	text = text.replace(url[0], " %s" % domain_and_tld.replace(".", "-----")) # hack to pass on dots in the domain
	except Exception, e:
	print "Failed to shorten URL to domain + TLD: %s (so it got removed)" % url[0]
	text = text.replace(url[0], "")
	return text