Skip to content

Instantly share code, notes, and snippets.

@timtrueman
Created February 6, 2010 05:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timtrueman/296560 to your computer and use it in GitHub Desktop.
Save timtrueman/296560 to your computer and use it in GitHub Desktop.
Takes a body of text and truncates full urls into domain + tld
def truncate_urls_to_domain_plus_tld(text):
"""
Get all the urls
Find the domain + tld
Find and replace each url found with its domain + tld
"""
match_urls = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^%s\s]|/)))')
break_down_url = re.compile(r'^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$')
urls = match_urls.findall(text)
for url in urls:
try:
if url[0][-1] == "/":
fixed_url = url[0]
else:
fixed_url = "%s/" % url[0]
result = break_down_url.search(fixed_url)
componentized_url = result.groupdict()
domain_and_tld = componentized_url['hostname']
chunks = domain_and_tld.split(".")
if chunks[-2] in ['co', 'ac', 'gov', 'com', 'mil', 'org', 'net', 'edu',]:
domain_and_tld = ".".join([chunks[-3], chunks[-2], chunks[-1]])
else:
domain_and_tld = ".".join([chunks[-2], chunks[-1]])
print domain_and_tld
text = text.replace(url[0], " %s" % domain_and_tld.replace(".", "-----")) # hack to pass on dots in the domain
except Exception, e:
print "Failed to shorten URL to domain + TLD: %s (so it got removed)" % url[0]
text = text.replace(url[0], "")
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment