Skip to content

Instantly share code, notes, and snippets.

@omeinusch
Created June 20, 2012 19:39
Show Gist options
  • Save omeinusch/2961760 to your computer and use it in GitHub Desktop.
Save omeinusch/2961760 to your computer and use it in GitHub Desktop.
Appends all URLs in a given text with numbers surrounded by brackets and returns a tuple with the new text and a dict-list with the urls and their numbers.
def append_urls_to_text_bottom(text):
import re
url_counter = 0
unique_url_list = dict()
appending_list = dict()
url_expression = "(?:(?:https?|ftp)://)(?:\\S+(?::\\S*)?@)?(?:(?!10(?:\\.\\d{1,3}){3})(?!127(?:\\.\\d{1,3}){3})(?!169\\.254(?:\\.\\d{1,3}){2})(?!192\\.168(?:\\.\\d{1,3}){2})(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\\d{2,5})?(?:/[^\\s]*)?"
list_of_all_urls = re.findall(url_expression, text)
for one_url in list_of_all_urls:
if one_url not in unique_url_list:
url_counter += 1
unique_url_list[one_url] = url_counter
for single_url, single_number in unique_url_list.items():
text = re.sub(single_url, "[%d]" % single_number, text)
appending_list[single_number] = single_url
return (text, appending_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment