wowkin2/url_parser.py

## url_parser.py
import re

content = '''
    <img src="/images/lol/hallo.png" />
    /images/lol/hallo.png
    /images/lol/hallo.png
    //example.com/images/lol/hallo.png
    http://example.com/images/lol/hallo.png
    https://example.com/images/lol/hallo.png
    <!-- /images/lol/commented.png -->
    <!-- <img src="/images/lol/commented2.png" /> -->
'''


def parse_active_urls(html_text):
    regexp = r'<!--[\s\S]*?-->|(?P<url>(http(s?):)?/?/.+?\.[\w\d]+)'
    result = [item[0] for item in re.findall(regexp, html_text) if item[0]]
    return result


def main():
    result = parse_active_urls(content)
    for item in result:
        print(item)


if __name__ == '__main__':
    main()
	import re

	content = '''
	<img src="/images/lol/hallo.png" />
	/images/lol/hallo.png
	/images/lol/hallo.png
	//example.com/images/lol/hallo.png
	http://example.com/images/lol/hallo.png
	https://example.com/images/lol/hallo.png
	<!-- /images/lol/commented.png -->
	<!-- <img src="/images/lol/commented2.png" /> -->
	'''


	def parse_active_urls(html_text):
	regexp = r'<!--[\s\S]*?-->\|(?P<url>(http(s?):)?/?/.+?\.[\w\d]+)'
	result = [item[0] for item in re.findall(regexp, html_text) if item[0]]
	return result


	def main():
	result = parse_active_urls(content)
	for item in result:
	print(item)


	if __name__ == '__main__':
	main()