Skip to content

Instantly share code, notes, and snippets.

@baali
Created March 27, 2020 06:22
Show Gist options
  • Save baali/064cbc8c343c14be9db48a05901d7207 to your computer and use it in GitHub Desktop.
Save baali/064cbc8c343c14be9db48a05901d7207 to your computer and use it in GitHub Desktop.
Compare URLs and check if they are same
from urllib.parse import urlparse
def check_url_similarity(url_1, url_2):
'''Method to compare two URLs to identify if they are same or not.
Returns bool: True/False based on comparison'''
def check_path(path_1, path_2):
# handles cases where paths differ because of trailing /
if path_1 == path_2:
return True
if path_1 == path_2+'/' or \
path_1+'/' == path_2:
return True
else:
return False
if len(url_2) == len(url_1):
if url_1 == url_2:
return True
else:
url_1_struct = urlparse(url_1)
url_2_struct = urlparse(url_2)
if url_1_struct.netloc == url_2_struct.netloc:
if check_path(url_1_struct.path, url_2_struct.path):
return True
if url_1_struct.netloc == 'www.'+url_2_struct.netloc or \
'www.'+url_1_struct.netloc == url_2_struct.netloc:
if check_path(url_1_struct.path, url_2_struct.path):
return True
return False
import unittest
from compare_urls import check_url_similarity
class TestUrlSimilarity(unittest.TestCase):
def test_trailing_slash(self):
url_1 = "https://www.mygov.in/covid-19/"
url_2 = "https://www.mygov.in/covid-19"
self.assertTrue(check_url_similarity(url_1, url_2))
def test_missing_www_subdomain(self):
url_1 = "https://mygov.in/covid-19"
url_2 = "https://www.mygov.in/covid-19"
self.assertTrue(check_url_similarity(url_1, url_2))
def test_missing_www_subdomain_and_trailing_slash(self):
url_1 = "https://mygov.in/covid-19/"
url_2 = "https://www.mygov.in/covid-19"
self.assertTrue(check_url_similarity(url_1, url_2))
url_1 = "https://mygov.in/covid-19"
url_2 = "https://www.mygov.in/covid-19/"
self.assertTrue(check_url_similarity(url_1, url_2))
def test_http_difference(self):
url_1 = "https://mygov.in/covid-19"
url_2 = "http://www.mygov.in/covid-19"
self.assertTrue(check_url_similarity(url_1, url_2))
def test_different_url(self):
url_1 = "https://mygov.in/covid-19"
url_2 = "https://www.india.gov.in/"
self.assertFalse(check_url_similarity(url_1, url_2))
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment