Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape DoH provider URLs from cURL's wiki page (see https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS)
#!/usr/bin/env python
#
# Scrape Doh provider URLs from Curl's DNS-over-HTTPS wiki (https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS).
#
# Example usage: ./scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
#
import argparse
import re
import urllib.request
HTTPS_URL_RE = re.compile(r'https://'
r'(?P<hostname>[0-9a-zA-Z._~-]+)'
r'(?P<port>:[0-9]+)?'
r'(?P<path>[0-9a-zA-Z._~/-]+)?')
PROVIDER_RE = re.compile(r'(\[([^\]]+)\]\(([^)]+))\)|(.*)')
# URLs that are not Doh URLs
do_not_include = ['my.nextdns.io', 'blog.cloudflare.com', 'https://blog.cloudflare.com/welcome-hidden-resolver', 'https://my.nextdns.io/start']
def get_doh_providers():
found_table = False
with urllib.request.urlopen('https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS.md') as fp:
for line in fp:
line = line.decode()
if line.startswith('|'):
if not found_table:
found_table = True
continue
cols = line.split('|')
provider_col = cols[1].strip()
website = None
provider_name = None
matches = PROVIDER_RE.findall(provider_col)
if matches[0][3] != '':
provider_name = matches[0][3]
if matches[0][1] != '':
provider_name = matches[0][1]
if matches[0][2] != '':
website = matches[0][2]
if provider_name is not None:
provider_name = re.sub(r'([^[]+)\s?(.*)', r'\1', provider_name)
while provider_name[-1] == ' ':
provider_name = provider_name[:-1]
if len(cols) < 3:
continue
url_col = cols[2]
doh_url_matches = HTTPS_URL_RE.findall(url_col)
if len(doh_url_matches) == 0:
continue
else:
for doh_url in doh_url_matches:
if doh_url[0] in do_not_include:
continue
yield {
'name': provider_name,
'website': website,
'url': 'https://{}{}{}'.format(doh_url[0], ':{}'.format(doh_url[1]) if len(doh_url[1]) != 0 else '', doh_url[2]),
'hostname': doh_url[0],
'port': doh_url[1] if len(doh_url[1]) != 0 else '443',
'path': doh_url[2],
}
if found_table and line.startswith('#'):
break
return
def main():
# example: ./scripts/scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
parser = argparse.ArgumentParser(description='A script to parse DoH provider URLs from cURL\'s wiki page!')
parser.add_argument('format', help='Format of output. Example: \'*(o["url"],o["name"])\'', default='o["url"]',
nargs='?')
args = parser.parse_args()
for o in get_doh_providers():
print(eval(args.format))
if __name__ == '__main__':
main()
@djkutiger

This comment has been minimized.

Copy link

@djkutiger djkutiger commented Dec 17, 2020

The table format probably changed since you wrote this script.

#!/usr/bin/env python
#
# Scrape Doh provider URLs from Curl's DNS-over-HTTPS wiki (https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS).
# 
# Example usage: ./scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
#
import argparse
import re
import urllib.request

HTTPS_URL_RE = re.compile(r'https://'
                          r'(?P<hostname>[0-9a-zA-Z._~-]+)'
                          r'(?P<port>:[0-9]+)?'
                          r'(?P<path>[0-9a-zA-Z._~/-]+)?')

PROVIDER_RE = re.compile(r'(\[([^\]]+)\]\(([^)]+))\)|(.*)')

# URLs that are not Doh URLs
do_not_include = ['my.nextdns.io', 'blog.cloudflare.com', 'https://blog.cloudflare.com/welcome-hidden-resolver', 'https://my.nextdns.io/start']


def get_doh_providers():
    found_table = False
    with urllib.request.urlopen('https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS.md') as fp:
        for line in fp:
            line = line.decode()
            if line.startswith('|'):
                if not found_table:
                    found_table = True
                    continue
                cols = line.split('|')

                provider_col = cols[1].strip()
                website = None
                provider_name = None
                matches = PROVIDER_RE.findall(provider_col)
                if matches[0][3] != '':
                    provider_name = matches[0][3]
                if matches[0][1] != '':
                    provider_name = matches[0][1]
                if matches[0][2] != '':
                    website = matches[0][2]
                if provider_name is not None:
                    provider_name = re.sub(r'([^[]+)\s?(.*)', r'\1', provider_name)
                    while provider_name[-1] == ' ':
                        provider_name = provider_name[:-1]

                if len(cols) > 2:
                    url_col = cols[2]

                doh_url_matches = HTTPS_URL_RE.findall(url_col)
                if len(doh_url_matches) == 0:
                    continue
                else:
                    for doh_url in doh_url_matches:
                        if doh_url[0] in do_not_include:
                            continue
                        yield {
                            'name': provider_name,
                            'website': website,
                            'url': 'https://{}{}{}'.format(doh_url[0], ':{}'.format(doh_url[1]) if len(doh_url[1]) != 0 else '', doh_url[2]),
                            'hostname': doh_url[0],
                            'port': doh_url[1] if len(doh_url[1]) != 0 else '443',
                            'path': doh_url[2],
                        }
            if found_table and line.startswith('#'):
                break
    return

def main():
    # example: ./scripts/scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
    parser = argparse.ArgumentParser(description='A script to parse DoH provider URLs from cURL\'s wiki page!')
    parser.add_argument('format', help='Format of output. Example: \'*(o["url"],o["name"])\'', default='o["url"]',
                        nargs='?')
    args = parser.parse_args()
    for o in get_doh_providers():
        print(eval(args.format))


if __name__ == '__main__':
    main()




@kimbo

This comment has been minimized.

Copy link
Owner Author

@kimbo kimbo commented Jan 24, 2021

Thanks @djkutiger, updated it to skip the rows with "A", "B", and so on.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment