Skip to content

Instantly share code, notes, and snippets.

@reneluria
Last active November 25, 2022 09:40
Show Gist options
  • Save reneluria/348cc68cc789fe38ec4f77fd3bb89b96 to your computer and use it in GitHub Desktop.
Save reneluria/348cc68cc789fe38ec4f77fd3bb89b96 to your computer and use it in GitHub Desktop.
Parse traceroute.org and return json with links by country
"""
Parse traceroute.org and return json with links by country
"""
import urllib.request
from lxml import html, etree # type: ignore[import]
import xml.etree.ElementTree as ET
import json
from typing import Dict, List
with urllib.request.urlopen("http://www.traceroute.org") as f:
# with open("traceroute.html", encoding='utf-8') as f:
htmldoc = html.fromstring(f.read())
root = ET.fromstring(etree.tostring(htmldoc))
countries = [x.text for x in root.findall("./body//table")[0].findall(".//td/a/span")]
links: Dict[str, List[Dict[str, str]]] = {}
started = 0
country = None
for x in root.findall("./body/"):
# when to start looking for countries
if started == 0 and x.tag == "hr":
started = 1
country = None
# that comes prior to a country
if started and x.tag == "h3":
if x[0].tag == "a":
country = x[0].attrib['name']
links[country] = []
# all links in a country
if country and x.tag == "ul":
for link in x.findall("./li/a"):
if link.text and link.attrib['href']:
links[country].append({"text": link.text, "url": link.attrib['href']})
print(json.dumps(links, indent=2))
@reneluria
Copy link
Author

Run this and output to file to store json

python3 parse-traceroute.py > links.json

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment