Skip to content

Instantly share code, notes, and snippets.

@lopes
Created July 4, 2024 12:28
Show Gist options
  • Save lopes/efeda0f0556e57a683db024124984af9 to your computer and use it in GitHub Desktop.
Save lopes/efeda0f0556e57a683db024124984af9 to your computer and use it in GitHub Desktop.
Connects to the Tor Project's exit-addresses page and parses the exit node data into JSON format.
#!/usr/bin/env python3
#torpids.py
'''
Connects to the Tor Project's exit-addresses page and parses the
exit node data into JSON format.
Data is fetched from: https://check.torproject.org/exit-addresses
Data is "Node ID-centric" in Tor Project's page, but here it is
"IP address-centric" to make it easier to use the data for
detection purposes. In other words, while in the Tor Project's
we might have duplicate IP addresses with different Node IDs,
here we have unique IP addresses with a list of Node IDs --
the Node IDs might be duplicated, not the IP addresses.
Having the JSON data, it should be easy to feed any dataset
or database with the exit nodes data for further use in analytics.
Author.: Joe Lopes <lopes.id>
Date...: 2024-07-04
License: MIT
'''
from datetime import datetime
from urllib.request import urlopen
from re import compile, DOTALL
from json import dumps
from sys import stderr
url = 'https://check.torproject.org/exit-addresses'
re_exit_node = compile(r'(?P<node>ExitNode\s(?P<node_id>[\dA-Z]+)\nPublished\s(?P<node_published_ts>[\s\d:-]+)\nLastStatus\s(?P<node_status_ts>[\s\d:-]+)\n(?P<exit_addresses>ExitAddress\s[\d.\s:-]+\n?(ExitAddress\s[\d.\s:-]+\n?)*))', DOTALL)
re_exit_address = compile(r'ExitAddress\s(?P<address>[^\s]+)\s(?P<address_ts>[-:\d\b]+)')
tor_exit_nodes = {
'timestamp': datetime.now().isoformat(),
'source': f'Generated by {__file__} based on {url}',
'total_exit_addresses': 0,
'exit_addresses': list()
}
unique_addresses = set()
def fetch_nodes(url):
try:
with urlopen(url) as response:
content = response.read().decode('utf-8')
return content
except Exception as e:
print(f'Error fetching nodes: {e}', file=stderr)
return None
##
# MAIN
#
raw = fetch_nodes(url)
if not raw:
exit(1)
for match in re_exit_node.finditer(raw):
node_id = match.group('node_id')
published = match.group('node_published_ts')
last_status = match.group('node_status_ts')
addresses = match.group('exit_addresses')
for a in re_exit_address.finditer(addresses):
address = a.group('address')
timestamp = a.group('address_ts')
if not address in unique_addresses:
tor_exit_nodes['exit_addresses'].append({
'address': address,
'exit_nodes': [{
'timestamp': timestamp,
'exit_node': {
'id': node_id,
'published': published,
'last_status': last_status
}
}]
})
unique_addresses.add(address)
else:
for e in tor_exit_nodes['exit_addresses']:
if e['address'] == address:
e['exit_nodes'].append({
'timestamp': timestamp,
'exit_node': {
'id': node_id,
'published': published,
'last_status': last_status
}
})
tor_exit_nodes['total_exit_addresses'] = len(tor_exit_nodes['exit_addresses'])
print(dumps(tor_exit_nodes))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment