Skip to content

Instantly share code, notes, and snippets.

@darighost
Created April 29, 2023 15:35
Show Gist options
  • Save darighost/7554c7cc42d2d0cc3fc62340a2018fed to your computer and use it in GitHub Desktop.
Save darighost/7554c7cc42d2d0cc3fc62340a2018fed to your computer and use it in GitHub Desktop.
Skeleton for Tor crawler (for Eamon)
import requests
import re
# This script assumes you already have Tor installed and running
# Snagged from StackOverflow, haven't tested it!
def get_tor_session():
session = requests.session()
# My Tor daemon is on port 9150
# On your computer, it's more likely 9050
session.proxies = {'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'}
return session
# We start out with the URL for tor.taxi, a Tor link aggregator
seed_url = "http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/"
# This regular expression will help us extract Tor links to crawl
# ChatGPT actually wrote this regex for me!
onion_regex = re.compile("[a-z0-9]+.onion")
def crawl(url, found_urls=[], dead_links=[], search_term="hacking", session=None):
if session is None:
session = get_tor_session()
try:
# Todo: use beautiful soup to extract only visible text from page...
# (including title and meta description of course)
html = session.get('http://' + url).text
except:
dead_links.append(url)
return # URL is a dead link, mark it as such so we don't visit it again
# Eventually we'll replace all of these in-memory arrays with proper databases...
if search_term in html:
print('Found search term:', url)
found_urls.append()
links = [] # ...
# Make sure to remove duplicates, and don't add dupes to found_links
pass # TODO: crawl recursively
# Make sure the basics work...
session = get_tor_session()
html = session.get(seed_url).text
links = onion_regex.findall(html)
print(links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment