darighost/tor_crawl_skeleton.py

## tor_crawl_skeleton.py
import requests
import re

# This script assumes you already have Tor installed and running

# Snagged from StackOverflow, haven't tested it!
def get_tor_session():
    session = requests.session()
    # My Tor daemon is on port 9150
    # On your computer, it's more likely 9050
    session.proxies = {'http':  'socks5h://127.0.0.1:9150',
                       'https': 'socks5h://127.0.0.1:9150'}
    return session

# We start out with the URL for tor.taxi, a Tor link aggregator
seed_url = "http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/"

# This regular expression will help us extract Tor links to crawl
# ChatGPT actually wrote this regex for me!
onion_regex = re.compile("[a-z0-9]+.onion")

def crawl(url, found_urls=[], dead_links=[], search_term="hacking", session=None):
    if session is None:
        session = get_tor_session()

    try:
        # Todo: use beautiful soup to extract only visible text from page...
        # (including title and meta description of course)
        html = session.get('http://' + url).text
    except:
        dead_links.append(url)
        return # URL is a dead link, mark it as such so we don't visit it again
        # Eventually we'll replace all of these in-memory arrays with proper databases...

    if search_term in html:
        print('Found search term:', url)
        found_urls.append()

    links = [] # ...

    # Make sure to remove duplicates, and don't add dupes to found_links

    pass # TODO: crawl recursively

# Make sure the basics work...
session = get_tor_session()
html = session.get(seed_url).text
links = onion_regex.findall(html)
print(links)
	import requests
	import re

	# This script assumes you already have Tor installed and running

	# Snagged from StackOverflow, haven't tested it!
	def get_tor_session():
	session = requests.session()
	# My Tor daemon is on port 9150
	# On your computer, it's more likely 9050
	session.proxies = {'http': 'socks5h://127.0.0.1:9150',
	'https': 'socks5h://127.0.0.1:9150'}
	return session

	# We start out with the URL for tor.taxi, a Tor link aggregator
	seed_url = "http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/"

	# This regular expression will help us extract Tor links to crawl
	# ChatGPT actually wrote this regex for me!
	onion_regex = re.compile("[a-z0-9]+.onion")

	def crawl(url, found_urls=[], dead_links=[], search_term="hacking", session=None):
	if session is None:
	session = get_tor_session()

	try:
	# Todo: use beautiful soup to extract only visible text from page...
	# (including title and meta description of course)
	html = session.get('http://' + url).text
	except:
	dead_links.append(url)
	return # URL is a dead link, mark it as such so we don't visit it again
	# Eventually we'll replace all of these in-memory arrays with proper databases...

	if search_term in html:
	print('Found search term:', url)
	found_urls.append()

	links = [] # ...

	# Make sure to remove duplicates, and don't add dupes to found_links

	pass # TODO: crawl recursively

	# Make sure the basics work...
	session = get_tor_session()
	html = session.get(seed_url).text
	links = onion_regex.findall(html)
	print(links)