Skip to content

Instantly share code, notes, and snippets.

@mhmdiaa
Last active November 22, 2023 12:27
Show Gist options
  • Save mhmdiaa/2742c5e147d49a804b408bfed3d32d07 to your computer and use it in GitHub Desktop.
Save mhmdiaa/2742c5e147d49a804b408bfed3d32d07 to your computer and use it in GitHub Desktop.
import requests
import re
import sys
from multiprocessing.dummy import Pool
def robots(host):
r = requests.get(
'https://web.archive.org/cdx/search/cdx\
?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
results = r.json()
if len(results) == 0: # might find nothing
return []
results.pop(0) # The first item is ['timestamp', 'original']
return results
def getpaths(snapshot):
url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
robotstext = requests.get(url).text
if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page
paths = re.findall('/.*', robotstext)
return paths
return []
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
sys.exit()
host = sys.argv[1]
snapshots = robots(host)
print('Found %s unique results' % len(snapshots))
if len(snapshots) == 0:
sys.exit()
print('This may take some time...')
pool = Pool(4)
paths = pool.map(getpaths, snapshots)
unique_paths = set()
for i in paths:
unique_paths.update(i)
filename = '%s-robots.txt' % host
with open(filename, 'w') as f:
f.write('\n'.join(unique_paths))
print('[*] Saved results to %s' % filename)
@ashrafed
Copy link

ashrafed commented Apr 6, 2020

any help for solving this error, please?

@Fawadkhanfk
Copy link

any help for solving this error, please?

install the required libraries

@Harsh5922
Copy link

python3 waybackrobots.py https://www.abc.in/
Found 4 unique results
This may take some time...
Traceback (most recent call last):
File "/home/wizard/BugBountyTool/waybackrobots.py", line 45, in
with open(filename, 'w') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'https://www.abc.in/-robots.txt'

(here the abc has actual domain)

@MAyyan125
Copy link

Oh its working fine on my end just install the libraries which he import and save file as waybackrobots.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment