Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import requests
import re
import sys
from multiprocessing.dummy import Pool
def robots(host):
r = requests.get(
'https://web.archive.org/cdx/search/cdx\
?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
results = r.json()
if len(results) == 0: # might find nothing
return []
results.pop(0) # The first item is ['timestamp', 'original']
return results
def getpaths(snapshot):
url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
robotstext = requests.get(url).text
if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page
paths = re.findall('/.*', robotstext)
return paths
return []
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
sys.exit()
host = sys.argv[1]
snapshots = robots(host)
print('Found %s unique results' % len(snapshots))
if len(snapshots) == 0:
sys.exit()
print('This may take some time...')
pool = Pool(4)
paths = pool.map(getpaths, snapshots)
unique_paths = set()
for i in paths:
unique_paths.update(i)
filename = '%s-robots.txt' % host
with open(filename, 'w') as f:
f.write('\n'.join(unique_paths))
print('[*] Saved results to %s' % filename)
@Fawadkhanfk
Copy link

Fawadkhanfk commented Sep 2, 2021

any help for solving this error, please?

install the required libraries

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment