import requests | |
import re | |
import sys | |
from multiprocessing.dummy import Pool | |
def robots(host): | |
r = requests.get( | |
'https://web.archive.org/cdx/search/cdx\ | |
?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host) | |
results = r.json() | |
if len(results) == 0: # might find nothing | |
return [] | |
results.pop(0) # The first item is ['timestamp', 'original'] | |
return results | |
def getpaths(snapshot): | |
url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1]) | |
robotstext = requests.get(url).text | |
if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page | |
paths = re.findall('/.*', robotstext) | |
return paths | |
return [] | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print('Usage:\n\tpython3 waybackrobots.py <domain-name>') | |
sys.exit() | |
host = sys.argv[1] | |
snapshots = robots(host) | |
print('Found %s unique results' % len(snapshots)) | |
if len(snapshots) == 0: | |
sys.exit() | |
print('This may take some time...') | |
pool = Pool(4) | |
paths = pool.map(getpaths, snapshots) | |
unique_paths = set() | |
for i in paths: | |
unique_paths.update(i) | |
filename = '%s-robots.txt' % host | |
with open(filename, 'w') as f: | |
f.write('\n'.join(unique_paths)) | |
print('[*] Saved results to %s' % filename) |
This comment has been minimized.
This comment has been minimized.
How can I use this? |
This comment has been minimized.
This comment has been minimized.
$ python3 waybackrobots.py |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
any help for solving this error, please? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Its giving me " Failed to establish a new connection: [Errno 111] Connection refused',)) " this error most of the time -_-