Skip to content

Instantly share code, notes, and snippets.

@JustusAdam
Created July 23, 2015 14:47
Show Gist options
  • Save JustusAdam/6c610038503b6f0f4e3a to your computer and use it in GitHub Desktop.
Save JustusAdam/6c610038503b6f0f4e3a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
from urllib.request import urlopen
import json
from itertools import chain
linkregex = re.compile(r'(https?://([\w\.-]*)?\.sz-online\.de)(/[\w/-]*\.html)')
linkregex2 = re.compile(r'href=\"(/[\w/-]*\.html)\"')
titleregex = re.compile(r"""<title>(.+)\s*</title>""", re.DOTALL)
def main_loop(queue, results):
while len(results) < 1000 and len(queue) > 0:
target = queue.pop()
domain, path = target
print('Next target: ', domain + path)
try:
page = urlopen(domain + path).read().decode()
title = titleregex.search(page).group(1)
print('Found title: ', title)
results[target] = title
newtargets1 = map(lambda a: a.group(1, 3), linkregex.finditer(page))
newtargets2 = map(lambda a: (domain, a.group(1)), linkregex2.finditer(page))
for t in chain(newtargets1, newtargets2):
if t not in results:
results[t] = None
queue.append(t)
except UnicodeDecodeError:
continue
def main():
results = {}
queue = [('http://www.sz-online.de', "")]
main_loop(queue, results)
jsonresults = { a[0] + a[1] : c for a,c in results.items() }
print(len(results), ' entries found')
with open('results.json', 'w') as file:
json.dump(jsonresults, file, indent=2)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment