JustusAdam/crawl.py

## crawl.py
#!/usr/bin/env python3

import re
from urllib.request import urlopen
import json
from itertools import chain


linkregex = re.compile(r'(https?://([\w\.-]*)?\.sz-online\.de)(/[\w/-]*\.html)')

linkregex2 = re.compile(r'href=\"(/[\w/-]*\.html)\"')

titleregex = re.compile(r"""<title>(.+)\s*</title>""", re.DOTALL)


def main_loop(queue, results):

    while len(results) < 1000 and len(queue) > 0:
        target = queue.pop()
        domain, path = target

        print('Next target: ', domain + path)

        try:
            page = urlopen(domain + path).read().decode()

            title = titleregex.search(page).group(1)

            print('Found title: ', title)

            results[target] = title

            newtargets1 = map(lambda a: a.group(1, 3), linkregex.finditer(page))

            newtargets2 = map(lambda a: (domain, a.group(1)), linkregex2.finditer(page))

            for t in chain(newtargets1, newtargets2):
                if t not in results:
                    results[t] = None
                    queue.append(t)
        except UnicodeDecodeError:
            continue


def main():

    results = {}

    queue = [('http://www.sz-online.de', "")]

    main_loop(queue, results)

    jsonresults = { a[0] + a[1] : c for a,c in results.items() }

    print(len(results), ' entries found')

    with open('results.json', 'w') as file:
        json.dump(jsonresults, file, indent=2)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	import re
	from urllib.request import urlopen
	import json
	from itertools import chain


	linkregex = re.compile(r'(https?://([\w\.-])?\.sz-online\.de)(/[\w/-]\.html)')

	linkregex2 = re.compile(r'href=\"(/[\w/-]*\.html)\"')

	titleregex = re.compile(r"""<title>(.+)\s*</title>""", re.DOTALL)


	def main_loop(queue, results):

	while len(results) < 1000 and len(queue) > 0:
	target = queue.pop()
	domain, path = target

	print('Next target: ', domain + path)

	try:
	page = urlopen(domain + path).read().decode()

	title = titleregex.search(page).group(1)

	print('Found title: ', title)

	results[target] = title

	newtargets1 = map(lambda a: a.group(1, 3), linkregex.finditer(page))

	newtargets2 = map(lambda a: (domain, a.group(1)), linkregex2.finditer(page))

	for t in chain(newtargets1, newtargets2):
	if t not in results:
	results[t] = None
	queue.append(t)
	except UnicodeDecodeError:
	continue


	def main():

	results = {}

	queue = [('http://www.sz-online.de', "")]

	main_loop(queue, results)

	jsonresults = { a[0] + a[1] : c for a,c in results.items() }

	print(len(results), ' entries found')

	with open('results.json', 'w') as file:
	json.dump(jsonresults, file, indent=2)





	if __name__ == '__main__':
	main()