Skip to content

Instantly share code, notes, and snippets.

@capjamesg
Created September 14, 2021 11:15
Show Gist options
  • Save capjamesg/6c4185303cec9e8af1af8d6f347306e8 to your computer and use it in GitHub Desktop.
Save capjamesg/6c4185303cec9e8af1af8d6f347306e8 to your computer and use it in GitHub Desktop.
concurrent.py
indexed = 0
all_links_final = set(iterate_list_of_urls)
indexed_list = {}
to_index = list(set(iterate_list_of_urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(url_handling.crawl_urls, final_urls, namespaces_to_ignore, indexed, images_indexed, image_urls, links, external_links, discovered_urls, broken_urls, iterate_list_of_urls, site, crawl_budget, url) for url in to_index]
no_more_found = False
for future in concurrent.futures.as_completed(futures):
print(indexed)
if indexed == crawl_budget:
break
_, images_indexed, iterate_list_of_urls, all_links, final_urls, url_indexed, discovered = future.result()
indexed += 1
indexed_list[url_indexed] = True
for item in discovered.keys():
if not indexed_list.get(item):
print("{} not indexed, added".format(item))
futures.append(executor.submit(url_handling.crawl_urls, final_urls, namespaces_to_ignore, indexed, images_indexed, image_urls, links, external_links, discovered_urls, broken_urls, iterate_list_of_urls, site, crawl_budget, item))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment