Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created April 11, 2021 14:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/7206277ad137daf9aef9024f6d9b4490 to your computer and use it in GitHub Desktop.
Save thisismattmiller/7206277ad137daf9aef9024f6d9b4490 to your computer and use it in GitHub Desktop.
Download yahoo answers sitemap
import requests
import xml.etree.ElementTree as ET
import json
import multiprocessing
answer_urls = []
def do_work(url):
print(url)
urls = []
r = requests.get(url)
file_root = ET.fromstring(r.text)
for child in file_root:
for loc in child:
urls.append(loc.text)
return urls
if __name__ == "__main__":
allxml_requests = requests.get('https://answers.yahoo.com/sitemaps/sitemap-us.xml')
root = ET.fromstring(allxml_requests.text)
urls_top_level = []
for sitemap in root:
for s in sitemap:
if s.tag == '{http://www.sitemaps.org/schemas/sitemap/0.9}loc':
urls_top_level.append(s.text)
the_pool = multiprocessing.Pool(int(multiprocessing.cpu_count()))
for results in the_pool.imap_unordered(do_work, urls_top_level):
answer_urls = answer_urls + results
print(len(answer_urls))
the_pool.close()
the_pool.join()
json.dump(answer_urls,open('yahoo_answers.json','w'),indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment