Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Last active December 6, 2022 11:42
Show Gist options
  • Save ZanSara/76653161a5e7974705394a38aa5c27e7 to your computer and use it in GitHub Desktop.
Save ZanSara/76653161a5e7974705394a38aa5c27e7 to your computer and use it in GitHub Desktop.
Wikipedia get list of pages
from pathlib import Path
import os
import logging
import wikipedia
OUTPUT_DIR = Path(__file__).parent / "animals"
LIST_FILE = Path(__file__).parent / "list_of_zoo_animals.txt"
OVERWRITE_EXISTING = False
with open(LIST_FILE, 'r') as list_file:
pages = list_file.readlines()
failed_pages = []
for idx, page_name in enumerate(pages):
page_name = page_name.strip()
try:
if os.path.isfile(OUTPUT_DIR / f"{page_name}.txt") and not OVERWRITE_EXISTING:
print(f"{idx}/{len(pages)}: {page_name} (exists)")
else:
page = wikipedia.page(page_name)
print(f"{idx}/{len(pages)}: {page_name}")
with open(OUTPUT_DIR / f"{page_name}.txt", 'w') as animal_file:
animal_file.write(page.content)
except Exception as e:
logging.exception(f"{idx}/{len(pages)}: {page_name} --> ERROR!")
failed_pages.append(page_name)
print("-------------------------------")
print("FAILED:")
for page in failed_pages:
print("- ", page)
print("-------------------------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment