Last active
December 6, 2022 11:42
-
-
Save ZanSara/76653161a5e7974705394a38aa5c27e7 to your computer and use it in GitHub Desktop.
Wikipedia get list of pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import os | |
import logging | |
import wikipedia | |
OUTPUT_DIR = Path(__file__).parent / "animals" | |
LIST_FILE = Path(__file__).parent / "list_of_zoo_animals.txt" | |
OVERWRITE_EXISTING = False | |
with open(LIST_FILE, 'r') as list_file: | |
pages = list_file.readlines() | |
failed_pages = [] | |
for idx, page_name in enumerate(pages): | |
page_name = page_name.strip() | |
try: | |
if os.path.isfile(OUTPUT_DIR / f"{page_name}.txt") and not OVERWRITE_EXISTING: | |
print(f"{idx}/{len(pages)}: {page_name} (exists)") | |
else: | |
page = wikipedia.page(page_name) | |
print(f"{idx}/{len(pages)}: {page_name}") | |
with open(OUTPUT_DIR / f"{page_name}.txt", 'w') as animal_file: | |
animal_file.write(page.content) | |
except Exception as e: | |
logging.exception(f"{idx}/{len(pages)}: {page_name} --> ERROR!") | |
failed_pages.append(page_name) | |
print("-------------------------------") | |
print("FAILED:") | |
for page in failed_pages: | |
print("- ", page) | |
print("-------------------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment