Last active
August 27, 2021 23:44
-
-
Save Christopher-Thornton/6e06b937ec63b46556f33a3e31df883f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipediaapi # pip install wikipedia-api | |
import pandas as pd | |
import concurrent.futures | |
from tqdm import tqdm | |
def wiki_scrape(topic_name, verbose=True): | |
def wiki_link(link): | |
try: | |
page = wiki_api.page(link) | |
if page.exists(): | |
return {'page': link, 'text': page.text, 'link': page.fullurl, | |
'categories': list(page.categories.keys())} | |
except: | |
return None | |
wiki_api = wikipediaapi.Wikipedia(language='en', | |
extract_format=wikipediaapi.ExtractFormat.WIKI) | |
page_name = wiki_api.page(topic_name) | |
if not page_name.exists(): | |
print('Page {} does not exist.'.format(topic_name)) | |
return | |
page_links = list(page_name.links.keys()) | |
progress = tqdm(desc='Links Scraped', unit='', total=len(page_links)) if verbose else None | |
sources = [{'page': topic_name, 'text': page_name.text, 'link': page_name.fullurl, | |
'categories': list(page_name.categories.keys())}] | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
future_link = {executor.submit(wiki_link, link): link for link in page_links} | |
for future in concurrent.futures.as_completed(future_link): | |
data = future.result() | |
sources.append(data) if data else None | |
progress.update(1) if verbose else None | |
progress.close() if verbose else None | |
namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki', | |
'Template', 'Help', 'User', 'Category talk', 'Portal talk') | |
sources = pd.DataFrame(sources) | |
sources = sources[(len(sources['text']) > 20) | |
& ~(sources['page'].str.startswith(namespaces, na=True))] | |
sources['categories'] = sources.categories.apply(lambda x: [y[9:] for y in x]) | |
sources['topic'] = topic_name | |
print('Wikipedia pages scraped:', len(sources)) | |
return sources |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment