Skip to content

Instantly share code, notes, and snippets.

@Christopher-Thornton
Last active August 27, 2021 23:44
Show Gist options
  • Save Christopher-Thornton/6e06b937ec63b46556f33a3e31df883f to your computer and use it in GitHub Desktop.
Save Christopher-Thornton/6e06b937ec63b46556f33a3e31df883f to your computer and use it in GitHub Desktop.
import wikipediaapi # pip install wikipedia-api
import pandas as pd
import concurrent.futures
from tqdm import tqdm
def wiki_scrape(topic_name, verbose=True):
def wiki_link(link):
try:
page = wiki_api.page(link)
if page.exists():
return {'page': link, 'text': page.text, 'link': page.fullurl,
'categories': list(page.categories.keys())}
except:
return None
wiki_api = wikipediaapi.Wikipedia(language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI)
page_name = wiki_api.page(topic_name)
if not page_name.exists():
print('Page {} does not exist.'.format(topic_name))
return
page_links = list(page_name.links.keys())
progress = tqdm(desc='Links Scraped', unit='', total=len(page_links)) if verbose else None
sources = [{'page': topic_name, 'text': page_name.text, 'link': page_name.fullurl,
'categories': list(page_name.categories.keys())}]
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_link = {executor.submit(wiki_link, link): link for link in page_links}
for future in concurrent.futures.as_completed(future_link):
data = future.result()
sources.append(data) if data else None
progress.update(1) if verbose else None
progress.close() if verbose else None
namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
'Template', 'Help', 'User', 'Category talk', 'Portal talk')
sources = pd.DataFrame(sources)
sources = sources[(len(sources['text']) > 20)
& ~(sources['page'].str.startswith(namespaces, na=True))]
sources['categories'] = sources.categories.apply(lambda x: [y[9:] for y in x])
sources['topic'] = topic_name
print('Wikipedia pages scraped:', len(sources))
return sources
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment