Skip to content

Instantly share code, notes, and snippets.

@dnk8n
Last active August 3, 2021 14:50
Show Gist options
  • Save dnk8n/afcd8585865fa29abe625e8ecee94c68 to your computer and use it in GitHub Desktop.
Save dnk8n/afcd8585865fa29abe625e8ecee94c68 to your computer and use it in GitHub Desktop.
Download Wiki Dumps
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "862aa30c",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"# Library for parsing HTML\n",
"from bs4 import BeautifulSoup\n",
"base_url = 'https://dumps.wikimedia.org'\n",
"enwiki_url = base_url + '/enwiki'\n",
"index = requests.get(enwiki_url).text\n",
"soup_index = BeautifulSoup(index, 'html.parser')\n",
"# Find the links on the page\n",
"dumps = [a['href'] for a in soup_index.find_all('a') if \n",
" a.has_attr('href')]\n",
"dumps"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1a42fd4",
"metadata": {},
"outputs": [],
"source": [
"dump_url = enwiki_url + '/20210720'\n",
"# Retrieve the html\n",
"dump_html = requests.get(dump_url).text\n",
"# Convert to a soup\n",
"soup_dump = BeautifulSoup(dump_html, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87b626ee",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"\n",
"wikipedia_dir = Path.home() / 'wikipedia-dev'\n",
"wikipedia_dir.mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f20bc9a",
"metadata": {},
"outputs": [],
"source": [
"# Find list elements with the class file\n",
"targets = [i.a.attrs[\"href\"] for i in soup_dump.find_all('li', {'class': 'file'}) if \"multistream\" in str(i)]\n",
"destinations = [t.split('/')[-1] for t in targets]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68e3a949",
"metadata": {},
"outputs": [],
"source": [
"# Define the remote file to retrieve\n",
"for target, destination in zip(targets, destinations):\n",
" print('target: ', base_url + target)\n",
" print('destination: ', wikipedia_dir / destination)\n",
" response = requests.get(base_url + target, stream=True)\n",
" total_size_in_bytes= int(response.headers.get('content-length', 0))\n",
" block_size = 1024\n",
" progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)\n",
" with (wikipedia_dir / destination).open('wb')as f:\n",
" for data in response.iter_content(block_size):\n",
" progress_bar.update(len(data))\n",
" f.write(data)\n",
" progress_bar.close()\n",
" if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:\n",
" print(\"ERROR, something went wrong\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1a80970",
"metadata": {},
"outputs": [],
"source": [
"from typing import Union\n",
"from pathlib import Path\n",
"from pprint import pprint\n",
"from hashlib import md5\n",
"\n",
"\n",
"\n",
"def pairwise(iterable):\n",
" return zip(*[iter(iterable)] * 2)\n",
"\n",
"def check_downloads(wiki_dir, destinations, md5sum_url):\n",
" response = requests.get(md5sum_url)\n",
" md5sum_dict = {k: v for v,k in pairwise(response.text.split())}\n",
" for dest in destinations:\n",
" dest_path = Path(wiki_dir / dest)\n",
" assert dest_path.is_file()\n",
" \n",
" with dest_path.open('rb') as f:\n",
" file_hash = md5()\n",
" while chunk := f.read(8192):\n",
" file_hash.update(chunk)\n",
" actual_md5sum = file_hash.hexdigest()\n",
" expected_md5sum = md5sum_dict[dest]\n",
" assert actual_md5sum == expected_md5sum\n",
" print('\\033[1m' + 'OK' + '\\033[0m', dest_path)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d148eab",
"metadata": {},
"outputs": [],
"source": [
"check_downloads(wikipedia_dir, destinations, 'https://dumps.wikimedia.org/enwiki/20210720/enwiki-20210720-md5sums.txt')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment