GokulNC/wiki_dump_size.py

## wiki_dump_size.py
# Basically I was trying to find out the size of MBERT model's data for Indian languages
# Date based on: https://github.com/google-research/bert/blob/f18bd94b8fee9bda3c293e0932d100add35b780c/multilingual.md

import requests
from bs4 import BeautifulSoup
from time import sleep

ARCHIVE_API = 'http://web.archive.org/cdx/search/cdx?url=%s&output=json'
WIKIDUMP_URL = 'https://dumps.wikimedia.org/%swiki/%s'
ARCHIVE_URL = 'http://web.archive.org/web/%s/%s'

LANGS = ['as', 'bn', 'gu', 'kn', 'hi', 'ml', 'mr', 'or', 'pa', 'ta', 'te']

def get_wikidump_size(lang_code, date):
	dump_url = WIKIDUMP_URL % (lang_code, date)
	query = ARCHIVE_API % dump_url
	response = requests.get(query).json()
	if len(response) < 2:
		print('Failed for: ', lang_code, date)
		return None

	dump_url = ARCHIVE_URL % (response[1][1], dump_url)
	dump_html = requests.get(dump_url).text
	soup_dump = BeautifulSoup(dump_html, 'html.parser')
	main_dump = str(soup_dump.find_all('li', {'class': 'file'})[0])
	size = main_dump[main_dump.find('</a>') + len('</a>') : main_dump.find('</li>')]
	return size.strip()


def get_lang_sizes(dump_date, save_to='wikidump_sizes.txt'):
	with open(save_to, 'w', encoding='utf-8') as f:
		for lang in LANGS:
			f.write('%s %s\n'%(lang, get_wikidump_size(lang, dump_date)))
	return

WIKI_STATS_URL = 'https://%s.wikipedia.org/wiki/Special:Statistics'
def get_wiki_words(lang_code, year):
	actual_url = requests.get(WIKI_STATS_URL%lang_code).url
	archive_query = ARCHIVE_API % actual_url
	response = requests.get(archive_query).json()
	if len(response) < 2:
		print('Failed for: ', lang_code, date)
		return None

	year = str(year)
	stats_url = None
	for row in response[1:]:
		if row[1].startswith(year):
			stats_url = ARCHIVE_URL % (row[1], actual_url)
			break

	if not stats_url:
		return None

	dump_html = requests.get(stats_url).content
	soup_dump = BeautifulSoup(dump_html, 'html.parser')
	print(stats_url)
	try:
		num_words = soup_dump.find_all('tr', {'id': 'mw-cirrussearch-article-words'})[0].find_all('td', {'class': 'mw-statistics-numbers'})[0].text
		return num_words
	except:
		return None

def get_lang_words(year, save_to='wiki_words.txt'):
	with open(save_to, 'w', encoding='utf-8') as f:
		for lang in LANGS:
			f.write('%s %s\n'%(lang, get_wiki_words(lang, year)))
			sleep(0.5)
	return

get_lang_sizes('20181001')
get_lang_words(2018)

## wiki_words.txt
as 28,18,782
bn ২,৪৯,১৯,৮৪৩
gu ૭૬,૧૪,૭૨૨
kn ೧,೫೬,೧೭,೦೭೮
hi 3,74,55,014
ml 1,54,66,075
mr ६६,२५,४४४
or None
pa 83,23,194
ta 2,80,52,682
te 3,47,61,585

## wikidump_sizes.txt
as 17.9 MB
bn 135.2 MB
gu 26.7 MB
kn 68.0 MB
hi 131.9 MB
ml 109.8 MB
mr 47.8 MB
or 23.7 MB
pa 38.7 MB
ta 136.7 MB
te 110.3 MB
	# Basically I was trying to find out the size of MBERT model's data for Indian languages
	# Date based on: https://github.com/google-research/bert/blob/f18bd94b8fee9bda3c293e0932d100add35b780c/multilingual.md

	import requests
	from bs4 import BeautifulSoup
	from time import sleep

	ARCHIVE_API = 'http://web.archive.org/cdx/search/cdx?url=%s&output=json'
	WIKIDUMP_URL = 'https://dumps.wikimedia.org/%swiki/%s'
	ARCHIVE_URL = 'http://web.archive.org/web/%s/%s'

	LANGS = ['as', 'bn', 'gu', 'kn', 'hi', 'ml', 'mr', 'or', 'pa', 'ta', 'te']

	def get_wikidump_size(lang_code, date):
	dump_url = WIKIDUMP_URL % (lang_code, date)
	query = ARCHIVE_API % dump_url
	response = requests.get(query).json()
	if len(response) < 2:
	print('Failed for: ', lang_code, date)
	return None

	dump_url = ARCHIVE_URL % (response[1][1], dump_url)
	dump_html = requests.get(dump_url).text
	soup_dump = BeautifulSoup(dump_html, 'html.parser')
	main_dump = str(soup_dump.find_all('li', {'class': 'file'})[0])
	size = main_dump[main_dump.find('</a>') + len('</a>') : main_dump.find('</li>')]
	return size.strip()


	def get_lang_sizes(dump_date, save_to='wikidump_sizes.txt'):
	with open(save_to, 'w', encoding='utf-8') as f:
	for lang in LANGS:
	f.write('%s %s\n'%(lang, get_wikidump_size(lang, dump_date)))
	return

	WIKI_STATS_URL = 'https://%s.wikipedia.org/wiki/Special:Statistics'
	def get_wiki_words(lang_code, year):
	actual_url = requests.get(WIKI_STATS_URL%lang_code).url
	archive_query = ARCHIVE_API % actual_url
	response = requests.get(archive_query).json()
	if len(response) < 2:
	print('Failed for: ', lang_code, date)
	return None

	year = str(year)
	stats_url = None
	for row in response[1:]:
	if row[1].startswith(year):
	stats_url = ARCHIVE_URL % (row[1], actual_url)
	break

	if not stats_url:
	return None

	dump_html = requests.get(stats_url).content
	soup_dump = BeautifulSoup(dump_html, 'html.parser')
	print(stats_url)
	try:
	num_words = soup_dump.find_all('tr', {'id': 'mw-cirrussearch-article-words'})[0].find_all('td', {'class': 'mw-statistics-numbers'})[0].text
	return num_words
	except:
	return None

	def get_lang_words(year, save_to='wiki_words.txt'):
	with open(save_to, 'w', encoding='utf-8') as f:
	for lang in LANGS:
	f.write('%s %s\n'%(lang, get_wiki_words(lang, year)))
	sleep(0.5)
	return

	get_lang_sizes('20181001')
	get_lang_words(2018)
	as 28,18,782
	bn ২,৪৯,১৯,৮৪৩
	gu ૭૬,૧૪,૭૨૨
	kn ೧,೫೬,೧೭,೦೭೮
	hi 3,74,55,014
	ml 1,54,66,075
	mr ६६,२५,४४४
	or None
	pa 83,23,194
	ta 2,80,52,682
	te 3,47,61,585
	as 17.9 MB
	bn 135.2 MB
	gu 26.7 MB
	kn 68.0 MB
	hi 131.9 MB
	ml 109.8 MB
	mr 47.8 MB
	or 23.7 MB
	pa 38.7 MB
	ta 136.7 MB
	te 110.3 MB