castdrian/fetch_bulba.py

## fetch_bulba.py
import json
import re
import requests
import bs4

GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon'

def fetch_mon_urls():
	"""Fetch all the urls for the mons."""
	res = requests.get(GEN_IX_URL)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	mon_urls = []
	# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages">
	# find all the links in the list
	# get the href attribute
	# append to mon_urls
	for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'):
		mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href'))
	return mon_urls

def get_bulba_data(url):
	"""Get the data from the Bulbapedia page."""
	res = requests.get(url)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	return soup

def parse_bulba_data(soup):
	# find roundy table
	tables = soup.find_all('table', class_='roundy')

	catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate')
	merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text

	gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio')
	merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text

	# get the percentages which look like "87.5% male, 12.5% female"
	percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%']

	# if lentgh is 1, then we get the gender text
	if len(percentages) == 1:
		# find one in the text
		if 'male' in merged_gender_ratio:
			percentages.append('0%')
		else:
			percentages.insert(0, '0%')

	egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding')
	merged_steps = egg[0].parent.next_sibling.next_sibling.text

	no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)')

	# check if merged_steps contains "Egg not obtainable" and return boolean
	if 'Egg not obtainable' in merged_steps or no_group:
		has_egg = False
	else:
		has_egg = True

	# get the steps which look like "5140 - 5396 steps"

	# use regex to get the numbers
	min_steps = re.search(r'\d+', merged_steps).group()

	ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield')
	merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text

	# find the first 7 numbers
	ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7]
	# cut off the first one
	ev_yield = ev_yield[1:]

	leveling_rate = tables[0].find_all('a', href='/wiki/Experience')
	merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text
	leveling_rate = merged_leveling_rate.strip()

	# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title>
	title = soup.find('title').text
	# split the title by the first parenthesis
	title = title.split('(', 1)[0].strip()

	return {
		'species': title.lower().strip().replace(' ', ''),
		'genderRatio': { 'male': percentages[0], 'female': percentages[1] },
		'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) },
		'isEggObtainable': has_egg,
		'catchRate': {
			'base': int(merged_catch_rate.split(' ')[0].strip()),
			'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '')
		},
		'levellingRate': leveling_rate,
		'minimumHatchTime': int(min_steps)
	}

def main():
	"""Main function."""
	mon_urls = fetch_mon_urls()

	data_list = []

	for url in mon_urls:
		soup = get_bulba_data(url)
		data = parse_bulba_data(soup)
		data_list.append(data)
		print('Successfully processed: ' + url)

	# write pretty json to file
	with open('partialBulbaData.json', 'w') as f:
		json.dump(data_list, f, indent=4)

	print('Done!')

if __name__ == '__main__':
	main()
	import json
	import re
	import requests
	import bs4

	GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon'

	def fetch_mon_urls():
	"""Fetch all the urls for the mons."""
	res = requests.get(GEN_IX_URL)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	mon_urls = []
	# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages">
	# find all the links in the list
	# get the href attribute
	# append to mon_urls
	for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'):
	mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href'))
	return mon_urls

	def get_bulba_data(url):
	"""Get the data from the Bulbapedia page."""
	res = requests.get(url)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	return soup

	def parse_bulba_data(soup):
	# find roundy table
	tables = soup.find_all('table', class_='roundy')

	catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate')
	merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text

	gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio')
	merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text

	# get the percentages which look like "87.5% male, 12.5% female"
	percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%']

	# if lentgh is 1, then we get the gender text
	if len(percentages) == 1:
	# find one in the text
	if 'male' in merged_gender_ratio:
	percentages.append('0%')
	else:
	percentages.insert(0, '0%')

	egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding')
	merged_steps = egg[0].parent.next_sibling.next_sibling.text

	no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)')

	# check if merged_steps contains "Egg not obtainable" and return boolean
	if 'Egg not obtainable' in merged_steps or no_group:
	has_egg = False
	else:
	has_egg = True

	# get the steps which look like "5140 - 5396 steps"

	# use regex to get the numbers
	min_steps = re.search(r'\d+', merged_steps).group()

	ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield')
	merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text

	# find the first 7 numbers
	ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7]
	# cut off the first one
	ev_yield = ev_yield[1:]

	leveling_rate = tables[0].find_all('a', href='/wiki/Experience')
	merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text
	leveling_rate = merged_leveling_rate.strip()

	# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title>
	title = soup.find('title').text
	# split the title by the first parenthesis
	title = title.split('(', 1)[0].strip()

	return {
	'species': title.lower().strip().replace(' ', ''),
	'genderRatio': { 'male': percentages[0], 'female': percentages[1] },
	'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) },
	'isEggObtainable': has_egg,
	'catchRate': {
	'base': int(merged_catch_rate.split(' ')[0].strip()),
	'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '')
	},
	'levellingRate': leveling_rate,
	'minimumHatchTime': int(min_steps)
	}

	def main():
	"""Main function."""
	mon_urls = fetch_mon_urls()

	data_list = []

	for url in mon_urls:
	soup = get_bulba_data(url)
	data = parse_bulba_data(soup)
	data_list.append(data)
	print('Successfully processed: ' + url)

	# write pretty json to file
	with open('partialBulbaData.json', 'w') as f:
	json.dump(data_list, f, indent=4)

	print('Done!')

	if __name__ == '__main__':
	main()