Skip to content

Instantly share code, notes, and snippets.

@castdrian
Created December 9, 2022 09:22
Show Gist options
  • Save castdrian/6c6155ecabeb44b1ab53bd7d9fc19342 to your computer and use it in GitHub Desktop.
Save castdrian/6c6155ecabeb44b1ab53bd7d9fc19342 to your computer and use it in GitHub Desktop.
import json
import re
import requests
import bs4
GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon'
def fetch_mon_urls():
"""Fetch all the urls for the mons."""
res = requests.get(GEN_IX_URL)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
mon_urls = []
# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages">
# find all the links in the list
# get the href attribute
# append to mon_urls
for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'):
mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href'))
return mon_urls
def get_bulba_data(url):
"""Get the data from the Bulbapedia page."""
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
return soup
def parse_bulba_data(soup):
# find roundy table
tables = soup.find_all('table', class_='roundy')
catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate')
merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text
gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio')
merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text
# get the percentages which look like "87.5% male, 12.5% female"
percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%']
# if lentgh is 1, then we get the gender text
if len(percentages) == 1:
# find one in the text
if 'male' in merged_gender_ratio:
percentages.append('0%')
else:
percentages.insert(0, '0%')
egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding')
merged_steps = egg[0].parent.next_sibling.next_sibling.text
no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)')
# check if merged_steps contains "Egg not obtainable" and return boolean
if 'Egg not obtainable' in merged_steps or no_group:
has_egg = False
else:
has_egg = True
# get the steps which look like "5140 - 5396 steps"
# use regex to get the numbers
min_steps = re.search(r'\d+', merged_steps).group()
ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield')
merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text
# find the first 7 numbers
ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7]
# cut off the first one
ev_yield = ev_yield[1:]
leveling_rate = tables[0].find_all('a', href='/wiki/Experience')
merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text
leveling_rate = merged_leveling_rate.strip()
# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title>
title = soup.find('title').text
# split the title by the first parenthesis
title = title.split('(', 1)[0].strip()
return {
'species': title.lower().strip().replace(' ', ''),
'genderRatio': { 'male': percentages[0], 'female': percentages[1] },
'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) },
'isEggObtainable': has_egg,
'catchRate': {
'base': int(merged_catch_rate.split(' ')[0].strip()),
'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '')
},
'levellingRate': leveling_rate,
'minimumHatchTime': int(min_steps)
}
def main():
"""Main function."""
mon_urls = fetch_mon_urls()
data_list = []
for url in mon_urls:
soup = get_bulba_data(url)
data = parse_bulba_data(soup)
data_list.append(data)
print('Successfully processed: ' + url)
# write pretty json to file
with open('partialBulbaData.json', 'w') as f:
json.dump(data_list, f, indent=4)
print('Done!')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment