Skip to content

Instantly share code, notes, and snippets.

@zardoru
Last active March 1, 2018 00:31
Show Gist options
  • Save zardoru/05dbf81c2432d5073c58e06c7afcbc83 to your computer and use it in GitHub Desktop.
Save zardoru/05dbf81c2432d5073c58e06c7afcbc83 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import unquote, quote
pages = [
"https://wikimon.net/Category:Digimon_with_profiles_on_Digimon_Reference_Book",
"https://wikimon.net/index.php?title=Category:Digimon_with_profiles_on_Digimon_Reference_Book&pagefrom=%E3%82%AB%0AGreymon+%282010+Anime+Version%29"
"https://wikimon.net/index.php?title=Category:Digimon_with_profiles_on_Digimon_Reference_Book&pagefrom=%E3%82%BF%0ADeckerdramon#mw-pages",
"https://wikimon.net/index.php?title=Category:Digimon_with_profiles_on_Digimon_Reference_Book&pagefrom=%E3%83%8F%0APalmon#mw-pages",
"https://wikimon.net/index.php?title=Category:Digimon_with_profiles_on_Digimon_Reference_Book&pagefrom=%E3%83%A9%0ARosemon#mw-pages"
]
links = []
for i, x in enumerate(pages):
print("Reading page... {}".format(i))
url = x
print (x)
f = urlopen(url)
text = str(f.read(), 'ascii', 'ignore')
print ("Parsing...")
soup = BeautifulSoup(text, "lxml")
print ("Going through wiki links...")
link_arr = [x.find_all("a") for x in soup.find_all(class_="mw-category-group")]
for ls in link_arr:
for link in ls:
links.append("https://wikimon.net" + link.get('href'))
print (len(links))
out = open("lines.txt", "w")
print ("Starting to load links...")
for mon in links:
print ("Reading {}...".format(mon).ljust(30) + "\r")
f = urlopen(mon)
text = str(f.read(), 'ascii', 'ignore')
soup = BeautifulSoup(text, "lxml")
# text, skipping "Japanese" bit
tag = soup.find(id="pnDigimonRefBookMultiMorphContent1")
if tag is None:
tag = soup.find(id="pn1aCurrentMultiMorphContent1")
if tag is None:
tag = soup.find(id="pDigimonRefBookMultiMorphContent1")
if tag is None:
print ("Can't find table data?")
continue
txt = tag.table.text
txt = txt.replace("\n\n Japanese" , "")
txt = txt.replace("Digimon Reference Book" , "")
txt = txt.replace('\n', '')
out.write(txt + "\n")
out.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment