Skip to content

Instantly share code, notes, and snippets.

@sc0tt
Created September 23, 2014 01:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sc0tt/f73292667a7c3264ae2b to your computer and use it in GitHub Desktop.
Save sc0tt/f73292667a7c3264ae2b to your computer and use it in GitHub Desktop.
Parses the pokemon tcg website.
import re
import time
from selenium import webdriver
# You will need to have the phantomjs exe in your path or alongside this script.
driver = webdriver.PhantomJS()
current_page = 0
total_pages = -1
parse_url = "http://www.pokemon.com/us/pokemon-tcg/pokemon-cards/?cardName=&cardText=&evolvesFrom=&simpleSubmit=&basic-pokemon=on&stage-1-pokemon=on&stage-2-pokemon=on&level-up-pokeomon=on&ex-pokemon=on&special-pokemon=on&pokemon-legend=on&restored-pokemon=on&trainer=on&trainer-pokemon-tool=on&trainer-stadium=on&trainer-supporter=on&trainer-technical-machine=on&trainer-rockets-secret-machine=on&basic-energy=on&special-energy=on&format=unlimited&hitPointsMin=0&hitPointsMax=200&retreatCostMin=0&retreatCostMax=5&totalAttackCostMin=0&totalAttackCostMax=10&particularArtist="
pokemon_urls = open("urls.txt", "w")
driver.get(parse_url)
try:
while current_page != total_pages:
# Go to the page navigation
pages = driver.find_element_by_id("cards-load-more")
# Find the element containing the page info
pages_container = pages.find_elements_by_tag_name("span")[1]
# Match the text
page_info = re.match(r"(\d+) of (\d+)", pages_container.get_attribute("innerHTML")).groups()
current_page = page_info[0]
total_pages = page_info[1]
print("Page %s of %s" % (current_page, total_pages))
# Go to the container of the pokemon cards
card_container = driver.find_element_by_id("cardResults")
# Find all the items
cards = card_container.find_elements_by_tag_name("li")
# For each card, print the url
for pkmn in cards:
pokemon_urls.write("%s\n" % pkmn.find_element_by_tag_name('a').get_attribute("href"))
# Navigate to the next page
pages.find_elements_by_tag_name("a")[1].click()
time.sleep(10)
except KeyboardInterrupt:
pass
driver.quit()
pokemon_urls.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment