Last active
January 22, 2020 14:03
-
-
Save katzefudder/1a2b54d0dbbbfcf8263abbda0ad062a6 to your computer and use it in GitHub Desktop.
Scrape DEL2 teams of eliteprospects using Python: scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bn31 -31- Felix Bick (EC Bad Nauheim) | |
bn29 -29- David Böttcher (EC Bad Nauheim) | |
bn42 -42- Pierluigi Capo (EC Bad Nauheim) | |
bn53 -53- Bastian Kucis (EC Bad Nauheim) | |
bn85 -85- David-Lee Paton (EC Bad Nauheim) | |
bn19 -19- Mike Card (EC Bad Nauheim) | |
bn55 -55- Maximilian Glötzl (EC Bad Nauheim) | |
bn11 -11- Simon Gnyp (EC Bad Nauheim) | |
bn82 -82- Niklas Heyer (EC Bad Nauheim) | |
bn6 -6- Daniel Ketter (EC Bad Nauheim) | |
bn3 -3- Jesper Kokkila (EC Bad Nauheim) | |
bn4 -4- Aaron Reinig (EC Bad Nauheim) | |
bn22 -22- Steve Slaton (EC Bad Nauheim) | |
bn -- Colin Ugbekile (EC Bad Nauheim) | |
bn -- Jan Wächtershäuser (EC Bad Nauheim) | |
bn41 -41- Dani Bindels (EC Bad Nauheim) | |
bn81 -81- Andrej Bires (EC Bad Nauheim) | |
bn97 -97- Jack Combs (EC Bad Nauheim) | |
bn70 -70- Nicolas Cornett (EC Bad Nauheim) | |
bn91 -91- Marc El-Sayed (EC Bad Nauheim) | |
bn23 -23- Tyler Fiddler (EC Bad Nauheim) | |
bn34 -34- Zach Hamill (EC Bad Nauheim) | |
bn14 -14- Marcel Kahle (EC Bad Nauheim) | |
bn -- Leon Köhler (EC Bad Nauheim) | |
bn77 -77- Mick Köhler (EC Bad Nauheim) | |
bn26 -26- Robin Palka (EC Bad Nauheim) | |
bn10 -10- Andreas Pauli (EC Bad Nauheim) | |
bn18 -18- Marvin Ratmann (EC Bad Nauheim) | |
bn9 -9- Huba Sekesi (EC Bad Nauheim) | |
bn16 -16- Cody Sylvester (EC Bad Nauheim) | |
bn88 -88- Luis Üffing (EC Bad Nauheim) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy, re | |
from scrapy.http.request import Request | |
class EliteSpider(scrapy.Spider): | |
name = "elite" | |
teams = { | |
'bn' : 'https://www.eliteprospects.com/team/438/ec-bad-nauheim', | |
'bi' : 'https://www.eliteprospects.com/team/440/bietigheim-steelers', | |
'fl' : 'https://www.eliteprospects.com/team/5065/lowen-frankfurt', | |
'bt' : 'https://www.eliteprospects.com/team/439/tolzer-lowen', | |
'by' : 'https://www.eliteprospects.com/team/746/bayreuth-tigers', | |
'dd' : 'https://www.eliteprospects.com/team/983/dresdner-eislowen', | |
'ka' : 'https://www.eliteprospects.com/team/8287/ec-kassel-huskies', | |
'fr' : 'https://www.eliteprospects.com/team/9328/ehc-freiburg', | |
'cr' : 'https://www.eliteprospects.com/team/659/eispiraten-crimmitschau', | |
'kb' : 'https://www.eliteprospects.com/team/677/esv-kaufbeuren', | |
'lh' : 'https://www.eliteprospects.com/team/642/ev-landshut', | |
'hn' : 'https://www.eliteprospects.com/team/444/heilbronner-falken', | |
'lf' : 'https://www.eliteprospects.com/team/448/lausitzer-fuchse', | |
'rt' : 'https://www.eliteprospects.com/team/747/ravensburg-towerstars' | |
} | |
def parse(self, response): | |
# invert the dict | |
team_keys = dict(zip(self.teams.values(), self.teams.keys())) | |
page = response.url.split('/')[-1] | |
filename = '%s.txt' % page | |
# get the team's name | |
team = str(response.css('#name-and-logo div.semi-logo::text').get()).strip() | |
with open(filename, 'w') as f: | |
for players in response.css('table.roster tbody tr'): | |
current_team_key = team_keys[response.url] | |
number = str(players.css('td.jersey::text').get()).strip() | |
number = number.replace('#', '') | |
name = str(players.css('td.sorted a::text').get()).strip() | |
# remove any hints on the player's name | |
name = str(re.sub('\(.*\)', '', name)).strip() | |
if number != 'None' and name != 'None': | |
f.write("%s%s\t\t-%s- %s (%s)\n" % (current_team_key, number, number, name, team)) | |
# override start_request to use an own dict instead of start_urls | |
def start_requests(self): | |
for key, url in self.teams.items(): | |
yield Request(url, self.parse) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hn20 -20- Jonas Gähr (Heilbronner Falken) | |
hn3 -3- Matthias Nemec (Heilbronner Falken) | |
hn30 -30- Mirko Pantkowski (Heilbronner Falken) | |
hn34 -34- Tom Schickedanz (Heilbronner Falken) | |
hn27 -27- Ian Brady (Heilbronner Falken) | |
hn44 -44- Marcus Götz (Heilbronner Falken) | |
hn6 -6- Kevin Maginot (Heilbronner Falken) | |
hn76 -76- Denis Majewski (Heilbronner Falken) | |
hn24 -24- Corey Mapes (Heilbronner Falken) | |
hn4 -4- Brock Maschmeyer (Heilbronner Falken) | |
hn16 -16- Tobias Möller (Heilbronner Falken) | |
hn7 -7- Jan Pavlu (Heilbronner Falken) | |
hn5 -5- Moritz Wirth (Heilbronner Falken) | |
hn50 -50- Louis Brune (Heilbronner Falken) | |
hn10 -10- Derek Damon (Heilbronner Falken) | |
hn19 -19- Stefan Della Rovere (Heilbronner Falken) | |
hn41 -41- Tim Detig (Heilbronner Falken) | |
hn71 -71- Bryce Gervais (Heilbronner Falken) | |
hn17 -17- Sebastian Hon (Heilbronner Falken) | |
hn40 -40- Valentino Klos (Heilbronner Falken) | |
hn89 -89- Michael Knaub (Heilbronner Falken) | |
hn98 -98- Davis Koch (Heilbronner Falken) | |
hn18 -18- Tim Miller (Heilbronner Falken) | |
hn22 -22- Alex Nikiforuk (Heilbronner Falken) | |
hn36 -36- Pierre Preto (Heilbronner Falken) | |
hn23 -23- Lukas Ribarik (Heilbronner Falken) | |
hn62 -62- Eero Savilahti (Heilbronner Falken) | |
hn66 -66- Jan-Luca Schumacher (Heilbronner Falken) | |
hn28 -28- Samuel Soramies (Heilbronner Falken) | |
hn26 -26- Yannik Valenti (Heilbronner Falken) | |
hn11 -11- Dylan Wruck (Heilbronner Falken) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment