Skip to content

Instantly share code, notes, and snippets.

@jkotra
Created July 24, 2017 10:21
Show Gist options
  • Save jkotra/26bbe1261c066f2fabbac279516e2b27 to your computer and use it in GitHub Desktop.
Save jkotra/26bbe1261c066f2fabbac279516e2b27 to your computer and use it in GitHub Desktop.
SoccerScraper.py
import csv
from urllib.request import Request, urlopen # Python 3
from bs4 import BeautifulSoup
country = []
team = []
names1 = []
names2 = []
names3 = []
links1 = []
links2 = []
def csv_writer(player_id):
with open('output.csv', 'a') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow([country[player_id], team[player_id],names1[player_id], links1[player_id], links2[player_id]])
def url_scraper(player_name):
url_player_name = names3[player_name]
url_player = 'https://en.soccerwiki.org/wiki.php?action=search&searchType=all&q=%s' % url_player_name
print(url_player)
url = Request(url_player)
url.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
url.add_header('Referer', 'https://en.soccerwiki.org/')
url2 = urlopen(url).read()
soup = BeautifulSoup(url2, 'html.parser')
links_soccer_wiki = [(n['href']) for n in soup.findAll('a')]
profile_link_1 = links_soccer_wiki[17]
if profile_link_1 == 'http://www.grassrootsoccer.org/':
profile_link_1 = "Not found"
print(profile_link_1)
links1.append(profile_link_1)
else:
profile_link_1 = "https://en.soccerwiki.org/" + profile_link_1
print(profile_link_1)
links1.append(profile_link_1)
def url_scraper_2(player_name):
url_player_name = names3[player_name]
url_player = 'https://www.transfermarkt.co.uk/schnellsuche/ergebnis/schnellsuche?query=%s' % url_player_name
print(url_player)
url = Request(url_player)
url.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
url.add_header('Referer', 'https://www.transfermarkt.co.uk/')
url2 = urlopen(url).read()
url_d_chk = urlopen(url)
url_d_chk2 = url_d_chk.read()
slot1 = []
soup = BeautifulSoup(url2, 'html.parser')
for link in soup.find_all('a'):
slot1.append(link.get('href'))
slot1_url = "https://www.transfermarkt.co.uk" + slot1[-13]
url3 = url_d_chk2.decode('utf-8')
dc0 = '2 Hits' in url3
print("result is" + str(dc0) + ".")
if dc0 == True:
dc1 = team[player_name] in slot1[-14]
print (dc1)
if dc1 == False:
slot1_url="https://www.transfermarkt.co.uk" + slot1[-14]
print ("Fixedurl:" + slot1_url)
dc2 = team[player_name] in slot1[-9]
if dc2 == True:
slot1_url="https://www.transfermarkt.co.uk" + slot1[-9]
if slot1_url == "https://www.transfermarkt.co.uk/profil/einstellungen":
slot1_url = "Not found"
if slot1_url == "https://www.transfermarkt.co.uk#":
slot1_url = "Not found"
links2.append(slot1_url)
def name_processor(player_index):
replace_comma = names1[player_index].replace(",", "")
names_p1 = replace_comma.split()
names2.append(names_p1)
def name_processor_2(player_index):
names_p2 = names2[player_index]
names_p3 = names_p2[-1] + "+" + names_p2[0]
names3.append(names_p3)
#def openpage():
def count(num):
print (1 + num)
print ("----------SoccerScraper---------")
f1_read = open("engsoc.csv")
f1_csv = csv.reader(f1_read)
for row in f1_csv:
names1.append(row[2])
country.append(row[0])
team.append(row[1])
f1_read.close()
list_len = len(names1)
for i in range(list_len):
name_processor(i)
for i in range(list_len):
name_processor_2(i)
for i in range(list_len):
url_scraper(i)
for i in range(list_len):
url_scraper_2(i)
for i in range(len(links1)):
csv_writer(i)
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment