Skip to content

Instantly share code, notes, and snippets.

@alaakh42
Created October 2, 2018 22:14
Show Gist options
  • Save alaakh42/876833459eb3b045604d3950a3f6c194 to your computer and use it in GitHub Desktop.
Save alaakh42/876833459eb3b045604d3950a3f6c194 to your computer and use it in GitHub Desktop.
first_col = []
teams_links = []
summary = []
history = []
stadiums = []
locations = []
stadiums_capcity = []
table = soup.find("table", style="text-align: left;")
table_body = table.find("tbody")
for row in table_body.find_all("tr")[1:]:
first_item = row.find_all("td")[0]
second_item = row.find_all("td")[1]
third_item = row.find_all("td")[2]
forth_item = row.find_all("td")[3]
first_col.append(first_item.text)
locations.append(second_item.text.strip().replace("\n",""))
stadiums.append(third_item.text.strip().replace("\n",""))
stadiums_capcity.append(int(re.sub(r'\[.*?\]', '', forth_item.text.replace(",","")[20:])))
for link in first_item.find_all("a", href=True):
print(first_item.text)
try:
summary.append(wikipedia.summary(first_item.text, sentences=10))
except wikipedia.exceptions.DisambiguationError as e:
print("Error: {0}".format(e))
summary.append(wikipedia.summary(link['href'].split("/")[2], sentences=10)) #urllib.unquote(link['href'].split("/")[2]).decode('utf8')
teams_links.append(base_url + link['href'])
try:
club_page = requests.get(base_url + link['href'], headers=hdr)
if club_page.status_code == 200:
club_page_data = club_page.text
else:
print('======== ERROR STATUS NUMBER ======== ', club_page.status_code)
except requests.Timeout as e:
print("IT IS TIME TO TIMEOUT")
print(str(e))
club_soup = BeautifulSoup(club_page_data, "html.parser")
print(club_soup.find("h3").text.replace('[edit]',"").replace('\n',''))
try:
if wikipedia.WikipediaPage(first_item.text.replace('\n','')).section(u"History") != u'':
history.append(wikipedia.WikipediaPage(first_item.text.replace('\n','')).section(u"History"))
else:
history.append(wikipedia.WikipediaPage(first_item.text.replace('\n','')).section(club_soup.find("h3").text.replace('[edit]',"").replace('\n','')))
if wikipedia.WikipediaPage(first_item.text.replace('\n','')).section(club_soup.find("h3").text.replace('[edit]',"").replace('\n','')) == u'':
history.append(wikipedia.WikipediaPage(urllib.unquote(link['href'].split("/")[2].replace('\n',''))).section(club_soup.find("h3").text.replace('[edit]',"").replace('\n','')))
except wikipedia.exceptions.DisambiguationError as e:
print("Error: {0}".format(e))
if wikipedia.WikipediaPage(urllib.unquote(link['href'].split("/")[2].replace('\n',''))).section(u"History") != u'':
history.append(wikipedia.WikipediaPage(urllib.unquote(link['href'].split("/")[2].replace('\n',''))).section(u"History"))
else:
history.append(wikipedia.WikipediaPage(urllib.unquote(link['href'].split("/")[2].replace('\n',''))).section(club_soup.find("h3").text.replace('[edit]',"").replace('\n','')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment