Skip to content

Instantly share code, notes, and snippets.

@alaakh42
Created October 2, 2018 22:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alaakh42/c26a19c01d055b6a98784b5400dad3bb to your computer and use it in GitHub Desktop.
Save alaakh42/c26a19c01d055b6a98784b5400dad3bb to your computer and use it in GitHub Desktop.
import re
import urllib
import requests
import wikipedia
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en,ar;q=0.8',
'Connection': 'keep-alive'
}
start_url = "https://en.wikipedia.org/wiki/La_Liga?oldformat=true"
base_url = "https://en.wikipedia.org"
try:
page_response = requests.get(start_url, headers=hdr)
if page_response.status_code == 200:
data = page_response.text
else:
print('======== ERROR STATUS NUMBER ======== ', page_response.status_code)
except requests.Timeout as e:
print("IT IS TIME TO TIMEOUT")
print(str(e))
soup = BeautifulSoup(data, "html.parser")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment