Skip to content

Instantly share code, notes, and snippets.

@fnneves
Last active October 30, 2020 11:11
Show Gist options
  • Save fnneves/58e18be9be4a83abe8ee8bb0f094d5a5 to your computer and use it in GitHub Desktop.
Save fnneves/58e18be9be4a83abe8ee8bb0f094d5a5 to your computer and use it in GitHub Desktop.
import requests
from glob import glob
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from time import sleep
# http://www.networkinghowtos.com/howto/common-user-agent-list/
HEADERS = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
# imports a csv file with the url's to scrape
prod_tracker = pd.read_csv('trackers/TRACKER_PRODUCTS.csv', sep=';')
prod_tracker_URLS = prod_tracker.url
# fetch the url
page = requests.get(prod_tracker_URLS[0], headers=HEADERS)
# create the object that will contain all the info in the url
soup = BeautifulSoup(page.content, features="lxml")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment