Zulko/nebscraper.py

## nebscraper.py
import urllib
import time

from Bio import Restriction
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas

def get_neb_soup(endpoint):
    with urllib.request.urlopen("https://www.neb.com" + endpoint) as response:
        soup = BeautifulSoup(response.read(), 'html.parser')
    return soup

enzymes_catalogue = get_neb_soup("/products/restriction-endonucleases")

enzymes = [
    (a.text, a.attrs["href"])
    for a in enzymes_catalogue.find_all("a")
    if a.attrs["href"].startswith("/products/r")
    and a.text in Restriction.__dict__
]

results = []
for enz, endpoint in tqdm(enzymes):
    enzyme_page = get_neb_soup(endpoint)
    for td in enzyme_page.find_all("td", {"class": "sku"}):
        results.append([enz]+[t.text for t in td.parent.find_all("td")[1:-2]])
    time.sleep(2)


data = pandas.DataFrame(results, columns=["enzyme", "units", "concentration", "price"])
num_price = data["price"].apply(lambda s: float(s[1:]))
num_units = data["units"].apply(lambda s: int(s.replace(",","").split(" ")[0]))
data["price_per_unit"] = num_price/num_units
	import urllib
	import time

	from Bio import Restriction
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import pandas

	def get_neb_soup(endpoint):
	with urllib.request.urlopen("https://www.neb.com" + endpoint) as response:
	soup = BeautifulSoup(response.read(), 'html.parser')
	return soup

	enzymes_catalogue = get_neb_soup("/products/restriction-endonucleases")

	enzymes = [
	(a.text, a.attrs["href"])
	for a in enzymes_catalogue.find_all("a")
	if a.attrs["href"].startswith("/products/r")
	and a.text in Restriction.__dict__
	]

	results = []
	for enz, endpoint in tqdm(enzymes):
	enzyme_page = get_neb_soup(endpoint)
	for td in enzyme_page.find_all("td", {"class": "sku"}):
	results.append([enz]+[t.text for t in td.parent.find_all("td")[1:-2]])
	time.sleep(2)


	data = pandas.DataFrame(results, columns=["enzyme", "units", "concentration", "price"])
	num_price = data["price"].apply(lambda s: float(s[1:]))
	num_units = data["units"].apply(lambda s: int(s.replace(",","").split(" ")[0]))
	data["price_per_unit"] = num_price/num_units