Skip to content

Instantly share code, notes, and snippets.

@kamchy
Created June 16, 2020 11:05
Show Gist options
  • Save kamchy/d357ce88b673b476337285822d8120b7 to your computer and use it in GitHub Desktop.
Save kamchy/d357ce88b673b476337285822d8120b7 to your computer and use it in GitHub Desktop.
scrape online dictionary
import requests
from bs4 import BeautifulSoup as bs
import time
url_base = "http://staropolska.pl/slownik/"
url_letter = url_base + "?litera="
letters = "ABCĆDEFGHIJKLŁMNOPRSŚTUVWZŹŻ"
def parse(html):
node = bs(html, 'html.parser')
haslo = node.select('.tytul')[0].string
bo = node.select('.bazaopis')
try:
opis = bo[0].menu.li.string.strip()
return (haslo, opis)
except:
return None
def parsenode(node):
td = node.select('td[background="pics/bgr_lewa.gif"]')
# first link is useless, so I skip it
links_left_pane = td[0].select('a')[1:]
links_no_img = [l['href'] for l in filter(lambda n: n.img is None, links_left_pane)]
links_img = [l for l in filter(lambda m: m.img is not None and m.img["src"] == "pics/dalej.gif", links_left_pane)]
return (links_no_img, url_base + links_img[0]['href']) if links_img else None
def links(url):
page = requests.get(url)
node = bs(page.content, 'html.parser')
return parsenode(node)
def process_href(h):
p = parse(requests.get(url_base + h).content)
if p is not None:
print("%s\t%s" % p)
def all():
for letter in letters:
nexturl = url_letter + letter
pair = links(nexturl)
while (pair is not None):
for href in pair[0]:
process_href(href)
nexturl = pair[1]
pair = links(nexturl)
if __name__ == "__main__":
all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment