Created
June 16, 2020 11:05
-
-
Save kamchy/d357ce88b673b476337285822d8120b7 to your computer and use it in GitHub Desktop.
scrape online dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bs | |
import time | |
url_base = "http://staropolska.pl/slownik/" | |
url_letter = url_base + "?litera=" | |
letters = "ABCĆDEFGHIJKLŁMNOPRSŚTUVWZŹŻ" | |
def parse(html): | |
node = bs(html, 'html.parser') | |
haslo = node.select('.tytul')[0].string | |
bo = node.select('.bazaopis') | |
try: | |
opis = bo[0].menu.li.string.strip() | |
return (haslo, opis) | |
except: | |
return None | |
def parsenode(node): | |
td = node.select('td[background="pics/bgr_lewa.gif"]') | |
# first link is useless, so I skip it | |
links_left_pane = td[0].select('a')[1:] | |
links_no_img = [l['href'] for l in filter(lambda n: n.img is None, links_left_pane)] | |
links_img = [l for l in filter(lambda m: m.img is not None and m.img["src"] == "pics/dalej.gif", links_left_pane)] | |
return (links_no_img, url_base + links_img[0]['href']) if links_img else None | |
def links(url): | |
page = requests.get(url) | |
node = bs(page.content, 'html.parser') | |
return parsenode(node) | |
def process_href(h): | |
p = parse(requests.get(url_base + h).content) | |
if p is not None: | |
print("%s\t%s" % p) | |
def all(): | |
for letter in letters: | |
nexturl = url_letter + letter | |
pair = links(nexturl) | |
while (pair is not None): | |
for href in pair[0]: | |
process_href(href) | |
nexturl = pair[1] | |
pair = links(nexturl) | |
if __name__ == "__main__": | |
all() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment