Created
September 25, 2021 06:36
-
-
Save RoyTakanen/d0118d976418190d7ab5a4748d28221f to your computer and use it in GitHub Desktop.
Kaikki suomenkielen sanat
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import requests | |
import json | |
alphabets = list(string.ascii_lowercase + "åäö") | |
def crawl_with_prefix(alphabet, limit, start): | |
headers = {} # Find these from the kielitoimistonsanakirja.fi autocomplete request in browser | |
params = ( | |
('keyword', alphabet), | |
('limit', str(limit)), | |
('start', str(start)), | |
('searchMode', 'all'), | |
) | |
response = requests.get('https://www.kielitoimistonsanakirja.fi/api/search/api/v1/autocomplete', headers=headers, params=params) | |
return response | |
def print_words(words): | |
for word in words: | |
print(word["clean_headword"]) | |
total = 0 | |
for alphabet in alphabets: | |
info = crawl_with_prefix(alphabet, "1", "0").json() | |
if info["overMaxLimit"]: | |
for another_alphabet in alphabets: | |
data = crawl_with_prefix(alphabet + another_alphabet, "5000", "0") | |
if data.status_code == 404: | |
pass | |
else: | |
words = data.json()["result"] | |
print_words(words) | |
else: | |
data = crawl_with_prefix(alphabet, "5000", "0") | |
words = data.json()["result"] | |
print_words(words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment