Skip to content

Instantly share code, notes, and snippets.

@RoyTakanen
Created September 25, 2021 06:36
Show Gist options
  • Save RoyTakanen/d0118d976418190d7ab5a4748d28221f to your computer and use it in GitHub Desktop.
Save RoyTakanen/d0118d976418190d7ab5a4748d28221f to your computer and use it in GitHub Desktop.
Kaikki suomenkielen sanat
import string
import requests
import json
alphabets = list(string.ascii_lowercase + "åäö")
def crawl_with_prefix(alphabet, limit, start):
headers = {} # Find these from the kielitoimistonsanakirja.fi autocomplete request in browser
params = (
('keyword', alphabet),
('limit', str(limit)),
('start', str(start)),
('searchMode', 'all'),
)
response = requests.get('https://www.kielitoimistonsanakirja.fi/api/search/api/v1/autocomplete', headers=headers, params=params)
return response
def print_words(words):
for word in words:
print(word["clean_headword"])
total = 0
for alphabet in alphabets:
info = crawl_with_prefix(alphabet, "1", "0").json()
if info["overMaxLimit"]:
for another_alphabet in alphabets:
data = crawl_with_prefix(alphabet + another_alphabet, "5000", "0")
if data.status_code == 404:
pass
else:
words = data.json()["result"]
print_words(words)
else:
data = crawl_with_prefix(alphabet, "5000", "0")
words = data.json()["result"]
print_words(words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment