Skip to content

Instantly share code, notes, and snippets.

@savkov
Last active August 5, 2022 05:21
Show Gist options
  • Save savkov/3bcfecee852a02cbf7bc6426ac46efca to your computer and use it in GitHub Desktop.
Save savkov/3bcfecee852a02cbf7bc6426ac46efca to your computer and use it in GitHub Desktop.
Download IPA transcriptions from Wiktionary
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def get_ipa(html):
soup = BeautifulSoup(html, 'html5lib')
ipa_spans = soup.findAll('span', {'class': 'IPA'})
return [span.text[1:-1] for span in ipa_spans]
def get_html(word):
r = requests.get(f'https://en.wiktionary.org/wiki/{word}')
return r.text
def scrape(target_words):
words = {}
for word in tqdm(target_words):
html = get_html(word)
try:
ipa = get_ipa(html)
except:
ipa = None
words[word] = ipa
return words
words = ['баба', 'дядо', 'внуче'] # put words here
transcriptions = scrape(words)
# {
# 'баба': ['ˈbabə', 'ˈbaba', 'ˈbabə', 'bâba'],
# 'внуче': ['ˈvnut͡ʃɛ'],
# 'дядо': ['ˈdʲado']
#}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment