Skip to content

Instantly share code, notes, and snippets.

@BaksiLi
Last active June 16, 2019 11:18
Show Gist options
  • Save BaksiLi/d4576e7bf3f40b2d98173bbd27dbc9a5 to your computer and use it in GitHub Desktop.
Save BaksiLi/d4576e7bf3f40b2d98173bbd27dbc9a5 to your computer and use it in GitHub Desktop.
Esperanto-English-Japanese-Etymology Dictionary
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# version: 0.1
import requests
from bs4 import BeautifulSoup
import pandas as pd
def download_from_web(url: str) -> str:
'''
Download dictionaries from web pages.
'''
response = requests.get(url)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
def retrieve_entries() -> (dict, dict, dict):
'''
Retrieve dictionaries from their links, and output as a tuple of three dicts.
'''
webpages = {
'Esperanto-English Dictionary': #
'https://www.esperanto-panorama.net/vortaro/eo-en-u.htm',
'エスペラント日本語基本辞書':
'https://vastalto.com/kagi/zisyo.html#LINIO_CX',
'Etymological Dictionary of the Esperanto Language': # Etimologia Vortareto
'https://web.archive.org/web/20100106014600/http://etymological.freeweb.hu/Esperanto.htm'
}
eo_alphabet = list('abcĉdefgĝhĥijĵklmnoprsŝtuŭvz-') \
+ list('ABCĈDEFGĜHĤIJĴKLMNOPSŜTUŬVZ')
dict1_entries, dict2_entries, dict3_entries = {}, {}, {}
# for 'Esperanto-English Dictionary'
# data stored as plain texts
dict1_name = 'Esperanto-English Dictionary'
dict1_lines = download_from_web(
url=webpages[dict1_name]).split('\n')
for line in dict1_lines[39: 15411]:
# if the first char is in the alphabet
if line[0] in eo_alphabet:
# unpack line contents
line_contents = line.split()
vorto = line_contents[0]
# defino = [i.strip(',') for i in line_contents[1:]]
defino = ' '.join(line_contents[1:])
# store entry
dict1_entries[vorto] = defino
# print(dict1_entries)
# for 'エスペラント日本語基本辞書'
# data stored in <table>
dict2_name = 'エスペラント日本語基本辞書'
soup2 = BeautifulSoup(download_from_web(url=webpages[dict2_name]), 'html.parser')
table = soup2.findAll('table')[1].findAll('tr') # the second table is the entries
for i in table[1:-1]:
line_contents = i.get_text().split()
vorto = line_contents[0].replace('/','')
defino = line_contents[-1]
dict2_entries[vorto] = defino
# print(dict2_entries)
# for 'Etymological Dictionary of the Esperanto Language'
# data stored in <p>
dict3_name = 'Etymological Dictionary of the Esperanto Language'
soup3 = BeautifulSoup(download_from_web(url=webpages[dict3_name]), 'html.parser')
table = soup3.findAll('p', attrs={'class': 'MsoNormal'})
for i in table[5:]:
line_contents = i.get_text().split('\n')
vorto = line_contents[0]
defino = ''.join(line_contents[1:])
dict3_entries[vorto] = defino
# print(dict3_entries)
# print(set(dict1_entries.keys()) & set(dict3_entries.keys()) & set(dict2_entries.keys()))
return(dict1_entries,dict2_entries,dict3_entries)
if __name__ == '__main__':
(d1, d2, d3) = retrieve_entries()
pd1 = pd.DataFrame.from_dict(d1, orient='index', columns=['Eo-E'])
pd2 = pd.DataFrame.from_dict(d2, orient='index', columns=['Eo-Jp'])
pd3 = pd.DataFrame.from_dict(d3, orient='index', columns=['Etymology'])
# TODO: combine using lowercases
pd_combined = pd1.join(pd2, how='left').join(pd3, how='left')
pd_combined.to_csv('./dict.csv', index=True, header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment