Skip to content

Instantly share code, notes, and snippets.

@lances101
Last active June 18, 2017 16:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lances101/5cfe430ff32d900454133f0a837e350f to your computer and use it in GitHub Desktop.
Save lances101/5cfe430ff32d900454133f0a837e350f to your computer and use it in GitHub Desktop.
Scrap locales from https://lh.2xlibre.net/locales/
###
# Downloads and parses https://lh.2xlibre.net/locales/ into a
# JSON file split into the following fields:
# - code: locale code, i.e. 'en_GB'
# - suffix: locale code suffix, i.e. 'latin' from 'be_BY'
# - name: locale name, i.e. 'English' from 'en_GB'
# - country: locale country 'title'lized, i.e. 'United Kingdom' from 'en_GB'
# Settings as on where to save the html file and locale file can be found below
###
# <===== CONFIG =====>
STORAGE_PATH = 'tools/locale_parser'
HTML_FILENAME = 'locales.html'
LOCALE_FILENAME = 'locales.json'
# <===== CONFIG END =====>
import json, os, re, requests
from bs4 import BeautifulSoup
if not os.path.exists(STORAGE_PATH):
os.makedirs(STORAGE_PATH)
HTML_FILENAME = os.path.join(STORAGE_PATH, HTML_FILENAME)
LOCALE_FILENAME = os.path.join(STORAGE_PATH, LOCALE_FILENAME)
text = ''
if not os.path.exists(HTML_FILENAME):
file = open(HTML_FILENAME, 'w')
resp = requests.get('https://lh.2xlibre.net/locales/')
file.write(resp.text)
file.close()
text = resp.text
else:
file = open(HTML_FILENAME, 'r')
text = file.read()
file.close()
node = BeautifulSoup(text, 'html.parser')
result = {'locales': []}
rows = node.find_all('tr', attrs={'class': 'glibc-HEAD'})
print(f'Found {len(rows)} rows')
max_len = {
'code':0,
'suffix':0,
'name':0,
'country':0
}
for row in rows:
entry = {}
first_column_text: str = row.contents[0].text
locale_code_array = first_column_text.split('@')
entry['code'] = locale_code_array[0]
max_len['code'] = max([max_len['code'], len(entry['code'])])
if len(locale_code_array) > 1:
entry['suffix'] = locale_code_array[1]
max_len['suffix'] = max([max_len['suffix'], len(entry['suffix'])])
second_column_text: str = row.contents[1].text
locale_name_search_result = re.search('— (.*) —', second_column_text)
entry['name'] = locale_name_search_result.group(1)
max_len['name'] = max([max_len['name'], len(entry['name'])])
third_column_text: str = row.contents[2].text
entry['country'] = third_column_text.title()
max_len['country'] = max([max_len['country'], len(entry['country'])])
result['locales'].append({'locale': entry})
json_dumped = json.dumps(result, indent=4)
print('Dumping into file locale')
if os.path.exists(LOCALE_FILENAME):
os.remove(LOCALE_FILENAME)
file = open(LOCALE_FILENAME, 'w')
file.write(json_dumped)
file.close()
print(f'Dumped. File size {str(int(os.path.getsize(LOCALE_FILENAME)/1000))} KB')
print(max_len)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment