Skip to content

Instantly share code, notes, and snippets.

@cyphar
Created July 7, 2021 08:49
Show Gist options
  • Save cyphar/42a3df666f4e934253e3a4fb09af4fae to your computer and use it in GitHub Desktop.
Save cyphar/42a3df666f4e934253e3a4fb09af4fae to your computer and use it in GitHub Desktop.
Forvo-based Audio Server for Yomichan
#!/usr/bin/env python3
# forvo-yomichan: Forvo-based Yomichan audio source
# Copyright (C) 2021 Aleksa Sarai <cyphar@cyphar.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import re
import base64
import flask
import requests
app = flask.Flask(__name__)
PLAY_PATTERN = re.compile(r"Play\([^\)]+\)")
SLUG_PATTERN = re.compile(r"'([^']+)'")
MEDIA_URL = "https://audio00.forvo.com"
URLS = [
MEDIA_URL + "/%(extension)s/%(unbase64_slug)s", # mp3 in /mp3
MEDIA_URL + "/%(extension)s/%(unbase64_slug)s", # ogg in /ogg
MEDIA_URL + "/audios/%(extension)s/%(unbase64_slug)s", # mp3 in /audios/mp3
MEDIA_URL + "/audios/%(extension)s/%(unbase64_slug)s", # ogg in /audios/ogg
]
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
def forvo_fetch(search_term):
with requests.get("https://forvo.com/search/%s/ja/" % (search_term,), headers=HEADERS) as r:
if not r.ok:
print("[!] Got error from request: %s" % (r,))
return None
for sample in PLAY_PATTERN.findall(r.text):
# Currently the order of these is (mp3, ogg, mp3, ogg), with the
# last two audio samples appearing to be better quality.
slugs = SLUG_PATTERN.findall(sample)[:len(URLS)]
if len(slugs) < 1:
print("[!] Got no slugs from '%s' search: %s" % (search_term, sample))
continue
urls = []
for i, slug in enumerate(slugs):
unbase64_slug = base64.b64decode(slug).decode("utf-8")
_, extension = os.path.splitext(unbase64_slug)
urls.append(URLS[i] % {
"unbase64_slug": unbase64_slug,
"extension": extension[1:],
})
yield urls[-1] # Last one is usually better quality.
def forvo_find_best(term, reading):
# Forvo doesn't have reading-based search, so we first try the term
# version. If there is only one result, return that. Otherwise, try the
# reading-based approach. If there is only one result, return that.
# Otherwise return all the readings and let Yomichan user pick (with a
# preference for term URLs).
term_urls = list(forvo_fetch(term) or [])
reading_urls = list(forvo_fetch(reading) or [])
if not (term_urls or reading_urls):
# We didn't get anything.
print("[!] Nothing from Forvo for %s(%s)." % (term, reading))
return []
if len(term_urls) == 1:
return term_urls
if len(reading_urls) == 1:
return reading_urls
return term_urls + reading_urls
@app.route("/<term>/<reading>")
def forvo(term, reading):
return flask.jsonify({
"type": "audioSourceList",
"audioSources": [
{"name": "Forvo", "url": url} for url in forvo_find_best(term, reading)
],
})
if __name__ == "__main__":
app.run(host="127.0.0.1", port="50505")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment