Skip to content

Instantly share code, notes, and snippets.

Created January 8, 2017 22:35
Show Gist options
  • Save anonymous/0bd76f5679e99b9eb73729fe0e291417 to your computer and use it in GitHub Desktop.
Save anonymous/0bd76f5679e99b9eb73729fe0e291417 to your computer and use it in GitHub Desktop.
"""
duolingo.com
"""
import json
import re
import urllib
from difflib import SequenceMatcher
from calibre.web.feeds.news import BasicNewsRecipe
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
"""
Download all Duolingo lesson tips and notes for your language
and turn them into a handy reference book!
USAGE:
1) you must have calibre installed. Download it from https://calibre-ebook.com/
Calibre is free, cross-platform e-book creation / conversion / management software.
2) go to https://www.duolingo.com/ , log in,
and make sure you have switched to the language you want to download
3) from the command line, type
ebook-convert duolingo.recipe <outputfilename>.<ext> --username <myduolingousername> --password <myduolingopassword> -vv --test
where <ext> is the book format you want, that is, epub azw3 mobi pdf
if you omit <outputfilename> it will use the same name as input, i.e., duolingo
Example:
ebook-convert duolingo.recipe duo_french.epub --username bob --password mary123 -vv --test
4) This creates a test book called duo_french.epub with only 2 lessons naximum.
Open it. If everything looks good, run the command again without
the --test this time, i.e.,
ebook-convert duolingo.recipe duo_french.epub --username bob --password mary123 -vv
Notes:
a) The -vv tells it to spit out some possibly useful debug info.
b) Once you have the epub, you can convert it to other formats without running
the script (and downloading everything) again, like this:
ebook-convert duo_french.epub .mobi --username bob --password mary123 -vv
ebook-convert duo_french.epub .azw3 --username bob --password mary123 -vv
ebook-convert duo_french.epub .pdf --username bob --password mary123 -vv
Good luck! Contact heybart on reddit if you have a problem
"""
class DuolingoLessons(BasicNewsRecipe):
# A few customizable options
# how book title will appear
# {} will be replaced with language name
title_with_vocabs = u'{} grammar and vocabulary with Duolingo'
title_no_vocabs = u'Learning {} with Duolingo'
# URL of image to use as the book cover
# set to a web address or local file
# example: 'http://i.imgur.com/KDslMRP.jpg' or 'c:/pics/owl.png'
# Windows users: use forward slashes / instead of backslashes \
# set to 'auto' to use the default picture of the duolingo owl
cover_url = 'auto'
# cover_url = 'https://duolingo-images.s3.amazonaws.com/avatars/15224667/q13kbDuwyI/xlarge' # American English
# cover_url = 'http://d2rhekw5qr4gcj.cloudfront.net/img/400sqf/from/uploads/course_photos/8947308000160730232415.png' # spanish
# cover_url = 'http://i.imgur.com/ZUIigS0.png' # german
# cover_url = 'http://i.imgur.com/KDslMRP.jpg' #french
# cover_url = 'http://d2rhekw5qr4gcj.cloudfront.net/img/400sqf/from/uploads/course_photos/2656444000150612030056.png' # norwegian
# description for your book, shows up in metadata
description = 'Duolingo Lesson Tips and Notes'
# include lesson words? either True or False
include_vocabs = False
# (if include_vocabs = True) include definitions of lessons words?
include_defs = True
# (if include_defs = True) put definitions inline instead of popup footnotes?
inline_defs = False
definition_color = '#222'
# symbol to indicate lesson has tips & notes
dagger = u'\u2020'
# ------------------------
# don't mess with rest of this stuff unless you know what you're doing :)
# see https://manual.calibre-ebook.com/news_recipe.html
# for documentation on calibre recipe API
# -------------------------
__author__ = 'heybart on reddit'
__version__ = '0.16.3a'
# there should be no reason to change this
index_url = 'https://www.duolingo.com'
login_url = 'https://www.duolingo.com/login'
max_articles_per_feed = 150
no_stylesheets = False
no_javascript = True
needs_subscription = True
extra_css = (
'.calibre_navbar {display:none} '
'a.sup, a sup { text-decoration: none !important; } '
'sup.invis { color: white !important; } '
'table { border-bottom: 1px solid #888; } '
'.vocabs { border-bottom: 1px dotted #888; } '
'li.vocab { margin-top: 3px; margin-bottom: 3px; } '
'.vocab_word { font-size: 1em; font-weight: bold } '
'.vocab_def { font-size: 0.90em; color: ' + definition_color + '; } '
'th { border-bottom: 1px dotted #aaa; } td, th {padding: 5px; } '
'.footnotes { page-break-before: always;} '
'h1 { font-size: 1.3em; border-bottom: 1px solid #aaa;} '
'h2 { font-size: 1.2em; } h3 { font-size: 1.15em; } '
'h4 { font-size: 1.10em; } h5,h6 { font-size: 1.05em; } '
)
# green duolingo owl, no specific language
default_cover_url = 'http://65.media.tumblr.com/5fd6b3ccc4e8c978c87f469b236558ad/tumblr_inline_mwkqv1OuOg1ss97ol.png'
learning_language = None
learning_language_id = None
# indices to use for <a name=...> to ensure uniqueness
a_indices = {}
def get_browser(self):
print('getbrowse')
br = BasicNewsRecipe.get_browser(self)
data = {'login': self.username, 'password': self.password}
br.open(self.login_url, urllib.urlencode(data))
return br
def get_raw(self, url):
br = BasicNewsRecipe.get_browser(self)
return br.open(url).read()
def get_json(self, url):
return json.loads(self.get_raw(url))
def lookup_definitions(self, learning_lang, from_lang, words):
"""
use API call to look up definition of words
learning_lang := language to translate to
from_lang := language to translate from
words := list of words
example:
learning_lang = "fr"
from_lang = "en"
words = ["me","femme","pays"]
https://d2.duolingo.com/api/1/dictionary/hints/fr/en?tokens=["me","femme","pays"]
returns
{"me": ["me", "myself"], "pays": ["country", "countries", "land", "region", "village"], "femme": ["woman", "wife"]}
"""
# flatten list
words = [item for sublist in words for item in sublist]
params = json.dumps(words, separators=(',', ':'))
url = 'https://d2.duolingo.com/api/1/dictionary/hints/{}/{}?tokens={}' \
.format(learning_lang, from_lang, urlquote(params))
return self.get_json(url)
def make_anchor(self, prefix, name = ''):
idx = self.a_indices.get(prefix, 1)
self.a_indices[prefix] = idx + 1
return '{}{}{}'.format(prefix, idx, re.sub(r'[^\w\d]', '', name))
# def postprocess_book(self, oeb, opts, log):
# # Remove the superfluous extra feed page at the beginning of the book, replacing it
# # with the proper credits
# for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'):
# item.getparent().remove(item)
# for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'):
# item.getparent().remove(item)
# def postprocess_html(self, soup, first_fetch):
# try:
# with open('d:/python/duolingo/tmp/'+self.make_anchor('post_')+'.html', 'w') as myfile:
# myfile.write(soup.prettify())
# except:
# pass
# return soup
def preprocess_raw_html(self, raw_html, url):
"""
extract article title ('name') and the tips and notes ('explanation')
and optionally vocabulary words ('lessonWords')
from json and return result as html
"""
def get_vocabs(lang_data):
"""return string of lesson vocabs built from lang_data
"""
if not self.include_vocabs: return ('', '')
# list of list of words, one list for each lesson
word_lists = lang_data.get('lessonWords')
if word_lists is None: return ('', '')
vocab_section = self.make_anchor('voc_')
vocabs = endnotes = ''
if self.include_defs:
defs = self.lookup_definitions(
self.learning_language,
lang_data['fromLanguage'], word_lists)
for words in word_lists:
if self.include_defs:
strng = sep = ''
for word in words:
if self.inline_defs:
strng += (
'{}&#9830;&nbsp;<span class="vocab_word">{}</span>: '
'<span class="vocab_def"> {}</span>').format(
sep, word, ', '.join(defs[word]))
else:
# kindle formats require a superscripted link from noteref
# to the footnote and a link back from footnote to noteref
# to make popup footnote work
fn = self.make_anchor('fn_', word) # name for footnote
ref = self.make_anchor('ref_', word) # name for noteref
strng += (
'{0}{3}<a id="{1}" href="#{2}" epub:type="noteref">'
'<sup>*</sup></a>').format(sep, ref, fn, word)
# ' <a href="#{4}">&#8629;</a></p>') \
endnotes += (
'<p id="{0}" epub:type="footnote">'
'<a href="#{3}">{1}</a>: {2}</p>') \
.format(fn, word, ', '.join(defs[word]), ref)
sep = ',&nbsp; '
else:
strng = ', &nbsp;'.join(words)
vocabs += '<li>{}</li>'.format(strng)
vocabs = '<div class="vocabs" id="{}"><ol>{}</ol></div>'.format(vocab_section, vocabs)
endnotes = '<aside epub:type="footnotes" class="footnotes">{}</aside>'.format(endnotes)
return (vocabs, endnotes)
def get_notes(lang_data):
"""process and return tips & notes from lang_data['explanation]
"""
notes = lang_data.get('explanation', '')
if len(notes) < 200: return ''
# if notes contains a heading similar to title, remove it
# because we'll add the title ourselves
hreg = re.compile(r'^\s*\<h\d\>(.+?)\<\/h\d\>')
m = hreg.match(notes)
if m and similar(m.group(1), lang_data['name']):
notes = hreg.sub('', notes)
# strip out extraneous "<hr /> blah blah" near bottom
notes = re.sub('\<hr \/\>.{,5}a href(.+)?$', '', notes, 1, re.DOTALL)
# strip out extraneous "blah blah <hr />" near top
notes = re.sub('^.{,100}\<hr \/\>', '', notes, 1, re.DOTALL)
return notes
try:
lang_data = json.loads(raw_html).get('skills')[0]
except:
abort_article('Unexpected json data')
return
heading = lang_data['name']
notes = get_notes(lang_data)
(vocabs, endnotes) = get_vocabs(lang_data)
if vocabs or notes:
class_name = ''
# if has both notes and vocabs add a link to the heading
# so you can skip over the vocabs and jump to the notes
if notes and vocabs:
anchor = self.make_anchor('notes_', heading)
heading += ' <a href="#{}">{}</a>'.format(anchor, self.dagger)
notes = '<a name="{0}" id="{0}"></a>{1}'.format(anchor, notes)
class_name = 'has_notes'
heading = '<h1 class="{}">{}</h1>'.format(class_name, heading)
html = (
'<?xml version="1.0" encoding="utf-8"?>'
'<html xmlns:epub="http://www.idpf.org/2007/ops">'
'<head><title></title></head><body>{}{}{}{}</body></html>') \
.format(heading, vocabs, notes, endnotes)
try:
with open('d:/python/duolingo/tmp/%s.html' % self.make_anchor('tmp_'), 'w') as myfile:
myfile.write(html)
except:
pass
return html
else:
self.abort_article(heading + ' has no notes or vocabs.')
def print_version(self, url):
"""
change user facing url
.../skill/<language>/<topic>
to
.../2016-04-13/skills?learningLanguage=<learning_lang>&urlName=<topic>
this gives us the json data we really want
"""
return re.sub(
'/skill/[^/]+/',
'/2016-04-13/skills?learningLanguage=' + self.learning_language_id + '&urlName=',
url, 1)
def populate_article_metadata(self, article, soup, first):
"""add dagger to title if h1 class == has_notes
"""
h = soup.find('h1')
if h and h.get('class') == 'has_notes':
article.title = article.title + ' ' + self.dagger
def get_learning_language_id(self):
"""
look up the learning language id needed for the skill API call
from the javascript struct duo.available_languages
downloaded from home page. (Don't know of a more elegant way of getting this)
usually it is the same as learning lang abbreviation but not always
e.g., for Norwegian, language abbrev = nb, learning language id = no-BO
"""
if self.learning_language_id is None:
raw_str = self.get_raw(self.index_url)
m = re.match(
'.+duo\.available\_languages\s*\=\s*(\[(.+?)\])',
raw_str, re.DOTALL)
langs = json.loads(m.group(1))
for lang in langs:
if lang['key'] == self.learning_language:
self.learning_language_id = lang['learning_language_id']
self.log('learning_language_id: ', self.learning_language_id)
break
# find nothing? well, let's hope for the best!
if self.learning_language_id is None:
self.learning_language_id = self.learning_language
def parse_index(self):
"""
get user data from which we get list of skills (i.e., lessons)
as well as learning language, then build article list
"""
print('parsei')
user_data = self.get_json('https://www.duolingo.com/users/' + self.username)
self.learning_language = user_data['learning_language']
if not self.learning_language:
abort_recipe_processing('Failed to get learning_language')
self.log('learning_language detected: ', self.learning_language)
self.get_learning_language_id()
lang_data = user_data['language_data'][self.learning_language]
lang_str = lang_data['language_string']
skills = lang_data['skills']
if self.include_vocabs:
self.title = self.title_with_vocabs.format(lang_str)
else:
self.title = self.title_no_vocabs.format(lang_str)
if self.cover_url == 'auto':
self.cover_url = self.default_cover_url
articles = []
# skills needs to be sorted by y coord (position in tree), then x coord
for skill in sorted(skills, key=lambda x: (x['coords_y'], x['coords_x'])):
url = '{}/skill/{}/{}'.format(self.index_url,
self.learning_language, urlquote(skill['url_title']))
#url = '{}/2016-04-13/skills?learningLanguage={}&urlName={}'.format(
# self.index_url, self.learning_language_id, urlquote(skill['url_title']))
# self.log('Found article:', url)
articles.append({'title': skill['title'], 'url': url})
return [(self.title, articles)]
def similar(a, b):
"""string a and b are similar if non junk chars SequenceMatcher ratio > .75
"""
return SequenceMatcher(
lambda x: x in ' -:_12345678890:/()[]?!', a.lower(), b.lower()
).ratio() >= 0.75
def urlquote(params):
"""safely quote url params with UTF-8 encoding
"""
return urllib.quote_plus(params.encode('UTF-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment