Skip to content

Instantly share code, notes, and snippets.

@cryzed
Last active June 30, 2017 19:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cryzed/9319ac406cdb330e50a148a5dcdb604c to your computer and use it in GitHub Desktop.
Save cryzed/9319ac406cdb330e50a148a5dcdb604c to your computer and use it in GitHub Desktop.
Index: statistics.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- statistics.py (date 1498849346000)
+++ statistics.py (revision )
@@ -7,12 +7,13 @@
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2017 additions by David Forrester <davidfor@internode.on.net>'
__docformat__ = 'restructuredtext en'
-import re, os, shutil
+import re, os, shutil, codecs
from calibre import prints
from calibre.constants import isosx
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
+from calibre.ebooks import BeautifulSoup
from calibre_plugins.count_pages.nltk_lite.textanalyzer import TextAnalyzer
@@ -61,6 +62,8 @@
count = _get_page_count_adobe(iterator, book_path)
elif page_algorithm == 3:
count = _get_page_count_custom(iterator, custom_chars_per_page)
+ elif page_algorithm == 4:
+ count = _get_page_count_accurate_experimental(iterator)
print('\tPage count:', count)
return iterator, count
@@ -71,7 +74,7 @@
Given an iterator for the epub (if already opened/converted), estimate a word count
'''
from calibre.utils.localization import get_lang
-
+
if iterator is None:
iterator = _open_epub_file(book_path)
@@ -124,6 +127,23 @@
return count
+def _get_page_count_accurate_experimental(iterator):
+ epub_html = _read_epub_contents(iterator)
+ soup = BeautifulSoup.BeautifulSoup(epub_html)
+
+ divisions = soup('div')
+ paragraphs = soup('p')
+ tags = divisions if len(divisions) > len(paragraphs) else paragraphs
+
+ characters = 0
+ for tag in tags:
+ characters += sum(len(text) for text in tag(text=True))
+
+ count = (len(tags) + characters // 70) // 31
+ fast_count = len(epub_html) // 2400 + 1
+ return max(count, fast_count)
+
+
def _get_page_count_accurate(iterator):
'''
The accurate algorithm attempts to apply a similar algorithm
@@ -222,9 +242,9 @@
'''
book_text = _read_epub_contents(iterator, strip_html=True)
-
+
wordcount = None
-
+
if icu_wordcount:
try:
from calibre.spell.break_iterator import count_words
@@ -255,23 +275,24 @@
'''
book_files = []
for path in iterator.spine:
- with open(path, 'rb') as f:
- html = f.read().decode('utf-8', 'replace')
+ with codecs.open(path, 'r', encoding='UTF-8', errors='replace') as f:
+ html = f.read()
if strip_html:
- html = unicode(_extract_body_text(html)).strip()
+ html = _extract_body_text(html).strip()
#print('FOUND HTML:', html)
book_files.append(html)
- return ' '.join(book_files)
+ return u' '.join(book_files)
def _extract_body_text(data):
'''
Get the body text of this html content wit any html tags stripped
'''
- body = RE_HTML_BODY.findall(data)
+ body = RE_HTML_BODY.search(data)
if body:
- return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ')
- return ''
+ stripped_markup = RE_STRIP_MARKUP.sub(u'', body.group(0))
+ return re.sub(ur'\.', u'. ', stripped_markup)
+ return u''
# ---------------------------------------------------------
# CBR/CBZ Page Count Functions
@@ -331,7 +352,7 @@
def get_flesch_reading_ease(text_analysis, lang=None):
if lang and lang == 'deu':
print('\tFlesch Reading Ease: language=%s' % lang)
- score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
+ score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
else:
score = 206.835 - (1.015 * (text_analysis['averageWordsPerSentence'])) - (84.6 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
print('\tFlesch Reading Ease:', score)
Index: config.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- config.py (date 1498849346000)
+++ config.py (revision )
@@ -59,7 +59,7 @@
KEY_PAGES_ALGORITHM = 'algorithmPages'
KEY_CUSTOM_CHARS_PER_PAGE = 'customCharsPerPage'
-PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)')]
+PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)'), 'Paragraphs (APNX accurate, experimental)']
PAGE_DOWNLOADS = {
'goodreads':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment