-
-
Save cryzed/9319ac406cdb330e50a148a5dcdb604c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index: statistics.py | |
IDEA additional info: | |
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | |
<+>UTF-8 | |
=================================================================== | |
--- statistics.py (date 1498849346000) | |
+++ statistics.py (revision ) | |
@@ -7,12 +7,13 @@ | |
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2017 additions by David Forrester <davidfor@internode.on.net>' | |
__docformat__ = 'restructuredtext en' | |
-import re, os, shutil | |
+import re, os, shutil, codecs | |
from calibre import prints | |
from calibre.constants import isosx | |
from calibre.ebooks.oeb.iterator import EbookIterator | |
from calibre.utils.ipc.simple_worker import fork_job, WorkerError | |
+from calibre.ebooks import BeautifulSoup | |
from calibre_plugins.count_pages.nltk_lite.textanalyzer import TextAnalyzer | |
@@ -61,6 +62,8 @@ | |
count = _get_page_count_adobe(iterator, book_path) | |
elif page_algorithm == 3: | |
count = _get_page_count_custom(iterator, custom_chars_per_page) | |
+ elif page_algorithm == 4: | |
+ count = _get_page_count_accurate_experimental(iterator) | |
print('\tPage count:', count) | |
return iterator, count | |
@@ -71,7 +74,7 @@ | |
Given an iterator for the epub (if already opened/converted), estimate a word count | |
''' | |
from calibre.utils.localization import get_lang | |
- | |
+ | |
if iterator is None: | |
iterator = _open_epub_file(book_path) | |
@@ -124,6 +127,23 @@ | |
return count | |
+def _get_page_count_accurate_experimental(iterator): | |
+ epub_html = _read_epub_contents(iterator) | |
+ soup = BeautifulSoup.BeautifulSoup(epub_html) | |
+ | |
+ divisions = soup('div') | |
+ paragraphs = soup('p') | |
+ tags = divisions if len(divisions) > len(paragraphs) else paragraphs | |
+ | |
+ characters = 0 | |
+ for tag in tags: | |
+ characters += sum(len(text) for text in tag(text=True)) | |
+ | |
+ count = (len(tags) + characters // 70) // 31 | |
+ fast_count = len(epub_html) // 2400 + 1 | |
+ return max(count, fast_count) | |
+ | |
+ | |
def _get_page_count_accurate(iterator): | |
''' | |
The accurate algorithm attempts to apply a similar algorithm | |
@@ -222,9 +242,9 @@ | |
''' | |
book_text = _read_epub_contents(iterator, strip_html=True) | |
- | |
+ | |
wordcount = None | |
- | |
+ | |
if icu_wordcount: | |
try: | |
from calibre.spell.break_iterator import count_words | |
@@ -255,23 +275,24 @@ | |
''' | |
book_files = [] | |
for path in iterator.spine: | |
- with open(path, 'rb') as f: | |
- html = f.read().decode('utf-8', 'replace') | |
+ with codecs.open(path, 'r', encoding='UTF-8', errors='replace') as f: | |
+ html = f.read() | |
if strip_html: | |
- html = unicode(_extract_body_text(html)).strip() | |
+ html = _extract_body_text(html).strip() | |
#print('FOUND HTML:', html) | |
book_files.append(html) | |
- return ' '.join(book_files) | |
+ return u' '.join(book_files) | |
def _extract_body_text(data): | |
''' | |
Get the body text of this html content wit any html tags stripped | |
''' | |
- body = RE_HTML_BODY.findall(data) | |
+ body = RE_HTML_BODY.search(data) | |
if body: | |
- return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ') | |
- return '' | |
+ stripped_markup = RE_STRIP_MARKUP.sub(u'', body.group(0)) | |
+ return re.sub(ur'\.', u'. ', stripped_markup) | |
+ return u'' | |
# --------------------------------------------------------- | |
# CBR/CBZ Page Count Functions | |
@@ -331,7 +352,7 @@ | |
def get_flesch_reading_ease(text_analysis, lang=None): | |
if lang and lang == 'deu': | |
print('\tFlesch Reading Ease: language=%s' % lang) | |
- score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount'])) | |
+ score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount'])) | |
else: | |
score = 206.835 - (1.015 * (text_analysis['averageWordsPerSentence'])) - (84.6 * (text_analysis['syllableCount']/ text_analysis['wordCount'])) | |
print('\tFlesch Reading Ease:', score) | |
Index: config.py | |
IDEA additional info: | |
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | |
<+>UTF-8 | |
=================================================================== | |
--- config.py (date 1498849346000) | |
+++ config.py (revision ) | |
@@ -59,7 +59,7 @@ | |
KEY_PAGES_ALGORITHM = 'algorithmPages' | |
KEY_CUSTOM_CHARS_PER_PAGE = 'customCharsPerPage' | |
-PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)')] | |
+PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)'), 'Paragraphs (APNX accurate, experimental)'] | |
PAGE_DOWNLOADS = { | |
'goodreads': |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment