cryzed/count-pages.patch Secret

## count-pages.patch
Index: statistics.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- statistics.py	(date 1498849346000)
+++ statistics.py	(revision )
@@ -7,12 +7,13 @@
 __copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2017 additions by David Forrester <davidfor@internode.on.net>'
 __docformat__ = 'restructuredtext en'

-import re, os, shutil
+import re, os, shutil, codecs

 from calibre import prints
 from calibre.constants import isosx
 from calibre.ebooks.oeb.iterator import EbookIterator
 from calibre.utils.ipc.simple_worker import fork_job, WorkerError
+from calibre.ebooks import BeautifulSoup

 from calibre_plugins.count_pages.nltk_lite.textanalyzer import TextAnalyzer

@@ -61,6 +62,8 @@
         count = _get_page_count_adobe(iterator, book_path)
     elif page_algorithm == 3:
         count = _get_page_count_custom(iterator, custom_chars_per_page)
+    elif page_algorithm == 4:
+        count = _get_page_count_accurate_experimental(iterator)

     print('\tPage count:', count)
     return iterator, count
@@ -71,7 +74,7 @@
     Given an iterator for the epub (if already opened/converted), estimate a word count
     '''
     from calibre.utils.localization import get_lang
-
+
     if iterator is None:
         iterator = _open_epub_file(book_path)

@@ -124,6 +127,23 @@
     return count


+def _get_page_count_accurate_experimental(iterator):
+    epub_html = _read_epub_contents(iterator)
+    soup = BeautifulSoup.BeautifulSoup(epub_html)
+
+    divisions = soup('div')
+    paragraphs = soup('p')
+    tags = divisions if len(divisions) > len(paragraphs) else paragraphs
+
+    characters = 0
+    for tag in tags:
+        characters += sum(len(text) for text in tag(text=True))
+
+    count = (len(tags) + characters // 70) // 31
+    fast_count = len(epub_html) // 2400 + 1
+    return max(count, fast_count)
+
+
 def _get_page_count_accurate(iterator):
     '''
     The accurate algorithm attempts to apply a similar algorithm
@@ -222,9 +242,9 @@
     '''

     book_text = _read_epub_contents(iterator, strip_html=True)
-
+
     wordcount = None
-
+
     if icu_wordcount:
         try:
             from calibre.spell.break_iterator import count_words
@@ -255,23 +275,24 @@
     '''
     book_files = []
     for path in iterator.spine:
-        with open(path, 'rb') as f:
-            html = f.read().decode('utf-8', 'replace')
+        with codecs.open(path, 'r', encoding='UTF-8', errors='replace') as f:
+            html = f.read()
             if strip_html:
-                html = unicode(_extract_body_text(html)).strip()
+                html = _extract_body_text(html).strip()
                 #print('FOUND HTML:', html)
         book_files.append(html)
-    return ' '.join(book_files)
+    return u' '.join(book_files)


 def _extract_body_text(data):
     '''
     Get the body text of this html content wit any html tags stripped
     '''
-    body = RE_HTML_BODY.findall(data)
+    body = RE_HTML_BODY.search(data)
     if body:
-        return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ')
-    return ''
+        stripped_markup = RE_STRIP_MARKUP.sub(u'', body.group(0))
+        return re.sub(ur'\.', u'. ', stripped_markup)
+    return u''

 # ---------------------------------------------------------
 #    CBR/CBZ Page Count Functions
@@ -331,7 +352,7 @@
 def get_flesch_reading_ease(text_analysis, lang=None):
     if lang and lang == 'deu':
         print('\tFlesch Reading Ease: language=%s' % lang)
-        score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
+        score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
     else:
         score = 206.835 - (1.015 * (text_analysis['averageWordsPerSentence'])) - (84.6 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
     print('\tFlesch Reading Ease:', score)
Index: config.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- config.py	(date 1498849346000)
+++ config.py	(revision )
@@ -59,7 +59,7 @@
 KEY_PAGES_ALGORITHM = 'algorithmPages'
 KEY_CUSTOM_CHARS_PER_PAGE = 'customCharsPerPage'

-PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)')]
+PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)'), 'Paragraphs (APNX accurate, experimental)']

 PAGE_DOWNLOADS = {
                   'goodreads':
	Index: statistics.py
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- statistics.py (date 1498849346000)
	+++ statistics.py (revision )
	@@ -7,12 +7,13 @@
	__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2017 additions by David Forrester <davidfor@internode.on.net>'
	__docformat__ = 'restructuredtext en'

	-import re, os, shutil
	+import re, os, shutil, codecs

	from calibre import prints
	from calibre.constants import isosx
	from calibre.ebooks.oeb.iterator import EbookIterator
	from calibre.utils.ipc.simple_worker import fork_job, WorkerError
	+from calibre.ebooks import BeautifulSoup

	from calibre_plugins.count_pages.nltk_lite.textanalyzer import TextAnalyzer

	@@ -61,6 +62,8 @@
	count = _get_page_count_adobe(iterator, book_path)
	elif page_algorithm == 3:
	count = _get_page_count_custom(iterator, custom_chars_per_page)
	+ elif page_algorithm == 4:
	+ count = _get_page_count_accurate_experimental(iterator)

	print('\tPage count:', count)
	return iterator, count
	@@ -71,7 +74,7 @@
	Given an iterator for the epub (if already opened/converted), estimate a word count
	'''
	from calibre.utils.localization import get_lang
	-
	+
	if iterator is None:
	iterator = _open_epub_file(book_path)

	@@ -124,6 +127,23 @@
	return count


	+def _get_page_count_accurate_experimental(iterator):
	+ epub_html = _read_epub_contents(iterator)
	+ soup = BeautifulSoup.BeautifulSoup(epub_html)
	+
	+ divisions = soup('div')
	+ paragraphs = soup('p')
	+ tags = divisions if len(divisions) > len(paragraphs) else paragraphs
	+
	+ characters = 0
	+ for tag in tags:
	+ characters += sum(len(text) for text in tag(text=True))
	+
	+ count = (len(tags) + characters // 70) // 31
	+ fast_count = len(epub_html) // 2400 + 1
	+ return max(count, fast_count)
	+
	+
	def _get_page_count_accurate(iterator):
	'''
	The accurate algorithm attempts to apply a similar algorithm
	@@ -222,9 +242,9 @@
	'''

	book_text = _read_epub_contents(iterator, strip_html=True)
	-
	+
	wordcount = None
	-
	+
	if icu_wordcount:
	try:
	from calibre.spell.break_iterator import count_words
	@@ -255,23 +275,24 @@
	'''
	book_files = []
	for path in iterator.spine:
	- with open(path, 'rb') as f:
	- html = f.read().decode('utf-8', 'replace')
	+ with codecs.open(path, 'r', encoding='UTF-8', errors='replace') as f:
	+ html = f.read()
	if strip_html:
	- html = unicode(_extract_body_text(html)).strip()
	+ html = _extract_body_text(html).strip()
	#print('FOUND HTML:', html)
	book_files.append(html)
	- return ' '.join(book_files)
	+ return u' '.join(book_files)


	def _extract_body_text(data):
	'''
	Get the body text of this html content wit any html tags stripped
	'''
	- body = RE_HTML_BODY.findall(data)
	+ body = RE_HTML_BODY.search(data)
	if body:
	- return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ')
	- return ''
	+ stripped_markup = RE_STRIP_MARKUP.sub(u'', body.group(0))
	+ return re.sub(ur'\.', u'. ', stripped_markup)
	+ return u''

	# ---------------------------------------------------------
	# CBR/CBZ Page Count Functions
	@@ -331,7 +352,7 @@
	def get_flesch_reading_ease(text_analysis, lang=None):
	if lang and lang == 'deu':
	print('\tFlesch Reading Ease: language=%s' % lang)
	- score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
	+ score = 180 - text_analysis['averageWordsPerSentence'] - (58.5 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
	else:
	score = 206.835 - (1.015 * (text_analysis['averageWordsPerSentence'])) - (84.6 * (text_analysis['syllableCount']/ text_analysis['wordCount']))
	print('\tFlesch Reading Ease:', score)
	Index: config.py
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- config.py (date 1498849346000)
	+++ config.py (revision )
	@@ -59,7 +59,7 @@
	KEY_PAGES_ALGORITHM = 'algorithmPages'
	KEY_CUSTOM_CHARS_PER_PAGE = 'customCharsPerPage'

	-PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)')]
	+PAGE_ALGORITHMS = [_('Paragraphs (APNX accurate)'), _('E-book Viewer (calibre)'), _('Adobe Digital Editions (ADE)'), _('Custom (Chars Per Page)'), 'Paragraphs (APNX accurate, experimental)']

	PAGE_DOWNLOADS = {
	'goodreads':