mapmeld/kiwix-strip-html.py

## kiwix-strip-html.py
# pip3 install pyquery

import os
from pyquery import PyQuery as pq

directory = './burmese-articles'
originalArticles = os.listdir(directory)

count = 0
total = len(originalArticles)

for article in originalArticles:
    count = count + 1
    if (count % 100 == 0):
        print(str(count) + ' / ' + str(total) + ' articles')
    if (article.find('.txt') > -1):
        # avoid already converted text files
        next
    if (article.find(':') > -1):
        # categories
        next
    htmlsource = open(directory + '/' + article, 'r')
    try:
        html = pq(htmlsource.read())
    except:
        htmlsource.close()
        next

    content = html('#mw-content-text p')
    htmlsource.close()

    if (len(content) > 0):
        txtsource = open(directory + '/' + article + '.txt', 'w')
        for pr in content:
            para = pq(pr)
            txtsource.write(para.text() + "\n\n")
        txtsource.close()
	# pip3 install pyquery

	import os
	from pyquery import PyQuery as pq

	directory = './burmese-articles'
	originalArticles = os.listdir(directory)

	count = 0
	total = len(originalArticles)

	for article in originalArticles:
	count = count + 1
	if (count % 100 == 0):
	print(str(count) + ' / ' + str(total) + ' articles')
	if (article.find('.txt') > -1):
	# avoid already converted text files
	next
	if (article.find(':') > -1):
	# categories
	next
	htmlsource = open(directory + '/' + article, 'r')
	try:
	html = pq(htmlsource.read())
	except:
	htmlsource.close()
	next

	content = html('#mw-content-text p')
	htmlsource.close()

	if (len(content) > 0):
	txtsource = open(directory + '/' + article + '.txt', 'w')
	for pr in content:
	para = pq(pr)
	txtsource.write(para.text() + "\n\n")
	txtsource.close()