Skip to content

Instantly share code, notes, and snippets.

@mapmeld
Created December 20, 2017 12:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mapmeld/625881849080b60217178529b141a4a4 to your computer and use it in GitHub Desktop.
Save mapmeld/625881849080b60217178529b141a4a4 to your computer and use it in GitHub Desktop.
Kiwix Strip HTML script
# pip3 install pyquery
import os
from pyquery import PyQuery as pq
directory = './burmese-articles'
originalArticles = os.listdir(directory)
count = 0
total = len(originalArticles)
for article in originalArticles:
count = count + 1
if (count % 100 == 0):
print(str(count) + ' / ' + str(total) + ' articles')
if (article.find('.txt') > -1):
# avoid already converted text files
next
if (article.find(':') > -1):
# categories
next
htmlsource = open(directory + '/' + article, 'r')
try:
html = pq(htmlsource.read())
except:
htmlsource.close()
next
content = html('#mw-content-text p')
htmlsource.close()
if (len(content) > 0):
txtsource = open(directory + '/' + article + '.txt', 'w')
for pr in content:
para = pq(pr)
txtsource.write(para.text() + "\n\n")
txtsource.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment