Skip to content

Instantly share code, notes, and snippets.

@NickMirnov
Created September 24, 2014 11:48
Show Gist options
  • Save NickMirnov/757fba69237c58046db3 to your computer and use it in GitHub Desktop.
Save NickMirnov/757fba69237c58046db3 to your computer and use it in GitHub Desktop.
text2xml converter for sphinx engine
#coding:utf-8
import sys
import os
from os import listdir
from os.path import isfile, join
import codecs
import cgi
import sys
import xml
import xml.sax
import xml.sax.saxutils
unicode_translate_map = {
0: u'',
0x01: u'',
0x02: u'',
0x03: u'',
0x04: u'',
0x05: u'',
0x06: u'',
0x07: u'',
0x08: u'',
0x09: u'',
0x0A: u'',
0x0B: u'',
0x0C: u'',
0x0D: u'',
0x0E: u'',
0x0F: u'',
0x10: u'&#10;', # u'<br/>', ## LF
0x11: u'',
0x12: u'',
0x13: u'&#13;', ## CR
0x14: u'',
0x15: u'',
0x16: u'',
0x17: u'',
0x18: u'',
0x19: u'',
0x1A: u'',
0x1B: u'',
0x1C: u'',
0x1D: u'',
0x1E: u'',
0x1F: u'',
ord(u'\"'): u'&quot;',
ord(u'\''): u'&apos;',
ord(u'<'): u'&lt;',
ord(u'>'): u'&gt;',
ord(u'&'): u'&amp;',
}
def recursive_walk_dir(dir):
result = []
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isfile(path):
result += [ path ]
else:
result += recursive_walk_dir(path)
return result
def dump_file(filename, doc_id):
s = ''
try:
fstr = codecs.open(filename, 'r', 'utf-8')
s = fstr.read()
fstr.close();
except:
# print 'Error loading %s' % filename
return
print '<sphinx:document id=\"%s\">' % doc_id
print '<content>'
#print xml.sax.saxutils.escape(s) #cgi.escape(s)
s = s.translate(unicode_translate_map)
print s
print '</content>'
print '</sphinx:document>'
#onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
#print onlyfiles
#
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
# 1. init
print '<?xml version=\"1.0\" encoding=\"utf-8\" ?>'
print '<sphinx:docset>'
print '<sphinx:schema>'
print '<sphinx:field name="content"/>'
print '</sphinx:schema>'
# 2. dump files as XMLs
abs_path = os.path.abspath('/opt/test_for_index')
lst = recursive_walk_dir(abs_path)
for el in enumerate(lst):
dump_file(el[1], el[0]+1000 )
# 3. finalize
print '</sphinx:docset>'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment