Word count with LuaTEX

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 packagedata = packagedata or { } -- namespace proposal for packages local word_count = { threshold = 3, } packagedata.word_count = packagedata.word_count or word_count   dofile(kpse.find_file"char-def.lua") -- unicode tables dofile(kpse.find_file"lualibs-table.lua") -- old Context table code   local utf = unicode.utf8 local node = node local type = type   local lower, utfchar, utfvalues = string.lower, utf.char, string.utfvalues local tableconcat, iowrite = table.concat, io.write local stringformat, texprint = string.format, tex.print   local collected = { total = 0, unique = 0, }   local traverse_nodes = node.traverse local chardata = characters.data   local glyph_code = node.id"glyph" local disc_code = node.id"disc" local kern_code = node.id"kern" local kerning_code = 0 -- from font   local is_letter = table.tohash { "ll", "lm", "lo", "lt", "lu" }   local charcache = { } --- memo without metatable local lcchar = function(code) if code then if charcache[code] then return charcache[code] end local c = chardata[code] c = c and c.lccode if c then --utfstring if type(c) == "table" then c = utfchar(unpack(c)) else c = utfchar(c) end else if type(code) == "number" then c = utfchar(code) else c = code end end charcache[code] = c return c end end local lowerchar = function (str) local new, n = { }, 0 for val in utfvalues(str) do n = n + 1 new[n] = lcchar(val) -- could be inlined here as well .. end return tableconcat(new) end   local function mark_words (head, whenfound) local current, done = head, nil, 0, false local str, s, nds, n = { }, 0, { }, 0 local function action() if s > 0 then local word = tableconcat(str, "", 1, s) local mark = whenfound(word) if mark then done = true for i=1,n do mark(nds[i]) end end end n, s = 0, 0 end while current do -- iterate local id = current.id if id == glyph_code then local components = current.components if components then n = n + 1 nds[n] = current for g in traverse_nodes(components) do s = s + 1 str[s] = utfchar(g.char) end else local code = current.char local data = chardata[code] if is_letter[data.category] then n = n + 1 nds[n] = current s = s + 1 str[s] = utfchar(code) elseif s > 0 then action() end end elseif id == disc_code then -- take the replace if n > 0 then n = n + 1 nds[n] = current end elseif id == kern_code and current.subtype == kerning_code and s > 0 then -- ok elseif s > 0 then action() end current = current.next end if s > 0 then action() end return head, done end   local known = { } local function insert_word (str) -- -Y´sweep(l,s)¡ if #str >= word_count.threshold then str = lowerchar(str) if not known[str] then collected.unique = collected.unique +1 known[str] = true end collected.total = collected.total + 1 end end   local callback = function (head) return mark_words(head, insert_word) end   word_count.callback = callback   local current_count = function () tex.print(collected.total) end   word_count.current_word_count = current_count   word_count.set_threshold = function (n) if n then word_count.threshold = n end end   local f_dump = [[ -A········································································ Document stats. ········································································ Threshold: %d Total number of words: %d Number of unique words: %d ········································································ ]]   local dump_total = function () --print(table.serialize(collected)) iowrite(stringformat(f_dump, word_count.threshold, collected.total, collected.unique)) end   word_count.dump_total_word_count = dump_total 
 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 \input{luatexbase.sty}%%% For the callback wrapper. \directlua{dofile(kpse.find_file"word_count.lua")}   \def\setwordthreshold#1{% \directlua{packagedata.word_count.set_threshold(\number#1)}% }   \def\startwordcount{% \directlua{ luatexbase.add_to_callback( "pre_linebreak_filter", packagedata.word_count.callback, "word_count" ) }% }   \def\stopwordcount{% \endgraf %% force paragraph \directlua{ luatexbase.remove_from_callback( "pre_linebreak_filter", "word_count" ) }% }   %%% This outputs the word count to stdout. \def\dumpwordcount{% \directlua{packagedata.word_count.dump_total_word_count()} }   %%% This returns the word count at the current position. Works only at %%% the end of a paragraph. \def\currentwordcount{% \directlua{packagedata.word_count.current_word_count()}% }   \setwordthreshold{3} %%% min chars in a row to count as word \startwordcount %%% start callback \input knuth\par %%% counted \currentwordcount %%% => 94 with threshold == 3 \input knuth %%% counted \stopwordcount %%% deregister callback \input knuth %%% not counted \dumpwordcount %%% => 188   \bye 
