public
Last active

Word count with LuaTEX

  • Download Gist
word_count.lua
Lua
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
packagedata = packagedata or { } -- namespace proposal for packages
local word_count = { threshold = 3, }
packagedata.word_count = packagedata.word_count or word_count
 
dofile(kpse.find_file"char-def.lua") -- unicode tables
dofile(kpse.find_file"lualibs-table.lua") -- old Context table code
 
local utf = unicode.utf8
local node = node
local type = type
 
local lower, utfchar, utfvalues = string.lower, utf.char, string.utfvalues
local tableconcat, iowrite = table.concat, io.write
local stringformat, texprint = string.format, tex.print
 
local collected = { total = 0, unique = 0, }
 
local traverse_nodes = node.traverse
local chardata = characters.data
 
local glyph_code = node.id"glyph"
local disc_code = node.id"disc"
local kern_code = node.id"kern"
local kerning_code = 0 -- from font
 
local is_letter = table.tohash { "ll", "lm", "lo", "lt", "lu" }
 
local charcache = { } --- memo without metatable
local lcchar = function(code)
if code then
if charcache[code] then return charcache[code] end
local c = chardata[code]
c = c and c.lccode
if c then --utfstring
if type(c) == "table" then
c = utfchar(unpack(c))
else
c = utfchar(c)
end
else
if type(code) == "number" then
c = utfchar(code)
else
c = code
end
end
charcache[code] = c
return c
end
end
local lowerchar = function (str)
local new, n = { }, 0
for val in utfvalues(str) do
n = n + 1
new[n] = lcchar(val) -- could be inlined here as well ..
end
return tableconcat(new)
end
 
local function mark_words (head, whenfound)
local current, done = head, nil, 0, false
local str, s, nds, n = { }, 0, { }, 0
local function action()
if s > 0 then
local word = tableconcat(str, "", 1, s)
local mark = whenfound(word)
if mark then
done = true
for i=1,n do
mark(nds[i])
end
end
end
n, s = 0, 0
end
while current do -- iterate
local id = current.id
if id == glyph_code then
local components = current.components
if components then
n = n + 1
nds[n] = current
for g in traverse_nodes(components) do
s = s + 1
str[s] = utfchar(g.char)
end
else
local code = current.char
local data = chardata[code]
if is_letter[data.category] then
n = n + 1
nds[n] = current
s = s + 1
str[s] = utfchar(code)
elseif s > 0 then
action()
end
end
elseif id == disc_code then -- take the replace
if n > 0 then
n = n + 1
nds[n] = current
end
elseif id == kern_code and current.subtype == kerning_code and s > 0 then
-- ok
elseif s > 0 then
action()
end
current = current.next
end
if s > 0 then
action()
end
return head, done
end
 
local known = { }
local function insert_word (str) -- -Y´sweep(l,s)¡
if #str >= word_count.threshold then
str = lowerchar(str)
if not known[str] then
collected.unique = collected.unique +1
known[str] = true
end
collected.total = collected.total + 1
end
end
 
local callback = function (head)
return mark_words(head, insert_word)
end
 
word_count.callback = callback
 
local current_count = function ()
tex.print(collected.total)
end
 
word_count.current_word_count = current_count
 
word_count.set_threshold = function (n)
if n then
word_count.threshold = n
end
end
 
local f_dump = [[
 
-A········································································
Document stats.
········································································
Threshold: %d
Total number of words: %d
Number of unique words: %d
········································································
 
]]
 
local dump_total = function ()
--print(table.serialize(collected))
iowrite(stringformat(f_dump,
word_count.threshold,
collected.total,
collected.unique))
end
 
word_count.dump_total_word_count = dump_total
word_count.tex
TeX
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
\input{luatexbase.sty}%%% For the callback wrapper.
\directlua{dofile(kpse.find_file"word_count.lua")}
 
\def\setwordthreshold#1{%
\directlua{packagedata.word_count.set_threshold(\number#1)}%
}
 
\def\startwordcount{%
\directlua{
luatexbase.add_to_callback(
"pre_linebreak_filter",
packagedata.word_count.callback,
"word_count"
)
}%
}
 
\def\stopwordcount{%
\endgraf %% force paragraph
\directlua{
luatexbase.remove_from_callback(
"pre_linebreak_filter",
"word_count"
)
}%
}
 
%%% This outputs the word count to stdout.
\def\dumpwordcount{%
\directlua{packagedata.word_count.dump_total_word_count()}
}
 
%%% This returns the word count at the current position. Works only at
%%% the end of a paragraph.
\def\currentwordcount{%
\directlua{packagedata.word_count.current_word_count()}%
}
 
\setwordthreshold{3} %%% min chars in a row to count as word
\startwordcount %%% start callback
\input knuth\par %%% counted
\currentwordcount %%% => 94 with threshold == 3
\input knuth %%% counted
\stopwordcount %%% deregister callback
\input knuth %%% not counted
\dumpwordcount %%% => 188
 
\bye

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.