a tool to dump texts of EPWING dictionary.
Required library:
-
eb library http://www.sra.co.jp/people/m-kasahr/eb/
-
iconv (glibc common)
% make
% ./ebdumptext /run/media/cdrom/ path/to/output
--- | |
Language: Cpp | |
# BasedOnStyle: Mozilla | |
AccessModifierOffset: -2 | |
AlignAfterOpenBracket: Align | |
AlignConsecutiveAssignments: false | |
AlignConsecutiveDeclarations: false | |
AlignEscapedNewlinesLeft: false | |
AlignOperands: true | |
AlignTrailingComments: true | |
AllowAllParametersOfDeclarationOnNextLine: false | |
AllowShortBlocksOnASingleLine: false | |
AllowShortCaseLabelsOnASingleLine: false | |
AllowShortFunctionsOnASingleLine: Inline | |
AllowShortIfStatementsOnASingleLine: false | |
AllowShortLoopsOnASingleLine: false | |
AlwaysBreakAfterDefinitionReturnType: TopLevel | |
AlwaysBreakAfterReturnType: TopLevelDefinitions | |
AlwaysBreakBeforeMultilineStrings: false | |
AlwaysBreakTemplateDeclarations: true | |
BinPackArguments: true | |
BinPackParameters: true | |
BraceWrapping: | |
AfterClass: true | |
AfterControlStatement: false | |
AfterEnum: true | |
AfterFunction: true | |
AfterNamespace: false | |
AfterObjCDeclaration: false | |
AfterStruct: true | |
AfterUnion: true | |
BeforeCatch: false | |
BeforeElse: false | |
IndentBraces: false | |
BreakBeforeBinaryOperators: None | |
BreakBeforeBraces: Mozilla | |
BreakBeforeTernaryOperators: true | |
BreakConstructorInitializersBeforeComma: true | |
BreakAfterJavaFieldAnnotations: false | |
BreakStringLiterals: true | |
ColumnLimit: 80 | |
CommentPragmas: '^ IWYU pragma:' | |
ConstructorInitializerAllOnOneLineOrOnePerLine: false | |
ConstructorInitializerIndentWidth: 2 | |
ContinuationIndentWidth: 2 | |
Cpp11BracedListStyle: false | |
DerivePointerAlignment: false | |
DisableFormat: false | |
ExperimentalAutoDetectBinPacking: false | |
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] | |
IncludeCategories: | |
- Regex: '^"(llvm|llvm-c|clang|clang-c)/' | |
Priority: 2 | |
- Regex: '^(<|"(gtest|isl|json)/)' | |
Priority: 3 | |
- Regex: '.*' | |
Priority: 1 | |
IncludeIsMainRegex: '$' | |
IndentCaseLabels: true | |
IndentWidth: 2 | |
IndentWrappedFunctionNames: false | |
JavaScriptQuotes: Leave | |
JavaScriptWrapImports: true | |
KeepEmptyLinesAtTheStartOfBlocks: true | |
MacroBlockBegin: '' | |
MacroBlockEnd: '' | |
MaxEmptyLinesToKeep: 1 | |
NamespaceIndentation: None | |
ObjCBlockIndentWidth: 2 | |
ObjCSpaceAfterProperty: true | |
ObjCSpaceBeforeProtocolList: false | |
PenaltyBreakBeforeFirstCallParameter: 19 | |
PenaltyBreakComment: 300 | |
PenaltyBreakFirstLessLess: 120 | |
PenaltyBreakString: 1000 | |
PenaltyExcessCharacter: 1000000 | |
PenaltyReturnTypeOnItsOwnLine: 200 | |
PointerAlignment: Left | |
ReflowComments: true | |
SortIncludes: true | |
SpaceAfterCStyleCast: false | |
SpaceBeforeAssignmentOperators: true | |
SpaceBeforeParens: ControlStatements | |
SpaceInEmptyParentheses: false | |
SpacesBeforeTrailingComments: 1 | |
SpacesInAngles: false | |
SpacesInContainerLiterals: true | |
SpacesInCStyleCastParentheses: false | |
SpacesInParentheses: false | |
SpacesInSquareBrackets: false | |
Standard: Cpp11 | |
TabWidth: 8 | |
UseTab: Never | |
... | |
ebdumptext | |
# Created by https://www.gitignore.io/api/vim,emacs,c | |
### C ### | |
# Prerequisites | |
*.d | |
# Object files | |
*.o | |
*.ko | |
*.obj | |
*.elf | |
# Linker output | |
*.ilk | |
*.map | |
*.exp | |
# Precompiled Headers | |
*.gch | |
*.pch | |
# Libraries | |
*.lib | |
*.a | |
*.la | |
*.lo | |
# Shared objects (inc. Windows DLLs) | |
*.dll | |
*.so | |
*.so.* | |
*.dylib | |
# Executables | |
*.exe | |
*.out | |
*.app | |
*.i*86 | |
*.x86_64 | |
*.hex | |
# Debug files | |
*.dSYM/ | |
*.su | |
*.idb | |
*.pdb | |
# Kernel Module Compile Results | |
*.mod* | |
*.cmd | |
modules.order | |
Module.symvers | |
Mkfile.old | |
dkms.conf | |
### Emacs ### | |
# -*- mode: gitignore; -*- | |
*~ | |
\#*\# | |
/.emacs.desktop | |
/.emacs.desktop.lock | |
*.elc | |
auto-save-list | |
tramp | |
.\#* | |
# Org-mode | |
.org-id-locations | |
*_archive | |
# flymake-mode | |
*_flymake.* | |
# eshell files | |
/eshell/history | |
/eshell/lastdir | |
# elpa packages | |
/elpa/ | |
# reftex files | |
*.rel | |
# AUCTeX auto folder | |
/auto/ | |
# cask packages | |
.cask/ | |
dist/ | |
# Flycheck | |
flycheck_*.el | |
# server auth directory | |
/server/ | |
# projectiles files | |
.projectile | |
# directory configuration | |
.dir-locals.el | |
### Vim ### | |
# swap | |
[._]*.s[a-v][a-z] | |
[._]*.sw[a-p] | |
[._]s[a-v][a-z] | |
[._]sw[a-p] | |
# session | |
Session.vim | |
# temporary | |
.netrwhist | |
# auto-generated tag files | |
tags | |
# End of https://www.gitignore.io/api/vim,emacs,c |
a tool to dump texts of EPWING dictionary.
Required library:
eb library http://www.sra.co.jp/people/m-kasahr/eb/
iconv (glibc common)
% make
% ./ebdumptext /run/media/cdrom/ path/to/output
#include <eb/eb.h> | |
#include <eb/error.h> | |
#include <eb/text.h> | |
#include <errno.h> | |
#include <iconv.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#ifdef NDEBUG | |
#define debug(fmt, ...) ((void)0) | |
#else | |
#define STRINGIFY(x) #x | |
#define TOSTRING(x) STRINGIFY(x) | |
#define LOCATION __FILE__ ":" TOSTRING(__LINE__) | |
#define debug(fmt, ...) \ | |
fprintf(stderr, "%s: " fmt "\n", LOCATION, ##__VA_ARGS__) | |
#endif /* NDEBUG */ | |
#ifdef __GNUC__ | |
#define ATTR_UNUSED __attribute__((unused)) | |
#else | |
#define ATTR_UNUSED | |
#endif | |
#define goto_final_if_eb_fail(error_code, msg, extra) \ | |
if ((error_code) != EB_SUCCESS) { \ | |
print_eb_error((error_code), (msg), (extra)); \ | |
debug("goto_final_if_eb_fail: %s", msg); \ | |
exit_code = 1; \ | |
goto final; \ | |
} | |
static const char* progname = NULL; | |
static size_t | |
iconv_convert(const char* from_code, const char* to_code, char* src, char* dst, | |
size_t dst_length) | |
{ | |
iconv_t cd = (iconv_t)-1; | |
size_t ret; | |
char* inbuf; | |
size_t inbytesleft, outbytesleft; | |
char* outbuf_p; | |
inbuf = src; | |
inbytesleft = strlen(src); | |
outbytesleft = dst_length; | |
outbuf_p = dst; | |
cd = iconv_open(to_code, from_code); | |
if (cd == (iconv_t)-1) { | |
ret = -1; | |
debug("failed iconv_open: to_code=\"%s\", from_code=\"%s\"", to_code, | |
from_code); | |
goto final; | |
} | |
ret = iconv(cd, &inbuf, &inbytesleft, &outbuf_p, &outbytesleft); | |
if (ret == (size_t)-1) { | |
switch (errno) { | |
case EILSEQ: | |
debug("iconv error: Input conversion stopped due to an input byte that " | |
"does not belong to the input codeset."); | |
break; | |
case E2BIG: | |
debug("iconv error: Input conversion stopped due to lack of space in " | |
"the output buffer."); | |
break; | |
case EINVAL: | |
debug("iconv error: Input conversion stopped due to an incomplete " | |
"character or shift sequence at the end of the input buffer."); | |
break; | |
} | |
} else { | |
if (outbytesleft == 0) { | |
debug("iconv error: Cannot terminate outbuf as a character string"); | |
goto final; | |
} | |
*outbuf_p = '\0'; | |
} | |
final: | |
if (cd != (iconv_t)-1) { | |
iconv_close(cd); | |
} | |
return ret; | |
} | |
static void | |
eb_write_text_string_sprintf(EB_Book* book, const char* format, ...) | |
{ | |
va_list args; | |
char text[BUFSIZ] = { 0 }; | |
va_start(args, format); | |
vsnprintf(text, sizeof(text), format, args); | |
va_end(args); | |
if (strlen(text) > 0) { | |
eb_write_text_string(book, text); | |
} | |
} | |
static EB_Error_Code | |
hook_newline(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "<br />"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_reference(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "<reference>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_reference(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "</reference page=0x%x offset=0x%x>", | |
argv[1], argv[2]); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_narrow_font(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "<gaiji code=h%04x />", argv[0]); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_wide_font(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "<gaiji code=z%04x />", argv[0]); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_candidate(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "<candidate>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_candidate_leaf(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, | |
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc, | |
ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "</candidate>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_candidate_group(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, | |
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc, | |
const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "</candidate page=0x%x offset=0x%x>", | |
argv[1], argv[2]); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_superscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, | |
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc, | |
ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "<sup>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_superscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "</sup>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_subscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "<sub>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_subscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "</sub>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_emphasis(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "<emphasis>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_emphasis(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "</emphasis>"); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_begin_keyword(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, const unsigned int* argv) | |
{ | |
eb_write_text_string_sprintf(book, "<keyword argv1=%x>", argv[1]); | |
return EB_SUCCESS; | |
} | |
static EB_Error_Code | |
hook_end_keyword(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix, | |
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code, | |
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv) | |
{ | |
eb_write_text_string(book, "</keyword>"); | |
return EB_SUCCESS; | |
} | |
static void | |
print_eb_error(EB_Error_Code error_code, const char* msg, const char* extra_msg) | |
{ | |
if (extra_msg != NULL) { | |
fprintf(stderr, "%s: %s, %s: %s\n", progname, eb_error_message(error_code), | |
msg, extra_msg); | |
} else { | |
fprintf(stderr, "%s: %s, %s\n", progname, eb_error_message(error_code), | |
msg); | |
} | |
} | |
static int | |
process_subbook(EB_Book* book, EB_Hookset* hookset, const char* output_file) | |
{ | |
int exit_code = 0; | |
EB_Error_Code error_code; | |
EB_Position text_position; | |
char title[EB_MAX_TITLE_LENGTH + 1] = { 0 }; | |
char title_conv[EB_MAX_TITLE_LENGTH * 2 + 1] = { 0 }; | |
FILE* output_fp = NULL; | |
error_code = eb_subbook_title(book, title); | |
goto_final_if_eb_fail(error_code, "failed to get subbook title", NULL); | |
iconv_convert("euc-jp", "utf-8", title, title_conv, sizeof(title_conv)); | |
debug("read subbook: %s", title_conv); | |
error_code = eb_text(book, &text_position); | |
if ((error_code != EB_ERR_NO_TEXT) && (error_code != EB_ERR_NO_SUCH_SEARCH)) { | |
char output_file_path[BUFSIZ] = { 0 }; | |
int output_file_path_length; | |
char text[BUFSIZ]; | |
char text_conv[BUFSIZ * 2 + 1]; | |
ssize_t text_length; | |
goto_final_if_eb_fail(error_code, "failed to get text information", NULL); | |
error_code = eb_seek_text(book, &text_position); | |
goto_final_if_eb_fail(error_code, "failed to get seek text", NULL); | |
output_file_path_length = | |
snprintf(output_file_path, sizeof(output_file_path), "%s_%s.txt", | |
output_file, title_conv); | |
if (output_file_path_length < 0) { | |
fprintf(stderr, "%s: %s: %s\n", progname, | |
"could not generate output file for", title_conv); | |
goto final; | |
} | |
output_fp = fopen(output_file_path, "w"); | |
if (output_fp == NULL) { | |
fprintf(stderr, "%s: %s: %s\n", progname, | |
"could not generate output file", output_file_path); | |
goto final; | |
} | |
while (1) { | |
do { | |
error_code = eb_read_text(book, NULL, hookset, NULL, sizeof(text), text, | |
&text_length); | |
goto_final_if_eb_fail(error_code, "failed to get text", NULL); | |
iconv_convert("euc-jp", "utf-8", text, text_conv, sizeof(text_conv)); | |
fprintf(output_fp, "%s\n", text_conv); | |
} while (eb_is_text_stopped(book) == 0); | |
error_code = eb_forward_text(book, NULL); | |
if (error_code == EB_ERR_END_OF_CONTENT) { | |
debug("EB_ERR_END_OF_CONTENT."); | |
break; | |
} else { | |
goto_final_if_eb_fail(error_code, "failed to forward next", NULL); | |
} | |
} | |
} else { | |
debug("skip subbook (no text): %s", title_conv); | |
} | |
final: | |
if (output_fp != NULL) { | |
fclose(output_fp); | |
} | |
return exit_code; | |
} | |
static int | |
process_eb(const char* source_dir, const char* output_file) | |
{ | |
int exit_code = 0; | |
EB_Hookset hookset; | |
const EB_Hook hooks[] = { | |
{ EB_HOOK_NEWLINE, hook_newline }, | |
{ EB_HOOK_BEGIN_REFERENCE, hook_begin_reference }, | |
{ EB_HOOK_END_REFERENCE, hook_end_reference }, | |
{ EB_HOOK_NARROW_FONT, hook_narrow_font }, | |
{ EB_HOOK_WIDE_FONT, hook_wide_font }, | |
{ EB_HOOK_BEGIN_CANDIDATE, hook_begin_candidate }, | |
{ EB_HOOK_END_CANDIDATE_LEAF, hook_end_candidate_leaf }, | |
{ EB_HOOK_END_CANDIDATE_GROUP, hook_end_candidate_group }, | |
{ EB_HOOK_BEGIN_SUPERSCRIPT, hook_begin_superscript }, | |
{ EB_HOOK_END_SUPERSCRIPT, hook_end_superscript }, | |
{ EB_HOOK_BEGIN_SUBSCRIPT, hook_begin_subscript }, | |
{ EB_HOOK_END_SUBSCRIPT, hook_end_subscript }, | |
{ EB_HOOK_BEGIN_EMPHASIS, hook_begin_emphasis }, | |
{ EB_HOOK_END_EMPHASIS, hook_end_emphasis }, | |
{ EB_HOOK_BEGIN_KEYWORD, hook_begin_keyword }, | |
{ EB_HOOK_END_KEYWORD, hook_end_keyword }, | |
{ EB_HOOK_NULL, NULL }, | |
}; | |
EB_Book book; | |
EB_Error_Code error_code; | |
// EB_Character_Code character_code; | |
EB_Subbook_Code sub_codes[EB_MAX_SUBBOOKS]; | |
int sub_count; | |
int i; | |
eb_initialize_hookset(&hookset); | |
error_code = eb_set_hooks(&hookset, hooks); | |
if (error_code != EB_SUCCESS) { | |
exit_code = 1; | |
return exit_code; | |
} | |
eb_initialize_book(&book); | |
error_code = eb_bind(&book, source_dir); | |
goto_final_if_eb_fail(error_code, "failed to bind the book", source_dir); | |
// error_code = eb_character_code(&book, &character_code); | |
// goto_final_if_eb_fail(error_code, "failed to get character code", | |
// NULL); | |
error_code = eb_subbook_list(&book, sub_codes, &sub_count); | |
goto_final_if_eb_fail(error_code, "failed to get subbook list", NULL); | |
for (i = 0; i < sub_count; i++) { | |
error_code = eb_set_subbook(&book, sub_codes[i]); | |
goto_final_if_eb_fail(error_code, "failed to set subbook", NULL); | |
exit_code = process_subbook(&book, &hookset, output_file); | |
if (exit_code != 0) { | |
goto final; | |
} | |
} | |
final: | |
eb_unset_subbook(&book); | |
eb_finalize_book(&book); | |
eb_finalize_hookset(&hookset); | |
return exit_code; | |
} | |
int | |
main(int argc, char* argv[]) | |
{ | |
EB_Error_Code error_code; | |
int exit_code = 0; | |
progname = argv[0]; | |
if (argc != 3) { | |
fprintf(stderr, "Usage: %s source-dir output.txt\n\n", argv[0]); | |
fprintf(stderr, | |
" source-dir: path to top directory of Book (directory which " | |
"contains catalogs file.)\n"); | |
fprintf(stderr, | |
" output.txt: path to output file. (if exists, overwrite existing " | |
"file.)\n"); | |
return 1; | |
} | |
error_code = eb_initialize_library(); | |
if (error_code != EB_SUCCESS) { | |
print_eb_error(error_code, "failed to initialize EB Library", argv[1]); | |
return 1; | |
} | |
exit_code = process_eb(argv[1], argv[2]); | |
eb_finalize_library(); | |
return exit_code; | |
} |
CC=gcc | |
CFLAGS=-O3 -g -W -Wall | |
LIBS=-leb | |
SRC = main.c | |
OBJ = $(SRC:%.c=%.o) | |
all: ebdumptext | |
clean: | |
\rm $(OBJ) ebdumptext | |
.PHONY: all clean | |
ebdumptext: $(OBJ) | |
$(CC) -o $@ $(OBJ) $(LIBS) |