Skip to content

Instantly share code, notes, and snippets.

@cat-in-136
Last active March 19, 2017 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cat-in-136/9f0b7c16215fbd113794581456c03433 to your computer and use it in GitHub Desktop.
Save cat-in-136/9f0b7c16215fbd113794581456c03433 to your computer and use it in GitHub Desktop.
ebdumptext: a tool to dump texts of EPWING dictionary.
---
Language: Cpp
# BasedOnStyle: Mozilla
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: false
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: TopLevel
AlwaysBreakAfterReturnType: TopLevelDefinitions
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterEnum: true
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: true
AfterUnion: true
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Mozilla
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: true
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 2
ContinuationIndentWidth: 2
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
- Regex: '^(<|"(gtest|isl|json)/)'
Priority: 3
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '$'
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...
ebdumptext
# Created by https://www.gitignore.io/api/vim,emacs,c
### C ###
# Prerequisites
*.d
# Object files
*.o
*.ko
*.obj
*.elf
# Linker output
*.ilk
*.map
*.exp
# Precompiled Headers
*.gch
*.pch
# Libraries
*.lib
*.a
*.la
*.lo
# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex
# Debug files
*.dSYM/
*.su
*.idb
*.pdb
# Kernel Module Compile Results
*.mod*
*.cmd
modules.order
Module.symvers
Mkfile.old
dkms.conf
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
### Vim ###
# swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-v][a-z]
[._]sw[a-p]
# session
Session.vim
# temporary
.netrwhist
# auto-generated tag files
tags
# End of https://www.gitignore.io/api/vim,emacs,c
#include <eb/eb.h>
#include <eb/error.h>
#include <eb/text.h>
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef NDEBUG
#define debug(fmt, ...) ((void)0)
#else
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LOCATION __FILE__ ":" TOSTRING(__LINE__)
#define debug(fmt, ...) \
fprintf(stderr, "%s: " fmt "\n", LOCATION, ##__VA_ARGS__)
#endif /* NDEBUG */
#ifdef __GNUC__
#define ATTR_UNUSED __attribute__((unused))
#else
#define ATTR_UNUSED
#endif
#define goto_final_if_eb_fail(error_code, msg, extra) \
if ((error_code) != EB_SUCCESS) { \
print_eb_error((error_code), (msg), (extra)); \
debug("goto_final_if_eb_fail: %s", msg); \
exit_code = 1; \
goto final; \
}
static const char* progname = NULL;
static size_t
iconv_convert(const char* from_code, const char* to_code, char* src, char* dst,
size_t dst_length)
{
iconv_t cd = (iconv_t)-1;
size_t ret;
char* inbuf;
size_t inbytesleft, outbytesleft;
char* outbuf_p;
inbuf = src;
inbytesleft = strlen(src);
outbytesleft = dst_length;
outbuf_p = dst;
cd = iconv_open(to_code, from_code);
if (cd == (iconv_t)-1) {
ret = -1;
debug("failed iconv_open: to_code=\"%s\", from_code=\"%s\"", to_code,
from_code);
goto final;
}
ret = iconv(cd, &inbuf, &inbytesleft, &outbuf_p, &outbytesleft);
if (ret == (size_t)-1) {
switch (errno) {
case EILSEQ:
debug("iconv error: Input conversion stopped due to an input byte that "
"does not belong to the input codeset.");
break;
case E2BIG:
debug("iconv error: Input conversion stopped due to lack of space in "
"the output buffer.");
break;
case EINVAL:
debug("iconv error: Input conversion stopped due to an incomplete "
"character or shift sequence at the end of the input buffer.");
break;
}
} else {
if (outbytesleft == 0) {
debug("iconv error: Cannot terminate outbuf as a character string");
goto final;
}
*outbuf_p = '\0';
}
final:
if (cd != (iconv_t)-1) {
iconv_close(cd);
}
return ret;
}
static void
eb_write_text_string_sprintf(EB_Book* book, const char* format, ...)
{
va_list args;
char text[BUFSIZ] = { 0 };
va_start(args, format);
vsnprintf(text, sizeof(text), format, args);
va_end(args);
if (strlen(text) > 0) {
eb_write_text_string(book, text);
}
}
static EB_Error_Code
hook_newline(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "<br />");
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_reference(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "<reference>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_reference(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "</reference page=0x%x offset=0x%x>",
argv[1], argv[2]);
return EB_SUCCESS;
}
static EB_Error_Code
hook_narrow_font(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "<gaiji code=h%04x />", argv[0]);
return EB_SUCCESS;
}
static EB_Error_Code
hook_wide_font(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "<gaiji code=z%04x />", argv[0]);
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_candidate(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "<candidate>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_candidate_leaf(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container,
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc,
ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "</candidate>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_candidate_group(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container,
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc,
const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "</candidate page=0x%x offset=0x%x>",
argv[1], argv[2]);
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_superscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container,
ATTR_UNUSED EB_Hook_Code code, ATTR_UNUSED int argc,
ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "<sup>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_superscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "</sup>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_subscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "<sub>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_subscript(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "</sub>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_emphasis(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "<emphasis>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_emphasis(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "</emphasis>");
return EB_SUCCESS;
}
static EB_Error_Code
hook_begin_keyword(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, const unsigned int* argv)
{
eb_write_text_string_sprintf(book, "<keyword argv1=%x>", argv[1]);
return EB_SUCCESS;
}
static EB_Error_Code
hook_end_keyword(EB_Book* book, ATTR_UNUSED EB_Appendix* appendix,
ATTR_UNUSED void* container, ATTR_UNUSED EB_Hook_Code code,
ATTR_UNUSED int argc, ATTR_UNUSED const unsigned int* argv)
{
eb_write_text_string(book, "</keyword>");
return EB_SUCCESS;
}
static void
print_eb_error(EB_Error_Code error_code, const char* msg, const char* extra_msg)
{
if (extra_msg != NULL) {
fprintf(stderr, "%s: %s, %s: %s\n", progname, eb_error_message(error_code),
msg, extra_msg);
} else {
fprintf(stderr, "%s: %s, %s\n", progname, eb_error_message(error_code),
msg);
}
}
static int
process_subbook(EB_Book* book, EB_Hookset* hookset, const char* output_file)
{
int exit_code = 0;
EB_Error_Code error_code;
EB_Position text_position;
char title[EB_MAX_TITLE_LENGTH + 1] = { 0 };
char title_conv[EB_MAX_TITLE_LENGTH * 2 + 1] = { 0 };
FILE* output_fp = NULL;
error_code = eb_subbook_title(book, title);
goto_final_if_eb_fail(error_code, "failed to get subbook title", NULL);
iconv_convert("euc-jp", "utf-8", title, title_conv, sizeof(title_conv));
debug("read subbook: %s", title_conv);
error_code = eb_text(book, &text_position);
if ((error_code != EB_ERR_NO_TEXT) && (error_code != EB_ERR_NO_SUCH_SEARCH)) {
char output_file_path[BUFSIZ] = { 0 };
int output_file_path_length;
char text[BUFSIZ];
char text_conv[BUFSIZ * 2 + 1];
ssize_t text_length;
goto_final_if_eb_fail(error_code, "failed to get text information", NULL);
error_code = eb_seek_text(book, &text_position);
goto_final_if_eb_fail(error_code, "failed to get seek text", NULL);
output_file_path_length =
snprintf(output_file_path, sizeof(output_file_path), "%s_%s.txt",
output_file, title_conv);
if (output_file_path_length < 0) {
fprintf(stderr, "%s: %s: %s\n", progname,
"could not generate output file for", title_conv);
goto final;
}
output_fp = fopen(output_file_path, "w");
if (output_fp == NULL) {
fprintf(stderr, "%s: %s: %s\n", progname,
"could not generate output file", output_file_path);
goto final;
}
while (1) {
do {
error_code = eb_read_text(book, NULL, hookset, NULL, sizeof(text), text,
&text_length);
goto_final_if_eb_fail(error_code, "failed to get text", NULL);
iconv_convert("euc-jp", "utf-8", text, text_conv, sizeof(text_conv));
fprintf(output_fp, "%s\n", text_conv);
} while (eb_is_text_stopped(book) == 0);
error_code = eb_forward_text(book, NULL);
if (error_code == EB_ERR_END_OF_CONTENT) {
debug("EB_ERR_END_OF_CONTENT.");
break;
} else {
goto_final_if_eb_fail(error_code, "failed to forward next", NULL);
}
}
} else {
debug("skip subbook (no text): %s", title_conv);
}
final:
if (output_fp != NULL) {
fclose(output_fp);
}
return exit_code;
}
static int
process_eb(const char* source_dir, const char* output_file)
{
int exit_code = 0;
EB_Hookset hookset;
const EB_Hook hooks[] = {
{ EB_HOOK_NEWLINE, hook_newline },
{ EB_HOOK_BEGIN_REFERENCE, hook_begin_reference },
{ EB_HOOK_END_REFERENCE, hook_end_reference },
{ EB_HOOK_NARROW_FONT, hook_narrow_font },
{ EB_HOOK_WIDE_FONT, hook_wide_font },
{ EB_HOOK_BEGIN_CANDIDATE, hook_begin_candidate },
{ EB_HOOK_END_CANDIDATE_LEAF, hook_end_candidate_leaf },
{ EB_HOOK_END_CANDIDATE_GROUP, hook_end_candidate_group },
{ EB_HOOK_BEGIN_SUPERSCRIPT, hook_begin_superscript },
{ EB_HOOK_END_SUPERSCRIPT, hook_end_superscript },
{ EB_HOOK_BEGIN_SUBSCRIPT, hook_begin_subscript },
{ EB_HOOK_END_SUBSCRIPT, hook_end_subscript },
{ EB_HOOK_BEGIN_EMPHASIS, hook_begin_emphasis },
{ EB_HOOK_END_EMPHASIS, hook_end_emphasis },
{ EB_HOOK_BEGIN_KEYWORD, hook_begin_keyword },
{ EB_HOOK_END_KEYWORD, hook_end_keyword },
{ EB_HOOK_NULL, NULL },
};
EB_Book book;
EB_Error_Code error_code;
// EB_Character_Code character_code;
EB_Subbook_Code sub_codes[EB_MAX_SUBBOOKS];
int sub_count;
int i;
eb_initialize_hookset(&hookset);
error_code = eb_set_hooks(&hookset, hooks);
if (error_code != EB_SUCCESS) {
exit_code = 1;
return exit_code;
}
eb_initialize_book(&book);
error_code = eb_bind(&book, source_dir);
goto_final_if_eb_fail(error_code, "failed to bind the book", source_dir);
// error_code = eb_character_code(&book, &character_code);
// goto_final_if_eb_fail(error_code, "failed to get character code",
// NULL);
error_code = eb_subbook_list(&book, sub_codes, &sub_count);
goto_final_if_eb_fail(error_code, "failed to get subbook list", NULL);
for (i = 0; i < sub_count; i++) {
error_code = eb_set_subbook(&book, sub_codes[i]);
goto_final_if_eb_fail(error_code, "failed to set subbook", NULL);
exit_code = process_subbook(&book, &hookset, output_file);
if (exit_code != 0) {
goto final;
}
}
final:
eb_unset_subbook(&book);
eb_finalize_book(&book);
eb_finalize_hookset(&hookset);
return exit_code;
}
int
main(int argc, char* argv[])
{
EB_Error_Code error_code;
int exit_code = 0;
progname = argv[0];
if (argc != 3) {
fprintf(stderr, "Usage: %s source-dir output.txt\n\n", argv[0]);
fprintf(stderr,
" source-dir: path to top directory of Book (directory which "
"contains catalogs file.)\n");
fprintf(stderr,
" output.txt: path to output file. (if exists, overwrite existing "
"file.)\n");
return 1;
}
error_code = eb_initialize_library();
if (error_code != EB_SUCCESS) {
print_eb_error(error_code, "failed to initialize EB Library", argv[1]);
return 1;
}
exit_code = process_eb(argv[1], argv[2]);
eb_finalize_library();
return exit_code;
}
CC=gcc
CFLAGS=-O3 -g -W -Wall
LIBS=-leb
SRC = main.c
OBJ = $(SRC:%.c=%.o)
all: ebdumptext
clean:
\rm $(OBJ) ebdumptext
.PHONY: all clean
ebdumptext: $(OBJ)
$(CC) -o $@ $(OBJ) $(LIBS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment