Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjgod/0d4b6339d761a5423f82 to your computer and use it in GitHub Desktop.
Save jjgod/0d4b6339d761a5423f82 to your computer and use it in GitHub Desktop.
ToUnicode fixes for dvipdfm-x
From 6d8ad63addd9a384c036b6b9a8f6711512534667 Mon Sep 17 00:00:00 2001
From: Jiang Jiang <gzjjgod@gmail.com>
Date: Sun, 3 Aug 2014 19:53:45 +0200
Subject: [PATCH 1/2] Fix ToUnicode stream creation for non-subst glyphs
non-subst glyphs generated with Unicode -> CID CMap are now properly
handled by storing the correct GID used and match them with the CMAP
in the fonts.
subst glyphs (the ones when OpenType features have been applied) can
not be found because they are not directly accessible through CMAP.
---
texk/dvipdfm-x/cff.c | 15 ++++++++-------
texk/dvipdfm-x/cff.h | 1 +
texk/dvipdfm-x/pdfdev.c | 20 +++++++++++++-------
3 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/texk/dvipdfm-x/cff.c b/texk/dvipdfm-x/cff.c
index 596b1cc..f122044 100644
--- a/texk/dvipdfm-x/cff.c
+++ b/texk/dvipdfm-x/cff.c
@@ -1054,23 +1054,24 @@ card16 cff_glyph_lookup (cff_font *cff, const char *glyph)
card16
cff_charsets_lookup (cff_font *cff, card16 cid)
{
- card16 gid = 0;
- cff_charsets *charset;
- card16 i;
-
if (cff->flag & (CHARSETS_ISOADOBE|CHARSETS_EXPERT|CHARSETS_EXPSUB)) {
ERROR("Predefined CFF charsets not supported yet");
} else if (cff->charsets == NULL) {
ERROR("Charsets data not available");
}
+ return cff_charsets_lookup_gid(cff->charsets, cid);
+}
+
+card16 cff_charsets_lookup_gid (cff_charsets *charset, card16 cid)
+{
+ card16 gid = 0;
+ card16 i;
+
if (cid == 0) {
return 0; /* GID 0 (.notdef) */
}
- charset = cff->charsets;
-
- gid = 0;
switch (charset->format) {
case 0:
for (i = 0; i <charset->num_entries; i++) {
diff --git a/texk/dvipdfm-x/cff.h b/texk/dvipdfm-x/cff.h
index 50d019d..4339d02 100644
--- a/texk/dvipdfm-x/cff.h
+++ b/texk/dvipdfm-x/cff.h
@@ -121,6 +121,7 @@ extern card16 cff_glyph_lookup (cff_font *cff, const char *glyph);
extern char* cff_get_glyphname (cff_font *cff, card16 gid);
/* Returns GID of glyph with SID/CID "cid" */
extern card16 cff_charsets_lookup (cff_font *cff, card16 cid);
+extern card16 cff_charsets_lookup_gid (cff_charsets *charset, card16 cid);
extern void cff_release_charsets (cff_charsets *charset);
/* Returns SID or CID */
extern card16 cff_charsets_lookup_inverse (cff_font *cff, card16 gid);
diff --git a/texk/dvipdfm-x/pdfdev.c b/texk/dvipdfm-x/pdfdev.c
index 6c2d1c5..2819a3e 100644
--- a/texk/dvipdfm-x/pdfdev.c
+++ b/texk/dvipdfm-x/pdfdev.c
@@ -1158,19 +1158,25 @@ pdf_dev_set_string (spt_t xpos, spt_t ypos,
length = instr_len;
if (font->format == PDF_FONTTYPE_COMPOSITE) {
- if (real_font->used_glyphs != NULL) {
- for (i = 0; i < length; i += 2)
- add_to_used_chars2(real_font->used_glyphs,
- (unsigned short) (str_ptr[i] << 8)|str_ptr[i+1]);
+ if (real_font->used_glyphs != NULL && ctype == -1) {
+ for (i = 0; i < length; i += 2) {
+ unsigned short gid = (str_ptr[i] << 8) | str_ptr[i + 1];
+ add_to_used_chars2(real_font->used_glyphs, gid);
+ }
}
if (handle_multibyte_string(font, &str_ptr, &length, ctype) < 0) {
ERROR("Error in converting input string...");
return;
}
if (real_font->used_chars != NULL) {
- for (i = 0; i < length; i += 2)
- add_to_used_chars2(real_font->used_chars,
- (unsigned short) (str_ptr[i] << 8)|str_ptr[i+1]);
+ for (i = 0; i < length; i += 2) {
+ unsigned short cid = (str_ptr[i] << 8) | str_ptr[i + 1];
+ if (ctype == 2 && font->cff_charsets) {
+ unsigned short gid = cff_charsets_lookup_gid(font->cff_charsets, cid);
+ add_to_used_chars2(real_font->used_glyphs, gid);
+ }
+ add_to_used_chars2(real_font->used_chars, cid);
+ }
}
} else {
if (real_font->used_chars != NULL) {
--
2.0.0
From 4b309e64bcf23e99e792bdee9600b2b15f74dc0f Mon Sep 17 00:00:00 2001
From: Jiang Jiang <gzjjgod@gmail.com>
Date: Sun, 3 Aug 2014 23:01:20 +0200
Subject: [PATCH 2/2] Fix ToUnicode with reverse CMap lookup
When dvipdfmx is provided a regular CMap in the following format:
<unicode codepoint> cid
We can try to utilize the data and do a reverse lookup when all we
know is the cids used.
Initially, when parsing the cmap specified, we build a reverseMap
which maps CID to the first Unicode codepoint it corresponds to.
(Which is pretty much the reverse of CMap_decode().)
Since this cmap is already parsed when we create the fontmap cache,
we can just pass the cmap_id all the way down to
otf_create_ToUnicode_stream() and get CMap* back from this cmap_id.
Second, we refactor create_ToUnicode_cmap{4, 12}() functions into
one create_ToUnicode_cmap() and combine the common code.
In the third step, in create_ToUnicode_cmap() we try to see if the
font is indeed a CFF font (with CID-keyed), and we do have a cmap,
we can use the newly added CMap_reverse_decode() to convert all
used cids back to Unicode codepoints. With that mapping information
we can successfully create the ToUnicode PDF stream we needed.
---
texk/dvipdfm-x/cmap.c | 23 +++++-
texk/dvipdfm-x/cmap.h | 2 +
texk/dvipdfm-x/cmap_p.h | 1 +
texk/dvipdfm-x/tt_cmap.c | 185 +++++++++++++++++++++++++++--------------------
texk/dvipdfm-x/tt_cmap.h | 5 +-
texk/dvipdfm-x/type0.c | 6 +-
6 files changed, 140 insertions(+), 82 deletions(-)
diff --git a/texk/dvipdfm-x/cmap.c b/texk/dvipdfm-x/cmap.c
index 0629ac1..d5c909c 100644
--- a/texk/dvipdfm-x/cmap.c
+++ b/texk/dvipdfm-x/cmap.c
@@ -109,6 +109,9 @@ CMap_new (void)
cmap->mapData->pos = 0;
cmap->mapData->data = NEW(MEM_ALLOC_SIZE, unsigned char);
+ cmap->reverseMap = NEW(65536, int);
+ memset(cmap->reverseMap, 0, 65536 * sizeof(int));
+
return cmap;
}
@@ -140,6 +143,9 @@ CMap_release (CMap *cmap)
}
}
+ if (cmap->reverseMap)
+ RELEASE(cmap->reverseMap);
+
RELEASE(cmap);
}
@@ -359,6 +365,14 @@ CMap_decode (CMap *cmap,
return count;
}
+int
+CMap_reverse_decode(CMap *cmap, CID cid) {
+ int ch = cmap->reverseMap ? cmap->reverseMap[cid] : -1;
+ if (ch == 0 && cmap->useCMap)
+ return CMap_reverse_decode(cmap->useCMap, cid);
+ return ch;
+}
+
char *
CMap_get_name (CMap *cmap)
{
@@ -664,7 +678,7 @@ int
CMap_add_cidrange (CMap *cmap,
const unsigned char *srclo, const unsigned char *srchi, int srcdim, CID base)
{
- int c;
+ int i, c, v;
mapDef *cur;
ASSERT(cmap);
@@ -679,6 +693,11 @@ CMap_add_cidrange (CMap *cmap,
if (locate_tbl(&cur, srclo, srcdim) < 0)
return -1;
+ for (v = 0, i = 0; i < srcdim - 1; i++)
+ v = (v << 8) + srclo[i];
+
+ cmap->reverseMap[base] = v;
+
for (c = srclo[srcdim-1]; c <= srchi[srcdim-1]; c++) {
if (cur[c].flag != 0) {
if (!__silent)
@@ -689,6 +708,8 @@ CMap_add_cidrange (CMap *cmap,
cur[c].code = get_mem(cmap, 2);
cur[c].code[0] = base >> 8;
cur[c].code[1] = base & 0xff;
+
+ cmap->reverseMap[base] = (v << 8) + c;
}
if (base >= CID_MAX)
WARN("CID number too large.");
diff --git a/texk/dvipdfm-x/cmap.h b/texk/dvipdfm-x/cmap.h
index 9b0f6fa..75c39bd 100644
--- a/texk/dvipdfm-x/cmap.h
+++ b/texk/dvipdfm-x/cmap.h
@@ -107,6 +107,8 @@ extern long CMap_decode (CMap *cmap,
const unsigned char **inbuf, long *inbytesleft,
unsigned char **outbuf, long *outbytesleft);
+extern int CMap_reverse_decode(CMap *cmap, CID cid);
+
extern void CMap_cache_init (void);
extern CMap *CMap_cache_get (int id);
extern int CMap_cache_find (const char *cmap_name);
diff --git a/texk/dvipdfm-x/cmap_p.h b/texk/dvipdfm-x/cmap_p.h
index 63639d2..7b9828b 100644
--- a/texk/dvipdfm-x/cmap_p.h
+++ b/texk/dvipdfm-x/cmap_p.h
@@ -97,6 +97,7 @@ struct CMap {
int maxBytesOut;
} profile;
+ int *reverseMap;
};
#endif /* _CMAP_P_H_ */
diff --git a/texk/dvipdfm-x/tt_cmap.c b/texk/dvipdfm-x/tt_cmap.c
index 4638b86..c56ddea 100644
--- a/texk/dvipdfm-x/tt_cmap.c
+++ b/texk/dvipdfm-x/tt_cmap.c
@@ -984,27 +984,14 @@ prepare_CIDFont_from_sfnt(sfnt* sfont)
return cffont;
}
-static pdf_obj *
-create_ToUnicode_cmap4 (struct cmap4 *map,
- const char *cmap_name, CMap *cmap_add,
- const char *used_glyphs,
- sfnt *sfont)
+USHORT
+create_ToUnicode_cmap4 (CMap *cmap,
+ struct cmap4 *map,
+ char *used_glyphs,
+ cff_font *cffont)
{
- pdf_obj *stream = NULL;
- CMap *cmap;
USHORT c0, c1, gid, count, ch;
USHORT i, j, d, segCount;
- char used_glyphs_copy[8192];
- cff_font *cffont = prepare_CIDFont_from_sfnt(sfont);
-
- cmap = CMap_new();
- CMap_set_name (cmap, cmap_name);
- CMap_set_wmode(cmap, 0);
- CMap_set_type (cmap, CMAP_TYPE_TO_UNICODE);
- CMap_set_CIDSysInfo(cmap, &CSI_UNICODE);
- CMap_add_codespacerange(cmap, srange_min, srange_max, 2);
-
- memcpy(used_glyphs_copy, used_glyphs, 8192);
segCount = map->segCountX2 / 2;
for (count = 0, i = 0; i < segCount; i++) {
@@ -1022,7 +1009,7 @@ create_ToUnicode_cmap4 (struct cmap4 *map,
gid = (map->glyphIndexArray[j+d] +
map->idDelta[i]) & 0xffff;
}
- if (is_used_char2(used_glyphs_copy, gid)) {
+ if (is_used_char2(used_glyphs, gid)) {
unsigned int cid = cffont ? cff_charsets_lookup_inverse(cffont, gid) : gid;
count++;
@@ -1039,55 +1026,31 @@ create_ToUnicode_cmap4 (struct cmap4 *map,
* mapping of ligatures encoded in PUA in fonts like Linux Libertine
* and old Adobe fonts.
*/
- if (!is_PUA_or_presentation(ch))
- /* Avoid duplicate entry
- * There are problem when two Unicode code is mapped to
- * single glyph...
- */
- used_glyphs_copy[gid/8] &= ~(1 << (7 - (gid % 8)));
- count++;
+ if (!is_PUA_or_presentation(ch)) {
+ /* Avoid duplicate entry
+ * There are problem when two Unicode code is mapped to
+ * single glyph...
+ */
+ used_glyphs[gid/8] &= ~(1 << (7 - (gid % 8)));
+ }
+ count++;
}
}
}
- count += handle_subst_glyphs(cmap, cmap_add, used_glyphs_copy, sfont, cffont);
-
- if (count < 1)
- stream = NULL;
- else {
- stream = CMap_create_stream(cmap, 0);
- }
- CMap_release(cmap);
-
- if (cffont)
- cff_close(cffont);
-
- return stream;
+ return count;
}
-static pdf_obj *
-create_ToUnicode_cmap12 (struct cmap12 *map,
- const char *cmap_name, CMap *cmap_add,
- const char *used_glyphs,
- sfnt *sfont)
+USHORT
+create_ToUnicode_cmap12 (CMap *cmap,
+ struct cmap12 *map,
+ char *used_glyphs,
+ cff_font *cffont)
{
- pdf_obj *stream = NULL;
- CMap *cmap;
- ULONG i, ch;
- USHORT gid, count;
- char used_glyphs_copy[8192];
- cff_font *cffont = prepare_CIDFont_from_sfnt(sfont);
-
- cmap = CMap_new();
- CMap_set_name (cmap, cmap_name);
- CMap_set_wmode(cmap, 0);
- CMap_set_type (cmap, CMAP_TYPE_TO_UNICODE);
- CMap_set_CIDSysInfo(cmap, &CSI_UNICODE);
- CMap_add_codespacerange(cmap, srange_min, srange_max, 2);
+ USHORT i, gid, ch, count = 0;
- memcpy(used_glyphs_copy, used_glyphs, 8192);
- for (count = 0, i = 0; i < map->nGroups; i++) {
+ for (i = 0; i < map->nGroups; i++) {
for (ch = map->groups[i].startCharCode;
ch <= map->groups[i].endCharCode; ch++) {
unsigned char *p;
@@ -1097,7 +1060,7 @@ create_ToUnicode_cmap12 (struct cmap12 *map,
p = wbuf + 2;
d = ch - map->groups[i].startCharCode;
gid = (USHORT) ((map->groups[i].startGlyphID + d) & 0xffff);
- if (is_used_char2(used_glyphs_copy, gid)) {
+ if (is_used_char2(used_glyphs, gid)) {
unsigned int cid = cffont ? cff_charsets_lookup_inverse(cffont, gid) : gid;
count++;
wbuf[0] = (cid >> 8) & 0xff;
@@ -1111,18 +1074,85 @@ create_ToUnicode_cmap12 (struct cmap12 *map,
* mapping of ligatures encoded in PUA in fonts like Linux Libertine
* and old Adobe fonts.
*/
- if (!is_PUA_or_presentation(ch))
- /* Avoid duplicate entry
- * There are problem when two Unicode code is mapped to
- * single glyph...
- */
- used_glyphs_copy[gid/8] &= ~(1 << (7 - (gid % 8)));
+ if (!is_PUA_or_presentation(ch)) {
+ /* Avoid duplicate entry
+ * There are problem when two Unicode code is mapped to
+ * single glyph...
+ */
+ used_glyphs[gid/8] &= ~(1 << (7 - (gid % 8)));
+ }
count++;
}
}
}
- count += handle_subst_glyphs(cmap, cmap_add, used_glyphs_copy, sfont, cffont);
+ return count;
+}
+
+static pdf_obj *
+create_ToUnicode_cmap (tt_cmap *ttcmap,
+ const char *cmap_name,
+ CMap *cmap_add,
+ const char *used_glyphs,
+ sfnt *sfont,
+ CMap *cmap_loaded)
+{
+ pdf_obj *stream = NULL;
+ CMap *cmap;
+ USHORT i, gid, ch, count = 0;
+ char used_glyphs_copy[8192];
+ cff_font *cffont = prepare_CIDFont_from_sfnt(sfont);
+
+ cmap = CMap_new();
+ CMap_set_name (cmap, cmap_name);
+ CMap_set_wmode(cmap, 0);
+ CMap_set_type (cmap, CMAP_TYPE_TO_UNICODE);
+ CMap_set_CIDSysInfo(cmap, &CSI_UNICODE);
+ CMap_add_codespacerange(cmap, srange_min, srange_max, 2);
+
+ if (cmap_loaded && cffont) {
+ for (i = 0; i < 8192; i++) {
+ int j;
+ long len, inbytesleft, outbytesleft;
+ const unsigned char *inbuf;
+ unsigned char *outbuf;
+
+ if (used_glyphs[i] == 0)
+ continue;
+
+ for (j = 0; j < 8; j++) {
+ unsigned int cid;
+ gid = 8 * i + j;
+
+ if (!is_used_char2(used_glyphs, gid))
+ continue;
+
+ cid = cff_charsets_lookup_inverse(cffont, gid);
+ int ch = CMap_reverse_decode(cmap_loaded, cid);
+ if (ch >= 0) {
+ unsigned char *p = wbuf + 2;
+ wbuf[0] = (cid >> 8) & 0xff;
+ wbuf[1] = cid & 0xff;
+ len = UC_sput_UTF16BE((long)ch, &p, wbuf + WBUF_SIZE);
+ CMap_add_bfchar(cmap, wbuf, 2, wbuf + 2, len);
+ count++;
+ }
+ }
+ }
+ } else {
+ memcpy(used_glyphs_copy, used_glyphs, 8192);
+
+ switch (ttcmap->format) {
+ case 4:
+ count = create_ToUnicode_cmap4(cmap, ttcmap->map, used_glyphs_copy, cffont);
+ break;
+ case 12:
+ count = create_ToUnicode_cmap12(cmap, ttcmap->map, used_glyphs_copy, cffont);
+ break;
+ }
+
+ count += handle_subst_glyphs(cmap, cmap_add, used_glyphs_copy, sfont, cffont);
+ }
if (count < 1)
stream = NULL;
@@ -1152,13 +1182,14 @@ static cmap_plat_enc_rec cmap_plat_encs[] = {
pdf_obj *
otf_create_ToUnicode_stream (const char *font_name,
- int ttc_index, /* 0 for non-TTC */
- const char *used_glyphs)
+ int ttc_index, /* 0 for non-TTC */
+ const char *used_glyphs,
+ int cmap_id)
{
pdf_obj *cmap_ref = NULL;
long res_id;
pdf_obj *cmap_obj = NULL;
- CMap *cmap_add;
+ CMap *cmap_add, *cmap_loaded;
int cmap_add_id;
tt_cmap *ttcmap;
char *normalized_font_name;
@@ -1226,6 +1257,8 @@ otf_create_ToUnicode_stream (const char *font_name,
ERROR("Could not read OpenType/TrueType table directory.");
}
+ cmap_loaded = CMap_cache_get(cmap_id);
+
cmap_add_id = CMap_cache_find(cmap_name);
if (cmap_add_id < 0) {
cmap_add = NULL;
@@ -1238,14 +1271,10 @@ otf_create_ToUnicode_stream (const char *font_name,
ttcmap = tt_cmap_read(sfont, cmap_plat_encs[i].platform, cmap_plat_encs[i].encoding);
if (!ttcmap)
continue;
- if (ttcmap->format == 4) {
- cmap_obj = create_ToUnicode_cmap4(ttcmap->map,
- cmap_name, cmap_add, used_glyphs, sfont);
- break;
- }
- if (ttcmap->format == 12) {
- cmap_obj = create_ToUnicode_cmap12(ttcmap->map,
- cmap_name, cmap_add, used_glyphs, sfont);
+
+ if (ttcmap->format == 4 || ttcmap->format == 12) {
+ cmap_obj = create_ToUnicode_cmap(ttcmap, cmap_name, cmap_add, used_glyphs,
+ sfont, cmap_loaded);
break;
}
}
diff --git a/texk/dvipdfm-x/tt_cmap.h b/texk/dvipdfm-x/tt_cmap.h
index 7aefdbf..afc492c 100644
--- a/texk/dvipdfm-x/tt_cmap.h
+++ b/texk/dvipdfm-x/tt_cmap.h
@@ -67,8 +67,9 @@ extern void tt_cmap_release (tt_cmap *cmap);
/* Indirect reference */
extern pdf_obj *otf_create_ToUnicode_stream (const char *map_name,
- int ttc_index,
- const char *used_glyphs);
+ int ttc_index,
+ const char *used_glyphs,
+ int cmap_id);
/* CMap ID */
extern int otf_load_Unicode_CMap (const char *map_name,
int ttc_index,
diff --git a/texk/dvipdfm-x/type0.c b/texk/dvipdfm-x/type0.c
index 0a76898..9015941 100644
--- a/texk/dvipdfm-x/type0.c
+++ b/texk/dvipdfm-x/type0.c
@@ -93,6 +93,7 @@ struct Type0Font {
CIDFont *descendant; /* Only single descendant is allowed. */
int flags;
int wmode;
+ int cmap_id;
/*
* PDF Font Resource
@@ -116,6 +117,7 @@ Type0Font_init_font_struct (Type0Font *font)
font->used_glyphs = NULL;
font->descendant = NULL;
font->wmode = -1;
+ font->cmap_id = -1;
font->flags = FLAG_NONE;
return;
@@ -161,7 +163,8 @@ Type0Font_create_ToUnicode_stream(Type0Font *font) {
return otf_create_ToUnicode_stream(CIDFont_get_ident(cidfont),
CIDFont_get_opt_index(cidfont),
- used);
+ used,
+ font->cmap_id);
}
/* Try to load ToUnicode CMap from file system first, if not found fallback to
@@ -466,6 +469,7 @@ Type0Font_cache_find (const char *map_name, int cmap_id, fontmap_opt *fmap_opt)
strcpy(font->encoding, "Identity-H");
}
font->wmode = wmode;
+ font->cmap_id = cmap_id;
/*
* Now we start font dictionary.
--
2.0.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment