Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Apple dictionaries
# Thanks to commenters for providing the base of this much nicer implementation!
# Save and run with $ python 0dedict.py
# You may need to hunt down the dictionary files yourself and change the awful path string below.
# This works for me on MacOS 10.14 Mohave
from struct import unpack
from zlib import decompress
import re
filename = '/System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/9f5862030e8f00af171924ebbc23ebfd6e91af78.asset/AssetData/Oxford Dictionary of English.dictionary/Contents/Resources/Body.data'
f = open(filename, 'rb')
def gen_entry():
f.seek(0x40)
limit = 0x40 + unpack('i', f.read(4))[0]
f.seek(0x60)
while f.tell()<limit:
sz, = unpack('i', f.read(4))
buf = decompress(f.read(sz)[8:])
pos = 0
while pos < len(buf):
chunksize, = unpack('i', buf[pos:pos+4])
pos += 4
entry = buf[pos:pos+chunksize]
title = re.search('d:title="(.*?)"', entry).group(1)
yield title, entry
pos += chunksize
for word, definition in gen_entry():
print(word)
// *** Old code - not needed given the python code above
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "zlib.h"
#define CHUNK 16384
/*
40 Length of the zlib stream
4c 0020
54 0275 number of blocks
60 808c pointer to the next block
64 8088 length of the first block
68 047a4a length of the unpacked block
6c start of the zlib stream
80fc second block
13cd134
13cd174
*/
int unpack(unsigned char *in, int len)
{
int ret,outed=0;
unsigned have;
z_stream strm;
unsigned char out[CHUNK];
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.avail_in = 0;
strm.next_in = Z_NULL;
ret = inflateInit(&strm);
if (ret != Z_OK)
return ret;
strm.avail_in = len;
strm.next_in = in;
do {
strm.avail_out = CHUNK;
strm.next_out = out;
ret = inflate(&strm, Z_NO_FLUSH);
assert(ret != Z_STREAM_ERROR); /* state not clobbered */
switch (ret) {
case Z_NEED_DICT:
ret = Z_DATA_ERROR; /* and fall through */
case Z_DATA_ERROR:
case Z_MEM_ERROR:
(void)inflateEnd(&strm);
return ret;
}
// printf("%lx %x\n",strm.next_in-in,strm.avail_in);
have = CHUNK - strm.avail_out /* - (outed?0:4)*/;
int off = 0;
/*
while (have - off > 3 && out[off] != '<' && out[1+off] != 'd' && out[2+off] != ':') {
++off;
}*/
if (have - off <= 3) {
fprintf(stderr, "could not find entry\n");
}
if (fwrite(out + off/*+(outed?0:4)*/, have - off, 1, stdout) != 1 || ferror(stdout)) {
(void)inflateEnd(&strm);
return Z_ERRNO;
}
//exit(0);
outed+=have;
} while (strm.avail_out == 0);
printf("%06x\n",outed);
(void)inflateEnd(&strm);
return ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
}
char filename[256];
int main(int argc,char **argv) {
FILE *fin; int limit,blen=0,p,l,bcnt=0; unsigned char *buf=NULL;
assert(argc >= 2);
sprintf(filename,"/Library/Dictionaries/%s.dictionary/Contents/Body.data",argv[1]);
if((fin=fopen(filename,"rb"))) {
fseek(fin,0x40,SEEK_SET);
fread(&l,1,4,fin);
limit=0x40+l;
p=0x60;
do {
fseek(fin,p,SEEK_SET);
fread(&l,1,4,fin);
// if(0==l) break;
if(blen<l) {
if(buf!=NULL) free(buf);
blen=l;
buf=(unsigned char *)malloc(blen);
}
fread(buf,1,l,fin);
//fprintf(stderr, "%x@%06x: %x>%06x\n",bcnt,p,l,((int *)buf)[1]);
unpack(buf+8,l-8);
p+=4+l;
++bcnt;
} while(p<limit);
free(buf);
fclose(fin);
}
return 0;
}
// This program strips the first 4 characters from each line in the input
#include <stdio.h>
int main() {
while(!ferror(stdin) && !feof(stdin)) {
size_t len = 0;
char *line = fgetln(stdin, &len);
if (!line) break;
if (len > 4)
fwrite(line + 4, 1, len - 4, stdout);
}
return 0;
}
@blasut

This comment has been minimized.

Copy link

commented Apr 4, 2016

If you are doing this on El Capitan the binary file is under: /Library/Dictionaries/%s.dictionary/Contents/Resources/Body.data

@soshial

This comment has been minimized.

Copy link

commented Aug 5, 2016

Does anyone know, why this script gives Segmentation fault 11 on this dictionary (it's easy downloadable over magnet link)?

@zuxfoucault

This comment has been minimized.

Copy link

commented Oct 7, 2016

Does anyone know where is the Dictionary binary located on Mac Seirra?
I can't find them in /Library/Dictionaries

update:
I found them on /System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOSX

@korakot

This comment has been minimized.

Copy link

commented Jun 22, 2017

Python version

from struct import unpack
from zlib import decompress
import re
filename = '/System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/558f4da14294a6eb6203c03e6bb582f0042eab9e.asset/AssetData/Thai.dictionary/Contents/Resources/Body.data'
f = open(filename, 'rb')

After library import, the core code is only 8 lines.

def gen_chunk():
    f.seek(0x40)
    limit = 0x40 + unpack('i', f.read(4))[0]
    f.seek(0x60)
    while f.tell()<limit:
        sz, = unpack('i', f.read(4))
        buf = decompress(f.read(sz)[8:])
        yield re.sub(b'(?m)^....<', b'<', buf)

You can call it and save output to a file:

with open('dict.xml', 'wb') as fo:
  for s in gen_chunk():
    fo.write(s)
@korakot

This comment has been minimized.

Copy link

commented Jun 23, 2017

Or for those who want to iterate 1 word at a time

def gen_entry():
    f.seek(0x40)
    limit = 0x40 + unpack('i', f.read(4))[0]
    f.seek(0x60)
    while f.tell()<limit:
        sz, = unpack('i', f.read(4))
        buf = decompress(f.read(sz)[8:])
        for m in re.finditer(b'<d:entry[^\n]+', buf):
            entry = m.group().decode()
            title = re.search('d:title="(.*?)"', entry).group(1)
            yield title, entry

To use it

for word, definition in gen_entry():
  # extract info from definition
@soshial

This comment has been minimized.

Copy link

commented Jul 7, 2017

Python script worked, thank you so much, I would never figure out this C code! Although some dictionaries have different start of the first pointer: instead of f.seek(0x60) I had to f.seek(0x44).

@EnzioChen

This comment has been minimized.

Copy link

commented Apr 17, 2018

Does not support the latest version of the MAC
macOS 10.13.4

@RoadToDream

This comment has been minimized.

Copy link

commented Jun 2, 2018

Of course it still can work, @EnzioChen, Apple has not worked on it since 2011.

@josephg

This comment has been minimized.

Copy link
Owner Author

commented Oct 26, 2018

I ran into some problems using the python code above on Mohave (10.14). I made some tweaks - this works for me:

from struct import unpack
from zlib import decompress
import re
filename = '/System/Library/Assets/com_apple_MobileAsset_DictionaryServices_dictionaryOSX/9f5862030e8f00af171924ebbc23ebfd6e91af78.asset/AssetData/Oxford Dictionary of English.dictionary/Contents/Resources/Body.data'
f = open(filename, 'rb')

def gen_entry():
    f.seek(0x40)
    limit = 0x40 + unpack('i', f.read(4))[0]
    f.seek(0x60)
    while f.tell()<limit:
        sz, = unpack('i', f.read(4))
        buf = decompress(f.read(sz)[8:])

        pos = 0
        while pos < len(buf):
            chunksize, = unpack('i', buf[pos:pos+4])
            pos += 4

            entry = buf[pos:pos+chunksize]
            title = re.search('d:title="(.*?)"', entry).group(1)
            yield title, entry

            pos += chunksize

for word, definition in gen_entry():
    print(word)

Thanks for the code base @korakot

@swadevs

This comment has been minimized.

Copy link

commented Nov 1, 2018

@josephg @korakot The python program above provides the list of words but not their derivatives. For example, it has the word 'walk' but not the words such as 'walks', 'walking', 'walked' etc. Is this possible to get those words as well somehow?
Do you know what other data files (e.g. EntryID.data, KeyText.data) contain? May be they have such mapping!

@wyatttu

This comment has been minimized.

Copy link

commented Nov 13, 2018

Following

@peheje

This comment has been minimized.

Copy link

commented May 28, 2019

I get the error: "cannot use a string pattern on a bytes-like object" on the line:
title = re.search('d:title="(.*?)"', entry).group(1)

Which version of python are you running?
Also would love to see derivates like @swadevs mentioned

@atylmo

This comment has been minimized.

Copy link

commented Jul 8, 2019

@peheje I got it to work with the built-in Python 2 on Mac. Python 3 gives me the same error as you.

@ctrngk

This comment has been minimized.

Copy link

commented Jul 21, 2019

Other than Body.data, there are still two *.data for indexing. For example, when you type something similar, it will be redirected to the right title. Are anybody able to extract it?

I try to use the same code. It is not working for the rest of *.data.

@vinniec

This comment has been minimized.

Copy link

commented Aug 17, 2019

Today I tried this program because I wanted to convert an apple dictionary to another format but starting the script 0dedict.py I get this error:

$ python3 ./0dedict.py 
Traceback (most recent call last):
  File "./0dedict.py", line 33, in <module>
    for word, definition in gen_entry():
  File "./0dedict.py", line 20, in gen_entry
    buf = decompress(f.read(sz)[8:])
MemoryError

The same also with python2.
This is the source of dictionary: http://rssmac.altervista.org/download/files/italian.dictionary.zip

@vinniec

This comment has been minimized.

Copy link

commented Aug 17, 2019

I tried to go through the code a little bit to see if I could figure out where the mistake was.
It seems to me that the error occurs at line 25 18, in the instruction "f.read(sz)".
In this case I think that sz has a value too large (1919866155, sorry, I do not know the unit of measurement but I think they are bytes and so I think it is a wrong value, if I have calculated correctly I'm asking him to read 1.8gb but the file Body.data is large 34mb).
Is it possible that the dictionary is not well formed?

@vinniec

This comment has been minimized.

Copy link

commented Aug 18, 2019

After a lot of test I can say that maybe the dictionary that interests me is damaged, not well formed or the dictionaries can have different forms.
However to your script on line 25 I would postpend .decode('utf-8') and I would remove line 23

@ilius

This comment has been minimized.

Copy link

commented Aug 19, 2019

Any ideas how can we get the number of entries at the beginning, without reading the whole file?

@vinniec

This comment has been minimized.

Copy link

commented Aug 22, 2019

Based on the structure of the body.data file that I deduced from this script this information is not there, but I think that this information could be in the other files (one of these EntryID.data, EntryID.index, KeyText.data, KeyText.index)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.