Skip to content

Instantly share code, notes, and snippets.

Created July 23, 2014 20:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/604f079206d7e5986d26 to your computer and use it in GitHub Desktop.
Save anonymous/604f079206d7e5986d26 to your computer and use it in GitHub Desktop.
diff -ruwb hatari-orig/src/gemdos.c hatari/src/gemdos.c
--- hatari-orig/src/gemdos.c 2014-07-23 22:06:55.833808766 +0200
+++ hatari/src/gemdos.c 2014-07-23 20:32:50.628566186 +0200
@@ -1114,7 +1114,11 @@
*/
modified = false;
- /* catch potentially invalid characters */
+ /* Commented out because this is risky. When deleting files which
+ * contain invalid characters, the inserted wildcard '?' may match
+ * also on other files, and an arbitrary file may be deleted.
+ *
+ * catch potentially invalid characters.
for (tmp = name; *tmp; tmp++)
{
if (*tmp == INVALID_CHAR)
@@ -1123,6 +1127,7 @@
modified = true;
}
}
+ */
/* catch potentially too long extension */
for (dot = 0; name[dot] && name[dot] != '.'; dot++);
@@ -1229,8 +1234,12 @@
void GemDOS_CreateHardDriveFileName(int Drive, const char *pszFileName,
char *pszDestName, int nDestNameLen)
{
- const char *s, *filename = pszFileName;
+ const char *s, *filename;
int minlen;
+ char pszFileNameHost[FILENAME_MAX];
+
+ Str_AtariToHost(pszFileName, pszFileNameHost, FILENAME_MAX, INVALID_CHAR);
+ filename = pszFileName = pszFileNameHost;
/* Is it a valid hard drive? */
if (Drive < 2)
diff -ruwb hatari-orig/src/includes/str.h hatari/src/includes/str.h
--- hatari-orig/src/includes/str.h 2014-07-23 22:07:06.663809232 +0200
+++ hatari/src/includes/str.h 2014-07-23 20:54:21.319988000 +0200
@@ -31,4 +31,15 @@
extern void Str_Filename2TOSname(const char *src, char *dst);
extern void Str_Dump_Hex_Ascii ( char *p , int Len , int Width , const char *Suffix , FILE *pFile );
+/* Interface of character set conversions */
+extern void initCharacterMappings(void);
+extern void Str_AtariToUtf8(const char *source, char *dest, int destLen);
+extern void Str_Utf8ToAtari(const char *source, char *dest, char replacementChar);
+extern void Str_AtariToLocal(const char *source, char *dest, int destLen, char replacementChar);
+extern void Str_LocalToAtari(const char *source, char *dest, char replacementChar);
+extern void Str_DecomposedToPrecomposedUtf8(const char *source, char * dest);
+extern void Str_AtariToHost(const char *source, char *dest, int destLen, char replacementChar);
+extern void Str_HostToAtari(const char *source, char *dest, char replacementChar);
+
+
#endif /* HATARI_STR_H */
diff -ruwb hatari-orig/src/str.c hatari/src/str.c
--- hatari-orig/src/str.c 2014-07-23 22:07:01.626809015 +0200
+++ hatari/src/str.c 2014-07-23 21:10:24.638663044 +0200
@@ -12,6 +12,7 @@
#include <ctype.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <locale.h>
#include <SDL_types.h>
#include "configuration.h"
#include "str.h"
@@ -136,6 +137,9 @@
int len;
src = strdup(source); /* dup so that it can be modified */
+
+ /* convert host string encoding to AtariST character set */
+ Str_HostToAtari(source, src, INVALID_CHAR);
len = strlen(src);
/* does filename have an extension? */
@@ -164,7 +168,8 @@
/* upcase and replace rest of invalid characters */
for (tmp = dst; *tmp; tmp++)
{
- if (*tmp < 33 || *tmp > 126)
+ /* invalid characters above 0x80 have already been replaced */
+ if ((*tmp & 255) < 32 || *tmp == 127)
*tmp = INVALID_CHAR;
else
{
@@ -187,6 +192,326 @@
}
+/* ---------------------------------------------------------------------- */
+
+/* Implementation of character set conversions */
+
+/* Maps AtariST characters 0x80..0xFF to unicode code points
+ * see http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT
+ */
+static int mapAtariToUnicode[128] =
+{
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
+ 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x00DF, 0x0192,
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
+ 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
+ 0x00E3, 0x00F5, 0x00D8, 0x00F8, 0x0153, 0x0152, 0x00C0, 0x00C3,
+ 0x00D5, 0x00A8, 0x00B4, 0x2020, 0x00B6, 0x00A9, 0x00AE, 0x2122,
+ 0x0133, 0x0132, 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5,
+ 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DB, 0x05DC, 0x05DE, 0x05E0,
+ 0x05E1, 0x05E2, 0x05E4, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA,
+ 0x05DF, 0x05DA, 0x05DD, 0x05E3, 0x05E5, 0x00A7, 0x2227, 0x221E,
+ 0x03B1, 0x03B2, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
+ 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x222E, 0x03C6, 0x2208, 0x2229,
+ 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
+ 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x00B3, 0x00AF
+};
+
+/* Hashtable with maps unicode code points to AtariST characters 0x80..0xFF.
+ * The last 9 bits of the unicode code point provide a hash function
+ * without collisions.
+ */
+static char mapUnicodeToAtari[512];
+static bool characterMappingsInitialized = false;
+
+/* Define this only for an old Linux system which does not store
+ * pathnames in UTF-8. If this is defined, pathnames are converted
+ * to the host character set as defined by the locale.
+ * It is normally not needed to define this. Do not define this
+ * for OSX, as the unicode pathnames then won't be converted from
+ * the decomposed to the precomposed form.
+ */
+#define USE_LOCALE_CHARSET 1
+
+
+/**
+ * This function initializes the mapUnicodeToAtari[] hashtable.
+ */
+void initCharacterMappings(void)
+{
+ int i;
+ for (i = 0; i < 128; i++)
+ {
+ mapUnicodeToAtari[mapAtariToUnicode[i] & 511] = i;
+ }
+ characterMappingsInitialized = true;
+
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ setlocale(LC_ALL, "");
+#endif
+}
+
+/**
+ * Convert a 0-terminated string in the AtariST character set to a 0-terminated
+ * UTF-8 encoded string. destLen is the number of available bytes in dest[].
+ * A single character of the AtariST charset can consume up to 3 bytes in UTF-8.
+ */
+void Str_AtariToUtf8(const char *source, char *dest, int destLen)
+{
+ int c;
+ while (*source)
+ {
+ c = *source++ & 255;
+ if (c >= 128)
+ {
+ c = mapAtariToUnicode[c & 127];
+ }
+ if (c < 128 && destLen > 1)
+ {
+ *dest++ = c; // 0xxxxxxx
+ destLen--;
+ }
+ else if (c < 2048 && destLen > 2)
+ {
+ *dest++ = (c >> 6) | 192; // 110xxxxx
+ *dest++ = (c & 63) | 128; // 10xxxxxx
+ destLen -= 2;
+ }
+ else if (destLen > 3)
+ {
+ *dest++ = (c >> 12) | 224; // 1110xxxx
+ *dest++ = ((c >> 6) & 63) | 128; // 10xxxxxx
+ *dest++ = (c & 63) | 128; // 10xxxxxx
+ destLen -= 3;
+ }
+ }
+ *dest = 0;
+}
+
+/**
+ * Convert a 0-terminated utf-8 encoded string to a 0-terminated string
+ * in the AtariST character set.
+ * replacementChar is inserted when there is no mapping.
+ */
+void Str_Utf8ToAtari(const char *source, char *dest, char replacementChar)
+{
+ int c, c2, c3, i;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source)
+ {
+ c = *source++ & 255;
+ if (c < 128) // single-byte utf-8 code (0xxxxxxx)
+ {
+ *dest++ = c;
+ }
+ else if (c < 192) // invalid utf-8 encoding (10xxxxxx)
+ {
+ *dest++ = replacementChar;
+ }
+ else // multi-byte utf-8 code
+ {
+ if (c < 224) // 110xxxxx, 10xxxxxx
+ {
+ c2 = *source++;
+ c = ((c & 31) << 6) | (c2 & 63);
+ }
+ else if (c < 240) // 1110xxxx, 10xxxxxx, 10xxxxxx
+ {
+ c2 = *source++;
+ c3 = *source++;
+ c = ((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+ }
+
+ // find AtariST character code for unicode code point c
+ i = mapUnicodeToAtari[c & 511];
+ *dest++ = (mapAtariToUnicode[i] == c ? i + 128 : replacementChar);
+ }
+ }
+ *dest = 0;
+}
+
+
+/**
+ * Convert a string from the AtariST character set into the host representation as
+ * defined by the current locale. Characters which do not exist in character set
+ * of the host as defined by the locale will be replaced by replacementChar.
+ */
+void Str_AtariToLocal(const char *source, char *dest, int destLen, char replacementChar)
+{
+ int c, i;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source && destLen > (int)MB_CUR_MAX)
+ {
+ c = *source++ & 255;
+ if (c >= 128)
+ c = mapAtariToUnicode[c & 127];
+ /* convert the unicode code point c to a character in the current locale */
+ i = wctomb(dest, c);
+ if (i < 0)
+ {
+ *dest = replacementChar;
+ i = 1;
+ }
+ dest += i;
+ destLen -= i;
+ }
+ *dest = 0;
+}
+
+/**
+ * Convert a string from the character set defined by current host locale into the
+ * AtariST character set. Characters which do not exist in the AtariST character set
+ * will be replaced by replacementChar.
+ */
+void Str_LocalToAtari(const char *source, char *dest, char replacementChar)
+{
+ int i;
+ wchar_t c;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source)
+ {
+ /* convert a character from the current locale into an unicode code point */
+ i = mbtowc(&c, source, 4);
+ if (i < 0)
+ {
+ c = replacementChar;
+ i = 1;
+ }
+ source += i;
+ if (c >= 128)
+ {
+ // find AtariST character code for unicode code point c
+ i = mapUnicodeToAtari[c & 511];
+ c = (mapAtariToUnicode[i] == c ? i + 128 : replacementChar);
+ }
+ *dest++ = c;
+ }
+ *dest = 0;
+}
+
+/* This table is needed to convert the UTF-8 representation of paths with
+ * diacritical marks from the decomposed form (as returned by OSX) into the
+ * precomposed form. Combining unicode characters are 0x0300..0x036F.
+ * This table contains only those characters which are part of the AtariST
+ * character set.
+ */
+static int mapDecomposedPrecomposed[] =
+{
+ 'A', 0x0300, 0xC0, /* À */
+ 'A', 0x0301, 0xC1, /* Á */
+ 'A', 0x0302, 0xC2, /* Â */
+ 'A', 0x0303, 0xC3, /* Ã */
+ 'A', 0x0308, 0xC4, /* Ä */
+ 'A', 0x030A, 0xC5, /* Å */
+ 'C', 0x0327, 0xC7, /* Ç */
+ 'E', 0x0300, 0xC8, /* È */
+ 'E', 0x0301, 0xC9, /* É */
+ 'E', 0x0302, 0xCA, /* Ê */
+ 'E', 0x0308, 0xCB, /* Ë */
+ 'I', 0x0300, 0xCC, /* Ì */
+ 'I', 0x0301, 0xCD, /* Í */
+ 'I', 0x0302, 0xCE, /* Î */
+ 'I', 0x0308, 0xCF, /* Ï */
+ 'N', 0x0303, 0xD1, /* Ñ */
+ 'O', 0x0300, 0xD2, /* Ò */
+ 'O', 0x0301, 0xD3, /* Ó */
+ 'O', 0x0302, 0xD4, /* Ô */
+ 'O', 0x0303, 0xD5, /* Õ */
+ 'O', 0x0308, 0xD6, /* Ö */
+ 'U', 0x0300, 0xD9, /* Ù */
+ 'U', 0x0301, 0xDA, /* Ú */
+ 'U', 0x0302, 0xDB, /* Û */
+ 'U', 0x0308, 0xDC, /* Ü */
+ 'Y', 0x0301, 0xDD, /* Ý */
+ 'a', 0x0300, 0xE0, /* à */
+ 'a', 0x0301, 0xE1, /* á */
+ 'a', 0x0302, 0xE2, /* â */
+ 'a', 0x0303, 0xE3, /* ã */
+ 'a', 0x0308, 0xE4, /* ä */
+ 'a', 0x030A, 0xE5, /* å */
+ 'c', 0x0327, 0xE7, /* ç */
+ 'e', 0x0300, 0xE8, /* è */
+ 'e', 0x0301, 0xE9, /* é */
+ 'e', 0x0302, 0xEA, /* ê */
+ 'e', 0x0308, 0xEB, /* ë */
+ 'i', 0x0300, 0xEC, /* ì */
+ 'i', 0x0301, 0xED, /* í */
+ 'i', 0x0302, 0xEE, /* î */
+ 'i', 0x0308, 0xEF, /* ï */
+ 'n', 0x0303, 0xF1, /* ñ */
+ 'o', 0x0300, 0xF2, /* ò */
+ 'o', 0x0301, 0xF3, /* ó */
+ 'o', 0x0302, 0xF4, /* ô */
+ 'o', 0x0303, 0xF5, /* õ */
+ 'o', 0x0308, 0xF6, /* ö */
+ 'u', 0x0300, 0xF9, /* ù */
+ 'u', 0x0301, 0xFA, /* ú */
+ 'u', 0x0302, 0xFB, /* û */
+ 'u', 0x0308, 0xFC, /* ü */
+ 'y', 0x0301, 0xFD, /* ý */
+ 'y', 0x0308, 0xFF, /* ÿ */
+ 0
+};
+
+/**
+ * Convert decomposed AtariST characters in an UTF-8 encoded string into the
+ * precomposed form. This is needed as OSX returns filesystem paths in the
+ * decomposed form (NFD).
+ */
+void Str_DecomposedToPrecomposedUtf8(const char *source, char * dest)
+{
+ int c, c1, i;
+ while (*source)
+ {
+ c = *source++ & 255;
+ /* do we have a combining character behind the current character */
+ if ((source[0] & 0xFC) == 0xCC) /* 0x03XX is in UTF-8: 110011xx 10xxxxxx */
+ {
+ c1 = ((source[0] & 31) << 6) | (source[1] & 63);
+ for (i = 0; mapDecomposedPrecomposed[i]; i += 3)
+ {
+ if (mapDecomposedPrecomposed[i] == c && mapDecomposedPrecomposed[i + 1] == c1)
+ {
+ c = mapDecomposedPrecomposed[i + 2];
+ source += 2;
+ break;
+ }
+ }
+ }
+ *dest++ = c;
+ }
+ *dest = 0;
+}
+
+
+void Str_AtariToHost(const char *source, char *dest, int destLen, char replacementChar)
+{
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ Str_AtariToLocal(source, dest, destLen, replacementChar);
+#else
+ Str_AtariToUtf8(source, dest, destLen);
+#endif
+}
+
+void Str_HostToAtari(const char *source, char *dest, char replacementChar)
+{
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ Str_LocalToAtari(source, dest, replacementChar);
+#else
+ Str_Utf8ToAtari(source, dest, replacementChar);
+ Str_DecomposedToPrecomposedUtf8(dest, dest); /* for OSX */
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+
/**
* Print an Hex/Ascii dump of Len bytes located at *p
* Each line consists of Width bytes, printed as an hexa value and as a char
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment