From 9f8ecb91cbcf6694c19362f1eac3f0f800c86e75 Mon Sep 17 00:00:00 2001 From: "Ryan C. Gordon" Date: Sun, 20 Aug 2017 01:18:41 -0400 Subject: [PATCH] unicode: Added UTF-16 and UCS-4 stricmp functions. --- src/physfs.h | 48 +++++++++++++++ src/physfs_unicode.c | 137 ++++++++++++++++++++++++++----------------- 2 files changed, 132 insertions(+), 53 deletions(-) diff --git a/src/physfs.h b/src/physfs.h index 1de5e988..018b9f1a 100644 --- a/src/physfs.h +++ b/src/physfs.h @@ -2542,6 +2542,54 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst, */ PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2); +/** + * \fn int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2) + * \brief Case-insensitive compare of two UTF-16 strings. + * + * This is a strcasecmp/stricmp replacement that expects both strings + * to be in UTF-16 encoding. It will do "case folding" to decide if the + * Unicode codepoints in the strings match. + * + * It will report which string is "greater than" the other, but be aware that + * this doesn't necessarily mean anything: 'a' may be "less than" 'b', but + * a Japanese kuten has no meaningful alphabetically relationship to + * a Greek lambda, but being able to assign a reliable "value" makes sorting + * algorithms possible, if not entirely sane. Most cases should treat the + * return value as "equal" or "not equal". + * + * Like stricmp, this expects both strings to be NULL-terminated. + * + * \param str1 First string to compare. + * \param str2 Second string to compare. + * \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal. + */ +PHYSFS_DECL int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, + const PHYSFS_uint16 *str2); + +/** + * \fn int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2) + * \brief Case-insensitive compare of two UCS-4 strings. + * + * This is a strcasecmp/stricmp replacement that expects both strings + * to be in UCS-4 (aka UTF-32) encoding. It will do "case folding" to decide + * if the Unicode codepoints in the strings match. + * + * It will report which string is "greater than" the other, but be aware that + * this doesn't necessarily mean anything: 'a' may be "less than" 'b', but + * a Japanese kuten has no meaningful alphabetically relationship to + * a Greek lambda, but being able to assign a reliable "value" makes sorting + * algorithms possible, if not entirely sane. Most cases should treat the + * return value as "equal" or "not equal". + * + * Like stricmp, this expects both strings to be NULL-terminated. + * + * \param str1 First string to compare. + * \param str2 Second string to compare. + * \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal. + */ +PHYSFS_DECL int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, + const PHYSFS_uint32 *str2); + /** * \typedef PHYSFS_EnumerateCallback diff --git a/src/physfs_unicode.c b/src/physfs_unicode.c index 2689fa99..f8ffec7c 100644 --- a/src/physfs_unicode.c +++ b/src/physfs_unicode.c @@ -190,6 +190,48 @@ static PHYSFS_uint32 utf8codepoint(const char **_str) return UNICODE_BOGUS_CHAR_VALUE; } /* utf8codepoint */ +static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str) +{ + const PHYSFS_uint16 *src = *_str; + PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); + + if (cp == 0) /* null terminator, end of string. */ + return 0; + /* Orphaned second half of surrogate pair? */ + else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */ + { + const PHYSFS_uint32 pair = (PHYSFS_uint32) *src; + if (pair == 0) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + else if ((pair < 0xDC00) || (pair > 0xDFFF)) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + else + { + src++; /* eat the other surrogate. */ + cp = (((cp - 0xD800) << 10) | (pair - 0xDC00)); + } /* else */ + } /* else if */ + + *_str = src; + return cp; +} /* utf16codepoint */ + +static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str) +{ + const PHYSFS_uint32 *src = *_str; + PHYSFS_uint32 cp = *(src++); + + if (cp == 0) /* null terminator, end of string. */ + return 0; + else if (cp > 0x10FFF) + cp = UNICODE_BOGUS_CHAR_CODEPOINT; + + *_str = src; + return cp; +} /* utf32codepoint */ + void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len) { @@ -378,25 +420,9 @@ void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len len--; while (len) { - PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); - if (cp == 0) + const PHYSFS_uint32 cp = utf16codepoint(&src); + if (!cp) break; - - /* Orphaned second half of surrogate pair? */ - if ((cp >= 0xDC00) && (cp <= 0xDFFF)) - cp = UNICODE_BOGUS_CHAR_CODEPOINT; - else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */ - { - const PHYSFS_uint32 pair = (PHYSFS_uint32) *src; - if ((pair < 0xDC00) || (pair > 0xDFFF)) - cp = UNICODE_BOGUS_CHAR_CODEPOINT; - else - { - src++; /* eat the other surrogate. */ - cp = (((cp - 0xD800) << 10) | (pair - 0xDC00)); - } /* else */ - } /* else if */ - utf8fromcodepoint(cp, &dst, &len); } /* while */ @@ -492,46 +518,51 @@ static int locate_casefold_mapping(const PHYSFS_uint32 from, PHYSFS_uint32 *to) } /* locate_casefold_mapping */ +#define UTFSTRICMP(bits) \ + PHYSFS_uint32 folded1[3], folded2[3]; \ + int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \ + while (1) { \ + PHYSFS_uint32 cp1, cp2; \ + if (head1 != tail1) { \ + cp1 = folded1[tail1++]; \ + } else { \ + head1 = locate_casefold_mapping(utf##bits##codepoint(&str1), folded1); \ + cp1 = folded1[0]; \ + tail1 = 1; \ + } \ + if (head2 != tail2) { \ + cp2 = folded2[tail2++]; \ + } else { \ + head2 = locate_casefold_mapping(utf##bits##codepoint(&str2), folded2); \ + cp2 = folded2[0]; \ + tail2 = 1; \ + } \ + if (cp1 < cp2) { \ + return -1; \ + } else if (cp1 > cp2) { \ + return 1; \ + } else if (cp1 == 0) { \ + break; /* complete match. */ \ + } \ + } \ + return 0 + int PHYSFS_utf8stricmp(const char *str1, const char *str2) { - PHYSFS_uint32 folded1[3], folded2[3]; - int head1 = 0; - int tail1 = 0; - int head2 = 0; - int tail2 = 0; - - while (1) - { - PHYSFS_uint32 cp1, cp2; - - if (head1 != tail1) - cp1 = folded1[tail1++]; - else - { - head1 = locate_casefold_mapping(utf8codepoint(&str1), folded1); - cp1 = folded1[0]; - tail1 = 1; - } /* else */ + UTFSTRICMP(8); +} /* PHYSFS_utf8stricmp */ - if (head2 != tail2) - cp2 = folded2[tail2++]; - else - { - head2 = locate_casefold_mapping(utf8codepoint(&str2), folded2); - cp2 = folded2[0]; - tail2 = 1; - } /* else */ +int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2) +{ + UTFSTRICMP(16); +} /* PHYSFS_utf16stricmp */ - if (cp1 < cp2) - return -1; - else if (cp1 > cp2) - return 1; - else if (cp1 == 0) - break; /* complete match. */ - } /* while */ +int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2) +{ + UTFSTRICMP(32); +} /* PHYSFS_ucs4stricmp */ - return 0; -} /* PHYSFS_utf8stricmp */ +#undef UTFSTRICMP /* end of physfs_unicode.c ... */