utf8: big improvements to case-insensitive UTF-8 string compare.

- Dramatically reduce RAM usage: uses between 8 and 11 kilobytes less static memory for its internal case-folding tables. - Actually works now. It would fail unconditionally if a codepoint folded into multiple codepoints, even if the compared string contained those exact codepoints. - Now a public API! - Removed __PHYSFS_utf8strnicmp(): nothing was using it, it was incorrect anyhow, and what does 'n' represent when either string might case-fold to something larger in-flight, anyhow?
icculus · Aug 11, 2017 · d1f2637 · d1f2637
1 parent 587ec88
commit d1f2637
Show file tree

Hide file tree

Showing 7 changed files with 2,695 additions and 2,258 deletions.
diff --git a/extras/makecasefoldhashtable.pl b/extras/makecasefoldhashtable.pl
@@ -3,6 +3,11 @@
 use warnings;
 use strict;
 
+my $HASHBUCKETS1_16 = 256;
+my $HASHBUCKETS1_32 = 16;
+my $HASHBUCKETS2_16 = 16;
+my $HASHBUCKETS3_16 = 4;
+
 print <<__EOF__;
 /*
  * This file is part of PhysicsFS (https://icculus.org/physfs/)
@@ -13,17 +18,97 @@
  * Please see the file LICENSE.txt in the source's root directory.
  */
 
+#ifndef _INCLUDE_PHYSFS_CASEFOLDING_H_
+#define _INCLUDE_PHYSFS_CASEFOLDING_H_
+
 #ifndef __PHYSICSFS_INTERNAL__
 #error Do not include this header from your applications.
 #endif
 
+/* We build three simple hashmaps here: one that maps Unicode codepoints to
+a one, two, or three lowercase codepoints. To retrieve this info: look at
+case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
+a few dozen fold to two, and a handful fold to three. If the codepoint isn't
+in any of these hashes, it doesn't fold (no separate upper and lowercase).
+
+Almost all these codepoints fit into 16 bits, so we hash them as such to save
+memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
+since there are (currently) only about 120 of them and (currently) all of them
+map to a single lowercase codepoint. */
+
+typedef struct CaseFoldMapping1_32
+{
+    PHYSFS_uint32 from;
+    PHYSFS_uint32 to0;
+} CaseFoldMapping1_32;
+
+typedef struct CaseFoldMapping1_16
+{
+    PHYSFS_uint16 from;
+    PHYSFS_uint16 to0;
+} CaseFoldMapping1_16;
+
+typedef struct CaseFoldMapping2_16
+{
+    PHYSFS_uint16 from;
+    PHYSFS_uint16 to0;
+    PHYSFS_uint16 to1;
+} CaseFoldMapping2_16;
+
+typedef struct CaseFoldMapping3_16
+{
+    PHYSFS_uint16 from;
+    PHYSFS_uint16 to0;
+    PHYSFS_uint16 to1;
+    PHYSFS_uint16 to2;
+} CaseFoldMapping3_16;
+
+typedef struct CaseFoldHashBucket1_16
+{
+    const CaseFoldMapping1_16 *list;
+    const PHYSFS_uint8 count;
+} CaseFoldHashBucket1_16;
+
+typedef struct CaseFoldHashBucket1_32
+{
+    const CaseFoldMapping1_32 *list;
+    const PHYSFS_uint8 count;
+} CaseFoldHashBucket1_32;
+
+typedef struct CaseFoldHashBucket2_16
+{
+    const CaseFoldMapping2_16 *list;
+    const PHYSFS_uint8 count;
+} CaseFoldHashBucket2_16;
+
+typedef struct CaseFoldHashBucket3_16
+{
+    const CaseFoldMapping3_16 *list;
+    const PHYSFS_uint8 count;
+} CaseFoldHashBucket3_16;
+
 __EOF__
 
 
-my @foldPairs;
+my @foldPairs1_16;
+my @foldPairs2_16;
+my @foldPairs3_16;
+my @foldPairs1_32;
+
+for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
+    $foldPairs1_16[$i] = '';
+}
+
+for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
+    $foldPairs1_32[$i] = '';
+}
+
+for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
+    $foldPairs2_16[$i] = '';
+}
 
-for (my $i = 0; $i < 256; $i++) {
-    $foldPairs[$i] = '';
+for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
+    $foldPairs3_16[$i] = '';
 }
 
 open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
@@ -38,47 +123,153 @@
 
     next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
     my ($code, $status, $mapping) = ($1, $2, $3);
+
     my $hexxed = hex($code);
-    my $hashed = (($hexxed ^ ($hexxed >> 8)) & 0xFF);
     #print("// code '$code'   status '$status'   mapping '$mapping'\n");
-    #print("// hexxed '$hexxed'  hashed '$hashed'\n");
 
     if (($status eq 'C') or ($status eq 'F')) {
-        my ($map1, $map2, $map3) = ('0000', '0000', '0000');
+        my ($map1, $map2, $map3) = (undef, undef, undef);
         $map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
         $map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
         $map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
         die("mapping space too small for '$code'\n") if ($mapping ne '');
-        $foldPairs[$hashed] .= "    { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
+        die("problem parsing mapping for '$code'\n") if (not defined($map1));
+
+        if ($hexxed < 128) {
+            # Just ignore these, we'll handle the low-ASCII ones ourselves.
+        } elsif ($hexxed > 0xFFFF) {
+            # We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
+            die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
+            my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
+            #print("// hexxed '$hexxed'  hashed1 '$hashed'\n");
+            $foldPairs1_32[$hashed] .= "    { 0x$code, 0x$map1 },\n";
+        } elsif (not defined($map2)) {
+            my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
+            #print("// hexxed '$hexxed'  hashed1 '$hashed'\n");
+            $foldPairs1_16[$hashed] .= "    { 0x$code, 0x$map1 },\n";
+        } elsif (not defined($map3)) {
+            my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
+            #print("// hexxed '$hexxed'  hashed2 '$hashed'\n");
+            $foldPairs2_16[$hashed] .= "    { 0x$code, 0x$map1, 0x$map2 },\n";
+        } else {
+            my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
+            #print("// hexxed '$hexxed'  hashed3 '$hashed'\n");
+            $foldPairs3_16[$hashed] .= "    { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
+        }
     }
 }
 close(FH);
 
-for (my $i = 0; $i < 256; $i++) {
-    $foldPairs[$i] =~ s/,\n\Z//;
-    my $str = $foldPairs[$i];
+for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
+    $foldPairs1_16[$i] =~ s/,\n\Z//;
+    my $str = $foldPairs1_16[$i];
+    next if $str eq '';
+    my $num = '000' . $i;
+    $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+    my $sym = "case_fold1_16_${num}";
+    print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
+}
+
+for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
+    $foldPairs1_32[$i] =~ s/,\n\Z//;
+    my $str = $foldPairs1_32[$i];
+    next if $str eq '';
+    my $num = '000' . $i;
+    $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+    my $sym = "case_fold1_32_${num}";
+    print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
+}
+
+for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
+    $foldPairs2_16[$i] =~ s/,\n\Z//;
+    my $str = $foldPairs2_16[$i];
+    next if $str eq '';
+    my $num = '000' . $i;
+    $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+    my $sym = "case_fold2_16_${num}";
+    print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
+}
+
+for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
+    $foldPairs3_16[$i] =~ s/,\n\Z//;
+    my $str = $foldPairs3_16[$i];
     next if $str eq '';
     my $num = '000' . $i;
     $num =~ s/\A.*?(\d\d\d)\Z/$1/;
-    my $sym = "case_fold_${num}";
-    print("static const CaseFoldMapping ${sym}[] = {\n$str\n};\n\n");
+    my $sym = "case_fold3_16_${num}";
+    print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
 }
 
-print("\nstatic const CaseFoldHashBucket case_fold_hash[256] = {\n");
+print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");
 
-for (my $i = 0; $i < 256; $i++) {
-    my $str = $foldPairs[$i];
+for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
+    my $str = $foldPairs1_16[$i];
     if ($str eq '') {
-        print("    { 0, NULL },\n");
+        print("    { NULL, 0 },\n");
     } else {
         my $num = '000' . $i;
         $num =~ s/\A.*?(\d\d\d)\Z/$1/;
-        my $sym = "case_fold_${num}";
-        print("    { __PHYSFS_ARRAYLEN($sym), $sym },\n");
+        my $sym = "case_fold1_16_${num}";
+        print("    { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
     }
 }
 print("};\n\n");
 
+
+print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");
+
+for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
+    my $str = $foldPairs1_32[$i];
+    if ($str eq '') {
+        print("    { NULL, 0 },\n");
+    } else {
+        my $num = '000' . $i;
+        $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+        my $sym = "case_fold1_32_${num}";
+        print("    { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
+    }
+}
+print("};\n\n");
+
+
+print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");
+
+for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
+    my $str = $foldPairs2_16[$i];
+    if ($str eq '') {
+        print("    { NULL, 0 },\n");
+    } else {
+        my $num = '000' . $i;
+        $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+        my $sym = "case_fold2_16_${num}";
+        print("    { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
+    }
+}
+print("};\n\n");
+
+print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");
+
+for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
+    my $str = $foldPairs3_16[$i];
+    if ($str eq '') {
+        print("    { NULL, 0 },\n");
+    } else {
+        my $num = '000' . $i;
+        $num =~ s/\A.*?(\d\d\d)\Z/$1/;
+        my $sym = "case_fold3_16_${num}";
+        print("    { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
+    }
+}
+print("};\n\n");
+
+print <<__EOF__;
+
+#endif  /* _INCLUDE_PHYSFS_CASEFOLDING_H_ */
+
+/* end of physfs_casefolding.h ... */
+
+__EOF__
+
 exit 0;
 
 # end of makecashfoldhashtable.pl ...

diff --git a/src/physfs.c b/src/physfs.c
@@ -891,14 +891,14 @@ static DirHandle *openDirectory(PHYSFS_Io *io, const char *d, int forWriting)
         /* Look for archivers with matching file extensions first... */
         for (i = archivers; (*i != NULL) && (retval == NULL); i++)
         {
-            if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
+            if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
                 retval = tryOpenDir(io, *i, d, forWriting);
         } /* for */
 
         /* failing an exact file extension match, try all the others... */
         for (i = archivers; (*i != NULL) && (retval == NULL); i++)
         {
-            if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
+            if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
                 retval = tryOpenDir(io, *i, d, forWriting);
         } /* for */
     } /* if */
@@ -1442,7 +1442,7 @@ static int doRegisterArchiver(const PHYSFS_Archiver *_archiver)
     ext = _archiver->info.extension;
     for (i = 0; i < numArchivers; i++)
     {
-        if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
+        if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
             BAIL(PHYSFS_ERR_DUPLICATE, 0);
     } /* for */
 
@@ -1518,7 +1518,7 @@ int PHYSFS_deregisterArchiver(const char *ext)
     __PHYSFS_platformGrabMutex(stateLock);
     for (i = 0; i < numArchivers; i++)
     {
-        if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
+        if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
         {
             const int retval = doDeregisterArchiver(i);
             __PHYSFS_platformReleaseMutex(stateLock);
@@ -1921,7 +1921,7 @@ int PHYSFS_setSaneConfig(const char *organization, const char *appName,
             if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
             {
                 ext = (*i) + (l - extlen);
-                if (__PHYSFS_utf8stricmp(ext, archiveExt) == 0)
+                if (PHYSFS_utf8stricmp(ext, archiveExt) == 0)
                     setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
             } /* if */
         } /* for */

diff --git a/src/physfs.h b/src/physfs.h
@@ -2521,6 +2521,30 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,
 
 /* Everything above this line is part of the PhysicsFS 2.0 API. */
 
+/**
+ * \fn int PHYSFS_utf8stricmp(const char *str1, const char *str2)
+ * \brief Case-insensitive compare of two UTF-8 strings.
+ *
+ * This is a strcasecmp/stricmp replacement that expects both strings
+ *  to be in UTF-8 encoding. It will do "case folding" to decide if the
+ *  Unicode codepoints in the strings match.
+ *
+ * It will report which string is "greater than" the other, but be aware that
+ *  this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
+ *  a Japanese kuten has no meaningful alphabetically relationship to
+ *  a Greek lambda, but being able to assign a reliable "value" makes sorting
+ *  algorithms possible, if not entirely sane. Most cases should treat the
+ *  return value as "equal" or "not equal".
+ *
+ * Like stricmp, this expects both strings to be NULL-terminated.
+ *
+ *   \param str1 First string to compare.
+ *   \param str2 Second string to compare.
+ *  \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
+ */
+PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);
+
+
 /**
  * \fn int PHYSFS_unmount(const char *oldDir)
  * \brief Remove a directory or archive from the search path.