Skip to content

Commit

Permalink
utf8: big improvements to case-insensitive UTF-8 string compare.
Browse files Browse the repository at this point in the history
- Dramatically reduce RAM usage: uses between 8 and 11 kilobytes less static
memory for its internal case-folding tables.
- Actually works now. It would fail unconditionally if a codepoint folded
into multiple codepoints, even if the compared string contained those exact
codepoints.
- Now a public API!
- Removed __PHYSFS_utf8strnicmp(): nothing was using it, it was incorrect
anyhow, and what does 'n' represent when either string might case-fold to
something larger in-flight, anyhow?
  • Loading branch information
icculus committed Aug 11, 2017
1 parent 587ec88 commit d1f2637
Show file tree
Hide file tree
Showing 7 changed files with 2,695 additions and 2,258 deletions.
227 changes: 209 additions & 18 deletions extras/makecasefoldhashtable.pl
Expand Up @@ -3,6 +3,11 @@
use warnings;
use strict;

my $HASHBUCKETS1_16 = 256;
my $HASHBUCKETS1_32 = 16;
my $HASHBUCKETS2_16 = 16;
my $HASHBUCKETS3_16 = 4;

print <<__EOF__;
/*
* This file is part of PhysicsFS (https://icculus.org/physfs/)
Expand All @@ -13,17 +18,97 @@
* Please see the file LICENSE.txt in the source's root directory.
*/
#ifndef _INCLUDE_PHYSFS_CASEFOLDING_H_
#define _INCLUDE_PHYSFS_CASEFOLDING_H_
#ifndef __PHYSICSFS_INTERNAL__
#error Do not include this header from your applications.
#endif
/* We build three simple hashmaps here: one that maps Unicode codepoints to
a one, two, or three lowercase codepoints. To retrieve this info: look at
case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
a few dozen fold to two, and a handful fold to three. If the codepoint isn't
in any of these hashes, it doesn't fold (no separate upper and lowercase).
Almost all these codepoints fit into 16 bits, so we hash them as such to save
memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
since there are (currently) only about 120 of them and (currently) all of them
map to a single lowercase codepoint. */
typedef struct CaseFoldMapping1_32
{
PHYSFS_uint32 from;
PHYSFS_uint32 to0;
} CaseFoldMapping1_32;
typedef struct CaseFoldMapping1_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
} CaseFoldMapping1_16;
typedef struct CaseFoldMapping2_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
} CaseFoldMapping2_16;
typedef struct CaseFoldMapping3_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
PHYSFS_uint16 to2;
} CaseFoldMapping3_16;
typedef struct CaseFoldHashBucket1_16
{
const CaseFoldMapping1_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_16;
typedef struct CaseFoldHashBucket1_32
{
const CaseFoldMapping1_32 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_32;
typedef struct CaseFoldHashBucket2_16
{
const CaseFoldMapping2_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket2_16;
typedef struct CaseFoldHashBucket3_16
{
const CaseFoldMapping3_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket3_16;
__EOF__


my @foldPairs;
my @foldPairs1_16;
my @foldPairs2_16;
my @foldPairs3_16;
my @foldPairs1_32;

for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] = '';
}

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] = '';
}

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] = '';
}

for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] = '';
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] = '';
}

open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
Expand All @@ -38,47 +123,153 @@

next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
my ($code, $status, $mapping) = ($1, $2, $3);

my $hexxed = hex($code);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & 0xFF);
#print("// code '$code' status '$status' mapping '$mapping'\n");
#print("// hexxed '$hexxed' hashed '$hashed'\n");

if (($status eq 'C') or ($status eq 'F')) {
my ($map1, $map2, $map3) = ('0000', '0000', '0000');
my ($map1, $map2, $map3) = (undef, undef, undef);
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
die("mapping space too small for '$code'\n") if ($mapping ne '');
$foldPairs[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
die("problem parsing mapping for '$code'\n") if (not defined($map1));

if ($hexxed < 128) {
# Just ignore these, we'll handle the low-ASCII ones ourselves.
} elsif ($hexxed > 0xFFFF) {
# We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_32[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map2)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_16[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map3)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
#print("// hexxed '$hexxed' hashed2 '$hashed'\n");
$foldPairs2_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2 },\n";
} else {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
#print("// hexxed '$hexxed' hashed3 '$hashed'\n");
$foldPairs3_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
}
}
}
close(FH);

for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] =~ s/,\n\Z//;
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_16_${num}";
print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_32[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs2_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs3_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print("static const CaseFoldMapping ${sym}[] = {\n$str\n};\n\n");
my $sym = "case_fold3_16_${num}";
print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
}

print("\nstatic const CaseFoldHashBucket case_fold_hash[256] = {\n");
print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");

for (my $i = 0; $i < 256; $i++) {
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
my $str = $foldPairs1_16[$i];
if ($str eq '') {
print(" { 0, NULL },\n");
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print(" { __PHYSFS_ARRAYLEN($sym), $sym },\n");
my $sym = "case_fold1_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");


print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
my $str = $foldPairs1_32[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");


print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
my $str = $foldPairs2_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");

print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
my $str = $foldPairs3_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");

print <<__EOF__;
#endif /* _INCLUDE_PHYSFS_CASEFOLDING_H_ */
/* end of physfs_casefolding.h ... */
__EOF__

exit 0;

# end of makecashfoldhashtable.pl ...
Expand Down
10 changes: 5 additions & 5 deletions src/physfs.c
Expand Up @@ -891,14 +891,14 @@ static DirHandle *openDirectory(PHYSFS_Io *io, const char *d, int forWriting)
/* Look for archivers with matching file extensions first... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */

/* failing an exact file extension match, try all the others... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */
} /* if */
Expand Down Expand Up @@ -1442,7 +1442,7 @@ static int doRegisterArchiver(const PHYSFS_Archiver *_archiver)
ext = _archiver->info.extension;
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
BAIL(PHYSFS_ERR_DUPLICATE, 0);
} /* for */

Expand Down Expand Up @@ -1518,7 +1518,7 @@ int PHYSFS_deregisterArchiver(const char *ext)
__PHYSFS_platformGrabMutex(stateLock);
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
{
const int retval = doDeregisterArchiver(i);
__PHYSFS_platformReleaseMutex(stateLock);
Expand Down Expand Up @@ -1921,7 +1921,7 @@ int PHYSFS_setSaneConfig(const char *organization, const char *appName,
if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
{
ext = (*i) + (l - extlen);
if (__PHYSFS_utf8stricmp(ext, archiveExt) == 0)
if (PHYSFS_utf8stricmp(ext, archiveExt) == 0)
setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
} /* if */
} /* for */
Expand Down
24 changes: 24 additions & 0 deletions src/physfs.h
Expand Up @@ -2521,6 +2521,30 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,

/* Everything above this line is part of the PhysicsFS 2.0 API. */

/**
* \fn int PHYSFS_utf8stricmp(const char *str1, const char *str2)
* \brief Case-insensitive compare of two UTF-8 strings.
*
* This is a strcasecmp/stricmp replacement that expects both strings
* to be in UTF-8 encoding. It will do "case folding" to decide if the
* Unicode codepoints in the strings match.
*
* It will report which string is "greater than" the other, but be aware that
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
* a Japanese kuten has no meaningful alphabetically relationship to
* a Greek lambda, but being able to assign a reliable "value" makes sorting
* algorithms possible, if not entirely sane. Most cases should treat the
* return value as "equal" or "not equal".
*
* Like stricmp, this expects both strings to be NULL-terminated.
*
* \param str1 First string to compare.
* \param str2 Second string to compare.
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
*/
PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);


/**
* \fn int PHYSFS_unmount(const char *oldDir)
* \brief Remove a directory or archive from the search path.
Expand Down

0 comments on commit d1f2637

Please sign in to comment.