[indexer] Use ICU regex to handle unicode characters in mastodon/bluesky domains

Improve regex, (ICU required for unicode character support) Also add tests for umlauts, add generated "Testing" folder to gitignore, and clean up url::UrlEncode a bit
TODO: android doesn't build
Signed-off-by: Harry Bond <me@hbond.xyz>
This commit is contained in:
Harry Bond
2025-07-26 17:12:37 +01:00
parent 8174eac134
commit 34e9b17c33
5 changed files with 65 additions and 35 deletions

View File

@@ -80,6 +80,7 @@ UNIT_TEST(Url_Encode)
TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ());
TEST_EQUAL(UrlEncode("20"), "20", ());
TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ());
TEST_EQUAL(UrlEncode("ümlaut"), "%C3%BCmlaut", ());
TEST_EQUAL(UrlEncode(orig1), enc1, ());
TEST_EQUAL(UrlEncode(orig2), enc2, ());
TEST_EQUAL(UrlEncode(orig3), enc3, ());
@@ -98,6 +99,8 @@ UNIT_TEST(Url_Decode)
TEST_EQUAL(UrlDecode(enc3), orig3, ());
TEST_EQUAL(UrlDecode(enc4), orig4, ());
TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ());
TEST_EQUAL(UrlDecode("%C3%BCmlaut"), "ümlaut", ());
}
UNIT_TEST(Url_Invalid)
@@ -127,6 +130,11 @@ UNIT_TEST(Url_Valid)
.Host("www.sandwichparlour.com.au")
.Path("");
TestUrl("https://www.ümlaut.org.de/")
.Scheme("https")
.Host("www.ümlaut.org.de")
.Path("");
TestUrl("cm:/&test").Scheme("cm").Host("&test").Path("");
}

View File

@@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs)
string UrlEncode(string const & rawUrl)
{
size_t const count = rawUrl.size();
string result;
result.reserve(count);
result.reserve(rawUrl.size());
for (size_t i = 0; i < count; ++i)
for (unsigned char c : rawUrl)
{
char const c = rawUrl[i];
if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') ||
c == '`' || (c > 'z' && c < '~') || c > '~')
// Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt
if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~')
result += c;
else
{
result += '%';
result += NumToHex(c);
}
else
result += rawUrl[i];
}
return result;