[indexer] Use ICU regex to handle unicode characters in mastodon/bluesky domains

Improve regex, (ICU required for unicode character support) Also add tests for umlauts, add generated "Testing" folder to gitignore, and clean up url::UrlEncode a bit
TODO: android doesn't build
Signed-off-by: Harry Bond <me@hbond.xyz>
This commit is contained in:
Harry Bond
2025-07-26 17:12:37 +01:00
parent 8174eac134
commit 34e9b17c33
5 changed files with 65 additions and 35 deletions

View File

@@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs)
string UrlEncode(string const & rawUrl)
{
size_t const count = rawUrl.size();
string result;
result.reserve(count);
result.reserve(rawUrl.size());
for (size_t i = 0; i < count; ++i)
for (unsigned char c : rawUrl)
{
char const c = rawUrl[i];
if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') ||
c == '`' || (c > 'z' && c < '~') || c > '~')
// Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt
if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~')
result += c;
else
{
result += '%';
result += NumToHex(c);
}
else
result += rawUrl[i];
}
return result;