Compare commits

...

1 Commits

Author SHA1 Message Date
Harry Bond
34e9b17c33 [indexer] Use ICU regex to handle unicode characters in mastodon/bluesky domains
Improve regex, (ICU required for unicode character support) Also add tests for umlauts, add generated "Testing" folder to gitignore, and clean up url::UrlEncode a bit
TODO: android doesn't build
Signed-off-by: Harry Bond <me@hbond.xyz>
2025-07-29 13:36:17 +01:00
5 changed files with 65 additions and 35 deletions

4
.gitignore vendored
View File

@@ -27,6 +27,10 @@ data/edits.xml
data/World.mwm
data/WorldCoasts.mwm
# generated when running tests. NOT the lowercase 'testing' which should be kept.
Testing
!testing
# Compiled Python
*.pyc

View File

@@ -80,6 +80,7 @@ UNIT_TEST(Url_Encode)
TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ());
TEST_EQUAL(UrlEncode("20"), "20", ());
TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ());
TEST_EQUAL(UrlEncode("ümlaut"), "%C3%BCmlaut", ());
TEST_EQUAL(UrlEncode(orig1), enc1, ());
TEST_EQUAL(UrlEncode(orig2), enc2, ());
TEST_EQUAL(UrlEncode(orig3), enc3, ());
@@ -98,6 +99,8 @@ UNIT_TEST(Url_Decode)
TEST_EQUAL(UrlDecode(enc3), orig3, ());
TEST_EQUAL(UrlDecode(enc4), orig4, ());
TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ());
TEST_EQUAL(UrlDecode("%C3%BCmlaut"), "ümlaut", ());
}
UNIT_TEST(Url_Invalid)
@@ -127,6 +130,11 @@ UNIT_TEST(Url_Valid)
.Host("www.sandwichparlour.com.au")
.Path("");
TestUrl("https://www.ümlaut.org.de/")
.Scheme("https")
.Host("www.ümlaut.org.de")
.Path("");
TestUrl("cm:/&test").Scheme("cm").Host("&test").Path("");
}

View File

@@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs)
string UrlEncode(string const & rawUrl)
{
size_t const count = rawUrl.size();
string result;
result.reserve(count);
result.reserve(rawUrl.size());
for (size_t i = 0; i < count; ++i)
for (unsigned char c : rawUrl)
{
char const c = rawUrl[i];
if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') ||
c == '`' || (c > 'z' && c < '~') || c > '~')
// Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt
if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~')
result += c;
else
{
result += '%';
result += NumToHex(c);
}
else
result += rawUrl[i];
}
return result;

View File

@@ -119,11 +119,17 @@ UNIT_TEST(EditableMapObject_ValidateAndFormat_fediverse)
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@floss.social.uk"), "comaps@floss.social.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/@comaps"), "comaps@pub.mastodon.org.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/@comaps"), "comaps@pub.mastodon.org.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("https://bawü.social/@mannheim"), "mannheim@bawü.social", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@mannheim@bawü.social"), "mannheim@bawü.social", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("comaps@fosstodon@mastodon.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("co$maps@mastodon.social"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/comaps"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps"), "", ());
}
UNIT_TEST(EditableMapObject_ValidateAndFormat_bluesky)
@@ -306,11 +312,16 @@ UNIT_TEST(EditableMapObject_ValidateFediversePage)
TEST(osm::ValidateFediversePage("@comaps@floss.social.uk"), ());
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/@comaps"), ());
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/users/@comaps"), ());
TEST(osm::ValidateFediversePage("https://bawü.social/@mannheim"), ());
TEST(osm::ValidateFediversePage("@mannheim@bawü.social"), ());
TEST(!osm::ValidateFediversePage("comaps@floss@mastodon.org"), ());
TEST(!osm::ValidateFediversePage("orga$nicmaps@mastodon.social"), ());
TEST(!osm::ValidateFediversePage("co$maps@mastodon.social"), ());
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/comaps"), ());
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/users/"), ());
TEST(!osm::ValidateFediversePage("@comaps.org"), ());
TEST(!osm::ValidateFediversePage("@comaps@.org"), ());
TEST(!osm::ValidateFediversePage("@comaps"), ());
}
UNIT_TEST(EditableMapObject_ValidateBlueskyPage)

View File

@@ -3,21 +3,22 @@
#include "coding/url.hpp"
#include "base/string_utils.hpp"
#include <unicode/regex.h>
#include <cstring> // strlen
#include <regex>
namespace osm
{
using namespace std;
static auto const s_instaRegex = regex(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)");
static auto const s_twitterRegex = regex(R"(^@?[A-Za-z0-9_]{1,15}$)");
static auto const s_badVkRegex = regex(R"(^\d\d\d.+$)");
static auto const s_goodVkRegex = regex(R"(^[A-Za-z0-9_.]{5,32}$)");
static auto const s_lineRegex = regex(R"(^[a-z0-9-_.]{4,20}$)");
static auto const s_fediverseRegex = regex(R"(^@?[a-zA-Z0-9_]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)");
static auto const s_blueskyRegex = regex(R"(^@?[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+$)");
UErrorCode status = U_ZERO_ERROR;
icu::RegexPattern* s_instaRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)", 0, status);
icu::RegexPattern* s_twitterRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_]{1,15}$)", 0, status);
icu::RegexPattern* s_badVkRegex = icu::RegexPattern::compile(R"(^\d\d\d.+$)", 0, status);
icu::RegexPattern* s_goodVkRegex = icu::RegexPattern::compile(R"(^[A-Za-z0-9_.]{5,32}$)", 0, status);
icu::RegexPattern* s_lineRegex = icu::RegexPattern::compile(R"(^[a-z0-9-_.]{4,20}$)", 0, status);
icu::RegexPattern* s_fediverseRegex = icu::RegexPattern::compile(R"(^@?[a-zA-Z0-9_]+@[\p{L}\p{N}-]+\.[\p{L}\p{N}.-]+$)", 0, status);
icu::RegexPattern* s_blueskyRegex = icu::RegexPattern::compile(R"(^@?[\p{L}\p{N}-]+(\.[\p{L}\p{N}-]+)+$)", 0, status);
constexpr string_view kFacebook{"contact:facebook"};
constexpr string_view kInstagram{"contact:instagram"};
@@ -59,6 +60,14 @@ constexpr string_view kUrlPanoramax{"https://api.panoramax.xyz/?pic="};
constexpr string_view kHttp{"http://"};
constexpr string_view kHttps{"https://"};
bool icuRegexMatches(const std::string& inputStr, icu::RegexPattern* pattern) {
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString input(inputStr.c_str(), "UTF-8");
std::unique_ptr<icu::RegexMatcher> matcher(pattern->matcher(input, status));
return U_SUCCESS(status) && matcher->matches(status);
}
size_t GetProtocolNameLength(string const & website)
{
if (website.starts_with(kHttps))
@@ -160,7 +169,7 @@ string ValidateAndFormat_instagram(string const & instagramPage)
return {};
// Check that instagramPage contains valid username.
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
if (regex_match(instagramPage, s_instaRegex))
if (icuRegexMatches(instagramPage, s_instaRegex))
{
if (instagramPage.front() == '@')
return instagramPage.substr(1);
@@ -189,7 +198,7 @@ string ValidateAndFormat_twitter(string const & twitterPage)
return {};
// Check that twitterPage contains valid username.
// Rules took here: https://stackoverflow.com/q/11361044
if (regex_match(twitterPage, s_twitterRegex))
if (icuRegexMatches(twitterPage, s_twitterRegex))
{
if (twitterPage.front() == '@')
return twitterPage.substr(1);
@@ -231,9 +240,9 @@ string ValidateAndFormat_vk(string const & vkPage)
if (vkPageClean.front() == '@')
vkPageClean = vkPageClean.substr(1);
if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || regex_match(vkPageClean, s_badVkRegex))
if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || icuRegexMatches(vkPageClean, s_badVkRegex))
return {};
if (regex_match(vkPageClean, s_goodVkRegex))
if (icuRegexMatches(vkPageClean, s_goodVkRegex))
return vkPageClean;
}
if (!ValidateWebsite(vkPage))
@@ -279,7 +288,7 @@ string ValidateAndFormat_contactLine(string const & linePage)
string linePageClean = stripAtSymbol(linePage);
if (regex_match(linePageClean, s_lineRegex))
if (icuRegexMatches(linePageClean, s_lineRegex))
return linePageClean;
}
@@ -339,7 +348,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
return {};
// Parse {@?}{username}@{domain.name} format
if (regex_match(fediPage, s_fediverseRegex))
if (icuRegexMatches(fediPage, s_fediverseRegex))
return stripAtSymbol(fediPage);
// If it doesn't match the above format, it can only be an URL format.
@@ -368,7 +377,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
// Then construct the final username@domain.name format
path.append("@").append(parsedDomain);
// and make sure it's valid
if (regex_match(path, s_fediverseRegex))
if (icuRegexMatches(path, s_fediverseRegex))
return path;
else
return {};
@@ -380,7 +389,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
return {};
// Try matching {@?}{user/domain.name} format to avoid doing the other stuff
if (regex_match(bskyPage, s_blueskyRegex))
if (icuRegexMatches(bskyPage, s_blueskyRegex))
return stripAtSymbol(bskyPage);
// If not, it must match the URL format
@@ -398,7 +407,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
// Then make sure it matches {@?}{user/domain.name}
if (regex_match(path, s_blueskyRegex))
if (icuRegexMatches(path, s_blueskyRegex))
return stripAtSymbol(path);
}
}
@@ -458,7 +467,7 @@ bool ValidateInstagramPage(string const & page)
return true;
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
if (regex_match(page, s_instaRegex))
if (icuRegexMatches(page, s_instaRegex))
return true;
if (!ValidateWebsite(page))
@@ -474,7 +483,7 @@ bool ValidateTwitterPage(string const & page)
return true;
if (!ValidateWebsite(page))
return regex_match(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044
return icuRegexMatches(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044
string const domain = strings::MakeLowerCase(url::Url::FromString(page).GetHost());
return domain == kXCom || domain.ends_with(kDotXCom) || domain == kTwitterCom || domain.ends_with(kDotTwitterCom);
@@ -500,9 +509,9 @@ bool ValidateVkPage(string const & page)
if (vkLogin.front() == '@')
vkLogin = vkLogin.substr(1);
if ((vkLogin.front() == '_' && vkLogin.back() == '_') || regex_match(vkLogin, s_badVkRegex))
if ((vkLogin.front() == '_' && vkLogin.back() == '_') || icuRegexMatches(vkLogin, s_badVkRegex))
return false;
if (regex_match(vkLogin, s_goodVkRegex))
if (icuRegexMatches(vkLogin, s_goodVkRegex))
return true;
}
@@ -525,7 +534,7 @@ bool ValidateLinePage(string const & page)
// The page name must be between 4 and 20 characters. Should contain alphanumeric characters
// and symbols '.', '-', and '_'
if (regex_match(stripAtSymbol(page), s_lineRegex))
if (icuRegexMatches(stripAtSymbol(page), s_lineRegex))
return true;
}
@@ -543,7 +552,7 @@ bool ValidateFediversePage(string const & page)
return true;
// Match @username@instance.name format
if (regex_match(page, s_fediverseRegex))
if (icuRegexMatches(page, s_fediverseRegex))
return true;
// If it doesn't match the above format, it can only be an URL format.
@@ -572,7 +581,7 @@ bool ValidateFediversePage(string const & page)
// Then construct the username@domain.name format
path.append("@").append(domain);
// And return if it's valid or not
return regex_match(path, s_fediverseRegex);
return icuRegexMatches(path, s_fediverseRegex);
}
bool ValidateBlueskyPage(string const & page)
@@ -582,7 +591,7 @@ bool ValidateBlueskyPage(string const & page)
return true;
// Match {@?}{user/domain.name} format
if (regex_match(page, s_blueskyRegex))
if (icuRegexMatches(page, s_blueskyRegex))
return true;
// Has to be an url format now
@@ -600,7 +609,7 @@ bool ValidateBlueskyPage(string const & page)
path.erase(0, 8); // Strip "profile/" part
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
// Then try to parse the remaining text as a username again
if (regex_match(path, s_blueskyRegex))
if (icuRegexMatches(path, s_blueskyRegex))
return true;
}