Compare commits

...

1 Commits

Author SHA1 Message Date
Harry Bond
34e9b17c33 [indexer] Use ICU regex to handle unicode characters in mastodon/bluesky domains
Improve regex, (ICU required for unicode character support) Also add tests for umlauts, add generated "Testing" folder to gitignore, and clean up url::UrlEncode a bit
TODO: android doesn't build
Signed-off-by: Harry Bond <me@hbond.xyz>
2025-07-29 13:36:17 +01:00
5 changed files with 65 additions and 35 deletions

4
.gitignore vendored
View File

@@ -27,6 +27,10 @@ data/edits.xml
data/World.mwm data/World.mwm
data/WorldCoasts.mwm data/WorldCoasts.mwm
# generated when running tests. NOT the lowercase 'testing' which should be kept.
Testing
!testing
# Compiled Python # Compiled Python
*.pyc *.pyc

View File

@@ -80,6 +80,7 @@ UNIT_TEST(Url_Encode)
TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ()); TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ());
TEST_EQUAL(UrlEncode("20"), "20", ()); TEST_EQUAL(UrlEncode("20"), "20", ());
TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ()); TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ());
TEST_EQUAL(UrlEncode("ümlaut"), "%C3%BCmlaut", ());
TEST_EQUAL(UrlEncode(orig1), enc1, ()); TEST_EQUAL(UrlEncode(orig1), enc1, ());
TEST_EQUAL(UrlEncode(orig2), enc2, ()); TEST_EQUAL(UrlEncode(orig2), enc2, ());
TEST_EQUAL(UrlEncode(orig3), enc3, ()); TEST_EQUAL(UrlEncode(orig3), enc3, ());
@@ -98,6 +99,8 @@ UNIT_TEST(Url_Decode)
TEST_EQUAL(UrlDecode(enc3), orig3, ()); TEST_EQUAL(UrlDecode(enc3), orig3, ());
TEST_EQUAL(UrlDecode(enc4), orig4, ()); TEST_EQUAL(UrlDecode(enc4), orig4, ());
TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ()); TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ());
TEST_EQUAL(UrlDecode("%C3%BCmlaut"), "ümlaut", ());
} }
UNIT_TEST(Url_Invalid) UNIT_TEST(Url_Invalid)
@@ -127,6 +130,11 @@ UNIT_TEST(Url_Valid)
.Host("www.sandwichparlour.com.au") .Host("www.sandwichparlour.com.au")
.Path(""); .Path("");
TestUrl("https://www.ümlaut.org.de/")
.Scheme("https")
.Host("www.ümlaut.org.de")
.Path("");
TestUrl("cm:/&test").Scheme("cm").Host("&test").Path(""); TestUrl("cm:/&test").Scheme("cm").Host("&test").Path("");
} }

View File

@@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs)
string UrlEncode(string const & rawUrl) string UrlEncode(string const & rawUrl)
{ {
size_t const count = rawUrl.size();
string result; string result;
result.reserve(count); result.reserve(rawUrl.size());
for (size_t i = 0; i < count; ++i) for (unsigned char c : rawUrl)
{ {
char const c = rawUrl[i]; // Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt
if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') || if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~')
c == '`' || (c > 'z' && c < '~') || c > '~') result += c;
else
{ {
result += '%'; result += '%';
result += NumToHex(c); result += NumToHex(c);
} }
else
result += rawUrl[i];
} }
return result; return result;

View File

@@ -119,11 +119,17 @@ UNIT_TEST(EditableMapObject_ValidateAndFormat_fediverse)
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@floss.social.uk"), "comaps@floss.social.uk", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@floss.social.uk"), "comaps@floss.social.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/@comaps"), "comaps@pub.mastodon.org.uk", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/@comaps"), "comaps@pub.mastodon.org.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/@comaps"), "comaps@pub.mastodon.org.uk", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/@comaps"), "comaps@pub.mastodon.org.uk", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("https://bawü.social/@mannheim"), "mannheim@bawü.social", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@mannheim@bawü.social"), "mannheim@bawü.social", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("comaps@fosstodon@mastodon.org"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("comaps@fosstodon@mastodon.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("co$maps@mastodon.social"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("co$maps@mastodon.social"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/comaps"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/comaps"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@.org"), "", ());
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps"), "", ());
} }
UNIT_TEST(EditableMapObject_ValidateAndFormat_bluesky) UNIT_TEST(EditableMapObject_ValidateAndFormat_bluesky)
@@ -306,11 +312,16 @@ UNIT_TEST(EditableMapObject_ValidateFediversePage)
TEST(osm::ValidateFediversePage("@comaps@floss.social.uk"), ()); TEST(osm::ValidateFediversePage("@comaps@floss.social.uk"), ());
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/@comaps"), ()); TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/@comaps"), ());
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/users/@comaps"), ()); TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/users/@comaps"), ());
TEST(osm::ValidateFediversePage("https://bawü.social/@mannheim"), ());
TEST(osm::ValidateFediversePage("@mannheim@bawü.social"), ());
TEST(!osm::ValidateFediversePage("comaps@floss@mastodon.org"), ()); TEST(!osm::ValidateFediversePage("comaps@floss@mastodon.org"), ());
TEST(!osm::ValidateFediversePage("orga$nicmaps@mastodon.social"), ()); TEST(!osm::ValidateFediversePage("co$maps@mastodon.social"), ());
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/comaps"), ()); TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/comaps"), ());
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/users/"), ()); TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/users/"), ());
TEST(!osm::ValidateFediversePage("@comaps.org"), ());
TEST(!osm::ValidateFediversePage("@comaps@.org"), ());
TEST(!osm::ValidateFediversePage("@comaps"), ());
} }
UNIT_TEST(EditableMapObject_ValidateBlueskyPage) UNIT_TEST(EditableMapObject_ValidateBlueskyPage)

View File

@@ -3,21 +3,22 @@
#include "coding/url.hpp" #include "coding/url.hpp"
#include "base/string_utils.hpp" #include "base/string_utils.hpp"
#include <unicode/regex.h>
#include <cstring> // strlen #include <cstring> // strlen
#include <regex>
namespace osm namespace osm
{ {
using namespace std; using namespace std;
static auto const s_instaRegex = regex(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)"); UErrorCode status = U_ZERO_ERROR;
static auto const s_twitterRegex = regex(R"(^@?[A-Za-z0-9_]{1,15}$)"); icu::RegexPattern* s_instaRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)", 0, status);
static auto const s_badVkRegex = regex(R"(^\d\d\d.+$)"); icu::RegexPattern* s_twitterRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_]{1,15}$)", 0, status);
static auto const s_goodVkRegex = regex(R"(^[A-Za-z0-9_.]{5,32}$)"); icu::RegexPattern* s_badVkRegex = icu::RegexPattern::compile(R"(^\d\d\d.+$)", 0, status);
static auto const s_lineRegex = regex(R"(^[a-z0-9-_.]{4,20}$)"); icu::RegexPattern* s_goodVkRegex = icu::RegexPattern::compile(R"(^[A-Za-z0-9_.]{5,32}$)", 0, status);
static auto const s_fediverseRegex = regex(R"(^@?[a-zA-Z0-9_]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)"); icu::RegexPattern* s_lineRegex = icu::RegexPattern::compile(R"(^[a-z0-9-_.]{4,20}$)", 0, status);
static auto const s_blueskyRegex = regex(R"(^@?[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+$)"); icu::RegexPattern* s_fediverseRegex = icu::RegexPattern::compile(R"(^@?[a-zA-Z0-9_]+@[\p{L}\p{N}-]+\.[\p{L}\p{N}.-]+$)", 0, status);
icu::RegexPattern* s_blueskyRegex = icu::RegexPattern::compile(R"(^@?[\p{L}\p{N}-]+(\.[\p{L}\p{N}-]+)+$)", 0, status);
constexpr string_view kFacebook{"contact:facebook"}; constexpr string_view kFacebook{"contact:facebook"};
constexpr string_view kInstagram{"contact:instagram"}; constexpr string_view kInstagram{"contact:instagram"};
@@ -59,6 +60,14 @@ constexpr string_view kUrlPanoramax{"https://api.panoramax.xyz/?pic="};
constexpr string_view kHttp{"http://"}; constexpr string_view kHttp{"http://"};
constexpr string_view kHttps{"https://"}; constexpr string_view kHttps{"https://"};
bool icuRegexMatches(const std::string& inputStr, icu::RegexPattern* pattern) {
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString input(inputStr.c_str(), "UTF-8");
std::unique_ptr<icu::RegexMatcher> matcher(pattern->matcher(input, status));
return U_SUCCESS(status) && matcher->matches(status);
}
size_t GetProtocolNameLength(string const & website) size_t GetProtocolNameLength(string const & website)
{ {
if (website.starts_with(kHttps)) if (website.starts_with(kHttps))
@@ -160,7 +169,7 @@ string ValidateAndFormat_instagram(string const & instagramPage)
return {}; return {};
// Check that instagramPage contains valid username. // Check that instagramPage contains valid username.
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ // Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
if (regex_match(instagramPage, s_instaRegex)) if (icuRegexMatches(instagramPage, s_instaRegex))
{ {
if (instagramPage.front() == '@') if (instagramPage.front() == '@')
return instagramPage.substr(1); return instagramPage.substr(1);
@@ -189,7 +198,7 @@ string ValidateAndFormat_twitter(string const & twitterPage)
return {}; return {};
// Check that twitterPage contains valid username. // Check that twitterPage contains valid username.
// Rules took here: https://stackoverflow.com/q/11361044 // Rules took here: https://stackoverflow.com/q/11361044
if (regex_match(twitterPage, s_twitterRegex)) if (icuRegexMatches(twitterPage, s_twitterRegex))
{ {
if (twitterPage.front() == '@') if (twitterPage.front() == '@')
return twitterPage.substr(1); return twitterPage.substr(1);
@@ -231,9 +240,9 @@ string ValidateAndFormat_vk(string const & vkPage)
if (vkPageClean.front() == '@') if (vkPageClean.front() == '@')
vkPageClean = vkPageClean.substr(1); vkPageClean = vkPageClean.substr(1);
if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || regex_match(vkPageClean, s_badVkRegex)) if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || icuRegexMatches(vkPageClean, s_badVkRegex))
return {}; return {};
if (regex_match(vkPageClean, s_goodVkRegex)) if (icuRegexMatches(vkPageClean, s_goodVkRegex))
return vkPageClean; return vkPageClean;
} }
if (!ValidateWebsite(vkPage)) if (!ValidateWebsite(vkPage))
@@ -279,7 +288,7 @@ string ValidateAndFormat_contactLine(string const & linePage)
string linePageClean = stripAtSymbol(linePage); string linePageClean = stripAtSymbol(linePage);
if (regex_match(linePageClean, s_lineRegex)) if (icuRegexMatches(linePageClean, s_lineRegex))
return linePageClean; return linePageClean;
} }
@@ -339,7 +348,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
return {}; return {};
// Parse {@?}{username}@{domain.name} format // Parse {@?}{username}@{domain.name} format
if (regex_match(fediPage, s_fediverseRegex)) if (icuRegexMatches(fediPage, s_fediverseRegex))
return stripAtSymbol(fediPage); return stripAtSymbol(fediPage);
// If it doesn't match the above format, it can only be an URL format. // If it doesn't match the above format, it can only be an URL format.
@@ -368,7 +377,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
// Then construct the final username@domain.name format // Then construct the final username@domain.name format
path.append("@").append(parsedDomain); path.append("@").append(parsedDomain);
// and make sure it's valid // and make sure it's valid
if (regex_match(path, s_fediverseRegex)) if (icuRegexMatches(path, s_fediverseRegex))
return path; return path;
else else
return {}; return {};
@@ -380,7 +389,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
return {}; return {};
// Try matching {@?}{user/domain.name} format to avoid doing the other stuff // Try matching {@?}{user/domain.name} format to avoid doing the other stuff
if (regex_match(bskyPage, s_blueskyRegex)) if (icuRegexMatches(bskyPage, s_blueskyRegex))
return stripAtSymbol(bskyPage); return stripAtSymbol(bskyPage);
// If not, it must match the URL format // If not, it must match the URL format
@@ -398,7 +407,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
// Then make sure it matches {@?}{user/domain.name} // Then make sure it matches {@?}{user/domain.name}
if (regex_match(path, s_blueskyRegex)) if (icuRegexMatches(path, s_blueskyRegex))
return stripAtSymbol(path); return stripAtSymbol(path);
} }
} }
@@ -458,7 +467,7 @@ bool ValidateInstagramPage(string const & page)
return true; return true;
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ // Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
if (regex_match(page, s_instaRegex)) if (icuRegexMatches(page, s_instaRegex))
return true; return true;
if (!ValidateWebsite(page)) if (!ValidateWebsite(page))
@@ -474,7 +483,7 @@ bool ValidateTwitterPage(string const & page)
return true; return true;
if (!ValidateWebsite(page)) if (!ValidateWebsite(page))
return regex_match(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044 return icuRegexMatches(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044
string const domain = strings::MakeLowerCase(url::Url::FromString(page).GetHost()); string const domain = strings::MakeLowerCase(url::Url::FromString(page).GetHost());
return domain == kXCom || domain.ends_with(kDotXCom) || domain == kTwitterCom || domain.ends_with(kDotTwitterCom); return domain == kXCom || domain.ends_with(kDotXCom) || domain == kTwitterCom || domain.ends_with(kDotTwitterCom);
@@ -500,9 +509,9 @@ bool ValidateVkPage(string const & page)
if (vkLogin.front() == '@') if (vkLogin.front() == '@')
vkLogin = vkLogin.substr(1); vkLogin = vkLogin.substr(1);
if ((vkLogin.front() == '_' && vkLogin.back() == '_') || regex_match(vkLogin, s_badVkRegex)) if ((vkLogin.front() == '_' && vkLogin.back() == '_') || icuRegexMatches(vkLogin, s_badVkRegex))
return false; return false;
if (regex_match(vkLogin, s_goodVkRegex)) if (icuRegexMatches(vkLogin, s_goodVkRegex))
return true; return true;
} }
@@ -525,7 +534,7 @@ bool ValidateLinePage(string const & page)
// The page name must be between 4 and 20 characters. Should contain alphanumeric characters // The page name must be between 4 and 20 characters. Should contain alphanumeric characters
// and symbols '.', '-', and '_' // and symbols '.', '-', and '_'
if (regex_match(stripAtSymbol(page), s_lineRegex)) if (icuRegexMatches(stripAtSymbol(page), s_lineRegex))
return true; return true;
} }
@@ -543,7 +552,7 @@ bool ValidateFediversePage(string const & page)
return true; return true;
// Match @username@instance.name format // Match @username@instance.name format
if (regex_match(page, s_fediverseRegex)) if (icuRegexMatches(page, s_fediverseRegex))
return true; return true;
// If it doesn't match the above format, it can only be an URL format. // If it doesn't match the above format, it can only be an URL format.
@@ -572,7 +581,7 @@ bool ValidateFediversePage(string const & page)
// Then construct the username@domain.name format // Then construct the username@domain.name format
path.append("@").append(domain); path.append("@").append(domain);
// And return if it's valid or not // And return if it's valid or not
return regex_match(path, s_fediverseRegex); return icuRegexMatches(path, s_fediverseRegex);
} }
bool ValidateBlueskyPage(string const & page) bool ValidateBlueskyPage(string const & page)
@@ -582,7 +591,7 @@ bool ValidateBlueskyPage(string const & page)
return true; return true;
// Match {@?}{user/domain.name} format // Match {@?}{user/domain.name} format
if (regex_match(page, s_blueskyRegex)) if (icuRegexMatches(page, s_blueskyRegex))
return true; return true;
// Has to be an url format now // Has to be an url format now
@@ -600,7 +609,7 @@ bool ValidateBlueskyPage(string const & page)
path.erase(0, 8); // Strip "profile/" part path.erase(0, 8); // Strip "profile/" part
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
// Then try to parse the remaining text as a username again // Then try to parse the remaining text as a username again
if (regex_match(path, s_blueskyRegex)) if (icuRegexMatches(path, s_blueskyRegex))
return true; return true;
} }