diff --git a/.gitignore b/.gitignore index f3b1d23d8..106c818af 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,10 @@ data/edits.xml data/World.mwm data/WorldCoasts.mwm +# generated when running tests. NOT the lowercase 'testing' which should be kept. +Testing +!testing + # Compiled Python *.pyc diff --git a/coding/coding_tests/url_tests.cpp b/coding/coding_tests/url_tests.cpp index f92afab51..a984f3e97 100644 --- a/coding/coding_tests/url_tests.cpp +++ b/coding/coding_tests/url_tests.cpp @@ -80,6 +80,7 @@ UNIT_TEST(Url_Encode) TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ()); TEST_EQUAL(UrlEncode("20"), "20", ()); TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ()); + TEST_EQUAL(UrlEncode("ümlaut"), "%C3%BCmlaut", ()); TEST_EQUAL(UrlEncode(orig1), enc1, ()); TEST_EQUAL(UrlEncode(orig2), enc2, ()); TEST_EQUAL(UrlEncode(orig3), enc3, ()); @@ -98,6 +99,8 @@ UNIT_TEST(Url_Decode) TEST_EQUAL(UrlDecode(enc3), orig3, ()); TEST_EQUAL(UrlDecode(enc4), orig4, ()); TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ()); + TEST_EQUAL(UrlDecode("%C3%BCmlaut"), "ümlaut", ()); + } UNIT_TEST(Url_Invalid) @@ -127,6 +130,11 @@ UNIT_TEST(Url_Valid) .Host("www.sandwichparlour.com.au") .Path(""); + TestUrl("https://www.ümlaut.org.de/") + .Scheme("https") + .Host("www.ümlaut.org.de") + .Path(""); + TestUrl("cm:/&test").Scheme("cm").Host("&test").Path(""); } diff --git a/coding/url.cpp b/coding/url.cpp index 4613e2f03..c4c4e5263 100644 --- a/coding/url.cpp +++ b/coding/url.cpp @@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs) string UrlEncode(string const & rawUrl) { - size_t const count = rawUrl.size(); string result; - result.reserve(count); + result.reserve(rawUrl.size()); - for (size_t i = 0; i < count; ++i) + for (unsigned char c : rawUrl) { - char const c = rawUrl[i]; - if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') || - c == '`' || (c > 'z' && c < '~') || c > '~') + // Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt + if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~') + result += c; + else { result += '%'; result += NumToHex(c); } - else - result += rawUrl[i]; } return result; diff --git a/indexer/indexer_tests/validate_and_format_contacts_test.cpp b/indexer/indexer_tests/validate_and_format_contacts_test.cpp index 0c8d041dd..3d3b3676d 100644 --- a/indexer/indexer_tests/validate_and_format_contacts_test.cpp +++ b/indexer/indexer_tests/validate_and_format_contacts_test.cpp @@ -119,11 +119,17 @@ UNIT_TEST(EditableMapObject_ValidateAndFormat_fediverse) TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@floss.social.uk"), "comaps@floss.social.uk", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/@comaps"), "comaps@pub.mastodon.org.uk", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/@comaps"), "comaps@pub.mastodon.org.uk", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("https://bawü.social/@mannheim"), "mannheim@bawü.social", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("@mannheim@bawü.social"), "mannheim@bawü.social", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("comaps@fosstodon@mastodon.org"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("co$maps@mastodon.social"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/comaps"), "", ()); TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/"), "", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps.org"), "", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@.org"), "", ()); + TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps"), "", ()); } UNIT_TEST(EditableMapObject_ValidateAndFormat_bluesky) @@ -306,11 +312,16 @@ UNIT_TEST(EditableMapObject_ValidateFediversePage) TEST(osm::ValidateFediversePage("@comaps@floss.social.uk"), ()); TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/@comaps"), ()); TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/users/@comaps"), ()); + TEST(osm::ValidateFediversePage("https://bawü.social/@mannheim"), ()); + TEST(osm::ValidateFediversePage("@mannheim@bawü.social"), ()); TEST(!osm::ValidateFediversePage("comaps@floss@mastodon.org"), ()); - TEST(!osm::ValidateFediversePage("orga$nicmaps@mastodon.social"), ()); + TEST(!osm::ValidateFediversePage("co$maps@mastodon.social"), ()); TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/comaps"), ()); TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/users/"), ()); + TEST(!osm::ValidateFediversePage("@comaps.org"), ()); + TEST(!osm::ValidateFediversePage("@comaps@.org"), ()); + TEST(!osm::ValidateFediversePage("@comaps"), ()); } UNIT_TEST(EditableMapObject_ValidateBlueskyPage) diff --git a/indexer/validate_and_format_contacts.cpp b/indexer/validate_and_format_contacts.cpp index 96a4609a2..525d434f8 100644 --- a/indexer/validate_and_format_contacts.cpp +++ b/indexer/validate_and_format_contacts.cpp @@ -3,21 +3,22 @@ #include "coding/url.hpp" #include "base/string_utils.hpp" +#include #include // strlen -#include namespace osm { using namespace std; -static auto const s_instaRegex = regex(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)"); -static auto const s_twitterRegex = regex(R"(^@?[A-Za-z0-9_]{1,15}$)"); -static auto const s_badVkRegex = regex(R"(^\d\d\d.+$)"); -static auto const s_goodVkRegex = regex(R"(^[A-Za-z0-9_.]{5,32}$)"); -static auto const s_lineRegex = regex(R"(^[a-z0-9-_.]{4,20}$)"); -static auto const s_fediverseRegex = regex(R"(^@?[a-zA-Z0-9_]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)"); -static auto const s_blueskyRegex = regex(R"(^@?[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+$)"); +UErrorCode status = U_ZERO_ERROR; +icu::RegexPattern* s_instaRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)", 0, status); +icu::RegexPattern* s_twitterRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_]{1,15}$)", 0, status); +icu::RegexPattern* s_badVkRegex = icu::RegexPattern::compile(R"(^\d\d\d.+$)", 0, status); +icu::RegexPattern* s_goodVkRegex = icu::RegexPattern::compile(R"(^[A-Za-z0-9_.]{5,32}$)", 0, status); +icu::RegexPattern* s_lineRegex = icu::RegexPattern::compile(R"(^[a-z0-9-_.]{4,20}$)", 0, status); +icu::RegexPattern* s_fediverseRegex = icu::RegexPattern::compile(R"(^@?[a-zA-Z0-9_]+@[\p{L}\p{N}-]+\.[\p{L}\p{N}.-]+$)", 0, status); +icu::RegexPattern* s_blueskyRegex = icu::RegexPattern::compile(R"(^@?[\p{L}\p{N}-]+(\.[\p{L}\p{N}-]+)+$)", 0, status); constexpr string_view kFacebook{"contact:facebook"}; constexpr string_view kInstagram{"contact:instagram"}; @@ -59,6 +60,14 @@ constexpr string_view kUrlPanoramax{"https://api.panoramax.xyz/?pic="}; constexpr string_view kHttp{"http://"}; constexpr string_view kHttps{"https://"}; +bool icuRegexMatches(const std::string& inputStr, icu::RegexPattern* pattern) { + + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString input(inputStr.c_str(), "UTF-8"); + std::unique_ptr matcher(pattern->matcher(input, status)); + return U_SUCCESS(status) && matcher->matches(status); +} + size_t GetProtocolNameLength(string const & website) { if (website.starts_with(kHttps)) @@ -160,7 +169,7 @@ string ValidateAndFormat_instagram(string const & instagramPage) return {}; // Check that instagramPage contains valid username. // Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ - if (regex_match(instagramPage, s_instaRegex)) + if (icuRegexMatches(instagramPage, s_instaRegex)) { if (instagramPage.front() == '@') return instagramPage.substr(1); @@ -189,7 +198,7 @@ string ValidateAndFormat_twitter(string const & twitterPage) return {}; // Check that twitterPage contains valid username. // Rules took here: https://stackoverflow.com/q/11361044 - if (regex_match(twitterPage, s_twitterRegex)) + if (icuRegexMatches(twitterPage, s_twitterRegex)) { if (twitterPage.front() == '@') return twitterPage.substr(1); @@ -231,9 +240,9 @@ string ValidateAndFormat_vk(string const & vkPage) if (vkPageClean.front() == '@') vkPageClean = vkPageClean.substr(1); - if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || regex_match(vkPageClean, s_badVkRegex)) + if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || icuRegexMatches(vkPageClean, s_badVkRegex)) return {}; - if (regex_match(vkPageClean, s_goodVkRegex)) + if (icuRegexMatches(vkPageClean, s_goodVkRegex)) return vkPageClean; } if (!ValidateWebsite(vkPage)) @@ -279,7 +288,7 @@ string ValidateAndFormat_contactLine(string const & linePage) string linePageClean = stripAtSymbol(linePage); - if (regex_match(linePageClean, s_lineRegex)) + if (icuRegexMatches(linePageClean, s_lineRegex)) return linePageClean; } @@ -339,7 +348,7 @@ string ValidateAndFormat_fediverse(string const & fediPage) return {}; // Parse {@?}{username}@{domain.name} format - if (regex_match(fediPage, s_fediverseRegex)) + if (icuRegexMatches(fediPage, s_fediverseRegex)) return stripAtSymbol(fediPage); // If it doesn't match the above format, it can only be an URL format. @@ -368,7 +377,7 @@ string ValidateAndFormat_fediverse(string const & fediPage) // Then construct the final username@domain.name format path.append("@").append(parsedDomain); // and make sure it's valid - if (regex_match(path, s_fediverseRegex)) + if (icuRegexMatches(path, s_fediverseRegex)) return path; else return {}; @@ -380,7 +389,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage) return {}; // Try matching {@?}{user/domain.name} format to avoid doing the other stuff - if (regex_match(bskyPage, s_blueskyRegex)) + if (icuRegexMatches(bskyPage, s_blueskyRegex)) return stripAtSymbol(bskyPage); // If not, it must match the URL format @@ -398,7 +407,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage) path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists // Then make sure it matches {@?}{user/domain.name} - if (regex_match(path, s_blueskyRegex)) + if (icuRegexMatches(path, s_blueskyRegex)) return stripAtSymbol(path); } } @@ -458,7 +467,7 @@ bool ValidateInstagramPage(string const & page) return true; // Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/ - if (regex_match(page, s_instaRegex)) + if (icuRegexMatches(page, s_instaRegex)) return true; if (!ValidateWebsite(page)) @@ -474,7 +483,7 @@ bool ValidateTwitterPage(string const & page) return true; if (!ValidateWebsite(page)) - return regex_match(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044 + return icuRegexMatches(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044 string const domain = strings::MakeLowerCase(url::Url::FromString(page).GetHost()); return domain == kXCom || domain.ends_with(kDotXCom) || domain == kTwitterCom || domain.ends_with(kDotTwitterCom); @@ -500,9 +509,9 @@ bool ValidateVkPage(string const & page) if (vkLogin.front() == '@') vkLogin = vkLogin.substr(1); - if ((vkLogin.front() == '_' && vkLogin.back() == '_') || regex_match(vkLogin, s_badVkRegex)) + if ((vkLogin.front() == '_' && vkLogin.back() == '_') || icuRegexMatches(vkLogin, s_badVkRegex)) return false; - if (regex_match(vkLogin, s_goodVkRegex)) + if (icuRegexMatches(vkLogin, s_goodVkRegex)) return true; } @@ -525,7 +534,7 @@ bool ValidateLinePage(string const & page) // The page name must be between 4 and 20 characters. Should contain alphanumeric characters // and symbols '.', '-', and '_' - if (regex_match(stripAtSymbol(page), s_lineRegex)) + if (icuRegexMatches(stripAtSymbol(page), s_lineRegex)) return true; } @@ -543,7 +552,7 @@ bool ValidateFediversePage(string const & page) return true; // Match @username@instance.name format - if (regex_match(page, s_fediverseRegex)) + if (icuRegexMatches(page, s_fediverseRegex)) return true; // If it doesn't match the above format, it can only be an URL format. @@ -572,7 +581,7 @@ bool ValidateFediversePage(string const & page) // Then construct the username@domain.name format path.append("@").append(domain); // And return if it's valid or not - return regex_match(path, s_fediverseRegex); + return icuRegexMatches(path, s_fediverseRegex); } bool ValidateBlueskyPage(string const & page) @@ -582,7 +591,7 @@ bool ValidateBlueskyPage(string const & page) return true; // Match {@?}{user/domain.name} format - if (regex_match(page, s_blueskyRegex)) + if (icuRegexMatches(page, s_blueskyRegex)) return true; // Has to be an url format now @@ -600,7 +609,7 @@ bool ValidateBlueskyPage(string const & page) path.erase(0, 8); // Strip "profile/" part path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists // Then try to parse the remaining text as a username again - if (regex_match(path, s_blueskyRegex)) + if (icuRegexMatches(path, s_blueskyRegex)) return true; }