mirror of
https://codeberg.org/comaps/comaps
synced 2025-12-19 13:03:36 +00:00
Compare commits
1 Commits
6a20269819
...
hb0nd-url-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
34e9b17c33 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -27,6 +27,10 @@ data/edits.xml
|
||||
data/World.mwm
|
||||
data/WorldCoasts.mwm
|
||||
|
||||
# generated when running tests. NOT the lowercase 'testing' which should be kept.
|
||||
Testing
|
||||
!testing
|
||||
|
||||
# Compiled Python
|
||||
*.pyc
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ UNIT_TEST(Url_Encode)
|
||||
TEST_EQUAL(UrlEncode("%% "), "%25%25%20", ());
|
||||
TEST_EQUAL(UrlEncode("20"), "20", ());
|
||||
TEST_EQUAL(UrlEncode("Guinea-Bissau"), "Guinea-Bissau", ());
|
||||
TEST_EQUAL(UrlEncode("ümlaut"), "%C3%BCmlaut", ());
|
||||
TEST_EQUAL(UrlEncode(orig1), enc1, ());
|
||||
TEST_EQUAL(UrlEncode(orig2), enc2, ());
|
||||
TEST_EQUAL(UrlEncode(orig3), enc3, ());
|
||||
@@ -98,6 +99,8 @@ UNIT_TEST(Url_Decode)
|
||||
TEST_EQUAL(UrlDecode(enc3), orig3, ());
|
||||
TEST_EQUAL(UrlDecode(enc4), orig4, ());
|
||||
TEST_EQUAL(UrlDecode("123+Main+St,+Seattle,+WA+98101"), "123 Main St, Seattle, WA 98101", ());
|
||||
TEST_EQUAL(UrlDecode("%C3%BCmlaut"), "ümlaut", ());
|
||||
|
||||
}
|
||||
|
||||
UNIT_TEST(Url_Invalid)
|
||||
@@ -127,6 +130,11 @@ UNIT_TEST(Url_Valid)
|
||||
.Host("www.sandwichparlour.com.au")
|
||||
.Path("");
|
||||
|
||||
TestUrl("https://www.ümlaut.org.de/")
|
||||
.Scheme("https")
|
||||
.Host("www.ümlaut.org.de")
|
||||
.Path("");
|
||||
|
||||
TestUrl("cm:/&test").Scheme("cm").Host("&test").Path("");
|
||||
}
|
||||
|
||||
|
||||
@@ -115,21 +115,19 @@ string Join(string const & lhs, string const & rhs)
|
||||
|
||||
string UrlEncode(string const & rawUrl)
|
||||
{
|
||||
size_t const count = rawUrl.size();
|
||||
string result;
|
||||
result.reserve(count);
|
||||
result.reserve(rawUrl.size());
|
||||
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
for (unsigned char c : rawUrl)
|
||||
{
|
||||
char const c = rawUrl[i];
|
||||
if (c < '-' || c == '/' || (c > '9' && c < 'A') || (c > 'Z' && c < '_') ||
|
||||
c == '`' || (c > 'z' && c < '~') || c > '~')
|
||||
// Allowed URI chars: https://www.ietf.org/rfc/rfc3986.txt
|
||||
if (isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~')
|
||||
result += c;
|
||||
else
|
||||
{
|
||||
result += '%';
|
||||
result += NumToHex(c);
|
||||
}
|
||||
else
|
||||
result += rawUrl[i];
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -119,11 +119,17 @@ UNIT_TEST(EditableMapObject_ValidateAndFormat_fediverse)
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@floss.social.uk"), "comaps@floss.social.uk", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/@comaps"), "comaps@pub.mastodon.org.uk", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/@comaps"), "comaps@pub.mastodon.org.uk", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("https://bawü.social/@mannheim"), "mannheim@bawü.social", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@mannheim@bawü.social"), "mannheim@bawü.social", ());
|
||||
|
||||
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("comaps@fosstodon@mastodon.org"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("co$maps@mastodon.social"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/comaps"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("pub.mastodon.org.uk/users/"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps.org"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps@.org"), "", ());
|
||||
TEST_EQUAL(osm::ValidateAndFormat_fediverse("@comaps"), "", ());
|
||||
}
|
||||
|
||||
UNIT_TEST(EditableMapObject_ValidateAndFormat_bluesky)
|
||||
@@ -306,11 +312,16 @@ UNIT_TEST(EditableMapObject_ValidateFediversePage)
|
||||
TEST(osm::ValidateFediversePage("@comaps@floss.social.uk"), ());
|
||||
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/@comaps"), ());
|
||||
TEST(osm::ValidateFediversePage("pub.mastodon.org.uk/users/@comaps"), ());
|
||||
TEST(osm::ValidateFediversePage("https://bawü.social/@mannheim"), ());
|
||||
TEST(osm::ValidateFediversePage("@mannheim@bawü.social"), ());
|
||||
|
||||
TEST(!osm::ValidateFediversePage("comaps@floss@mastodon.org"), ());
|
||||
TEST(!osm::ValidateFediversePage("orga$nicmaps@mastodon.social"), ());
|
||||
TEST(!osm::ValidateFediversePage("co$maps@mastodon.social"), ());
|
||||
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/comaps"), ());
|
||||
TEST(!osm::ValidateFediversePage("pub.mastodon.org.uk/users/"), ());
|
||||
TEST(!osm::ValidateFediversePage("@comaps.org"), ());
|
||||
TEST(!osm::ValidateFediversePage("@comaps@.org"), ());
|
||||
TEST(!osm::ValidateFediversePage("@comaps"), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(EditableMapObject_ValidateBlueskyPage)
|
||||
|
||||
@@ -3,21 +3,22 @@
|
||||
#include "coding/url.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
#include <unicode/regex.h>
|
||||
|
||||
#include <cstring> // strlen
|
||||
#include <regex>
|
||||
|
||||
namespace osm
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
static auto const s_instaRegex = regex(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)");
|
||||
static auto const s_twitterRegex = regex(R"(^@?[A-Za-z0-9_]{1,15}$)");
|
||||
static auto const s_badVkRegex = regex(R"(^\d\d\d.+$)");
|
||||
static auto const s_goodVkRegex = regex(R"(^[A-Za-z0-9_.]{5,32}$)");
|
||||
static auto const s_lineRegex = regex(R"(^[a-z0-9-_.]{4,20}$)");
|
||||
static auto const s_fediverseRegex = regex(R"(^@?[a-zA-Z0-9_]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)");
|
||||
static auto const s_blueskyRegex = regex(R"(^@?[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+$)");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
icu::RegexPattern* s_instaRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_][A-Za-z0-9_.]{0,28}[A-Za-z0-9_]$)", 0, status);
|
||||
icu::RegexPattern* s_twitterRegex = icu::RegexPattern::compile(R"(^@?[A-Za-z0-9_]{1,15}$)", 0, status);
|
||||
icu::RegexPattern* s_badVkRegex = icu::RegexPattern::compile(R"(^\d\d\d.+$)", 0, status);
|
||||
icu::RegexPattern* s_goodVkRegex = icu::RegexPattern::compile(R"(^[A-Za-z0-9_.]{5,32}$)", 0, status);
|
||||
icu::RegexPattern* s_lineRegex = icu::RegexPattern::compile(R"(^[a-z0-9-_.]{4,20}$)", 0, status);
|
||||
icu::RegexPattern* s_fediverseRegex = icu::RegexPattern::compile(R"(^@?[a-zA-Z0-9_]+@[\p{L}\p{N}-]+\.[\p{L}\p{N}.-]+$)", 0, status);
|
||||
icu::RegexPattern* s_blueskyRegex = icu::RegexPattern::compile(R"(^@?[\p{L}\p{N}-]+(\.[\p{L}\p{N}-]+)+$)", 0, status);
|
||||
|
||||
constexpr string_view kFacebook{"contact:facebook"};
|
||||
constexpr string_view kInstagram{"contact:instagram"};
|
||||
@@ -59,6 +60,14 @@ constexpr string_view kUrlPanoramax{"https://api.panoramax.xyz/?pic="};
|
||||
constexpr string_view kHttp{"http://"};
|
||||
constexpr string_view kHttps{"https://"};
|
||||
|
||||
bool icuRegexMatches(const std::string& inputStr, icu::RegexPattern* pattern) {
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
icu::UnicodeString input(inputStr.c_str(), "UTF-8");
|
||||
std::unique_ptr<icu::RegexMatcher> matcher(pattern->matcher(input, status));
|
||||
return U_SUCCESS(status) && matcher->matches(status);
|
||||
}
|
||||
|
||||
size_t GetProtocolNameLength(string const & website)
|
||||
{
|
||||
if (website.starts_with(kHttps))
|
||||
@@ -160,7 +169,7 @@ string ValidateAndFormat_instagram(string const & instagramPage)
|
||||
return {};
|
||||
// Check that instagramPage contains valid username.
|
||||
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
|
||||
if (regex_match(instagramPage, s_instaRegex))
|
||||
if (icuRegexMatches(instagramPage, s_instaRegex))
|
||||
{
|
||||
if (instagramPage.front() == '@')
|
||||
return instagramPage.substr(1);
|
||||
@@ -189,7 +198,7 @@ string ValidateAndFormat_twitter(string const & twitterPage)
|
||||
return {};
|
||||
// Check that twitterPage contains valid username.
|
||||
// Rules took here: https://stackoverflow.com/q/11361044
|
||||
if (regex_match(twitterPage, s_twitterRegex))
|
||||
if (icuRegexMatches(twitterPage, s_twitterRegex))
|
||||
{
|
||||
if (twitterPage.front() == '@')
|
||||
return twitterPage.substr(1);
|
||||
@@ -231,9 +240,9 @@ string ValidateAndFormat_vk(string const & vkPage)
|
||||
if (vkPageClean.front() == '@')
|
||||
vkPageClean = vkPageClean.substr(1);
|
||||
|
||||
if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || regex_match(vkPageClean, s_badVkRegex))
|
||||
if ((vkPageClean.front() == '_' && vkPageClean.back() == '_') || icuRegexMatches(vkPageClean, s_badVkRegex))
|
||||
return {};
|
||||
if (regex_match(vkPageClean, s_goodVkRegex))
|
||||
if (icuRegexMatches(vkPageClean, s_goodVkRegex))
|
||||
return vkPageClean;
|
||||
}
|
||||
if (!ValidateWebsite(vkPage))
|
||||
@@ -279,7 +288,7 @@ string ValidateAndFormat_contactLine(string const & linePage)
|
||||
|
||||
string linePageClean = stripAtSymbol(linePage);
|
||||
|
||||
if (regex_match(linePageClean, s_lineRegex))
|
||||
if (icuRegexMatches(linePageClean, s_lineRegex))
|
||||
return linePageClean;
|
||||
}
|
||||
|
||||
@@ -339,7 +348,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
|
||||
return {};
|
||||
|
||||
// Parse {@?}{username}@{domain.name} format
|
||||
if (regex_match(fediPage, s_fediverseRegex))
|
||||
if (icuRegexMatches(fediPage, s_fediverseRegex))
|
||||
return stripAtSymbol(fediPage);
|
||||
|
||||
// If it doesn't match the above format, it can only be an URL format.
|
||||
@@ -368,7 +377,7 @@ string ValidateAndFormat_fediverse(string const & fediPage)
|
||||
// Then construct the final username@domain.name format
|
||||
path.append("@").append(parsedDomain);
|
||||
// and make sure it's valid
|
||||
if (regex_match(path, s_fediverseRegex))
|
||||
if (icuRegexMatches(path, s_fediverseRegex))
|
||||
return path;
|
||||
else
|
||||
return {};
|
||||
@@ -380,7 +389,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
|
||||
return {};
|
||||
|
||||
// Try matching {@?}{user/domain.name} format to avoid doing the other stuff
|
||||
if (regex_match(bskyPage, s_blueskyRegex))
|
||||
if (icuRegexMatches(bskyPage, s_blueskyRegex))
|
||||
return stripAtSymbol(bskyPage);
|
||||
|
||||
// If not, it must match the URL format
|
||||
@@ -398,7 +407,7 @@ string ValidateAndFormat_bluesky(string const & bskyPage)
|
||||
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
|
||||
|
||||
// Then make sure it matches {@?}{user/domain.name}
|
||||
if (regex_match(path, s_blueskyRegex))
|
||||
if (icuRegexMatches(path, s_blueskyRegex))
|
||||
return stripAtSymbol(path);
|
||||
}
|
||||
}
|
||||
@@ -458,7 +467,7 @@ bool ValidateInstagramPage(string const & page)
|
||||
return true;
|
||||
|
||||
// Rules are defined here: https://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
|
||||
if (regex_match(page, s_instaRegex))
|
||||
if (icuRegexMatches(page, s_instaRegex))
|
||||
return true;
|
||||
|
||||
if (!ValidateWebsite(page))
|
||||
@@ -474,7 +483,7 @@ bool ValidateTwitterPage(string const & page)
|
||||
return true;
|
||||
|
||||
if (!ValidateWebsite(page))
|
||||
return regex_match(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044
|
||||
return icuRegexMatches(page, s_twitterRegex); // Rules are defined here: https://stackoverflow.com/q/11361044
|
||||
|
||||
string const domain = strings::MakeLowerCase(url::Url::FromString(page).GetHost());
|
||||
return domain == kXCom || domain.ends_with(kDotXCom) || domain == kTwitterCom || domain.ends_with(kDotTwitterCom);
|
||||
@@ -500,9 +509,9 @@ bool ValidateVkPage(string const & page)
|
||||
if (vkLogin.front() == '@')
|
||||
vkLogin = vkLogin.substr(1);
|
||||
|
||||
if ((vkLogin.front() == '_' && vkLogin.back() == '_') || regex_match(vkLogin, s_badVkRegex))
|
||||
if ((vkLogin.front() == '_' && vkLogin.back() == '_') || icuRegexMatches(vkLogin, s_badVkRegex))
|
||||
return false;
|
||||
if (regex_match(vkLogin, s_goodVkRegex))
|
||||
if (icuRegexMatches(vkLogin, s_goodVkRegex))
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -525,7 +534,7 @@ bool ValidateLinePage(string const & page)
|
||||
// The page name must be between 4 and 20 characters. Should contain alphanumeric characters
|
||||
// and symbols '.', '-', and '_'
|
||||
|
||||
if (regex_match(stripAtSymbol(page), s_lineRegex))
|
||||
if (icuRegexMatches(stripAtSymbol(page), s_lineRegex))
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -543,7 +552,7 @@ bool ValidateFediversePage(string const & page)
|
||||
return true;
|
||||
|
||||
// Match @username@instance.name format
|
||||
if (regex_match(page, s_fediverseRegex))
|
||||
if (icuRegexMatches(page, s_fediverseRegex))
|
||||
return true;
|
||||
|
||||
// If it doesn't match the above format, it can only be an URL format.
|
||||
@@ -572,7 +581,7 @@ bool ValidateFediversePage(string const & page)
|
||||
// Then construct the username@domain.name format
|
||||
path.append("@").append(domain);
|
||||
// And return if it's valid or not
|
||||
return regex_match(path, s_fediverseRegex);
|
||||
return icuRegexMatches(path, s_fediverseRegex);
|
||||
}
|
||||
|
||||
bool ValidateBlueskyPage(string const & page)
|
||||
@@ -582,7 +591,7 @@ bool ValidateBlueskyPage(string const & page)
|
||||
return true;
|
||||
|
||||
// Match {@?}{user/domain.name} format
|
||||
if (regex_match(page, s_blueskyRegex))
|
||||
if (icuRegexMatches(page, s_blueskyRegex))
|
||||
return true;
|
||||
|
||||
// Has to be an url format now
|
||||
@@ -600,7 +609,7 @@ bool ValidateBlueskyPage(string const & page)
|
||||
path.erase(0, 8); // Strip "profile/" part
|
||||
path.erase(path.find_last_not_of('/') + 1); // Strip last '/' symbol if exists
|
||||
// Then try to parse the remaining text as a username again
|
||||
if (regex_match(path, s_blueskyRegex))
|
||||
if (icuRegexMatches(path, s_blueskyRegex))
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user