diff --git a/libs/search/house_numbers_matcher.cpp b/libs/search/house_numbers_matcher.cpp index 1c6ad8e6a..f0676a903 100644 --- a/libs/search/house_numbers_matcher.cpp +++ b/libs/search/house_numbers_matcher.cpp @@ -76,7 +76,7 @@ char const * g_strings[] = { // ./clusterize-tag-values.lisp house-number path-to-taginfo-db.db > numbers.txt // tail -n +2 numbers.txt | head -78 | sed 's/^.*) \(.*\) \[.*$/"\1"/g;s/[ -/]//g;s/$/,/' | // sort | uniq -vector const g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L", "LL", "LN", "LNL", "LNLN", "LNN", +array constexpr g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L", "LL", "LN", "LNL", "LNLN", "LNN", "N", "NBL", "NBLN", "NBN", "NBNBN", "NBNL", "NL", "NLBN", "NLL", "NLLN", "NLN", "NLNL", "NLS", "NLSN", "NN", "NNBN", "NNL", "NNLN", "NNN", "NNS", "NS", "NSN", "NSS", "S", "SL", "SLL", "SLN", "SN", "SNBNSS", "SNL", "SNN", "SS", "SSN", "SSS", "SSSS", @@ -85,13 +85,14 @@ vector const g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L "NNBNL"}; // List of patterns which look like house numbers more than other patterns. Constructed by hand. -vector const g_patternsStrict = {"N", "NBN", "NBL", "NL"}; +array constexpr g_patternsStrict = {"N", "NBN", "NBL", "NL"}; // List of common synonyms for building parts. Constructed by hand. char const * g_buildingPartSynonyms[] = {"building", "bldg", "bld", "bl", "unit", "block", "blk", "корпус", "корп", "кор", "литер", "лит", "строение", "стр", "блок", "бл"}; // List of common stop words for buildings. Constructed by hand. +// TODO: add more stop words? UniString const g_stopWords[] = {MakeUniString("дом"), MakeUniString("house"), MakeUniString("д")}; bool IsStopWord(UniString const & s, bool isPrefix) @@ -167,7 +168,8 @@ class HouseNumberClassifier public: using Patterns = StringSet; - HouseNumberClassifier(vector const & patterns = g_patterns) + template + HouseNumberClassifier(array const & patterns) { for (auto const & p : patterns) m_patterns.Add(make_transform_iterator(p.begin(), &CharToType), make_transform_iterator(p.end(), &CharToType)); @@ -590,7 +592,7 @@ bool HouseNumbersMatchRange(std::string_view const & hnRange, TokensT const & qu bool LooksLikeHouseNumber(UniString const & s, bool isPrefix) { - static HouseNumberClassifier const classifier; + static HouseNumberClassifier const classifier(g_patterns); return classifier.LooksGood(s, isPrefix); } diff --git a/libs/search/locality_scorer.cpp b/libs/search/locality_scorer.cpp index 48fe41a7a..bccf1f07f 100644 --- a/libs/search/locality_scorer.cpp +++ b/libs/search/locality_scorer.cpp @@ -298,8 +298,7 @@ void LocalityScorer::GetDocVecs(uint32_t localityId, vector & dvs) const DocVec::Builder builder; ForEachNormalizedToken(name, [&](strings::UniString const & token) { - if (!IsStopWord(token)) - builder.Add(token); + builder.Add(token); }); dvs.emplace_back(std::move(builder)); } diff --git a/libs/search/processor.cpp b/libs/search/processor.cpp index f9b4c3bb4..77b44e523 100644 --- a/libs/search/processor.cpp +++ b/libs/search/processor.cpp @@ -69,21 +69,6 @@ m2::RectD GetRectAroundPosition(m2::PointD const & position) return mercator::RectByCenterXYAndSizeInMeters(position, kMaxPositionRadiusM); } -// Removes all full-token stop words from |tokens|. -// Does nothing if all tokens are non-prefix stop words. -void RemoveStopWordsIfNeeded(QueryTokens & tokens, strings::UniString & prefix) -{ - size_t numStopWords = 0; - for (auto const & token : tokens) - if (IsStopWord(token)) - ++numStopWords; - - if (numStopWords == tokens.size() && prefix.empty()) - return; - - tokens.erase_if(&IsStopWord); -} - void TrimLeadingSpaces(string & s) { while (!s.empty() && strings::IsASCIISpace(s.front())) @@ -277,9 +262,6 @@ void Processor::SetQuery(string const & query, bool categorialRequest /* = false } } - // Remove stopwords *after* FillCategories call (it makes exact tokens match). - RemoveStopWordsIfNeeded(m_query.m_tokens, m_query.m_prefix); - if (!m_isCategorialRequest) { // Assign tokens and prefix to scorer. diff --git a/libs/search/query_params.cpp b/libs/search/query_params.cpp index 7962926db..c998d323b 100644 --- a/libs/search/query_params.cpp +++ b/libs/search/query_params.cpp @@ -1482,13 +1482,7 @@ unordered_map> const kSynonyms = { // QueryParams::Token ------------------------------------------------------------------------------ void QueryParams::Token::AddSynonym(string const & s) { - AddSynonym(strings::MakeUniString(s)); -} - -void QueryParams::Token::AddSynonym(String const & s) -{ - if (!IsStopWord(s)) - m_synonyms.push_back(s); + m_synonyms.push_back(strings::MakeUniString(s)); } string DebugPrint(QueryParams::Token const & token) @@ -1510,10 +1504,11 @@ void QueryParams::ClearStreetIndices() AdditionalCommonTokens() { char const * arr[] = { - "the", // English - "der", "zum", "und", "auf", // German - "del", "les", // Spanish - "в", "на" // Cyrillic + "a", "and", "s", "the", // English + "am", "an", "auf", "der", "im", "und", "zum", // German + "as", "d", "da", "de", "del", "di", "do", "du", "e", "el", + "et", "la", "las", "le", "les", "los", "o", "os", "y", // French, Spanish, Italian + "в", "и", "на", "я" // Cyrillic }; for (char const * s : arr) m_strings.insert(NormalizeAndSimplifyString(s)); diff --git a/libs/search/query_params.hpp b/libs/search/query_params.hpp index bb7adc3bf..85d1a9519 100644 --- a/libs/search/query_params.hpp +++ b/libs/search/query_params.hpp @@ -28,7 +28,6 @@ public: Token(String const & original) : m_original(original) {} void AddSynonym(std::string const & s); - void AddSynonym(String const & s); template void ForEachSynonym(Fn && fn) const diff --git a/libs/search/ranking_utils.cpp b/libs/search/ranking_utils.cpp index 22f778196..3f8810fb9 100644 --- a/libs/search/ranking_utils.cpp +++ b/libs/search/ranking_utils.cpp @@ -94,42 +94,9 @@ ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniStr } } // namespace impl -bool IsStopWord(UniString const & s) -{ - /// @todo Get all common used stop words and take out this array into search_string_utils.cpp module for example. - /// Should skip this tokens when building search index? - class StopWordsChecker - { - set m_set; - - public: - StopWordsChecker() - { - // Don't want to put _full_ stopwords list, not to break current ranking. - // Only 2-letters and the most common. - char const * arr[] = { - "a", "s", "the", // English - "am", "im", "an", // German - "d", "da", "de", "di", "du", "la", "le", // French, Spanish, Italian - "и", "я" // Cyrillic - }; - for (char const * s : arr) - m_set.insert(MakeUniString(s)); - } - bool Has(UniString const & s) const { return m_set.count(s) > 0; } - }; - - static StopWordsChecker const swChecker; - return swChecker.Has(s); -} - TokensVector::TokensVector(string_view name) { - ForEachNormalizedToken(name, [this](strings::UniString && token) - { - if (!IsStopWord(token)) - m_tokens.push_back(std::move(token)); - }); + m_tokens = NormalizeAndTokenizeString(std::move(name)); Init(); } diff --git a/libs/search/ranking_utils.hpp b/libs/search/ranking_utils.hpp index 51d81e027..7826421ac 100644 --- a/libs/search/ranking_utils.hpp +++ b/libs/search/ranking_utils.hpp @@ -180,9 +180,6 @@ struct NameScores std::string DebugPrint(NameScore const & score); std::string DebugPrint(NameScores const & scores); -// Returns true when |s| is a stop-word and may be removed from a query. -bool IsStopWord(strings::UniString const & s); - class TokensVector { std::vector m_tokens; diff --git a/libs/search/search_tests/locality_scorer_test.cpp b/libs/search/search_tests/locality_scorer_test.cpp index 85d28e5ef..76085ac7e 100644 --- a/libs/search/search_tests/locality_scorer_test.cpp +++ b/libs/search/search_tests/locality_scorer_test.cpp @@ -44,11 +44,7 @@ public: m_scorer.SetPivotForTesting(pivot); vector tokens; - search::ForEachNormalizedToken(query, [&tokens](strings::UniString && token) - { - if (!IsStopWord(token)) - tokens.push_back(std::move(token)); - }); + tokens = NormalizeAndTokenizeString(query); m_params.Init(query, tokens, lastTokenIsPrefix); }