mirror of
https://codeberg.org/comaps/comaps
synced 2025-12-19 13:03:36 +00:00
[search] Remove stop words
Signed-off-by: x7z4w <x7z4w@noreply.codeberg.org>
This commit is contained in:
@@ -76,7 +76,7 @@ char const * g_strings[] = {
|
|||||||
// ./clusterize-tag-values.lisp house-number path-to-taginfo-db.db > numbers.txt
|
// ./clusterize-tag-values.lisp house-number path-to-taginfo-db.db > numbers.txt
|
||||||
// tail -n +2 numbers.txt | head -78 | sed 's/^.*) \(.*\) \[.*$/"\1"/g;s/[ -/]//g;s/$/,/' |
|
// tail -n +2 numbers.txt | head -78 | sed 's/^.*) \(.*\) \[.*$/"\1"/g;s/[ -/]//g;s/$/,/' |
|
||||||
// sort | uniq
|
// sort | uniq
|
||||||
vector<string> const g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L", "LL", "LN", "LNL", "LNLN", "LNN",
|
array<string_view, 48> constexpr g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L", "LL", "LN", "LNL", "LNLN", "LNN",
|
||||||
"N", "NBL", "NBLN", "NBN", "NBNBN", "NBNL", "NL", "NLBN", "NLL", "NLLN", "NLN",
|
"N", "NBL", "NBLN", "NBN", "NBNBN", "NBNL", "NL", "NLBN", "NLL", "NLLN", "NLN",
|
||||||
"NLNL", "NLS", "NLSN", "NN", "NNBN", "NNL", "NNLN", "NNN", "NNS", "NS", "NSN", "NSS",
|
"NLNL", "NLS", "NLSN", "NN", "NNBN", "NNL", "NNLN", "NNN", "NNS", "NS", "NSN", "NSS",
|
||||||
"S", "SL", "SLL", "SLN", "SN", "SNBNSS", "SNL", "SNN", "SS", "SSN", "SSS", "SSSS",
|
"S", "SL", "SLL", "SLN", "SN", "SNBNSS", "SNL", "SNN", "SS", "SSN", "SSS", "SSSS",
|
||||||
@@ -85,13 +85,14 @@ vector<string> const g_patterns = {"BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L
|
|||||||
"NNBNL"};
|
"NNBNL"};
|
||||||
|
|
||||||
// List of patterns which look like house numbers more than other patterns. Constructed by hand.
|
// List of patterns which look like house numbers more than other patterns. Constructed by hand.
|
||||||
vector<string> const g_patternsStrict = {"N", "NBN", "NBL", "NL"};
|
array<string_view, 4> constexpr g_patternsStrict = {"N", "NBN", "NBL", "NL"};
|
||||||
|
|
||||||
// List of common synonyms for building parts. Constructed by hand.
|
// List of common synonyms for building parts. Constructed by hand.
|
||||||
char const * g_buildingPartSynonyms[] = {"building", "bldg", "bld", "bl", "unit", "block", "blk", "корпус",
|
char const * g_buildingPartSynonyms[] = {"building", "bldg", "bld", "bl", "unit", "block", "blk", "корпус",
|
||||||
"корп", "кор", "литер", "лит", "строение", "стр", "блок", "бл"};
|
"корп", "кор", "литер", "лит", "строение", "стр", "блок", "бл"};
|
||||||
|
|
||||||
// List of common stop words for buildings. Constructed by hand.
|
// List of common stop words for buildings. Constructed by hand.
|
||||||
|
// TODO: add more stop words?
|
||||||
UniString const g_stopWords[] = {MakeUniString("дом"), MakeUniString("house"), MakeUniString("д")};
|
UniString const g_stopWords[] = {MakeUniString("дом"), MakeUniString("house"), MakeUniString("д")};
|
||||||
|
|
||||||
bool IsStopWord(UniString const & s, bool isPrefix)
|
bool IsStopWord(UniString const & s, bool isPrefix)
|
||||||
@@ -167,7 +168,8 @@ class HouseNumberClassifier
|
|||||||
public:
|
public:
|
||||||
using Patterns = StringSet<Token::Type, 4>;
|
using Patterns = StringSet<Token::Type, 4>;
|
||||||
|
|
||||||
HouseNumberClassifier(vector<string> const & patterns = g_patterns)
|
template <size_t size>
|
||||||
|
HouseNumberClassifier(array<string_view, size> const & patterns)
|
||||||
{
|
{
|
||||||
for (auto const & p : patterns)
|
for (auto const & p : patterns)
|
||||||
m_patterns.Add(make_transform_iterator(p.begin(), &CharToType), make_transform_iterator(p.end(), &CharToType));
|
m_patterns.Add(make_transform_iterator(p.begin(), &CharToType), make_transform_iterator(p.end(), &CharToType));
|
||||||
@@ -590,7 +592,7 @@ bool HouseNumbersMatchRange(std::string_view const & hnRange, TokensT const & qu
|
|||||||
|
|
||||||
bool LooksLikeHouseNumber(UniString const & s, bool isPrefix)
|
bool LooksLikeHouseNumber(UniString const & s, bool isPrefix)
|
||||||
{
|
{
|
||||||
static HouseNumberClassifier const classifier;
|
static HouseNumberClassifier const classifier(g_patterns);
|
||||||
return classifier.LooksGood(s, isPrefix);
|
return classifier.LooksGood(s, isPrefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -298,7 +298,6 @@ void LocalityScorer::GetDocVecs(uint32_t localityId, vector<DocVec> & dvs) const
|
|||||||
DocVec::Builder builder;
|
DocVec::Builder builder;
|
||||||
ForEachNormalizedToken(name, [&](strings::UniString const & token)
|
ForEachNormalizedToken(name, [&](strings::UniString const & token)
|
||||||
{
|
{
|
||||||
if (!IsStopWord(token))
|
|
||||||
builder.Add(token);
|
builder.Add(token);
|
||||||
});
|
});
|
||||||
dvs.emplace_back(std::move(builder));
|
dvs.emplace_back(std::move(builder));
|
||||||
|
|||||||
@@ -69,21 +69,6 @@ m2::RectD GetRectAroundPosition(m2::PointD const & position)
|
|||||||
return mercator::RectByCenterXYAndSizeInMeters(position, kMaxPositionRadiusM);
|
return mercator::RectByCenterXYAndSizeInMeters(position, kMaxPositionRadiusM);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Removes all full-token stop words from |tokens|.
|
|
||||||
// Does nothing if all tokens are non-prefix stop words.
|
|
||||||
void RemoveStopWordsIfNeeded(QueryTokens & tokens, strings::UniString & prefix)
|
|
||||||
{
|
|
||||||
size_t numStopWords = 0;
|
|
||||||
for (auto const & token : tokens)
|
|
||||||
if (IsStopWord(token))
|
|
||||||
++numStopWords;
|
|
||||||
|
|
||||||
if (numStopWords == tokens.size() && prefix.empty())
|
|
||||||
return;
|
|
||||||
|
|
||||||
tokens.erase_if(&IsStopWord);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TrimLeadingSpaces(string & s)
|
void TrimLeadingSpaces(string & s)
|
||||||
{
|
{
|
||||||
while (!s.empty() && strings::IsASCIISpace(s.front()))
|
while (!s.empty() && strings::IsASCIISpace(s.front()))
|
||||||
@@ -277,9 +262,6 @@ void Processor::SetQuery(string const & query, bool categorialRequest /* = false
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove stopwords *after* FillCategories call (it makes exact tokens match).
|
|
||||||
RemoveStopWordsIfNeeded(m_query.m_tokens, m_query.m_prefix);
|
|
||||||
|
|
||||||
if (!m_isCategorialRequest)
|
if (!m_isCategorialRequest)
|
||||||
{
|
{
|
||||||
// Assign tokens and prefix to scorer.
|
// Assign tokens and prefix to scorer.
|
||||||
|
|||||||
@@ -1482,13 +1482,7 @@ unordered_map<string, vector<string>> const kSynonyms = {
|
|||||||
// QueryParams::Token ------------------------------------------------------------------------------
|
// QueryParams::Token ------------------------------------------------------------------------------
|
||||||
void QueryParams::Token::AddSynonym(string const & s)
|
void QueryParams::Token::AddSynonym(string const & s)
|
||||||
{
|
{
|
||||||
AddSynonym(strings::MakeUniString(s));
|
m_synonyms.push_back(strings::MakeUniString(s));
|
||||||
}
|
|
||||||
|
|
||||||
void QueryParams::Token::AddSynonym(String const & s)
|
|
||||||
{
|
|
||||||
if (!IsStopWord(s))
|
|
||||||
m_synonyms.push_back(s);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string DebugPrint(QueryParams::Token const & token)
|
string DebugPrint(QueryParams::Token const & token)
|
||||||
@@ -1510,10 +1504,11 @@ void QueryParams::ClearStreetIndices()
|
|||||||
AdditionalCommonTokens()
|
AdditionalCommonTokens()
|
||||||
{
|
{
|
||||||
char const * arr[] = {
|
char const * arr[] = {
|
||||||
"the", // English
|
"a", "and", "s", "the", // English
|
||||||
"der", "zum", "und", "auf", // German
|
"am", "an", "auf", "der", "im", "und", "zum", // German
|
||||||
"del", "les", // Spanish
|
"as", "d", "da", "de", "del", "di", "do", "du", "e", "el",
|
||||||
"в", "на" // Cyrillic
|
"et", "la", "las", "le", "les", "los", "o", "os", "y", // French, Spanish, Italian
|
||||||
|
"в", "и", "на", "я" // Cyrillic
|
||||||
};
|
};
|
||||||
for (char const * s : arr)
|
for (char const * s : arr)
|
||||||
m_strings.insert(NormalizeAndSimplifyString(s));
|
m_strings.insert(NormalizeAndSimplifyString(s));
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ public:
|
|||||||
Token(String const & original) : m_original(original) {}
|
Token(String const & original) : m_original(original) {}
|
||||||
|
|
||||||
void AddSynonym(std::string const & s);
|
void AddSynonym(std::string const & s);
|
||||||
void AddSynonym(String const & s);
|
|
||||||
|
|
||||||
template <typename Fn>
|
template <typename Fn>
|
||||||
void ForEachSynonym(Fn && fn) const
|
void ForEachSynonym(Fn && fn) const
|
||||||
|
|||||||
@@ -94,42 +94,9 @@ ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniStr
|
|||||||
}
|
}
|
||||||
} // namespace impl
|
} // namespace impl
|
||||||
|
|
||||||
bool IsStopWord(UniString const & s)
|
|
||||||
{
|
|
||||||
/// @todo Get all common used stop words and take out this array into search_string_utils.cpp module for example.
|
|
||||||
/// Should skip this tokens when building search index?
|
|
||||||
class StopWordsChecker
|
|
||||||
{
|
|
||||||
set<UniString> m_set;
|
|
||||||
|
|
||||||
public:
|
|
||||||
StopWordsChecker()
|
|
||||||
{
|
|
||||||
// Don't want to put _full_ stopwords list, not to break current ranking.
|
|
||||||
// Only 2-letters and the most common.
|
|
||||||
char const * arr[] = {
|
|
||||||
"a", "s", "the", // English
|
|
||||||
"am", "im", "an", // German
|
|
||||||
"d", "da", "de", "di", "du", "la", "le", // French, Spanish, Italian
|
|
||||||
"и", "я" // Cyrillic
|
|
||||||
};
|
|
||||||
for (char const * s : arr)
|
|
||||||
m_set.insert(MakeUniString(s));
|
|
||||||
}
|
|
||||||
bool Has(UniString const & s) const { return m_set.count(s) > 0; }
|
|
||||||
};
|
|
||||||
|
|
||||||
static StopWordsChecker const swChecker;
|
|
||||||
return swChecker.Has(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
TokensVector::TokensVector(string_view name)
|
TokensVector::TokensVector(string_view name)
|
||||||
{
|
{
|
||||||
ForEachNormalizedToken(name, [this](strings::UniString && token)
|
m_tokens = NormalizeAndTokenizeString(std::move(name));
|
||||||
{
|
|
||||||
if (!IsStopWord(token))
|
|
||||||
m_tokens.push_back(std::move(token));
|
|
||||||
});
|
|
||||||
|
|
||||||
Init();
|
Init();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -180,9 +180,6 @@ struct NameScores
|
|||||||
std::string DebugPrint(NameScore const & score);
|
std::string DebugPrint(NameScore const & score);
|
||||||
std::string DebugPrint(NameScores const & scores);
|
std::string DebugPrint(NameScores const & scores);
|
||||||
|
|
||||||
// Returns true when |s| is a stop-word and may be removed from a query.
|
|
||||||
bool IsStopWord(strings::UniString const & s);
|
|
||||||
|
|
||||||
class TokensVector
|
class TokensVector
|
||||||
{
|
{
|
||||||
std::vector<strings::UniString> m_tokens;
|
std::vector<strings::UniString> m_tokens;
|
||||||
|
|||||||
@@ -44,11 +44,7 @@ public:
|
|||||||
m_scorer.SetPivotForTesting(pivot);
|
m_scorer.SetPivotForTesting(pivot);
|
||||||
|
|
||||||
vector<UniString> tokens;
|
vector<UniString> tokens;
|
||||||
search::ForEachNormalizedToken(query, [&tokens](strings::UniString && token)
|
tokens = NormalizeAndTokenizeString(query);
|
||||||
{
|
|
||||||
if (!IsStopWord(token))
|
|
||||||
tokens.push_back(std::move(token));
|
|
||||||
});
|
|
||||||
|
|
||||||
m_params.Init(query, tokens, lastTokenIsPrefix);
|
m_params.Init(query, tokens, lastTokenIsPrefix);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user