diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 18419b18f..1563f13e0 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt @@ -9,8 +9,6 @@ set(SRC affiliation.hpp altitude_generator.cpp altitude_generator.hpp - # Should precede booking_dataset.cpp because of Unity build + template instantiation order. - booking_scoring.cpp booking_dataset.cpp booking_dataset.hpp borders.cpp @@ -126,6 +124,8 @@ set(SRC isolines_generator.hpp isolines_section_builder.cpp isolines_section_builder.hpp + kayak_dataset.cpp + kayak_dataset.hpp maxspeeds_builder.cpp maxspeeds_builder.hpp maxspeeds_collector.cpp diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 847f947aa..5c5684393 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,6 +1,5 @@ #include "generator/booking_dataset.hpp" #include "generator/feature_builder.hpp" -#include "generator/sponsored_scoring.hpp" #include "indexer/classificator.hpp" #include "indexer/ftypes_matcher.hpp" @@ -18,37 +17,42 @@ namespace generator using namespace feature; // BookingHotel ------------------------------------------------------------------------------------ -BookingHotel::BookingHotel(std::string const & src) +BookingHotel::BookingHotel(std::string src) { + /// @todo For fast parsing we can preprocess src (quotes) and return string_view's. std::vector rec; strings::ParseCSVRow(src, '\t', rec); - CHECK_EQUAL(rec.size(), FieldsCount(), ("Error parsing hotels.tsv line:", - boost::replace_all_copy(src, "\t", "\\t"))); - CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Id)], m_id.Get()), ()); - // TODO(mgsergio): Use ms::LatLon. - CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::Latitude)], m_latLon.m_lat), ()); - CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::Longtitude)], m_latLon.m_lon), ()); + CHECK_EQUAL(rec.size(), Fields::Counter, + ("Error parsing hotels entry:", boost::replace_all_copy(src, "\t", "\\t"))); - m_name = rec[FieldIndex(Fields::Name)]; - m_address = rec[FieldIndex(Fields::Address)]; + // Assign id in the end in case of possible errors. + uint32_t id; + CLOG(LDEBUG, strings::to_uint(rec[Fields::Id], id), ()); + CLOG(LDEBUG, strings::to_double(rec[Fields::Latitude], m_latLon.m_lat), ()); + CLOG(LDEBUG, strings::to_double(rec[Fields::Longitude], m_latLon.m_lon), ()); - CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Stars)], m_stars), ()); - CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::PriceCategory)], m_priceCategory), ()); - CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::RatingBooking)], m_ratingBooking), ()); - CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::RatingUsers)], m_ratingUser), ()); + m_name = rec[Fields::Name]; + m_address = rec[Fields::Address]; - m_descUrl = rec[FieldIndex(Fields::DescUrl)]; + CLOG(LDEBUG, strings::to_uint(rec[Fields::Stars], m_stars), ()); + CLOG(LDEBUG, strings::to_uint(rec[Fields::PriceCategory], m_priceCategory), ()); + CLOG(LDEBUG, strings::to_double(rec[Fields::RatingBooking], m_ratingBooking), ()); + CLOG(LDEBUG, strings::to_double(rec[Fields::RatingUsers], m_ratingUser), ()); - CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Type)], m_type), ()); + m_descUrl = rec[Fields::DescUrl]; - m_translations = rec[FieldIndex(Fields::Translations)]; + CLOG(LDEBUG, strings::to_uint(rec[Fields::Type], m_type), ()); + + m_translations = rec[Fields::Translations]; + + m_id.Set(id); } // BookingDataset ---------------------------------------------------------------------------------- template <> -bool BookingDataset::NecessaryMatchingConditionHolds(FeatureBuilder const & fb) const +bool BookingDataset::IsSponsoredCandidate(FeatureBuilder const & fb) const { if (fb.GetName(StringUtf8Multilang::kDefaultCode).empty()) return false; @@ -173,28 +177,4 @@ void BookingDataset::BuildObject(Object const & hotel, FBuilderFnT const & fn) c fn(fb); } -/// @todo It looks like quite common FindMatchingObjectId function implementation. -template <> -BookingDataset::ObjectId BookingDataset::FindMatchingObjectIdImpl(FeatureBuilder const & fb) const -{ - auto const name = fb.GetName(StringUtf8Multilang::kDefaultCode); - - if (name.empty()) - return Object::InvalidObjectId(); - - // Find |kMaxSelectedElements| nearest values to a point, sorted by distance? - auto const bookingIndexes = m_storage.GetNearestObjects(mercator::ToLatLon(fb.GetKeyPoint())); - - /// @todo Select best candidate? Assume we match "Foo Resort SPA hotel" feature. Have candidates: - /// - "Bar SPA hotel" in 10 meters (first); - /// - "Foo SPA hotel" in 100 meters (second, but best); - /// I suspect that first "Bar hotel" will be selected (wrong). - for (auto const j : bookingIndexes) - { - if (sponsored_scoring::Match(m_storage.GetObjectById(j), fb).IsMatched()) - return j; - } - - return Object::InvalidObjectId(); -} -} // namespace generator +} // namespace generator diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index 0d412207b..6ece11fdd 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -7,29 +7,30 @@ namespace generator { -struct BookingHotel : SponsoredObjectBase +class BookingHotel : public SponsoredObjectBase { - enum class Fields + enum Fields { Id = 0, - Latitude = 1, - Longtitude = 2, - Name = 3, - Address = 4, - Stars = 5, - PriceCategory = 6, - RatingBooking = 7, - RatingUsers = 8, - DescUrl = 9, - Type = 10, - Translations = 11, + Latitude, + Longitude, + Name, + Address, + Stars, + PriceCategory, + RatingBooking, + RatingUsers, + DescUrl, + Type, + Translations, + Counter }; - explicit BookingHotel(std::string const & src); +public: + explicit BookingHotel(std::string src); - static constexpr size_t FieldIndex(Fields field) { return SponsoredObjectBase::FieldIndex(field); } - static constexpr size_t FieldsCount() { return SponsoredObjectBase::FieldsCount(); } + static constexpr size_t FieldsCount() { return Fields::Counter; } uint32_t m_stars = 0; uint32_t m_priceCategory = 0; @@ -37,6 +38,7 @@ struct BookingHotel : SponsoredObjectBase double m_ratingUser = 0.0; uint32_t m_type = 0; std::string m_translations; + std::string m_descUrl; }; using BookingDataset = SponsoredDataset; diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 30878ba02..93198e277 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,22 +1,18 @@ -#include "generator/booking_dataset.hpp" - +//#include "generator/booking_dataset.hpp" #include "generator/feature_builder.hpp" +#include "generator/feature_maker.hpp" //#include "generator/opentable_dataset.hpp" +#include "generator/kayak_dataset.hpp" #include "generator/osm_source.hpp" -#include "generator/processor_booking.hpp" #include "generator/raw_generator.hpp" -#include "generator/sponsored_scoring.hpp" -#include "generator/translator_collection.hpp" -#include "generator/translator_factory.hpp" +#include "generator/sponsored_dataset_inl.hpp" +#include "generator/translator.hpp" #include "indexer/classificator_loader.hpp" -#include "geometry/distance_on_sphere.hpp" - #include "base/file_name_utils.hpp" #include "base/exception.hpp" #include "base/geo_object_id.hpp" -#include "base/stl_helpers.hpp" #include "base/string_utils.hpp" #include @@ -39,7 +35,7 @@ DEFINE_string(factors, "", "Factors output path"); DEFINE_string(sample, "", "Path so sample file"); DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); -DEFINE_uint64(selection_size, 1000, "Selection size"); +DEFINE_uint64(selection_size, 10000, "Selection size"); DEFINE_bool(generate, false, "Generate unmarked sample"); using namespace generator; @@ -100,7 +96,7 @@ GenerateInfo GetGenerateInfo() info.SetNodeStorageType("map"); info.SetOsmFileType("o5m"); - info.m_intermediateDir = base::GetDirectory(FLAGS_factors); + info.m_cacheDir = info.m_intermediateDir = base::GetDirectory(FLAGS_osm); // Set other info params here. @@ -189,36 +185,37 @@ vector> ReadSampleFromFile(string const & name) return ReadSample(ist); } +void PrintOsmUrl(std::ostream & os, ms::LatLon const & ll) +{ + os << "# URL: https://www.openstreetmap.org/?mlat=" << ll.m_lat << "&mlon=" << ll.m_lon + << "#map=18/" << ll.m_lat << "/" << ll.m_lon << endl; +}; + template void GenerateFactors(Dataset const & dataset, map const & features, vector> const & sampleItems, ostream & ost) { + ost << fixed << setprecision(6); + for (auto const & item : sampleItems) { auto const & object = dataset.GetStorage().GetObjectById(item.m_sponsoredId); auto const & feature = features.at(item.m_osmId); - auto const score = generator::sponsored_scoring::Match(object, feature); + auto const score = dataset.CalcScore(object, feature); - auto const center = mercator::ToLatLon(feature.GetKeyPoint()); - double const distanceMeters = ms::DistanceOnEarth(center, object.m_latLon); - auto const matched = score.IsMatched(); - - ost << "# ------------------------------------------" << fixed << setprecision(6) - << endl; - ost << (matched ? 'y' : 'n') << " \t" << DebugPrint(feature.GetMostGenericOsmId()) - << "\t " << object.m_id - << "\tdistance: " << distanceMeters + ost << "# ------------------------------------------" << endl; + ost << (score.IsMatched() ? "YES" : "NO") << "\t" << DebugPrint(feature.GetMostGenericOsmId()) + << "\t" << object.m_id + << "\tdistance: " << score.m_distance << "\tdistance score: " << score.m_linearNormDistanceScore << "\tname score: " << score.m_nameSimilarityScore << "\tresult score: " << score.GetMatchingScore() << endl; ost << "# " << PrintBuilder(feature) << endl; ost << "# " << object << endl; - ost << "# URL: https://www.openstreetmap.org/?mlat=" - << object.m_latLon.m_lat << "&mlon=" << object.m_latLon.m_lon << "#map=18/" - << object.m_latLon.m_lat << "/" << object.m_latLon.m_lon << endl; + PrintOsmUrl(ost, object.m_latLon); } } @@ -241,38 +238,33 @@ void GenerateSample(Dataset const & dataset, if (FLAGS_selection_size < elementIndexes.size()) elementIndexes.resize(FLAGS_selection_size); - stringstream outStream; - + ost << fixed << setprecision(6); for (auto osmId : elementIndexes) { auto const & fb = features.at(osmId); - auto const sponsoredIndexes = dataset.GetStorage().GetNearestObjects(mercator::ToLatLon(fb.GetKeyPoint())); + auto const ll = mercator::ToLatLon(fb.GetKeyPoint()); + auto const sponsoredIndexes = dataset.GetStorage().GetNearestObjects(ll); + + ost << "# ------------------------------------------" << endl + << "# " << PrintBuilder(fb) << endl; + PrintOsmUrl(ost, ll); for (auto const sponsoredId : sponsoredIndexes) { auto const & object = dataset.GetStorage().GetObjectById(sponsoredId); - auto const score = sponsored_scoring::Match(object, fb); + auto const score = dataset.CalcScore(object, fb); - auto const center = mercator::ToLatLon(fb.GetKeyPoint()); - double const distanceMeters = ms::DistanceOnEarth(center, object.m_latLon); - auto const matched = score.IsMatched(); - - ost << "# ------------------------------------------" << fixed << setprecision(6) - << endl; - ost << (matched ? 'y' : 'n') << " \t" << DebugPrint(osmId) << "\t " << sponsoredId - << "\tdistance: " << distanceMeters + ost << (score.IsMatched() ? "YES" : "NO") << "\t" << sponsoredId + << "\tdistance: " << score.m_distance << "\tdistance score: " << score.m_linearNormDistanceScore << "\tname score: " << score.m_nameSimilarityScore << "\tresult score: " << score.GetMatchingScore() - << endl; - ost << "# " << PrintBuilder(fb) << endl; - ost << "# " << object << endl; - ost << "# URL: https://www.openstreetmap.org/?mlat=" - << object.m_latLon.m_lat << "&mlon=" << object.m_latLon.m_lon - << "#map=18/" << object.m_latLon.m_lat << "/" << object.m_latLon.m_lon << endl; + << endl + << "# " << object << endl; + PrintOsmUrl(ost, object.m_latLon); } - if (!sponsoredIndexes.empty()) - ost << endl << endl; + + ost << endl; } } @@ -280,7 +272,7 @@ template string GetDatasetFilePath(GenerateInfo const & info); template <> -string GetDatasetFilePath(GenerateInfo const & info) +string GetDatasetFilePath(GenerateInfo const & info) { return info.m_bookingDataFilename; } @@ -291,6 +283,75 @@ string GetDatasetFilePath(GenerateInfo const & info) // return info.m_opentableDataFilename; //} +class TranslatorMock : public Translator +{ +public: + TranslatorMock(std::shared_ptr const & processor, + std::shared_ptr const & cache) + : Translator(processor, cache, std::make_shared(cache->GetCache())) + { + } + + /// @name TranslatorInterface overrides. + /// @{ + std::shared_ptr Clone() const override + { + UNREACHABLE(); + return nullptr; + } + void Merge(TranslatorInterface const &) override + { + UNREACHABLE(); + } + /// @} +}; + +class AggregateProcessor : public FeatureProcessorInterface +{ +public: + /// @name FeatureProcessorInterface overrides. + /// @{ + std::shared_ptr Clone() const override + { + UNREACHABLE(); + return nullptr; + } + void Process(feature::FeatureBuilder & fb) override + { + auto const id = fb.GetMostGenericOsmId(); + m_features.emplace(id, std::move(fb)); + } + void Finish() override {} + /// @} + + std::map m_features; +}; + +template class DatasetFilter : public FilterInterface +{ + Dataset const & m_dataset; +public: + DatasetFilter(Dataset const & dataset) : m_dataset(dataset) {} + + /// @name FilterInterface overrides. + /// @{ + std::shared_ptr Clone() const override + { + UNREACHABLE(); + return nullptr; + } + bool IsAccepted(OsmElement const & e) const override + { + // All hotels under tourism tag. + return !e.GetTag("tourism").empty(); + } + bool IsAccepted(feature::FeatureBuilder const & fb) const override + { + return m_dataset.IsSponsoredCandidate(fb); + } + /// @} +}; + template void RunImpl(GenerateInfo & info) { @@ -298,16 +359,17 @@ void RunImpl(GenerateInfo & info) Dataset dataset(dataSetFilePath); LOG_SHORT(LINFO, (dataset.GetStorage().Size(), "objects are loaded from a file:", dataSetFilePath)); - map features; LOG_SHORT(LINFO, ("OSM data:", FLAGS_osm)); generator::cache::IntermediateDataObjectsCache objectsCache; - generator::cache::IntermediateData cacheLoader(objectsCache, info); - auto translators = make_shared(); - auto processor = make_shared>(dataset, features); - translators->Append(CreateTranslator(TranslatorType::Country, processor, cacheLoader.GetCache(), info)); + auto cache = std::make_shared(objectsCache, info); + auto processor = make_shared(); + auto translator = std::make_shared(processor, cache); + translator->SetFilter(std::make_shared>(dataset)); + RawGenerator generator(info); - generator.GenerateCustom(translators); + generator.GenerateCustom(translator); + CHECK(generator.Execute(), ()); if (FLAGS_generate) { @@ -319,7 +381,7 @@ void RunImpl(GenerateInfo & info) CHECK(ofst->is_open(), ("Can't open file", FLAGS_sample, strerror(errno))); ost = ofst.get(); } - GenerateSample(dataset, features, *ost); + GenerateSample(dataset, processor->m_features, *ost); } else { @@ -327,7 +389,7 @@ void RunImpl(GenerateInfo & info) LOG_SHORT(LINFO, ("Sample size is", sample.size())); ofstream ost(FLAGS_factors); CHECK(ost.is_open(), ("Can't open file", FLAGS_factors, strerror(errno))); - GenerateFactors(dataset, features, sample, ost); + GenerateFactors(dataset, processor->m_features, sample, ost); } } @@ -335,7 +397,7 @@ void Run(DatasetType const datasetType, GenerateInfo & info) { switch (datasetType) { - case DatasetType::Booking: RunImpl(info); break; + case DatasetType::Booking: RunImpl(info); break; //case DatasetType::Opentable: RunImpl(info); break; } } diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp deleted file mode 100644 index cd7802c34..000000000 --- a/generator/booking_scoring.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "generator/sponsored_scoring.hpp" - -#include "generator/booking_dataset.hpp" -#include "generator/feature_builder.hpp" - -#include "geometry/mercator.hpp" - - -namespace -{ -// Calculated with tools/python/booking_hotels_quality.py. -double constexpr kOptimalThreshold = 0.304875; -} // namespace - -namespace generator -{ -namespace sponsored_scoring -{ -template <> -double MatchStats::GetMatchingScore() const -{ - // TODO(mgsergio): Use tuner to get optimal function. - return m_linearNormDistanceScore * m_nameSimilarityScore; -} - -template <> -bool MatchStats::IsMatched() const -{ - return GetMatchingScore() > kOptimalThreshold; -} - -/// @todo It looks like quite common Match function implementation, -/// because GetLatLon and GetName() needed. -template <> -MatchStats Match(BookingHotel const & h, feature::FeatureBuilder const & fb) -{ - MatchStats score; - - auto const fbCenter = mercator::ToLatLon(fb.GetKeyPoint()); - auto const distance = ms::DistanceOnEarth(fbCenter, h.m_latLon); - score.m_linearNormDistanceScore = - impl::GetLinearNormDistanceScore(distance, BookingDataset::kDistanceLimitInMeters); - - // TODO(mgsergio): Check all translations and use the best one. - score.m_nameSimilarityScore = impl::GetNameSimilarityScore( - h.m_name, std::string(fb.GetName(StringUtf8Multilang::kDefaultCode))); - - return score; -} -} // namespace sponsored_scoring -} // namespace generator diff --git a/generator/final_processor_country.cpp b/generator/final_processor_country.cpp index f5152b98f..e9b93f091 100644 --- a/generator/final_processor_country.cpp +++ b/generator/final_processor_country.cpp @@ -3,7 +3,8 @@ #include "generator/addresses_collector.hpp" #include "generator/address_enricher.hpp" #include "generator/affiliation.hpp" -#include "generator/booking_dataset.hpp" +//#include "generator/booking_dataset.hpp" +#include "generator/kayak_dataset.hpp" #include "generator/coastlines_generator.hpp" #include "generator/feature_builder.hpp" #include "generator/final_processor_utils.hpp" @@ -97,7 +98,8 @@ void CountryFinalProcessor::Order() void CountryFinalProcessor::ProcessBooking() { - BookingDataset dataset(m_hotelsFilename); + KayakDataset dataset(m_hotelsFilename); + LOG(LINFO, ("Loaded", dataset.GetStorage().Size(), "hotels from", m_hotelsFilename)); std::ofstream matchingLogStream; matchingLogStream.exceptions(std::fstream::failbit | std::fstream::badbit); @@ -110,38 +112,46 @@ void CountryFinalProcessor::ProcessBooking() return; std::stringstream sstream; + sstream << std::fixed << std::setprecision(7); + + size_t total = 0, matched = 0; + FeatureBuilderWriter writer(path, true /* mangleName */); ForEachFeatureRawFormat(path, [&](FeatureBuilder && fb, uint64_t) { - auto const id = dataset.FindMatchingObjectId(fb); - if (id == BookingHotel::InvalidObjectId()) + bool hotelProcessed = false; + if (dataset.IsSponsoredCandidate(fb)) { - writer.Write(fb); - } - else - { - dataset.PreprocessMatchedOsmObject(id, fb, [&](FeatureBuilder & newFeature) + ++total; + auto const id = dataset.FindMatchingObjectId(fb); + if (id != KayakHotel::InvalidObjectId()) { - if (newFeature.PreSerialize()) - writer.Write(newFeature); - }); - } + ++matched; + hotelProcessed = true; + + dataset.PreprocessMatchedOsmObject(id, fb, [&](FeatureBuilder & newFeature) + { + if (newFeature.PreSerialize()) + writer.Write(newFeature); + }); - auto const & isHotelChecker = ftypes::IsHotelChecker::Instance(); - if (isHotelChecker(fb.GetTypes())) - { - if (id != BookingHotel::InvalidObjectId()) sstream << id; + } + else + sstream << "NO"; - auto const latLon = mercator::ToLatLon(fb.GetKeyPoint()); - sstream << ',' << fb.GetMostGenericOsmId().GetEncodedId() << ',' - << strings::to_string_dac(latLon.m_lat, 7) << ',' - << strings::to_string_dac(latLon.m_lon, 7) << ',' << name << '\n'; + auto const ll = mercator::ToLatLon(fb.GetKeyPoint()); + sstream << ",\t" << DebugPrint(fb.GetMostGenericOsmId()) << ",\t" << ll.m_lat << ',' << ll.m_lon << std::endl; } + + if (!hotelProcessed) + writer.Write(fb); }); std::lock_guard guard(m); matchingLogStream << sstream.str(); + LOG(LINFO, ("Hotels (MWM, total, matched):", name, total, matched)); + }, m_threadsCount); std::vector fbs; diff --git a/generator/final_processor_utils.hpp b/generator/final_processor_utils.hpp index 8ce2d47af..e4ea4ad2c 100644 --- a/generator/final_processor_utils.hpp +++ b/generator/final_processor_utils.hpp @@ -43,6 +43,9 @@ std::vector> AppendToMwmTmp(std::vector> countryToFbsIndexes; for (size_t i = 0; i < fbs.size(); ++i) diff --git a/generator/generator_tests/CMakeLists.txt b/generator/generator_tests/CMakeLists.txt index 65780e830..6337ad506 100644 --- a/generator/generator_tests/CMakeLists.txt +++ b/generator/generator_tests/CMakeLists.txt @@ -46,6 +46,7 @@ set(SRC source_data.hpp source_to_element_test.cpp speed_cameras_test.cpp + sponsored_scoring_tests.cpp srtm_parser_test.cpp tag_admixer_test.cpp tesselator_test.cpp diff --git a/generator/generator_tests/sponsored_scoring_tests.cpp b/generator/generator_tests/sponsored_scoring_tests.cpp new file mode 100644 index 000000000..45c77c8d6 --- /dev/null +++ b/generator/generator_tests/sponsored_scoring_tests.cpp @@ -0,0 +1,41 @@ +#include "testing/testing.hpp" + +#include "generator/sponsored_scoring.hpp" + +#include "geometry/distance_on_sphere.hpp" +#include "geometry/latlon.hpp" + +namespace sponsored_scoring_tests +{ + +generator::sponsored::MatchStats GetMatch(ms::LatLon osmLL, std::string const & osmName, + ms::LatLon hotelLL, std::string const & hotelName) +{ + // The same as SponsoredDataset::kDistanceLimitMeters + return { ms::DistanceOnEarth(osmLL, hotelLL), 150.0, hotelName, osmName }; +} + +UNIT_TEST(SponsoredScoring_Paris) +{ + TEST(!GetMatch({48.8474633, 2.3712106}, "Hôtel de Marseille", + {48.8473730, 2.3712020}, "Hotel Riesner").IsMatched(), ()); + + TEST(GetMatch({48.8760697, 2.3456749}, "Holiday Villa", + {48.8761570, 2.3455750}, "Hotel Villa Lafayette Paris IX").IsMatched(), ()); + + TEST(GetMatch({48.8664199, 2.2892440}, "Hôtel Baltimore", + {48.8663780, 2.2895710}, "Sofitel Paris Baltimore Tour Eiffel").IsMatched(), ()); + + TEST(!GetMatch({48.8808205, 2.3517253}, "Grand Hotel Magenta", + {48.8806950, 2.3521320}, "Hotel Cambrai").IsMatched(), ()); + + // But may be false on the ground. + TEST(GetMatch({48.8733283, 2.3004615}, "Hôtel Balzac", + {48.8735222, 2.3004904}, "Apart Inn Paris - Balzac").IsMatched(), ()); + + TEST(!GetMatch({48.8470895, 2.3710844}, "Hôtel Mignon", + {48.8473730, 2.3712020}, "Hotel Riesner").IsMatched(), ()); + +} + +} // namespace sponsored_scoring_tests diff --git a/generator/kayak_dataset.cpp b/generator/kayak_dataset.cpp new file mode 100644 index 000000000..b74ee521c --- /dev/null +++ b/generator/kayak_dataset.cpp @@ -0,0 +1,74 @@ +#include "generator/kayak_dataset.hpp" + +#include "generator/feature_builder.hpp" + +#include "indexer/ftypes_matcher.hpp" + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + + +namespace generator +{ +using namespace feature; + +// BookingHotel ------------------------------------------------------------------------------------ +KayakHotel::KayakHotel(std::string src) +{ + using namespace strings; + + // Patch strange entries. + if (src.starts_with("\",")) + src.erase(0, 1); + + /// @todo For fast parsing we can preprocess src (quotes) and return string_view's. + std::vector rec; + strings::ParseCSVRow(src, ',', rec); + + // Skip bad entries and header. + if (rec.size() != Fields::Counter || rec[0] == "ChainID") + return; + + // Assign id in the end in case of possible errors. + uint32_t id; + CLOG(LDEBUG, to_uint(rec[Fields::KayakHotelID], id), ()); /// @todo HotelID ? + CLOG(LDEBUG, to_double(rec[Fields::Latitude], m_latLon.m_lat), (rec[Fields::Latitude])); + CLOG(LDEBUG, to_double(rec[Fields::Longitude], m_latLon.m_lon), (rec[Fields::Longitude])); + + if (!to_double(rec[Fields::OverallRating], m_overallRating)) + m_overallRating = kInvalidRating; + + m_name = rec[Fields::HotelName]; + m_address = rec[Fields::HotelAddress]; + + m_id.Set(id); +} + + +// KayakDataset ---------------------------------------------------------------------------------- +template <> +bool KayakDataset::IsSponsoredCandidate(FeatureBuilder const & fb) const +{ + if (fb.GetName(StringUtf8Multilang::kDefaultCode).empty()) + return false; + + return ftypes::IsHotelChecker::Instance()(fb.GetTypes()); +} + +template <> +void KayakDataset::PreprocessMatchedOsmObject(ObjectId id, FeatureBuilder & fb, FBuilderFnT const fn) const +{ + auto const & hotel = m_storage.GetObjectById(id); + + fb.SetHotelInfo(Metadata::SRC_KAYAK, hotel.m_id.Get(), hotel.m_overallRating, 0 /* priceCategory */); + + fn(fb); +} + +template <> +void KayakDataset::BuildObject(Object const &, FBuilderFnT const &) const +{ + // Don't create new objects. +} + +} // namespace generator diff --git a/generator/kayak_dataset.hpp b/generator/kayak_dataset.hpp new file mode 100644 index 000000000..6294f74f0 --- /dev/null +++ b/generator/kayak_dataset.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "generator/sponsored_dataset.hpp" +#include "generator/sponsored_object_base.hpp" + +#include + +namespace generator +{ +class KayakHotel : public SponsoredObjectBase +{ + enum Fields + { + ChainID = 0, + ChainName, + Checkin, + Checkout, + CountryCode, + CountryFileName, + CountryName, + CurrencyCode, + DateCreated, + Facilities, + HotelAddress, + HotelFileName, + HotelID, + HotelName, + HotelPostcode, + IataPlaceCode, + ImageID, + KayakHotelID, + LastUpdated, + Latitude, + Longitude, + MinRate, + OverallRating, + PlaceFileName, + PlaceID, + PlaceName, + PlaceType, + Popularity, + PropertyType, + PropertyTypeID, + SelfRated, + StarRating, + StateName, + StatePlaceID, + StatePlacefilename, + Themes, + Trademarked, + TransliteratedHotelName, + + Counter + }; + +public: + explicit KayakHotel(std::string src); + + static constexpr size_t FieldsCount() { return Fields::Counter; } + + static double constexpr kInvalidRating = 0; + double m_overallRating = kInvalidRating; +}; + +using KayakDataset = SponsoredDataset; +} // namespace generator diff --git a/generator/processor_booking.hpp b/generator/processor_booking.hpp deleted file mode 100644 index bbf3ed531..000000000 --- a/generator/processor_booking.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include "generator/feature_builder.hpp" -#include "generator/feature_generator.hpp" -#include "generator/processor_interface.hpp" - -#include "indexer/feature_data.hpp" - -#include "base/assert.hpp" -#include "base/geo_object_id.hpp" - -#include -#include - -namespace generator -{ -template -class ProcessorBooking : public FeatureProcessorInterface -{ -public: - ProcessorBooking(Dataset const & dataset, - std::map & features) - : m_dataset(dataset), m_features(features) - { - } - - // FeatureProcessorInterface overrides: - std::shared_ptr Clone() const override - { - CHECK(false, ()); - return {}; - } - - void Process(feature::FeatureBuilder & fb) override - { - if (m_dataset.NecessaryMatchingConditionHolds(fb)) - m_features.emplace(fb.GetMostGenericOsmId(), fb); - } - - void Finish() override {} - -private: - Dataset const & m_dataset; - std::map & m_features; -}; -} // namespace generator diff --git a/generator/processor_factory.hpp b/generator/processor_factory.hpp index c12e7aef7..3cef118a0 100644 --- a/generator/processor_factory.hpp +++ b/generator/processor_factory.hpp @@ -1,7 +1,6 @@ #pragma once #include "generator/factory_utils.hpp" -#include "generator/processor_booking.hpp" #include "generator/processor_coastline.hpp" //#include "generator/processor_complex.hpp" #include "generator/processor_country.hpp" diff --git a/generator/raw_generator.cpp b/generator/raw_generator.cpp index f8d91d9b8..695310ac1 100644 --- a/generator/raw_generator.cpp +++ b/generator/raw_generator.cpp @@ -141,6 +141,7 @@ void RawGenerator::GenerateCoasts() void RawGenerator::GenerateCustom(std::shared_ptr const & translator) { + CHECK(translator, ()); m_translators->Append(translator); } @@ -148,6 +149,7 @@ void RawGenerator::GenerateCustom( std::shared_ptr const & translator, std::shared_ptr const & finalProcessor) { + CHECK(translator && finalProcessor, ()); m_translators->Append(translator); m_finalProcessors.emplace(finalProcessor); } diff --git a/generator/sponsored_dataset.hpp b/generator/sponsored_dataset.hpp index bdeca2fcd..3ca2f9d95 100644 --- a/generator/sponsored_dataset.hpp +++ b/generator/sponsored_dataset.hpp @@ -1,17 +1,16 @@ #pragma once #include "generator/sponsored_object_storage.hpp" +#include "generator/sponsored_scoring.hpp" #include #include -namespace feature -{ -class FeatureBuilder; -} // namespace feature +namespace feature { class FeatureBuilder; } namespace generator { + template class SponsoredDataset { @@ -19,32 +18,34 @@ public: using Object = SponsoredObject; using ObjectId = typename Object::ObjectId; - static double constexpr kDistanceLimitInMeters = 150; + static double constexpr kDistanceLimitMeters = 150; static size_t constexpr kMaxSelectedElements = 3; explicit SponsoredDataset(std::string const & dataPath); - /// @return true if |fb| satisfies some necessary conditions to match one or serveral - /// objects from dataset. - bool NecessaryMatchingConditionHolds(feature::FeatureBuilder const & fb) const; - ObjectId FindMatchingObjectId(feature::FeatureBuilder const & e) const; + /// @return true if |fb| satisfies some necessary conditions to match one or serveral objects from dataset. + bool IsSponsoredCandidate(feature::FeatureBuilder const & fb) const; + ObjectId FindMatchingObjectId(feature::FeatureBuilder const & fb) const; using FBuilderFnT = std::function; // Applies changes to a given osm object (for example, remove hotel type) // and passes the result to |fn|. - void PreprocessMatchedOsmObject(ObjectId matchedObjId, feature::FeatureBuilder & fb, - FBuilderFnT const fn) const; + void PreprocessMatchedOsmObject(ObjectId matchedObjId, feature::FeatureBuilder & fb, FBuilderFnT const fn) const; // Creates objects and adds them to the map (MWM) via |fn|. void BuildOsmObjects(FBuilderFnT const & fn) const; + static sponsored::MatchStats CalcScore(Object const & obj, feature::FeatureBuilder const & fb); + sponsored::MatchStats CalcScore(ObjectId objId, feature::FeatureBuilder const & fb) const + { + return CalcScore(m_storage.GetObjectById(objId), fb); + } + SponsoredObjectStorage const & GetStorage() const { return m_storage; } private: void BuildObject(Object const & object, FBuilderFnT const & fn) const; - /// @return an id of a matched object or kInvalidObjectId on failure. - ObjectId FindMatchingObjectIdImpl(feature::FeatureBuilder const & fb) const; - SponsoredObjectStorage m_storage; }; + } // namespace generator diff --git a/generator/sponsored_dataset_inl.hpp b/generator/sponsored_dataset_inl.hpp index 5cf52c637..fd3c5a703 100644 --- a/generator/sponsored_dataset_inl.hpp +++ b/generator/sponsored_dataset_inl.hpp @@ -1,17 +1,21 @@ #pragma once +#include "generator/feature_builder.hpp" #include "generator/sponsored_dataset.hpp" +#include "geometry/mercator.hpp" + #include #include + namespace generator { // SponsoredDataset -------------------------------------------------------------------------------- template SponsoredDataset::SponsoredDataset(std::string const & dataPath) - : m_storage(kDistanceLimitInMeters, kMaxSelectedElements) + : m_storage(kDistanceLimitMeters, kMaxSelectedElements) { m_storage.LoadData(dataPath); } @@ -23,12 +27,46 @@ void SponsoredDataset::BuildOsmObjects(FBuilderFnT const & fn) BuildObject(item.second, fn); } +template +sponsored::MatchStats SponsoredDataset::CalcScore( + Object const & obj, feature::FeatureBuilder const & fb) +{ + auto const fbCenter = mercator::ToLatLon(fb.GetKeyPoint()); + auto const distance = ms::DistanceOnEarth(fbCenter, obj.m_latLon); + + /// @todo Input dataset is in English language. + auto name = fb.GetName(StringUtf8Multilang::kEnglishCode); + if (name.empty()) + name = fb.GetName(StringUtf8Multilang::kDefaultCode); + + return { distance, kDistanceLimitMeters, obj.m_name, std::string(name) }; +} + template typename SponsoredDataset::ObjectId SponsoredDataset::FindMatchingObjectId(feature::FeatureBuilder const & fb) const { - if (NecessaryMatchingConditionHolds(fb)) - return FindMatchingObjectIdImpl(fb); - return Object::InvalidObjectId(); + // Find |kMaxSelectedElements| nearest values to a point, sorted by distance? + auto const indices = m_storage.GetNearestObjects(mercator::ToLatLon(fb.GetKeyPoint())); + + // Select best candidate by score. + double bestScore = -1; + auto res = Object::InvalidObjectId(); + for (auto const i : indices) + { + auto const r = CalcScore(i, fb); + if (r.IsMatched()) + { + double const score = r.GetMatchingScore(); + if (score > bestScore) + { + bestScore = score; + res = i; + } + } + } + + return res; } + } // namespace generator diff --git a/generator/sponsored_object_base.hpp b/generator/sponsored_object_base.hpp index bc3f68fa9..1b47d95a3 100644 --- a/generator/sponsored_object_base.hpp +++ b/generator/sponsored_object_base.hpp @@ -22,22 +22,15 @@ struct SponsoredObjectBase virtual ~SponsoredObjectBase() = default; - template - static constexpr size_t FieldIndex(Fields field) { return static_cast(field); } - - template - static constexpr size_t FieldsCount() { return static_cast(Fields::Counter); } - bool HasAddresParts() const { return !m_street.empty() || !m_houseNumber.empty(); } ObjectId m_id{InvalidObjectId()}; ms::LatLon m_latLon = ms::LatLon::Zero(); std::string m_name; + std::string m_street; std::string m_houseNumber; - std::string m_address; - std::string m_descUrl; }; NEWTYPE_SIMPLE_OUTPUT(SponsoredObjectBase::ObjectId); diff --git a/generator/sponsored_object_storage.hpp b/generator/sponsored_object_storage.hpp index c6bcf079a..292ce06ef 100644 --- a/generator/sponsored_object_storage.hpp +++ b/generator/sponsored_object_storage.hpp @@ -107,7 +107,9 @@ public: for (std::string line; std::getline(src, line);) { - Object object(line); + Object object(std::move(line)); + line.clear(); + if (object.m_id != Object::InvalidObjectId() && excludedIds.find(object.m_id) == excludedIds.cend()) { diff --git a/generator/sponsored_scoring.cpp b/generator/sponsored_scoring.cpp index 41a3c8f2f..d8412418c 100644 --- a/generator/sponsored_scoring.cpp +++ b/generator/sponsored_scoring.cpp @@ -1,5 +1,7 @@ #include "generator/sponsored_scoring.hpp" +#include "search/ranking_utils.hpp" + #include "indexer/search_string_utils.hpp" #include "base/math.hpp" @@ -7,18 +9,50 @@ #include #include + +namespace generator +{ +namespace sponsored +{ namespace { -using WeightedBagOfWords = std::vector>; +using StringT = strings::UniString; +class SkipTokens +{ + std::set m_skip; +public: + SkipTokens() + { + /// @todo Add other common terms? + m_skip.insert(strings::MakeUniString("hotel")); + } + bool Has(StringT const & s) const + { + return m_skip.count(s) > 0; + } +}; -std::vector StringToWords(std::string const & str) +using WeightedBagOfWords = std::vector>; + +std::vector StringToWords(std::string const & str) { auto result = search::NormalizeAndTokenizeString(str); - std::sort(std::begin(result), std::end(result)); + + static SkipTokens toSkip; + auto it = std::remove_if(result.begin(), result.end(), [](StringT const & s) + { + return toSkip.Has(s) || search::IsStopWord(s); + }); + + // In case if name is like "The Hotel". + if (std::distance(result.begin(), it) > 0) + result.erase(it, result.end()); + + std::sort(result.begin(), result.end()); return result; } -WeightedBagOfWords MakeWeightedBagOfWords(std::vector const & words) +WeightedBagOfWords MakeWeightedBagOfWords(std::vector const & words) { // TODO(mgsergio): Calculate tf-idsf score for every word. auto constexpr kTfIdfScorePlaceholder = 1; @@ -38,7 +72,7 @@ WeightedBagOfWords MakeWeightedBagOfWords(std::vector const double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) { - double result{}; + double result = 0; auto lhsIt = begin(lhs); auto rhsIt = begin(rhs); @@ -77,12 +111,7 @@ double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords return product / (lhsLength * rhsLength); } -} // namespace -namespace generator -{ -namespace impl -{ double GetLinearNormDistanceScore(double distance, double const maxDistance) { CHECK_NOT_EQUAL(maxDistance, 0.0, ("maxDistance cannot be 0.")); @@ -102,5 +131,16 @@ double GetNameSimilarityScore(std::string const & booking_name, std::string cons return WeightedBagOfWordsCos(aws, bws); } -} // namespace impl -} // namespace generator +} // namespace + +MatchStats::MatchStats(double distM, double distLimitM, std::string const & name, std::string const & fbName) + : m_distance(distM) +{ + m_linearNormDistanceScore = GetLinearNormDistanceScore(distM, distLimitM); + + // TODO(mgsergio): Check all translations and use the best one. + m_nameSimilarityScore = GetNameSimilarityScore(name, fbName); +} + +} // namespace sponsored +} // namespace generator diff --git a/generator/sponsored_scoring.hpp b/generator/sponsored_scoring.hpp index 74f99185e..beb0e240c 100644 --- a/generator/sponsored_scoring.hpp +++ b/generator/sponsored_scoring.hpp @@ -2,36 +2,40 @@ #include -namespace feature -{ -class FeatureBuilder; -} // namespace feature - namespace generator { -namespace impl -{ -double GetLinearNormDistanceScore(double distance, double maxDistance); -double GetNameSimilarityScore(std::string const & booking_name, std::string const & osm_name); -} // namespace impl +struct SponsoredObjectBase; -namespace sponsored_scoring +namespace sponsored { -/// Represents a match scoring statystics of a sponsored object agains osm object. -template -struct MatchStats -{ - /// Returns some score based on geven fields and classificator tuning. - double GetMatchingScore() const; - /// Returns true if GetMatchingScore is greater then some theshold. - bool IsMatched() const; - double m_linearNormDistanceScore{}; - double m_nameSimilarityScore{}; +/// Represents a match scoring statistics of a sponsored object against OSM object. +class MatchStats +{ + // Calculated with tools/python/booking_hotels_quality.py. + static double constexpr kOptimalThreshold = 0.304875; + +public: + MatchStats(double distM, double distLimitM, std::string const & name, std::string const & fbName); + + /// @return some score based on geven fields and classificator tuning. + double GetMatchingScore() const + { + // TODO(mgsergio): Use tuner to get optimal function. + return m_linearNormDistanceScore * m_nameSimilarityScore; + } + + /// @return true if GetMatchingScore is greater then some theshold. + bool IsMatched() const + { + return GetMatchingScore() > kOptimalThreshold; + } + +public: + double m_distance; + double m_linearNormDistanceScore; + double m_nameSimilarityScore; }; -/// Matches a given sponsored object against a given OSM object. -template -MatchStats Match(SponsoredObject const & o, feature::FeatureBuilder const & fb); -} // namespace booking_scoring -} // namespace generator +} // namespace sponsored +} // namespace generator