diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 9da16f131..18419b18f 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt @@ -9,6 +9,10 @@ set(SRC affiliation.hpp altitude_generator.cpp altitude_generator.hpp + # Should precede booking_dataset.cpp because of Unity build + template instantiation order. + booking_scoring.cpp + booking_dataset.cpp + booking_dataset.hpp borders.cpp borders.hpp boundary_postcodes_enricher.cpp @@ -196,6 +200,12 @@ set(SRC routing_world_roads_generator.hpp search_index_builder.cpp search_index_builder.hpp + sponsored_dataset.hpp + sponsored_dataset_inl.hpp + sponsored_object_base.hpp + sponsored_object_storage.hpp + sponsored_scoring.cpp + sponsored_scoring.hpp srtm_parser.cpp srtm_parser.hpp statistics.cpp @@ -274,5 +284,6 @@ omim_add_tool_subdirectory(generator_tool) #omim_add_tool_subdirectory(complex_generator) omim_add_tool_subdirectory(feature_segments_checker) omim_add_tool_subdirectory(srtm_coverage_checker) +omim_add_tool_subdirectory(booking_quality_check) add_subdirectory(world_roads_builder) add_subdirectory(address_parser) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp new file mode 100644 index 000000000..847f947aa --- /dev/null +++ b/generator/booking_dataset.cpp @@ -0,0 +1,200 @@ +#include "generator/booking_dataset.hpp" +#include "generator/feature_builder.hpp" +#include "generator/sponsored_scoring.hpp" + +#include "indexer/classificator.hpp" +#include "indexer/ftypes_matcher.hpp" + +#include "geometry/mercator.hpp" + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include "boost/algorithm/string/replace.hpp" + + +namespace generator +{ +using namespace feature; + +// BookingHotel ------------------------------------------------------------------------------------ +BookingHotel::BookingHotel(std::string const & src) +{ + std::vector rec; + strings::ParseCSVRow(src, '\t', rec); + CHECK_EQUAL(rec.size(), FieldsCount(), ("Error parsing hotels.tsv line:", + boost::replace_all_copy(src, "\t", "\\t"))); + + CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Id)], m_id.Get()), ()); + // TODO(mgsergio): Use ms::LatLon. + CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::Latitude)], m_latLon.m_lat), ()); + CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::Longtitude)], m_latLon.m_lon), ()); + + m_name = rec[FieldIndex(Fields::Name)]; + m_address = rec[FieldIndex(Fields::Address)]; + + CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Stars)], m_stars), ()); + CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::PriceCategory)], m_priceCategory), ()); + CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::RatingBooking)], m_ratingBooking), ()); + CLOG(LDEBUG, strings::to_double(rec[FieldIndex(Fields::RatingUsers)], m_ratingUser), ()); + + m_descUrl = rec[FieldIndex(Fields::DescUrl)]; + + CLOG(LDEBUG, strings::to_uint(rec[FieldIndex(Fields::Type)], m_type), ()); + + m_translations = rec[FieldIndex(Fields::Translations)]; +} + + +// BookingDataset ---------------------------------------------------------------------------------- +template <> +bool BookingDataset::NecessaryMatchingConditionHolds(FeatureBuilder const & fb) const +{ + if (fb.GetName(StringUtf8Multilang::kDefaultCode).empty()) + return false; + + return ftypes::IsHotelChecker::Instance()(fb.GetTypes()); +} + +template <> +void BookingDataset::PreprocessMatchedOsmObject(ObjectId, FeatureBuilder & fb, FBuilderFnT const fn) const +{ + // Turn a hotel into a simple building. + if (fb.GetGeomType() == GeomType::Area) + { + // Remove all information about the hotel. + auto & meta = fb.GetMetadata(); + meta.Drop(Metadata::EType::FMD_STARS); + meta.Drop(Metadata::EType::FMD_WEBSITE); + meta.Drop(Metadata::EType::FMD_PHONE_NUMBER); + + auto & params = fb.GetParams(); + params.ClearName(); + + auto const tourism = classif().GetTypeByPath({"tourism"}); + base::EraseIf(params.m_types, [tourism](uint32_t type) + { + ftype::TruncValue(type, 1); + return type == tourism; + }); + } + + fn(fb); +} + +template <> +void BookingDataset::BuildObject(Object const & hotel, FBuilderFnT const & fn) const +{ + FeatureBuilder fb; + + fb.SetCenter(mercator::FromLatLon(hotel.m_latLon.m_lat, hotel.m_latLon.m_lon)); + + /// @todo SRC_BOOKING + fb.SetHotelInfo(Metadata::SRC_KAYAK, hotel.m_id.Get(), hotel.m_ratingUser, hotel.m_priceCategory); + auto & metadata = fb.GetMetadata(); + metadata.Set(Metadata::FMD_WEBSITE, hotel.m_descUrl); + metadata.Set(Metadata::FMD_STARS, strings::to_string(hotel.m_stars)); + + auto & params = fb.GetParams(); + if (!hotel.m_street.empty()) + params.SetStreet(hotel.m_street); + + if (!hotel.m_houseNumber.empty()) + params.AddHouseNumber(hotel.m_houseNumber); + + if (!hotel.m_translations.empty()) + { + // TODO(mgsergio): Move parsing to the hotel costruction stage. + std::vector parts; + strings::ParseCSVRow(hotel.m_translations, '|', parts); + CHECK_EQUAL(parts.size() % 3, 0, ("Invalid translation string:", hotel.m_translations)); + for (size_t i = 0; i < parts.size(); i += 3) + { + auto const langCode = StringUtf8Multilang::GetLangIndex(parts[i]); + params.AddName(StringUtf8Multilang::GetLangByCode(langCode), parts[i + 1]); + // TODO(mgsergio): e.AddTag("addr:full:" + parts[i], parts[i + 2]); + } + } + params.AddName(StringUtf8Multilang::GetLangByCode(StringUtf8Multilang::kEnglishCode), hotel.m_name); + + auto const & clf = classif(); + params.AddType(clf.GetTypeByPath({"sponsored", "booking"})); + // Matching booking.com hotel types to OpenStreetMap values. + // Booking types are listed in the closed API docs. + switch (hotel.m_type) + { + case 19: + case 205: params.AddType(clf.GetTypeByPath({"tourism", "motel"})); break; + + case 21: + case 206: + case 212: params.AddType(clf.GetTypeByPath({"tourism", "resort"})); break; + + case 3: + case 23: + case 24: + case 25: + case 202: + case 207: + case 208: + case 209: + case 210: + case 216: + case 220: + case 223: params.AddType(clf.GetTypeByPath({"tourism", "guest_house"})); break; + + case 14: + case 204: + case 213: + case 218: + case 219: + case 226: + case 222: params.AddType(clf.GetTypeByPath({"tourism", "hotel"})); break; + + case 211: + case 224: + case 228: params.AddType(clf.GetTypeByPath({"tourism", "chalet"})); break; + + case 13: + case 225: + case 203: params.AddType(clf.GetTypeByPath({"tourism", "hostel"})); break; + + case 215: + case 221: + case 227: + case 2: + case 201: params.AddType(clf.GetTypeByPath({"tourism", "apartment"})); break; + + case 214: params.AddType(clf.GetTypeByPath({"tourism", "camp_site"})); break; + + default: params.AddType(clf.GetTypeByPath({"tourism", "hotel"})); break; + } + + fn(fb); +} + +/// @todo It looks like quite common FindMatchingObjectId function implementation. +template <> +BookingDataset::ObjectId BookingDataset::FindMatchingObjectIdImpl(FeatureBuilder const & fb) const +{ + auto const name = fb.GetName(StringUtf8Multilang::kDefaultCode); + + if (name.empty()) + return Object::InvalidObjectId(); + + // Find |kMaxSelectedElements| nearest values to a point, sorted by distance? + auto const bookingIndexes = m_storage.GetNearestObjects(mercator::ToLatLon(fb.GetKeyPoint())); + + /// @todo Select best candidate? Assume we match "Foo Resort SPA hotel" feature. Have candidates: + /// - "Bar SPA hotel" in 10 meters (first); + /// - "Foo SPA hotel" in 100 meters (second, but best); + /// I suspect that first "Bar hotel" will be selected (wrong). + for (auto const j : bookingIndexes) + { + if (sponsored_scoring::Match(m_storage.GetObjectById(j), fb).IsMatched()) + return j; + } + + return Object::InvalidObjectId(); +} +} // namespace generator diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp new file mode 100644 index 000000000..0d412207b --- /dev/null +++ b/generator/booking_dataset.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include "generator/sponsored_dataset.hpp" +#include "generator/sponsored_object_base.hpp" + +#include + +namespace generator +{ +struct BookingHotel : SponsoredObjectBase +{ + enum class Fields + { + Id = 0, + Latitude = 1, + Longtitude = 2, + Name = 3, + Address = 4, + Stars = 5, + PriceCategory = 6, + RatingBooking = 7, + RatingUsers = 8, + DescUrl = 9, + Type = 10, + Translations = 11, + Counter + }; + + explicit BookingHotel(std::string const & src); + + static constexpr size_t FieldIndex(Fields field) { return SponsoredObjectBase::FieldIndex(field); } + static constexpr size_t FieldsCount() { return SponsoredObjectBase::FieldsCount(); } + + uint32_t m_stars = 0; + uint32_t m_priceCategory = 0; + double m_ratingBooking = 0.0; + double m_ratingUser = 0.0; + uint32_t m_type = 0; + std::string m_translations; +}; + +using BookingDataset = SponsoredDataset; +} // namespace generator diff --git a/generator/booking_quality_check/CMakeLists.txt b/generator/booking_quality_check/CMakeLists.txt new file mode 100644 index 000000000..8bfa5f020 --- /dev/null +++ b/generator/booking_quality_check/CMakeLists.txt @@ -0,0 +1,11 @@ +project(booking_quality_check) + +set(SRC booking_quality_check.cpp) +#set(SRC booking_addr_match.cpp) + +omim_add_executable(${PROJECT_NAME} ${SRC}) + +target_link_libraries(${PROJECT_NAME} + generator + gflags::gflags +) diff --git a/generator/booking_quality_check/booking_addr_match.cpp b/generator/booking_quality_check/booking_addr_match.cpp new file mode 100644 index 000000000..228054fc7 --- /dev/null +++ b/generator/booking_quality_check/booking_addr_match.cpp @@ -0,0 +1,96 @@ +#include "generator/booking_dataset.hpp" +#include "generator/utils.hpp" + +#include "search/reverse_geocoder.hpp" + +#include "indexer/data_source.hpp" + +#include "geometry/mercator.hpp" + +#include "platform/platform.hpp" + +#include + +#include + + +DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); +DEFINE_string(user_resource_path, "", "Path to data directory (resources dir)"); +DEFINE_string(data_path, "", "Path to mwm files (writable dir)"); +DEFINE_string(locale, "en", "Locale of all the search queries"); +DEFINE_int32(num_threads, 1, "Number of search engine threads"); + +namespace +{ + +class AddressMatcher +{ +public: + AddressMatcher() + { + LoadDataSource(m_dataSource); + m_coder = std::make_unique(m_dataSource); + } + + template + void operator()(SponsoredObject & object) + { + search::ReverseGeocoder::Address addr; + m_coder->GetNearbyAddress(mercator::FromLatLon(object.m_latLon), addr); + object.m_street = addr.GetStreetName(); + object.m_houseNumber = addr.GetHouseNumber(); + } + +private: + FrozenDataSource m_dataSource; + std::unique_ptr m_coder; +}; + +} // namespace + +int main(int argc, char * argv[]) +{ + gflags::SetUsageMessage( + "Takes OSM XML data from stdin and creates" + " data and index files in several passes."); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + Platform & platform = GetPlatform(); + + if (!FLAGS_user_resource_path.empty()) + platform.SetResourceDir(FLAGS_user_resource_path); + + if (!FLAGS_data_path.empty()) + platform.SetWritableDirForTests(FLAGS_data_path); + + LOG(LINFO, ("writable dir =", platform.WritableDir())); + LOG(LINFO, ("resources dir =", platform.ResourcesDir())); + + LOG_SHORT(LINFO, ("Booking data:", FLAGS_booking_data)); + + generator::BookingDataset bookingDataset(FLAGS_booking_data); + AddressMatcher addressMatcher; + + size_t matchedNum = 0; + size_t emptyAddr = 0; + auto const & storage = bookingDataset.GetStorage(); + for (auto [_, hotel] : storage.GetObjects()) + { + addressMatcher(hotel); + + if (hotel.m_address.empty()) + ++emptyAddr; + + if (hotel.HasAddresParts()) + { + ++matchedNum; + std::cout << "Hotel: " << hotel.m_address << " AddLoc: " << hotel.m_translations << " --> " + << hotel.m_street << " " << hotel.m_houseNumber << std::endl; + } + } + + std::cout << "Num of hotels: " << storage.Size() << " matched: " << matchedNum + << " Empty addresses: " << emptyAddr << std::endl; + + return 0; +} diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp new file mode 100644 index 000000000..30878ba02 --- /dev/null +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -0,0 +1,373 @@ +#include "generator/booking_dataset.hpp" + +#include "generator/feature_builder.hpp" +//#include "generator/opentable_dataset.hpp" +#include "generator/osm_source.hpp" +#include "generator/processor_booking.hpp" +#include "generator/raw_generator.hpp" +#include "generator/sponsored_scoring.hpp" +#include "generator/translator_collection.hpp" +#include "generator/translator_factory.hpp" + +#include "indexer/classificator_loader.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "base/file_name_utils.hpp" +#include "base/exception.hpp" +#include "base/geo_object_id.hpp" +#include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include +#include +#include + +#include + +#include "boost/range/adaptor/map.hpp" +#include "boost/range/algorithm/copy.hpp" + +using namespace std; + +DEFINE_string(osm, "", "Input .o5m file"); +DEFINE_string(booking, "", "Path to booking data in .tsv format"); +DEFINE_string(opentable, "", "Path to opentable data in .tsv format"); +DEFINE_string(factors, "", "Factors output path"); +DEFINE_string(sample, "", "Path so sample file"); + +DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); +DEFINE_uint64(selection_size, 1000, "Selection size"); +DEFINE_bool(generate, false, "Generate unmarked sample"); + +using namespace generator; +using namespace feature; + +namespace +{ +string PrintBuilder(FeatureBuilder const & fb) +{ + ostringstream s; + + s << "Id: " << DebugPrint(fb.GetMostGenericOsmId()) << '\t' + << "Name: " << fb.GetName(StringUtf8Multilang::kDefaultCode) << '\t'; + + s << "Params: " << DebugPrint(fb.GetParams()) << '\t'; + + auto const center = mercator::ToLatLon(fb.GetKeyPoint()); + s << "lat: " << center.m_lat << " lon: " << center.m_lon << '\t'; + + if (fb.GetGeomType() == GeomType::Point) + s << "GeomType: Point"; + else if (fb.GetGeomType() == GeomType::Area) + s << "GeomType: Area"; + else + CHECK(false, ()); + + return s.str(); +} + +DECLARE_EXCEPTION(ParseError, RootException); + +base::GeoObjectId ReadDebuggedPrintedOsmId(string const & str) +{ + istringstream sstr(str); + string type; + uint64_t id; + sstr >> type >> id; + + if (sstr.fail()) + MYTHROW(ParseError, ("Can't make osmId from string", str)); + + if (type == "node") + return base::MakeOsmNode(id); + if (type == "way") + return base::MakeOsmWay(id); + if (type == "relation") + return base::MakeOsmRelation(id); + + MYTHROW(ParseError, ("Can't make osmId from string", str)); +} + +GenerateInfo GetGenerateInfo() +{ + GenerateInfo info; + info.m_bookingDataFilename = FLAGS_booking; + //info.m_opentableDataFilename = FLAGS_opentable; + info.m_osmFileName = FLAGS_osm; + info.SetNodeStorageType("map"); + info.SetOsmFileType("o5m"); + + info.m_intermediateDir = base::GetDirectory(FLAGS_factors); + + // Set other info params here. + + return info; +} + +template +struct SampleItem +{ + enum MatchStatus {Uninitialized, Yes, No}; + using ObjectId = typename Object::ObjectId; + + SampleItem() = default; + + SampleItem(base::GeoObjectId const & osmId, ObjectId const sponsoredId, + MatchStatus match = Uninitialized) + : m_osmId(osmId), m_sponsoredId(sponsoredId), m_match(match) + { + } + + base::GeoObjectId m_osmId; + ObjectId m_sponsoredId = Object::InvalidObjectId(); + + MatchStatus m_match = Uninitialized; +}; + +template +typename SampleItem::MatchStatus ReadMatchStatus(string_view str) +{ + if (str == "Yes") + return SampleItem::Yes; + + if (str == "No") + return SampleItem::No; + + if (str == "Uninitialized") + return SampleItem::Uninitialized; + + MYTHROW(ParseError, ("Can't make SampleItem::MatchStatus from string:", str)); +} + +template +SampleItem ReadSampleItem(string const & str) +{ + SampleItem item; + + auto const parts = strings::Tokenize(str, "\t"); + CHECK_EQUAL(parts.size(), 3, ("Cant't make SampleItem from string:", str, + "due to wrong number of fields.")); + + item.m_osmId = ReadDebuggedPrintedOsmId(string(parts[0])); + if (!strings::to_uint(parts[1], item.m_sponsoredId.Get())) + MYTHROW(ParseError, ("Can't make uint32 from string:", parts[1])); + item.m_match = ReadMatchStatus(parts[2]); + + return item; +} + +template +vector> ReadSample(istream & ist) +{ + vector> result; + + size_t lineNumber = 1; + try + { + for (string line; getline(ist, line); ++lineNumber) + { + result.emplace_back(ReadSampleItem(line)); + } + } + catch (ParseError const & e) + { + LOG_SHORT(LERROR, ("Wrong format: line", lineNumber, e.Msg())); + exit(1); + } + + return result; +} + +template +vector> ReadSampleFromFile(string const & name) +{ + ifstream ist(name); + CHECK(ist.is_open(), ("Can't open file:", name, strerror(errno))); + return ReadSample(ist); +} + +template +void GenerateFactors(Dataset const & dataset, + map const & features, + vector> const & sampleItems, ostream & ost) +{ + for (auto const & item : sampleItems) + { + auto const & object = dataset.GetStorage().GetObjectById(item.m_sponsoredId); + auto const & feature = features.at(item.m_osmId); + + auto const score = generator::sponsored_scoring::Match(object, feature); + + auto const center = mercator::ToLatLon(feature.GetKeyPoint()); + double const distanceMeters = ms::DistanceOnEarth(center, object.m_latLon); + auto const matched = score.IsMatched(); + + ost << "# ------------------------------------------" << fixed << setprecision(6) + << endl; + ost << (matched ? 'y' : 'n') << " \t" << DebugPrint(feature.GetMostGenericOsmId()) + << "\t " << object.m_id + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; + ost << "# " << PrintBuilder(feature) << endl; + ost << "# " << object << endl; + ost << "# URL: https://www.openstreetmap.org/?mlat=" + << object.m_latLon.m_lat << "&mlon=" << object.m_latLon.m_lon << "#map=18/" + << object.m_latLon.m_lat << "/" << object.m_latLon.m_lon << endl; + } +} + +enum class DatasetType +{ + Booking, + Opentable +}; + +template +void GenerateSample(Dataset const & dataset, + map const & features, ostream & ost) +{ + LOG_SHORT(LINFO, ("Num of elements:", features.size())); + vector elementIndexes(features.size()); + boost::copy(features | boost::adaptors::map_keys, begin(elementIndexes)); + + // TODO(mgsergio): Try RandomSample (from search:: at the moment of writing). + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(static_cast(FLAGS_seed))); + if (FLAGS_selection_size < elementIndexes.size()) + elementIndexes.resize(FLAGS_selection_size); + + stringstream outStream; + + for (auto osmId : elementIndexes) + { + auto const & fb = features.at(osmId); + auto const sponsoredIndexes = dataset.GetStorage().GetNearestObjects(mercator::ToLatLon(fb.GetKeyPoint())); + + for (auto const sponsoredId : sponsoredIndexes) + { + auto const & object = dataset.GetStorage().GetObjectById(sponsoredId); + auto const score = sponsored_scoring::Match(object, fb); + + auto const center = mercator::ToLatLon(fb.GetKeyPoint()); + double const distanceMeters = ms::DistanceOnEarth(center, object.m_latLon); + auto const matched = score.IsMatched(); + + ost << "# ------------------------------------------" << fixed << setprecision(6) + << endl; + ost << (matched ? 'y' : 'n') << " \t" << DebugPrint(osmId) << "\t " << sponsoredId + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; + ost << "# " << PrintBuilder(fb) << endl; + ost << "# " << object << endl; + ost << "# URL: https://www.openstreetmap.org/?mlat=" + << object.m_latLon.m_lat << "&mlon=" << object.m_latLon.m_lon + << "#map=18/" << object.m_latLon.m_lat << "/" << object.m_latLon.m_lon << endl; + } + if (!sponsoredIndexes.empty()) + ost << endl << endl; + } +} + +template +string GetDatasetFilePath(GenerateInfo const & info); + +template <> +string GetDatasetFilePath(GenerateInfo const & info) +{ + return info.m_bookingDataFilename; +} + +//template <> +//string GetDatasetFilePath(GenerateInfo const & info) +//{ +// return info.m_opentableDataFilename; +//} + +template +void RunImpl(GenerateInfo & info) +{ + auto const & dataSetFilePath = GetDatasetFilePath(info); + Dataset dataset(dataSetFilePath); + LOG_SHORT(LINFO, (dataset.GetStorage().Size(), "objects are loaded from a file:", dataSetFilePath)); + + map features; + LOG_SHORT(LINFO, ("OSM data:", FLAGS_osm)); + + generator::cache::IntermediateDataObjectsCache objectsCache; + generator::cache::IntermediateData cacheLoader(objectsCache, info); + auto translators = make_shared(); + auto processor = make_shared>(dataset, features); + translators->Append(CreateTranslator(TranslatorType::Country, processor, cacheLoader.GetCache(), info)); + RawGenerator generator(info); + generator.GenerateCustom(translators); + + if (FLAGS_generate) + { + ostream * ost = &cout; + unique_ptr ofst; + if (!FLAGS_sample.empty()) + { + ofst = std::make_unique(FLAGS_sample); + CHECK(ofst->is_open(), ("Can't open file", FLAGS_sample, strerror(errno))); + ost = ofst.get(); + } + GenerateSample(dataset, features, *ost); + } + else + { + auto const sample = ReadSampleFromFile(FLAGS_sample); + LOG_SHORT(LINFO, ("Sample size is", sample.size())); + ofstream ost(FLAGS_factors); + CHECK(ost.is_open(), ("Can't open file", FLAGS_factors, strerror(errno))); + GenerateFactors(dataset, features, sample, ost); + } +} + +void Run(DatasetType const datasetType, GenerateInfo & info) +{ + switch (datasetType) + { + case DatasetType::Booking: RunImpl(info); break; + //case DatasetType::Opentable: RunImpl(info); break; + } +} +} // namespace + +int main(int argc, char * argv[]) +{ + gflags::SetUsageMessage("Calculates factors for given samples."); + + if (argc == 1) + { + gflags::ShowUsageWithFlags(argv[0]); + exit(0); + } + + gflags::ParseCommandLineFlags(&argc, &argv, true); + + CHECK(!FLAGS_sample.empty(), ("Please specify sample path.")); + CHECK(!FLAGS_osm.empty(), ("Please specify osm path.")); + CHECK(!FLAGS_booking.empty() || !FLAGS_opentable.empty(), + ("Please specify either booking or opentable path.")); + CHECK(!FLAGS_factors.empty() || FLAGS_generate, ("Please either specify factors path" + "or use -generate.")); + + auto const datasetType = FLAGS_booking.empty() ? DatasetType::Opentable : DatasetType::Booking; + + classificator::Load(); + + auto info = GetGenerateInfo(); + GenerateIntermediateData(info); + + Run(datasetType, info); + + return 0; +} diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp new file mode 100644 index 000000000..cd7802c34 --- /dev/null +++ b/generator/booking_scoring.cpp @@ -0,0 +1,51 @@ +#include "generator/sponsored_scoring.hpp" + +#include "generator/booking_dataset.hpp" +#include "generator/feature_builder.hpp" + +#include "geometry/mercator.hpp" + + +namespace +{ +// Calculated with tools/python/booking_hotels_quality.py. +double constexpr kOptimalThreshold = 0.304875; +} // namespace + +namespace generator +{ +namespace sponsored_scoring +{ +template <> +double MatchStats::GetMatchingScore() const +{ + // TODO(mgsergio): Use tuner to get optimal function. + return m_linearNormDistanceScore * m_nameSimilarityScore; +} + +template <> +bool MatchStats::IsMatched() const +{ + return GetMatchingScore() > kOptimalThreshold; +} + +/// @todo It looks like quite common Match function implementation, +/// because GetLatLon and GetName() needed. +template <> +MatchStats Match(BookingHotel const & h, feature::FeatureBuilder const & fb) +{ + MatchStats score; + + auto const fbCenter = mercator::ToLatLon(fb.GetKeyPoint()); + auto const distance = ms::DistanceOnEarth(fbCenter, h.m_latLon); + score.m_linearNormDistanceScore = + impl::GetLinearNormDistanceScore(distance, BookingDataset::kDistanceLimitInMeters); + + // TODO(mgsergio): Check all translations and use the best one. + score.m_nameSimilarityScore = impl::GetNameSimilarityScore( + h.m_name, std::string(fb.GetName(StringUtf8Multilang::kDefaultCode))); + + return score; +} +} // namespace sponsored_scoring +} // namespace generator diff --git a/generator/feature_builder.cpp b/generator/feature_builder.cpp index 01b1042c6..0d8ea4e32 100644 --- a/generator/feature_builder.cpp +++ b/generator/feature_builder.cpp @@ -2,6 +2,7 @@ #include "routing/routing_helpers.hpp" +#include "indexer/custom_keyvalue.hpp" #include "indexer/feature_algo.hpp" #include "indexer/feature_visibility.hpp" #include "indexer/ftypes_matcher.hpp" @@ -593,6 +594,29 @@ size_t FeatureBuilder::GetPointsCount() const return counter; } +void FeatureBuilder::SetHotelInfo(Metadata::ESource src, uint64_t id, double rating, uint8_t priceCategory) +{ + // Normalize rating [0, 100] + if (rating < 0 || rating > 10) + rating = 0; + else + rating *= 10; + + auto & meta = GetMetadata(); + auto const append = [src, &meta](Metadata::EType type, auto val) + { + indexer::CustomKeyValue kv(meta.Get(type)); + kv.Add(src, val); + meta.Set(type, kv.ToString()); + }; + + append(Metadata::FMD_CUSTOM_IDS, id); + if (rating > 0) + append(Metadata::FMD_RATINGS, static_cast(std::round(rating))); + if (priceCategory > 0) + append(Metadata::FMD_PRICE_RATES, priceCategory); +} + bool FeatureBuilder::IsDrawableInRange(int lowScale, int highScale) const { auto const types = GetTypesHolder(); diff --git a/generator/feature_builder.hpp b/generator/feature_builder.hpp index ec21926c1..df19a6a30 100644 --- a/generator/feature_builder.hpp +++ b/generator/feature_builder.hpp @@ -159,6 +159,8 @@ public: Metadata const & GetMetadata() const { return m_params.GetMetadata(); } Metadata & GetMetadata() { return m_params.GetMetadata(); } + void SetHotelInfo(Metadata::ESource src, uint64_t id, double rating, uint8_t priceCategory); + // To work with types and names based on drawing. // Check classificator types for their compatibility with feature geometry type. // Need to call when using any classificator types manipulating. diff --git a/generator/final_processor_country.cpp b/generator/final_processor_country.cpp index 86ad25d01..f5152b98f 100644 --- a/generator/final_processor_country.cpp +++ b/generator/final_processor_country.cpp @@ -3,6 +3,7 @@ #include "generator/addresses_collector.hpp" #include "generator/address_enricher.hpp" #include "generator/affiliation.hpp" +#include "generator/booking_dataset.hpp" #include "generator/coastlines_generator.hpp" #include "generator/feature_builder.hpp" #include "generator/final_processor_utils.hpp" @@ -12,6 +13,8 @@ #include "generator/osm2type.hpp" #include "generator/region_meta.hpp" +#include "generator/sponsored_dataset_inl.hpp" + #include "routing/speed_camera_prohibition.hpp" #include "indexer/classificator.hpp" @@ -50,6 +53,9 @@ void CountryFinalProcessor::Process() if (!m_coastlineGeomFilename.empty()) ProcessCoastline(); + if (!m_hotelsFilename.empty()) + ProcessBooking(); + // 1. Process roundabouts and addr:interpolation first. if (!m_miniRoundaboutsFilename.empty() || !m_addrInterpolFilename.empty()) ProcessRoundabouts(); @@ -89,6 +95,60 @@ void CountryFinalProcessor::Order() } */ +void CountryFinalProcessor::ProcessBooking() +{ + BookingDataset dataset(m_hotelsFilename); + + std::ofstream matchingLogStream; + matchingLogStream.exceptions(std::fstream::failbit | std::fstream::badbit); + matchingLogStream.open(m_hotelsStatusFilename); + + std::mutex m; + ForEachMwmTmp(m_temporaryMwmPath, [&](auto const & name, auto const & path) + { + if (!IsCountry(name)) + return; + + std::stringstream sstream; + FeatureBuilderWriter writer(path, true /* mangleName */); + ForEachFeatureRawFormat(path, [&](FeatureBuilder && fb, uint64_t) + { + auto const id = dataset.FindMatchingObjectId(fb); + if (id == BookingHotel::InvalidObjectId()) + { + writer.Write(fb); + } + else + { + dataset.PreprocessMatchedOsmObject(id, fb, [&](FeatureBuilder & newFeature) + { + if (newFeature.PreSerialize()) + writer.Write(newFeature); + }); + } + + auto const & isHotelChecker = ftypes::IsHotelChecker::Instance(); + if (isHotelChecker(fb.GetTypes())) + { + if (id != BookingHotel::InvalidObjectId()) + sstream << id; + + auto const latLon = mercator::ToLatLon(fb.GetKeyPoint()); + sstream << ',' << fb.GetMostGenericOsmId().GetEncodedId() << ',' + << strings::to_string_dac(latLon.m_lat, 7) << ',' + << strings::to_string_dac(latLon.m_lon, 7) << ',' << name << '\n'; + } + }); + + std::lock_guard guard(m); + matchingLogStream << sstream.str(); + }, m_threadsCount); + + std::vector fbs; + dataset.BuildOsmObjects([&](auto && fb) { fbs.emplace_back(std::move(fb)); }); + AppendToMwmTmp(fbs, *m_affiliations, m_temporaryMwmPath, m_threadsCount); +} + void CountryFinalProcessor::ProcessRoundabouts() { auto const roundabouts = ReadMiniRoundabouts(m_miniRoundaboutsFilename); diff --git a/generator/final_processor_country.hpp b/generator/final_processor_country.hpp index a0152287c..c2d84c721 100644 --- a/generator/final_processor_country.hpp +++ b/generator/final_processor_country.hpp @@ -41,6 +41,12 @@ public: m_addressPath = dir; } + void SetHotels(std::string const & hotelsFile, std::string const & statusFile) + { + m_hotelsFilename = hotelsFile; + m_hotelsStatusFilename = statusFile; + } + void SetCityBoundariesFiles(std::string const & collectorFile) { m_boundariesCollectorFile = collectorFile; @@ -54,6 +60,7 @@ public: private: //void Order(); void ProcessCoastline(); + void ProcessBooking(); void ProcessRoundabouts(); void AddFakeNodes(); void AddIsolines(); @@ -63,12 +70,13 @@ private: bool IsCountry(std::string const & filename); - std::string m_borderPath; std::string m_temporaryMwmPath; std::string m_intermediateDir; std::string m_isolinesPath, m_addressPath; std::string m_boundariesCollectorFile; std::string m_coastlineGeomFilename; + std::string m_hotelsFilename; + std::string m_hotelsStatusFilename; std::string m_worldCoastsFilename; std::string m_fakeNodesFilename; std::string m_miniRoundaboutsFilename; diff --git a/generator/generate_info.hpp b/generator/generate_info.hpp index 8bdfe943a..a9242ff3e 100644 --- a/generator/generate_info.hpp +++ b/generator/generate_info.hpp @@ -1,7 +1,5 @@ #pragma once -#include "generator/cities_boundaries_builder.hpp" - #include "base/file_name_utils.hpp" #include "base/logging.hpp" @@ -49,6 +47,7 @@ struct GenerateInfo OsmSourceType m_osmFileType = OsmSourceType::XML; std::string m_osmFileName; + std::string m_bookingDataFilename; std::string m_brandsFilename; std::string m_brandsTranslationsFilename; diff --git a/generator/generator_tests/feature_builder_test.cpp b/generator/generator_tests/feature_builder_test.cpp index 9f75d74ea..994c920a1 100644 --- a/generator/generator_tests/feature_builder_test.cpp +++ b/generator/generator_tests/feature_builder_test.cpp @@ -7,6 +7,7 @@ #include "generator/geometry_holder.hpp" #include "indexer/data_header.hpp" +#include "indexer/custom_keyvalue.hpp" #include "indexer/feature_visibility.hpp" #include "indexer/ftypes_matcher.hpp" @@ -397,4 +398,21 @@ UNIT_CLASS_TEST(TestWithClassificator, FBuilder_RemoveInconsistentTypes) TEST(!params.IsTypeExist(classif().GetTypeByPath({"hwtag", "nobicycle"})), ()); } +UNIT_CLASS_TEST(TestWithClassificator, FBuilder_Hotel) +{ + FeatureBuilder fb; + auto const src = Metadata::SRC_KAYAK; + + auto const & meta = fb.GetMetadata(); + auto const isEqual = [&meta, src](Metadata::EType type, uint64_t val) + { + return indexer::CustomKeyValue(meta.Get(type)).Get(src) == val; + }; + + fb.SetHotelInfo(src, 777, 6.3, 4); + + TEST(isEqual(Metadata::FMD_CUSTOM_IDS, 777), ()); + TEST(isEqual(Metadata::FMD_RATINGS, 63), ()); + TEST(isEqual(Metadata::FMD_PRICE_RATES, 4), ()); +} } // namespace feature_builder_test diff --git a/generator/generator_tests_support/test_generator.cpp b/generator/generator_tests_support/test_generator.cpp index 58e755249..929305084 100644 --- a/generator/generator_tests_support/test_generator.cpp +++ b/generator/generator_tests_support/test_generator.cpp @@ -1,11 +1,12 @@ #include "test_generator.hpp" #include "generator/borders.hpp" -#include "generator/camera_info_collector.hpp" #include "generator/feature_sorter.hpp" #include "generator/osm_source.hpp" #include "generator/raw_generator.hpp" +#include "generator/camera_info_collector.hpp" +#include "generator/cities_boundaries_builder.hpp" #include "generator/maxspeeds_builder.hpp" #include "generator/restriction_generator.hpp" #include "generator/road_access_generator.hpp" diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index d292aaa40..db29542e0 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -139,6 +139,7 @@ DEFINE_bool( DEFINE_bool(generate_maxspeed, false, "Generate section with maxspeed of road features."); // Sponsored-related. +DEFINE_string(booking_data, "", "Path to booking data in tsv format."); DEFINE_string(complex_hierarchy_data, "", "Path to complex hierarchy in csv format."); DEFINE_string(wikipedia_pages, "", "Input dir with wikipedia pages."); @@ -240,6 +241,7 @@ MAIN_WITH_ERROR_HANDLING([](int argc, char ** argv) genInfo.m_osmFileName = FLAGS_osm_file_name; genInfo.m_failOnCoasts = FLAGS_fail_on_coasts; genInfo.m_preloadCache = FLAGS_preload_cache; + genInfo.m_bookingDataFilename = FLAGS_booking_data; genInfo.m_popularPlacesFilename = FLAGS_popular_places_data; genInfo.m_brandsFilename = FLAGS_brands_data; genInfo.m_brandsTranslationsFilename = FLAGS_brands_translations_data; diff --git a/generator/raw_generator.cpp b/generator/raw_generator.cpp index 29426a3ee..f8d91d9b8 100644 --- a/generator/raw_generator.cpp +++ b/generator/raw_generator.cpp @@ -189,6 +189,8 @@ RawGenerator::FinalProcessorPtr RawGenerator::CreateCountryFinalProcessor( auto finalProcessor = std::make_shared(affiliations, m_genInfo.m_tmpDir, m_threadsCount); finalProcessor->SetIsolinesDir(m_genInfo.m_isolinesDir); finalProcessor->SetAddressesDir(m_genInfo.m_addressesDir); + + finalProcessor->SetHotels(m_genInfo.m_bookingDataFilename, m_genInfo.GetIntermediateFileName("hotels_status.csv")); finalProcessor->SetMiniRoundabouts(m_genInfo.GetIntermediateFileName(MINI_ROUNDABOUTS_FILENAME)); finalProcessor->SetAddrInterpolation(m_genInfo.GetIntermediateFileName(ADDR_INTERPOL_FILENAME)); if (addAds) diff --git a/generator/sponsored_dataset.hpp b/generator/sponsored_dataset.hpp new file mode 100644 index 000000000..bdeca2fcd --- /dev/null +++ b/generator/sponsored_dataset.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "generator/sponsored_object_storage.hpp" + +#include +#include + +namespace feature +{ +class FeatureBuilder; +} // namespace feature + +namespace generator +{ +template +class SponsoredDataset +{ +public: + using Object = SponsoredObject; + using ObjectId = typename Object::ObjectId; + + static double constexpr kDistanceLimitInMeters = 150; + static size_t constexpr kMaxSelectedElements = 3; + + explicit SponsoredDataset(std::string const & dataPath); + + /// @return true if |fb| satisfies some necessary conditions to match one or serveral + /// objects from dataset. + bool NecessaryMatchingConditionHolds(feature::FeatureBuilder const & fb) const; + ObjectId FindMatchingObjectId(feature::FeatureBuilder const & e) const; + + using FBuilderFnT = std::function; + // Applies changes to a given osm object (for example, remove hotel type) + // and passes the result to |fn|. + void PreprocessMatchedOsmObject(ObjectId matchedObjId, feature::FeatureBuilder & fb, + FBuilderFnT const fn) const; + // Creates objects and adds them to the map (MWM) via |fn|. + void BuildOsmObjects(FBuilderFnT const & fn) const; + + SponsoredObjectStorage const & GetStorage() const { return m_storage; } + +private: + void BuildObject(Object const & object, FBuilderFnT const & fn) const; + + /// @return an id of a matched object or kInvalidObjectId on failure. + ObjectId FindMatchingObjectIdImpl(feature::FeatureBuilder const & fb) const; + + SponsoredObjectStorage m_storage; +}; +} // namespace generator diff --git a/generator/sponsored_dataset_inl.hpp b/generator/sponsored_dataset_inl.hpp new file mode 100644 index 000000000..5cf52c637 --- /dev/null +++ b/generator/sponsored_dataset_inl.hpp @@ -0,0 +1,34 @@ +#pragma once + +#include "generator/sponsored_dataset.hpp" + +#include +#include + +namespace generator +{ + +// SponsoredDataset -------------------------------------------------------------------------------- +template +SponsoredDataset::SponsoredDataset(std::string const & dataPath) + : m_storage(kDistanceLimitInMeters, kMaxSelectedElements) +{ + m_storage.LoadData(dataPath); +} + +template +void SponsoredDataset::BuildOsmObjects(FBuilderFnT const & fn) const +{ + for (auto const & item : m_storage.GetObjects()) + BuildObject(item.second, fn); +} + +template +typename SponsoredDataset::ObjectId +SponsoredDataset::FindMatchingObjectId(feature::FeatureBuilder const & fb) const +{ + if (NecessaryMatchingConditionHolds(fb)) + return FindMatchingObjectIdImpl(fb); + return Object::InvalidObjectId(); +} +} // namespace generator diff --git a/generator/sponsored_object_base.hpp b/generator/sponsored_object_base.hpp new file mode 100644 index 000000000..bc3f68fa9 --- /dev/null +++ b/generator/sponsored_object_base.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include "geometry/latlon.hpp" + +#include "base/newtype.hpp" + +#include +#include +#include +#include + +namespace generator +{ +struct SponsoredObjectBase +{ + NEWTYPE(uint32_t, ObjectId); + + static constexpr ObjectId InvalidObjectId() + { + return ObjectId(std::numeric_limits::max()); + } + + virtual ~SponsoredObjectBase() = default; + + template + static constexpr size_t FieldIndex(Fields field) { return static_cast(field); } + + template + static constexpr size_t FieldsCount() { return static_cast(Fields::Counter); } + + bool HasAddresParts() const { return !m_street.empty() || !m_houseNumber.empty(); } + + ObjectId m_id{InvalidObjectId()}; + ms::LatLon m_latLon = ms::LatLon::Zero(); + std::string m_name; + std::string m_street; + std::string m_houseNumber; + + std::string m_address; + std::string m_descUrl; +}; + +NEWTYPE_SIMPLE_OUTPUT(SponsoredObjectBase::ObjectId); + +inline std::ostream & operator<<(std::ostream & s, SponsoredObjectBase const & h) +{ + s << std::fixed << std::setprecision(7); + s << "Id: " << h.m_id << "\t Name: " << h.m_name << "\t Address: " << h.m_address + << "\t lat: " << h.m_latLon.m_lat << " lon: " << h.m_latLon.m_lon; + return s; +} +} // namespace generator diff --git a/generator/sponsored_object_storage.hpp b/generator/sponsored_object_storage.hpp new file mode 100644 index 000000000..c6bcf079a --- /dev/null +++ b/generator/sponsored_object_storage.hpp @@ -0,0 +1,176 @@ +#pragma once + +#include "platform/platform.hpp" + +#include "geometry/distance_on_sphere.hpp" +#include "geometry/latlon.hpp" + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include "std/boost_geometry.hpp" +#include + + +namespace generator +{ +template +class SponsoredObjectStorage +{ +public: + using ObjectId = typename Object::ObjectId; + using ObjectsContainer = std::map; + using ExcludedIdsContainer = std::unordered_set; + + SponsoredObjectStorage(double distanceLimitMeters, size_t maxSelectedElements) + : m_distanceLimitMeters(distanceLimitMeters) + , m_maxSelectedElements(maxSelectedElements) + { + } + + double GetDistanceLimitInMeters() const + { + return m_distanceLimitMeters; + } + + size_t GetMaxSelectedElements() const + { + return m_maxSelectedElements; + } + + ObjectsContainer const & GetObjects() const + { + return m_objects; + } + + size_t Size() const + { + return m_objects.size(); + } + + void LoadData(std::string const & dataPath) + { + if (dataPath.empty()) + return; + + std::ifstream dataSource(dataPath); + if (!dataSource) + { + LOG(LERROR, ("Error while opening", dataPath, ":", strerror(errno))); + return; + } + + LoadData(dataSource, LoadExcludedIds({})); // empty exclude path + } + + ExcludedIdsContainer LoadExcludedIds(std::string const & excludedIdsPath) + { + if (excludedIdsPath.empty()) + return {}; + + std::ifstream source(excludedIdsPath); + if (!source) + { + LOG(LERROR, ("Error while opening", excludedIdsPath, ":", strerror(errno))); + return {}; + } + + ExcludedIdsContainer result; + for (std::string line; std::getline(source, line);) + { + ObjectId id{Object::InvalidObjectId()}; + + if (!strings::to_any(line, id.Get())) + { + LOG(LWARNING, ("Incorrect excluded sponsored id:", line)); + continue; + } + + if (id != Object::InvalidObjectId()) + result.emplace(id); + } + + return result; + } + + void LoadData(std::istream & src, ExcludedIdsContainer const & excludedIds) + { + m_objects.clear(); + m_rtree.clear(); + + for (std::string line; std::getline(src, line);) + { + Object object(line); + if (object.m_id != Object::InvalidObjectId() && + excludedIds.find(object.m_id) == excludedIds.cend()) + { + m_objects.emplace(object.m_id, object); + } + } + + for (auto const & item : m_objects) + { + auto const & object = item.second; + Box b(Point(object.m_latLon.m_lat, object.m_latLon.m_lon), + Point(object.m_latLon.m_lat, object.m_latLon.m_lon)); + m_rtree.insert(make_pair(b, object.m_id)); + } + } + + Object const & GetObjectById(ObjectId id) const + { + auto const it = m_objects.find(id); + CHECK(it != end(m_objects), ("Got wrong object id:", id)); + return it->second; + } + + Object & GetObjectById(ObjectId id) + { + auto const it = m_objects.find(id); + CHECK(it != end(m_objects), ("Got wrong object id:", id)); + return it->second; + } + + std::vector GetNearestObjects(ms::LatLon const & latLon) const + { + namespace bgi = boost::geometry::index; + + std::vector indexes; + for_each(bgi::qbegin(m_rtree, bgi::nearest(Point(latLon.m_lat, latLon.m_lon), + static_cast(m_maxSelectedElements))), + bgi::qend(m_rtree), [this, &latLon, &indexes](Value const & v) + { + auto const & object = GetObjectById(v.second); + double const dist = ms::DistanceOnEarth(latLon, object.m_latLon); + if (m_distanceLimitMeters != 0.0 && dist > m_distanceLimitMeters) + return; + + indexes.emplace_back(v.second); + }); + + return indexes; + } + +private: + // TODO(mgsergio): Get rid of Box since boost::rtree supports point as value type. + // TODO(mgsergio): Use mercator instead of latlon or boost::geometry::cs::spherical_equatorial + // instead of boost::geometry::cs::cartesian. + using Point = boost::geometry::model::point; + using Box = boost::geometry::model::box; + using Value = std::pair; + + // Create the rtree using default constructor. + boost::geometry::index::rtree> m_rtree; + ObjectsContainer m_objects; + + double const m_distanceLimitMeters; + size_t const m_maxSelectedElements; +}; +} // namespace generator diff --git a/generator/sponsored_scoring.cpp b/generator/sponsored_scoring.cpp new file mode 100644 index 000000000..41a3c8f2f --- /dev/null +++ b/generator/sponsored_scoring.cpp @@ -0,0 +1,106 @@ +#include "generator/sponsored_scoring.hpp" + +#include "indexer/search_string_utils.hpp" + +#include "base/math.hpp" + +#include +#include + +namespace +{ +using WeightedBagOfWords = std::vector>; + +std::vector StringToWords(std::string const & str) +{ + auto result = search::NormalizeAndTokenizeString(str); + std::sort(std::begin(result), std::end(result)); + return result; +} + +WeightedBagOfWords MakeWeightedBagOfWords(std::vector const & words) +{ + // TODO(mgsergio): Calculate tf-idsf score for every word. + auto constexpr kTfIdfScorePlaceholder = 1; + + WeightedBagOfWords result; + for (size_t i = 0; i < words.size(); ++i) + { + result.emplace_back(words[i], kTfIdfScorePlaceholder); + while (i + 1 < words.size() && words[i] == words[i + 1]) + { + result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist; + ++i; + } + } + return result; +} + +double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + double result{}; + + auto lhsIt = begin(lhs); + auto rhsIt = begin(rhs); + + while (lhsIt != end(lhs) && rhsIt != end(rhs)) + { + if (lhsIt->first == rhsIt->first) + { + result += lhsIt->second * rhsIt->second; + ++lhsIt; + ++rhsIt; + } + else if (lhsIt->first < rhsIt->first) + { + ++lhsIt; + } + else + { + ++rhsIt; + } + } + + return result; +} + +double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + auto const product = WeightedBagsDotProduct(lhs, rhs); + auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs)); + auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs)); + + // WeightedBagsDotProduct returns 0.0 if lhs.empty() || rhs.empty() or + // if every element of either lhs or rhs is 0.0. + if (product == 0.0) + return 0.0; + + return product / (lhsLength * rhsLength); +} +} // namespace + +namespace generator +{ +namespace impl +{ +double GetLinearNormDistanceScore(double distance, double const maxDistance) +{ + CHECK_NOT_EQUAL(maxDistance, 0.0, ("maxDistance cannot be 0.")); + distance = base::Clamp(distance, 0.0, maxDistance); + return 1.0 - distance / maxDistance; +} + +double GetNameSimilarityScore(std::string const & booking_name, std::string const & osm_name) +{ + auto const aws = MakeWeightedBagOfWords(StringToWords(booking_name)); + auto const bws = MakeWeightedBagOfWords(StringToWords(osm_name)); + + if (aws.empty() && bws.empty()) + return 1.0; + if (aws.empty() || bws.empty()) + return 0.0; + + return WeightedBagOfWordsCos(aws, bws); +} +} // namespace impl +} // namespace generator diff --git a/generator/sponsored_scoring.hpp b/generator/sponsored_scoring.hpp new file mode 100644 index 000000000..74f99185e --- /dev/null +++ b/generator/sponsored_scoring.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace feature +{ +class FeatureBuilder; +} // namespace feature + +namespace generator +{ +namespace impl +{ +double GetLinearNormDistanceScore(double distance, double maxDistance); +double GetNameSimilarityScore(std::string const & booking_name, std::string const & osm_name); +} // namespace impl + +namespace sponsored_scoring +{ +/// Represents a match scoring statystics of a sponsored object agains osm object. +template +struct MatchStats +{ + /// Returns some score based on geven fields and classificator tuning. + double GetMatchingScore() const; + /// Returns true if GetMatchingScore is greater then some theshold. + bool IsMatched() const; + + double m_linearNormDistanceScore{}; + double m_nameSimilarityScore{}; +}; + +/// Matches a given sponsored object against a given OSM object. +template +MatchStats Match(SponsoredObject const & o, feature::FeatureBuilder const & fb); +} // namespace booking_scoring +} // namespace generator diff --git a/generator/utils.cpp b/generator/utils.cpp index 1f13f0320..dc7a57789 100644 --- a/generator/utils.cpp +++ b/generator/utils.cpp @@ -98,6 +98,27 @@ std::unique_ptr FeatureGetter::GetFeatureByIndex(uint32_t index) co return m_guard->GetFeatureByIndex(index); } +void LoadDataSource(DataSource & dataSource) +{ + std::vector localFiles; + + Platform & platform = GetPlatform(); + platform::FindAllLocalMapsInDirectoryAndCleanup(platform.WritableDir(), 0 /* version */, + -1 /* latestVersion */, localFiles); + for (auto const & localFile : localFiles) + { + LOG(LINFO, ("Found mwm:", localFile)); + try + { + dataSource.RegisterMap(localFile); + } + catch (RootException const & ex) + { + CHECK(false, (ex.Msg(), "Bad mwm file:", localFile)); + } + } +} + bool ParseFeatureIdToOsmIdMapping(std::string const & path, std::unordered_map & mapping) { diff --git a/generator/utils.hpp b/generator/utils.hpp index 5923bca81..7456b4cc2 100644 --- a/generator/utils.hpp +++ b/generator/utils.hpp @@ -55,6 +55,8 @@ private: MwmSet::MwmId m_mwmId; }; +void LoadDataSource(DataSource & dataSource); + class FeatureGetter { public: diff --git a/tools/python/maps_generator/generator/stages_declaration.py b/tools/python/maps_generator/generator/stages_declaration.py index a2ed95be1..b3eb41c0f 100644 --- a/tools/python/maps_generator/generator/stages_declaration.py +++ b/tools/python/maps_generator/generator/stages_declaration.py @@ -104,24 +104,26 @@ class StagePreprocess(Stage): @outer_stage @depends_from_internal( - D(settings.HOTELS_URL, PathProvider.hotels_path, "p"), + D(settings.HOTELS_URL, PathProvider.hotels_path), D(settings.PROMO_CATALOG_CITIES_URL, PathProvider.promo_catalog_cities_path, "p"), D(settings.POPULARITY_URL, PathProvider.popularity_path, "p"), D(settings.FOOD_URL, PathProvider.food_paths, "p"), D(settings.FOOD_TRANSLATIONS_URL, PathProvider.food_translations_path, "p"), ) -@test_stage( - Test(st.make_test_booking_data(max_days=7), lambda e, _: e.production, True) -) +# @test_stage( +# Test(st.make_test_booking_data(max_days=7), lambda e, _: e.production, True) +# ) class StageFeatures(Stage): def apply(self, env: Env): extra = {} if is_accepted(env, StageDescriptions): extra.update({"idToWikidata": env.paths.id_to_wikidata_path}) + + extra.update({"booking_data": env.paths.hotels_path}) + if env.production: extra.update( { - "booking_data": env.paths.hotels_path, "promo_catalog_cities": env.paths.promo_catalog_cities_path, "popular_places_data": env.paths.popularity_path, "brands_data": env.paths.food_paths,