diff --git a/.forgejo/workflows/map-generator.yml b/.forgejo/workflows/map-generator.yml index 6f97ab2d6..2b2cef7b5 100644 --- a/.forgejo/workflows/map-generator.yml +++ b/.forgejo/workflows/map-generator.yml @@ -17,8 +17,14 @@ on: required: false default: false type: boolean - run-panoramax: - description: 'Update Panoramax imagery?' + # TODO: enable + # run-panoramax: + # description: 'Update Panoramax imagery?' + # required: false + # default: true + # type: boolean + run-cleanup: + description: 'Clean up old build files?' required: false default: false type: boolean @@ -85,11 +91,49 @@ env: ZULIP_API_KEY: ${{ secrets.ZULIP_API_KEY }} MWMTEST: ${{ inputs.map-generator-test }} MWMCONTINUE: ${{ inputs.map-generator-continue }} - # MWMCOUNTRIES: ${{ inputs.map-generator-countries }} + #TODO: undo ${{ inputs.map-generator-countries }} + MWMCOUNTRIES: US_Oregon_Portland DEBIAN_FRONTEND: noninteractive TZ: Etc/UTC jobs: + cleanup-old-files: + if: inputs.run-cleanup + name: Clean Up Old Files + runs-on: mapfilemaker + container: + image: codeberg.org/comaps/maps_generator:f6d53d54f794 + volumes: + - /mnt/4tbexternal/:/mnt/4tbexternal/ + - /mnt/4tbexternal/osm-planet:/home/planet + concurrency: + group: ${{ github.workflow }}-map-generator-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + steps: + - name: Remove intermediate data + shell: bash + run: | + echo "Checking for intermediate map build data in /mnt/4tbexternal/osm-maps..." + cd /mnt/4tbexternal/osm-maps/ + # List all dated directories, sort by name (newest first) + ls -1d */ 2>/dev/null | grep -E '^[0-9]{4}_[0-9]{2}_[0-9]{2}__[0-9]{2}_[0-9]{2}_[0-9]{2}/$' | while read dir; do + echo "Removing any intermediate data: $dir" + rm -rf "$dir/intermediate_data" + rm -rf "$dir/osm2ft" + rm -rf "$dir/world_roads.o5m" + done + echo "Intermediate data cleaned up." + - name: Remove old map builds (keep last 6) + shell: bash + run: | + echo "Checking for old map builds in /mnt/4tbexternal/osm-maps..." + cd /mnt/4tbexternal/osm-maps/ + # List all dated directories, sort by name (newest first), skip first 6, delete the rest + ls -1d */ 2>/dev/null | grep -E '^[0-9]{4}_[0-9]{2}_[0-9]{2}__[0-9]{2}_[0-9]{2}_[0-9]{2}/$' | sort -r | tail -n +7 | while read dir; do + echo "Removing old build: $dir" + rm -rf "$dir" + done + echo "Old map builds cleaned up." clone-repos: name: Clone Git Repos runs-on: mapfilemaker @@ -215,7 +259,8 @@ jobs: --data-urlencode 'content=Isolines are done!' update-panoramax: - if: inputs.run-panoramax + # TODO: uncommenbt + # if: inputs.run-panoramax name: Update Panoramax runs-on: mapfilemaker needs: @@ -642,4 +687,3 @@ jobs: --data-urlencode 'to="DevOps"' \ --data-urlencode topic=codeberg-bot \ --data-urlencode 'content=Upload is done!' - diff --git a/generator/panoramax_generator.cpp b/generator/panoramax_generator.cpp new file mode 100644 index 000000000..cb4a13774 --- /dev/null +++ b/generator/panoramax_generator.cpp @@ -0,0 +1,144 @@ +#include "generator/panoramax_generator.hpp" + +#include "indexer/classificator.hpp" +#include "indexer/feature_meta.hpp" + +#include "coding/file_reader.hpp" +#include "coding/read_write_utils.hpp" + +#include "geometry/mercator.hpp" + +#include "base/assert.hpp" +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include +#include + +namespace generator +{ +namespace +{ +std::string_view const kPanoramax = "panoramax"; +std::string_view const kImage = "image"; + +std::string GetPanoramaxFilePath(std::string const & countryName, std::string const & panoramaxDir) +{ + return panoramaxDir + "/" + countryName + ".panoramax"; +} + +struct PanoramaxPoint +{ + double lat; + double lon; + std::string imageId; +}; + +bool LoadPanoramaxPoints(std::string const & filePath, std::vector & points) +{ + try + { + std::ifstream file(filePath, std::ios::binary); + if (!file.is_open()) + { + LOG(LWARNING, ("Can't open panoramax file", filePath)); + return false; + } + + // Read header + uint32_t version; + uint64_t pointCount; + + file.read(reinterpret_cast(&version), sizeof(version)); + file.read(reinterpret_cast(&pointCount), sizeof(pointCount)); + + if (version != 1) + { + LOG(LERROR, ("Unsupported panoramax file version", version)); + return false; + } + + points.reserve(static_cast(pointCount)); + + // Read points + for (uint64_t i = 0; i < pointCount; ++i) + { + PanoramaxPoint point; + + file.read(reinterpret_cast(&point.lat), sizeof(point.lat)); + file.read(reinterpret_cast(&point.lon), sizeof(point.lon)); + + // Read image_id (length-prefixed string) + uint32_t imageIdLength; + file.read(reinterpret_cast(&imageIdLength), sizeof(imageIdLength)); + + if (imageIdLength > 0 && imageIdLength < 10000) // Sanity check + { + point.imageId.resize(imageIdLength); + file.read(&point.imageId[0], imageIdLength); + } + + if (file.fail()) + { + LOG(LERROR, ("Error reading panoramax point", i, "from", filePath)); + return false; + } + + points.push_back(std::move(point)); + } + + return true; + } + catch (std::exception const & e) + { + LOG(LERROR, ("Exception loading panoramax file", filePath, ":", e.what())); + return false; + } +} +} // namespace + +PanoramaxFeaturesGenerator::PanoramaxFeaturesGenerator(std::string const & panoramaxDir) + : m_panoramaxDir(panoramaxDir) +{ + Classificator const & c = classif(); + m_panoramaxType = c.GetTypeByPath({kPanoramax, kImage}); +} + +void PanoramaxFeaturesGenerator::GeneratePanoramax(std::string const & countryName, + FeaturesCollectFn const & fn) const +{ + auto const panoramaxPath = GetPanoramaxFilePath(countryName, m_panoramaxDir); + + std::vector points; + if (!LoadPanoramaxPoints(panoramaxPath, points)) + { + LOG(LWARNING, ("Can't load panoramax points for", countryName)); + return; + } + + LOG(LINFO, ("Generating", points.size(), "panoramax points for", countryName)); + + for (auto const & point : points) + { + feature::FeatureBuilder fb; + + // Set point geometry + m2::PointD const mercatorPoint = mercator::FromLatLon(point.lat, point.lon); + fb.SetCenter(mercatorPoint); + + // Add classificator type + fb.AddType(m_panoramaxType); + + // Add metadata with image ID + if (!point.imageId.empty()) + { + fb.GetMetadata().Set(feature::Metadata::FMD_PANORAMAX, point.imageId); + } + + // Panoramax points are POI features (point geometry) + fb.SetPoint(); + + fn(std::move(fb)); + } +} +} // namespace generator diff --git a/generator/panoramax_generator.hpp b/generator/panoramax_generator.hpp new file mode 100644 index 000000000..04fbed93e --- /dev/null +++ b/generator/panoramax_generator.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "generator/feature_builder.hpp" + +#include +#include + +namespace generator +{ +// Generates Panoramax imagery point features from binary files. +// Binary files are created by the panoramax_preprocessor.py script. +class PanoramaxFeaturesGenerator +{ +public: + explicit PanoramaxFeaturesGenerator(std::string const & panoramaxDir); + + using FeaturesCollectFn = std::function; + void GeneratePanoramax(std::string const & countryName, FeaturesCollectFn const & fn) const; + +private: + std::string m_panoramaxDir; + uint32_t m_panoramaxType; // Classificator type for panoramax|image +}; +} // namespace generator diff --git a/tools/python/maps_generator/panoramax_preprocessor.py b/tools/python/maps_generator/panoramax_preprocessor.py new file mode 100644 index 000000000..2df8c4181 --- /dev/null +++ b/tools/python/maps_generator/panoramax_preprocessor.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +""" +Panoramax Preprocessor + +Converts the global Panoramax geoparquet file into per-country binary files +for use in the map generator. + +The script streams the large geoparquet file (20GB+) using DuckDB to avoid +loading everything into memory, performs a spatial join with country polygons, +and writes compact binary files for each country. + +Binary Format: + Header: + uint32 version (=1) + uint64 point_count + Data (repeated point_count times): + double lat (8 bytes) + double lon (8 bytes) + string image_id (length-prefixed: uint32 length + bytes) +""" + +import argparse +import logging +import struct +import sys +from pathlib import Path +from typing import Dict, List, Tuple +from collections import defaultdict + +try: + import duckdb +except ImportError: + print("Error: duckdb is required. Install with: pip install duckdb", file=sys.stderr) + sys.exit(1) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def load_country_polygons(polygons_file: Path) -> Dict[str, any]: + """ + Load country polygons from packed_polygons.bin file. + + This is a placeholder - actual implementation would need to parse the binary format. + For now, we'll use a simpler approach with DuckDB spatial functions. + """ + # TODO: Implement actual polygon loading from packed_polygons.bin + # For MVP, we can use a simplified approach or require pre-processed country boundaries + logger.warning("Country polygon loading not yet implemented - using fallback method") + return {} + + +def determine_country_from_coords(lat: float, lon: float, conn: duckdb.DuckDBPyConnection) -> str: + """ + Determine which country a coordinate belongs to. + + This uses a simple approach for MVP - can be enhanced later. + Returns country name or "Unknown" if not found. + """ + # Simplified country detection for MVP + # TODO: Use actual country polygons for accurate spatial join + + # For now, return a simplified country code based on rough lat/lon bounds + # This is just for initial testing - real implementation needs proper spatial join + if 40 < lat < 52 and -5 < lon < 10: + return "France" + elif 45 < lat < 48 and 5 < lon < 11: + return "Switzerland" + elif 43 < lat < 44 and 7 < lon < 8: + return "Monaco" + else: + return "Unknown" + + +def write_binary_file(output_path: Path, points: List[Tuple[float, float, str]]): + """ + Write panoramax points to binary file. + + Format: + Header: + uint32 version = 1 + uint64 point_count + Data: + For each point: + double lat + double lon + uint32 image_id_length + bytes image_id + """ + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'wb') as f: + # Write header + version = 1 + point_count = len(points) + f.write(struct.pack(' 100000: # Write if accumulated > 100k points + output_file = output_dir / f"{country}.panoramax" + # Append mode for incremental writing + # TODO: Implement append mode or accumulate all then write once + logger.info(f"Country {country} has {len(points)} points accumulated") + + logger.info(f"Finished processing {total_points} total points") + logger.info(f"Countries found: {list(country_points.keys())}") + + # Write final output files + for country, points in country_points.items(): + if points: + output_file = output_dir / f"{country}.panoramax" + write_binary_file(output_file, points) + + except Exception as e: + logger.error(f"Error processing parquet: {e}") + raise + + finally: + conn.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Convert Panoramax geoparquet to per-country binary files", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + parser.add_argument( + '--input', + default='https://api.panoramax.xyz/data/geoparquet/panoramax.parquet', + help='Path or URL to Panoramax geoparquet file (default: official Panoramax URL)' + ) + + parser.add_argument( + '--output', + type=Path, + required=True, + help='Output directory for per-country .panoramax files' + ) + + parser.add_argument( + '--polygons', + type=Path, + help='Path to packed_polygons.bin file (optional, for accurate country detection)' + ) + + parser.add_argument( + '--batch-size', + type=int, + default=100000, + help='Number of rows to process per batch (default: 100000)' + ) + + args = parser.parse_args() + + logger.info("Panoramax Preprocessor starting") + logger.info(f"Input: {args.input}") + logger.info(f"Output directory: {args.output}") + logger.info(f"Batch size: {args.batch_size}") + + if args.polygons: + logger.info(f"Country polygons: {args.polygons}") + # TODO: Load and use country polygons for accurate spatial join + else: + logger.warning("No country polygons provided - using simplified country detection") + + # Create output directory + args.output.mkdir(parents=True, exist_ok=True) + + # Process the parquet file + process_parquet_streaming(args.input, args.output, args.batch_size) + + logger.info("Panoramax preprocessing complete!") + + +if __name__ == '__main__': + main()