add missing files, add map build cleanup, temporarily build only one region

Signed-off-by: zyphlar <zyphlar@gmail.com>
2026-01-16 13:24:44 +00:00 · 2026-01-03 22:12:16 -08:00
parent 5eeeaeb288
commit 7722cc7d46
4 changed files with 477 additions and 5 deletions
--- a/.forgejo/workflows/map-generator.yml
+++ b/.forgejo/workflows/map-generator.yml
@@ -17,8 +17,14 @@ on:
        required: false
        default: false
        type: boolean
-      run-panoramax:
-        description: 'Update Panoramax imagery?'
+      # TODO: enable
+      # run-panoramax:
+      #   description: 'Update Panoramax imagery?'
+      #   required: false
+      #   default: true
+      #   type: boolean
+      run-cleanup:
+        description: 'Clean up old build files?'
        required: false
        default: false
        type: boolean
@@ -85,11 +91,49 @@ env:
  ZULIP_API_KEY: ${{ secrets.ZULIP_API_KEY }}
  MWMTEST: ${{ inputs.map-generator-test }}
  MWMCONTINUE: ${{ inputs.map-generator-continue }}
-  # MWMCOUNTRIES: ${{ inputs.map-generator-countries }}
+  #TODO: undo ${{ inputs.map-generator-countries }}
+  MWMCOUNTRIES: US_Oregon_Portland
  DEBIAN_FRONTEND: noninteractive
  TZ: Etc/UTC

 jobs:
+  cleanup-old-files:
+    if: inputs.run-cleanup
+    name: Clean Up Old Files
+    runs-on: mapfilemaker
+    container:
+      image: codeberg.org/comaps/maps_generator:f6d53d54f794
+      volumes:
+        - /mnt/4tbexternal/:/mnt/4tbexternal/
+        - /mnt/4tbexternal/osm-planet:/home/planet
+    concurrency:
+      group: ${{ github.workflow }}-map-generator-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
+    steps:
+      - name: Remove intermediate data
+        shell: bash
+        run: |
+          echo "Checking for intermediate map build data in /mnt/4tbexternal/osm-maps..."
+          cd /mnt/4tbexternal/osm-maps/
+          # List all dated directories, sort by name (newest first)
+          ls -1d */ 2>/dev/null | grep -E '^[0-9]{4}_[0-9]{2}_[0-9]{2}__[0-9]{2}_[0-9]{2}_[0-9]{2}/$' | while read dir; do
+            echo "Removing any intermediate data: $dir"
+            rm -rf "$dir/intermediate_data"
+            rm -rf "$dir/osm2ft"
+            rm -rf "$dir/world_roads.o5m"
+          done
+          echo "Intermediate data cleaned up."
+      - name: Remove old map builds (keep last 6)
+        shell: bash
+        run: |
+          echo "Checking for old map builds in /mnt/4tbexternal/osm-maps..."
+          cd /mnt/4tbexternal/osm-maps/
+          # List all dated directories, sort by name (newest first), skip first 6, delete the rest
+          ls -1d */ 2>/dev/null | grep -E '^[0-9]{4}_[0-9]{2}_[0-9]{2}__[0-9]{2}_[0-9]{2}_[0-9]{2}/$' | sort -r | tail -n +7 | while read dir; do
+            echo "Removing old build: $dir"
+            rm -rf "$dir"
+          done
+          echo "Old map builds cleaned up."
  clone-repos:
    name: Clone Git Repos
    runs-on: mapfilemaker
@@ -215,7 +259,8 @@ jobs:
            --data-urlencode 'content=Isolines are done!'

  update-panoramax:
-    if: inputs.run-panoramax
+    # TODO: uncommenbt
+    # if: inputs.run-panoramax
    name: Update Panoramax
    runs-on: mapfilemaker
    needs:
@@ -642,4 +687,3 @@ jobs:
            --data-urlencode 'to="DevOps"' \
            --data-urlencode topic=codeberg-bot \
            --data-urlencode 'content=Upload is done!'
-
--- a/generator/panoramax_generator.cpp
+++ b/generator/panoramax_generator.cpp
@@ -0,0 +1,144 @@
+#include "generator/panoramax_generator.hpp"
+
+#include "indexer/classificator.hpp"
+#include "indexer/feature_meta.hpp"
+
+#include "coding/file_reader.hpp"
+#include "coding/read_write_utils.hpp"
+
+#include "geometry/mercator.hpp"
+
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+#include "base/string_utils.hpp"
+
+#include <cstdint>
+#include <fstream>
+
+namespace generator
+{
+namespace
+{
+std::string_view const kPanoramax = "panoramax";
+std::string_view const kImage = "image";
+
+std::string GetPanoramaxFilePath(std::string const & countryName, std::string const & panoramaxDir)
+{
+  return panoramaxDir + "/" + countryName + ".panoramax";
+}
+
+struct PanoramaxPoint
+{
+  double lat;
+  double lon;
+  std::string imageId;
+};
+
+bool LoadPanoramaxPoints(std::string const & filePath, std::vector<PanoramaxPoint> & points)
+{
+  try
+  {
+    std::ifstream file(filePath, std::ios::binary);
+    if (!file.is_open())
+    {
+      LOG(LWARNING, ("Can't open panoramax file", filePath));
+      return false;
+    }
+
+    // Read header
+    uint32_t version;
+    uint64_t pointCount;
+
+    file.read(reinterpret_cast<char*>(&version), sizeof(version));
+    file.read(reinterpret_cast<char*>(&pointCount), sizeof(pointCount));
+
+    if (version != 1)
+    {
+      LOG(LERROR, ("Unsupported panoramax file version", version));
+      return false;
+    }
+
+    points.reserve(static_cast<size_t>(pointCount));
+
+    // Read points
+    for (uint64_t i = 0; i < pointCount; ++i)
+    {
+      PanoramaxPoint point;
+
+      file.read(reinterpret_cast<char*>(&point.lat), sizeof(point.lat));
+      file.read(reinterpret_cast<char*>(&point.lon), sizeof(point.lon));
+
+      // Read image_id (length-prefixed string)
+      uint32_t imageIdLength;
+      file.read(reinterpret_cast<char*>(&imageIdLength), sizeof(imageIdLength));
+
+      if (imageIdLength > 0 && imageIdLength < 10000)  // Sanity check
+      {
+        point.imageId.resize(imageIdLength);
+        file.read(&point.imageId[0], imageIdLength);
+      }
+
+      if (file.fail())
+      {
+        LOG(LERROR, ("Error reading panoramax point", i, "from", filePath));
+        return false;
+      }
+
+      points.push_back(std::move(point));
+    }
+
+    return true;
+  }
+  catch (std::exception const & e)
+  {
+    LOG(LERROR, ("Exception loading panoramax file", filePath, ":", e.what()));
+    return false;
+  }
+}
+}  // namespace
+
+PanoramaxFeaturesGenerator::PanoramaxFeaturesGenerator(std::string const & panoramaxDir)
+  : m_panoramaxDir(panoramaxDir)
+{
+  Classificator const & c = classif();
+  m_panoramaxType = c.GetTypeByPath({kPanoramax, kImage});
+}
+
+void PanoramaxFeaturesGenerator::GeneratePanoramax(std::string const & countryName,
+                                                    FeaturesCollectFn const & fn) const
+{
+  auto const panoramaxPath = GetPanoramaxFilePath(countryName, m_panoramaxDir);
+
+  std::vector<PanoramaxPoint> points;
+  if (!LoadPanoramaxPoints(panoramaxPath, points))
+  {
+    LOG(LWARNING, ("Can't load panoramax points for", countryName));
+    return;
+  }
+
+  LOG(LINFO, ("Generating", points.size(), "panoramax points for", countryName));
+
+  for (auto const & point : points)
+  {
+    feature::FeatureBuilder fb;
+
+    // Set point geometry
+    m2::PointD const mercatorPoint = mercator::FromLatLon(point.lat, point.lon);
+    fb.SetCenter(mercatorPoint);
+
+    // Add classificator type
+    fb.AddType(m_panoramaxType);
+
+    // Add metadata with image ID
+    if (!point.imageId.empty())
+    {
+      fb.GetMetadata().Set(feature::Metadata::FMD_PANORAMAX, point.imageId);
+    }
+
+    // Panoramax points are POI features (point geometry)
+    fb.SetPoint();
+
+    fn(std::move(fb));
+  }
+}
+}  // namespace generator
--- a/generator/panoramax_generator.hpp
+++ b/generator/panoramax_generator.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "generator/feature_builder.hpp"
+
+#include <functional>
+#include <string>
+
+namespace generator
+{
+// Generates Panoramax imagery point features from binary files.
+// Binary files are created by the panoramax_preprocessor.py script.
+class PanoramaxFeaturesGenerator
+{
+public:
+  explicit PanoramaxFeaturesGenerator(std::string const & panoramaxDir);
+
+  using FeaturesCollectFn = std::function<void(feature::FeatureBuilder && fb)>;
+  void GeneratePanoramax(std::string const & countryName, FeaturesCollectFn const & fn) const;
+
+private:
+  std::string m_panoramaxDir;
+  uint32_t m_panoramaxType;  // Classificator type for panoramax|image
+};
+}  // namespace generator
--- a/tools/python/maps_generator/panoramax_preprocessor.py
+++ b/tools/python/maps_generator/panoramax_preprocessor.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""
+Panoramax Preprocessor
+
+Converts the global Panoramax geoparquet file into per-country binary files
+for use in the map generator.
+
+The script streams the large geoparquet file (20GB+) using DuckDB to avoid
+loading everything into memory, performs a spatial join with country polygons,
+and writes compact binary files for each country.
+
+Binary Format:
+  Header:
+    uint32 version (=1)
+    uint64 point_count
+  Data (repeated point_count times):
+    double lat (8 bytes)
+    double lon (8 bytes)
+    string image_id (length-prefixed: uint32 length + bytes)
+"""
+
+import argparse
+import logging
+import struct
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+from collections import defaultdict
+
+try:
+    import duckdb
+except ImportError:
+    print("Error: duckdb is required. Install with: pip install duckdb", file=sys.stderr)
+    sys.exit(1)
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def load_country_polygons(polygons_file: Path) -> Dict[str, any]:
+    """
+    Load country polygons from packed_polygons.bin file.
+
+    This is a placeholder - actual implementation would need to parse the binary format.
+    For now, we'll use a simpler approach with DuckDB spatial functions.
+    """
+    # TODO: Implement actual polygon loading from packed_polygons.bin
+    # For MVP, we can use a simplified approach or require pre-processed country boundaries
+    logger.warning("Country polygon loading not yet implemented - using fallback method")
+    return {}
+
+
+def determine_country_from_coords(lat: float, lon: float, conn: duckdb.DuckDBPyConnection) -> str:
+    """
+    Determine which country a coordinate belongs to.
+
+    This uses a simple approach for MVP - can be enhanced later.
+    Returns country name or "Unknown" if not found.
+    """
+    # Simplified country detection for MVP
+    # TODO: Use actual country polygons for accurate spatial join
+
+    # For now, return a simplified country code based on rough lat/lon bounds
+    # This is just for initial testing - real implementation needs proper spatial join
+    if 40 < lat < 52 and -5 < lon < 10:
+        return "France"
+    elif 45 < lat < 48 and 5 < lon < 11:
+        return "Switzerland"
+    elif 43 < lat < 44 and 7 < lon < 8:
+        return "Monaco"
+    else:
+        return "Unknown"
+
+
+def write_binary_file(output_path: Path, points: List[Tuple[float, float, str]]):
+    """
+    Write panoramax points to binary file.
+
+    Format:
+      Header:
+        uint32 version = 1
+        uint64 point_count
+      Data:
+        For each point:
+          double lat
+          double lon
+          uint32 image_id_length
+          bytes image_id
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, 'wb') as f:
+        # Write header
+        version = 1
+        point_count = len(points)
+        f.write(struct.pack('<I', version))  # uint32 version
+        f.write(struct.pack('<Q', point_count))  # uint64 point_count
+
+        # Write points
+        for lat, lon, image_id in points:
+            f.write(struct.pack('<d', lat))  # double lat
+            f.write(struct.pack('<d', lon))  # double lon
+
+            # Write image_id as length-prefixed string
+            image_id_bytes = image_id.encode('utf-8')
+            f.write(struct.pack('<I', len(image_id_bytes)))  # uint32 length
+            f.write(image_id_bytes)  # bytes
+
+    logger.info(f"Wrote {point_count} points to {output_path}")
+
+
+def process_parquet_streaming(parquet_url: str, output_dir: Path, batch_size: int = 100000):
+    """
+    Stream the Panoramax parquet file and write per-country binary files.
+
+    Uses DuckDB to stream the large parquet file without loading it entirely into memory.
+    """
+    conn = duckdb.connect(database=':memory:')
+
+    # Enable httpfs extension for remote file access
+    try:
+        conn.execute("INSTALL httpfs;")
+        conn.execute("LOAD httpfs;")
+    except Exception as e:
+        logger.warning(f"Could not load httpfs extension: {e}")
+
+    # Install spatial extension for future country boundary support
+    try:
+        conn.execute("INSTALL spatial;")
+        conn.execute("LOAD spatial;")
+    except Exception as e:
+        logger.warning(f"Could not load spatial extension: {e}")
+
+    logger.info(f"Reading parquet file: {parquet_url}")
+
+    # Dictionary to accumulate points per country
+    country_points: Dict[str, List[Tuple[float, float, str]]] = defaultdict(list)
+
+    # Stream the parquet file in batches
+    # Assuming parquet has columns: latitude, longitude, id (or similar)
+    # Adjust column names based on actual Panoramax parquet schema
+    query = f"""
+        SELECT
+            latitude as lat,
+            longitude as lon,
+            id as image_id
+        FROM read_parquet('{parquet_url}')
+        WHERE latitude IS NOT NULL AND longitude IS NOT NULL
+    """
+
+    try:
+        result = conn.execute(query)
+
+        batch_count = 0
+        total_points = 0
+
+        while True:
+            batch = result.fetchmany(batch_size)
+            if not batch:
+                break
+
+            batch_count += 1
+            batch_size_actual = len(batch)
+            total_points += batch_size_actual
+
+            logger.info(f"Processing batch {batch_count}: {batch_size_actual} points (total: {total_points})")
+
+            for row in batch:
+                lat, lon, image_id = row
+
+                # Determine country
+                country = determine_country_from_coords(lat, lon, conn)
+
+                # Skip unknown countries for now (or save to separate file)
+                if country != "Unknown":
+                    country_points[country].append((lat, lon, str(image_id)))
+
+            # Periodically write to disk to avoid memory issues
+            if batch_count % 10 == 0:
+                for country, points in country_points.items():
+                    if len(points) > 100000:  # Write if accumulated > 100k points
+                        output_file = output_dir / f"{country}.panoramax"
+                        # Append mode for incremental writing
+                        # TODO: Implement append mode or accumulate all then write once
+                        logger.info(f"Country {country} has {len(points)} points accumulated")
+
+        logger.info(f"Finished processing {total_points} total points")
+        logger.info(f"Countries found: {list(country_points.keys())}")
+
+        # Write final output files
+        for country, points in country_points.items():
+            if points:
+                output_file = output_dir / f"{country}.panoramax"
+                write_binary_file(output_file, points)
+
+    except Exception as e:
+        logger.error(f"Error processing parquet: {e}")
+        raise
+
+    finally:
+        conn.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Panoramax geoparquet to per-country binary files",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+
+    parser.add_argument(
+        '--input',
+        default='https://api.panoramax.xyz/data/geoparquet/panoramax.parquet',
+        help='Path or URL to Panoramax geoparquet file (default: official Panoramax URL)'
+    )
+
+    parser.add_argument(
+        '--output',
+        type=Path,
+        required=True,
+        help='Output directory for per-country .panoramax files'
+    )
+
+    parser.add_argument(
+        '--polygons',
+        type=Path,
+        help='Path to packed_polygons.bin file (optional, for accurate country detection)'
+    )
+
+    parser.add_argument(
+        '--batch-size',
+        type=int,
+        default=100000,
+        help='Number of rows to process per batch (default: 100000)'
+    )
+
+    args = parser.parse_args()
+
+    logger.info("Panoramax Preprocessor starting")
+    logger.info(f"Input: {args.input}")
+    logger.info(f"Output directory: {args.output}")
+    logger.info(f"Batch size: {args.batch_size}")
+
+    if args.polygons:
+        logger.info(f"Country polygons: {args.polygons}")
+        # TODO: Load and use country polygons for accurate spatial join
+    else:
+        logger.warning("No country polygons provided - using simplified country detection")
+
+    # Create output directory
+    args.output.mkdir(parents=True, exist_ok=True)
+
+    # Process the parquet file
+    process_parquet_streaming(args.input, args.output, args.batch_size)
+
+    logger.info("Panoramax preprocessing complete!")
+
+
+if __name__ == '__main__':
+    main()