From e9406c0f3693ff0956f94369285427b8dfcc1f28 Mon Sep 17 00:00:00 2001
From: zyphlar <zyphlar@gmail.com>
Date: Sat, 3 Jan 2026 23:38:14 -0800
Subject: [PATCH] update parquet schema

Signed-off-by: zyphlar <zyphlar@gmail.com>
---
 .forgejo/workflows/map-generator.yml            |  1 +
 .../maps_generator/panoramax_preprocessor.py    | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/.forgejo/workflows/map-generator.yml b/.forgejo/workflows/map-generator.yml
index 5389ea83f..9fadd4be6 100644
--- a/.forgejo/workflows/map-generator.yml
+++ b/.forgejo/workflows/map-generator.yml
@@ -290,6 +290,7 @@ jobs:
         run: |
           mkdir -p /home/planet/panoramax
           cd /home/planet/panoramax
+          #TODO: force/redownload if old/desired/nonexistent
           # Download the global Panoramax geoparquet file (20GB)
           if [ ! -f panoramax.parquet ]; then
             echo "Downloading Panoramax geoparquet..."
diff --git a/tools/python/maps_generator/panoramax_preprocessor.py b/tools/python/maps_generator/panoramax_preprocessor.py
index 2df8c4181..dbcea58d6 100644
--- a/tools/python/maps_generator/panoramax_preprocessor.py
+++ b/tools/python/maps_generator/panoramax_preprocessor.py
@@ -133,19 +133,26 @@ def process_parquet_streaming(parquet_url: str, output_dir: Path, batch_size: in
 
     logger.info(f"Reading parquet file: {parquet_url}")
 
+    # First, inspect the schema to understand the columns
+    try:
+        schema_result = conn.execute(f"DESCRIBE SELECT * FROM read_parquet('{parquet_url}') LIMIT 0").fetchall()
+        logger.info(f"Parquet schema: {[col[0] for col in schema_result]}")
+    except Exception as e:
+        logger.warning(f"Could not read schema: {e}")
+
     # Dictionary to accumulate points per country
     country_points: Dict[str, List[Tuple[float, float, str]]] = defaultdict(list)
 
     # Stream the parquet file in batches
-    # Assuming parquet has columns: latitude, longitude, id (or similar)
-    # Adjust column names based on actual Panoramax parquet schema
+    # Geoparquet stores geometry as GEOMETRY type
+    # Use DuckDB spatial functions to extract lat/lon
     query = f"""
         SELECT
-            latitude as lat,
-            longitude as lon,
+            ST_Y(geometry) as lat,
+            ST_X(geometry) as lon,
             id as image_id
         FROM read_parquet('{parquet_url}')
-        WHERE latitude IS NOT NULL AND longitude IS NOT NULL
+        WHERE geometry IS NOT NULL
     """
 
     try: