From e9406c0f3693ff0956f94369285427b8dfcc1f28 Mon Sep 17 00:00:00 2001 From: zyphlar Date: Sat, 3 Jan 2026 23:38:14 -0800 Subject: [PATCH] update parquet schema Signed-off-by: zyphlar --- .forgejo/workflows/map-generator.yml | 1 + .../maps_generator/panoramax_preprocessor.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.forgejo/workflows/map-generator.yml b/.forgejo/workflows/map-generator.yml index 5389ea83f..9fadd4be6 100644 --- a/.forgejo/workflows/map-generator.yml +++ b/.forgejo/workflows/map-generator.yml @@ -290,6 +290,7 @@ jobs: run: | mkdir -p /home/planet/panoramax cd /home/planet/panoramax + #TODO: force/redownload if old/desired/nonexistent # Download the global Panoramax geoparquet file (20GB) if [ ! -f panoramax.parquet ]; then echo "Downloading Panoramax geoparquet..." diff --git a/tools/python/maps_generator/panoramax_preprocessor.py b/tools/python/maps_generator/panoramax_preprocessor.py index 2df8c4181..dbcea58d6 100644 --- a/tools/python/maps_generator/panoramax_preprocessor.py +++ b/tools/python/maps_generator/panoramax_preprocessor.py @@ -133,19 +133,26 @@ def process_parquet_streaming(parquet_url: str, output_dir: Path, batch_size: in logger.info(f"Reading parquet file: {parquet_url}") + # First, inspect the schema to understand the columns + try: + schema_result = conn.execute(f"DESCRIBE SELECT * FROM read_parquet('{parquet_url}') LIMIT 0").fetchall() + logger.info(f"Parquet schema: {[col[0] for col in schema_result]}") + except Exception as e: + logger.warning(f"Could not read schema: {e}") + # Dictionary to accumulate points per country country_points: Dict[str, List[Tuple[float, float, str]]] = defaultdict(list) # Stream the parquet file in batches - # Assuming parquet has columns: latitude, longitude, id (or similar) - # Adjust column names based on actual Panoramax parquet schema + # Geoparquet stores geometry as GEOMETRY type + # Use DuckDB spatial functions to extract lat/lon query = f""" SELECT - latitude as lat, - longitude as lon, + ST_Y(geometry) as lat, + ST_X(geometry) as lon, id as image_id FROM read_parquet('{parquet_url}') - WHERE latitude IS NOT NULL AND longitude IS NOT NULL + WHERE geometry IS NOT NULL """ try: