From a4059ab9f9d1ee80ddd2832cab81e3c162189a1e Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Thu, 23 Oct 2025 08:34:34 +0200 Subject: [PATCH 1/7] feat(#3): group and crop surveys based on h3 cells --- .gitignore | 8 +- lasConverter.py | 299 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 220 insertions(+), 87 deletions(-) diff --git a/.gitignore b/.gitignore index b30bf2c..0b9945f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ -#survey content -/surveys/* -/output_las/* +surveys +converted_surveys +temp_cells +metadata +h3_cells .DS_Store .venv \ No newline at end of file diff --git a/lasConverter.py b/lasConverter.py index b8fc424..1b5d5d7 100644 --- a/lasConverter.py +++ b/lasConverter.py @@ -1,102 +1,233 @@ -import sys +from collections import defaultdict +import json +import shutil from pathlib import Path import dask.dataframe as dd +import h3 import numpy as np import laspy import pandas as pd +import pdal from pyproj import Transformer - - -def CSV_2_LAS(surveys_folders, output_folder, output_name="merged_final1.las", chunk_size_bytes="64MB"): +from shapely import MultiPolygon, Polygon +from shapely.ops import transform +from shapely import wkt + +def prepare_cells(metadata): + # Relevant level 1 H3 cells + level1_cells = pd.read_csv("metadata/level1_h3_cells.csv")["cell_id"].tolist() + + # Transformer for polygons + transformer = Transformer.from_crs("EPSG:4326", "EPSG:4978", always_xy=True) + + # Generate level 2 cells & polygons + level2_cells = [] + cell_polygons = {} + for c1 in level1_cells: + children = h3.cell_to_children(c1, 2) + for c2 in children: + coords = h3.cell_to_boundary(c2) + poly = Polygon([(lng, lat) for lat, lng in coords]) + cell_polygons[c2] = transform(transformer.transform, poly) + level2_cells.append(c2) + + metadata["geom"] = metadata["geom"].apply(wkt.loads) + transform_4258_to_4978 = Transformer.from_crs("EPSG:4258", "EPSG:4978", always_xy=True) + metadata["geom_4978"] = metadata["geom"].apply(lambda g: transform(transform_4258_to_4978.transform, g)) + + # Assign surveys to cells + cell_to_surveys = defaultdict(list) + for c2, cell_poly in cell_polygons.items(): + for _, survey in metadata.iterrows(): + geom = survey["geom_4978"] + + if geom.is_empty: + continue + + # Handle both Polygon and MultiPolygon + if isinstance(geom, Polygon) and cell_poly.intersects(geom): + cell_to_surveys[c2].append(survey["fid"]) + elif isinstance(geom, MultiPolygon) and any(cell_poly.intersects(subgeom) for subgeom in geom.geoms): + cell_to_surveys[c2].append(survey["fid"]) + + return cell_to_surveys, cell_polygons + +def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB"): surveys_path = Path(surveys_folders) output_folder = Path(output_folder) output_folder.mkdir(parents=True, exist_ok=True) - transformer = Transformer.from_crs("EPSG:25832", "EPSG:4978", always_xy=True) - - - csv_files = list(surveys_path.rglob("*.csv")) - if not csv_files: - print("No CSV files found.") - return - print(f"Found {len(csv_files)} CSV files.") - - # Map surveys to IDs - top_dirs = sorted({f.relative_to(surveys_path).parts[0] for f in csv_files}) - dir_to_id = {name: i+1 for i, name in enumerate(top_dirs)} - pd.DataFrame(list(dir_to_id.items()), columns=["top_dir","id"]).to_csv(output_folder / "dir_mapping.csv", index=False) - temp_folder = output_folder / "tmp" temp_folder.mkdir(exist_ok=True) - # Step 1: Write chunked LAS files - print("Step 1: Writing LAS chunks...") - file_counter = 1 - - for f in csv_files: - top_dir_id = dir_to_id[f.relative_to(surveys_path).parts[0]] - # Read CSV in Dask partitions (out-of-core) - ddf = dd.read_csv(str(f), header=None, usecols=[0,1,2,3,4,5], blocksize=chunk_size_bytes) - - for partition in ddf.to_delayed(): - df = partition.compute() - x, y, z = df.iloc[:,0].to_numpy(), df.iloc[:,1].to_numpy(), df.iloc[:,2].to_numpy() - - # Transform coordinates into EPSG:4978 - x, y, z = transformer.transform(x, y, z) - accepted = ( - df.iloc[:, 3].astype(str) - .str.strip() - .str.lower() - .eq("accepted") - .astype(np.uint8) - .to_numpy() - ) - tvu = df.iloc[:, 4].to_numpy(dtype=np.float32) - thu = df.iloc[:, 5].to_numpy(dtype=np.float32) - ids = np.full(len(df), top_dir_id, dtype=np.uint16) - - # Create LAS header template - header = laspy.LasHeader(point_format=6, version="1.4") - - # Add extra dimensions - header.add_extra_dim(laspy.ExtraBytesParams(name="accepted", type=np.uint8)) - header.add_extra_dim(laspy.ExtraBytesParams(name="TVU", type=np.float32)) - header.add_extra_dim(laspy.ExtraBytesParams(name="THU", type=np.float32)) - - las = laspy.LasData(header) - las.x = x - las.y = y - las.z = z - - # Add extra dimensions - las["accepted"] = accepted - las["TVU"] = tvu - las["THU"] = thu - las.point_source_id = ids - - las_path = temp_folder / f"{output_name}_chunk_{file_counter}.las" - las.write(las_path) - print(f"✅ Wrote {las_path}") - file_counter += 1 - - # Merging chunked LAS files into single LAS - print("Step 2: Merging LAS chunks into final LAS...") - las_files = sorted(temp_folder.glob(f"{output_name}_chunk_*.las")) - first_las = laspy.read(str(las_files[0])) - merged_header = first_las.header - - with laspy.open(str(output_folder / output_name), mode="w", header=merged_header) as merged_writer: + # Set EPSG:25832 as default when crs is missing + metadata["epsg"] = metadata["epsg"].fillna(25832).astype(int) + + for source_id, row in metadata.iterrows(): + survey_name = row["survey_name"] + survey_folder = surveys_path / survey_name + + if not survey_folder.exists() or not survey_folder.is_dir(): + print(f"⚠️ Survey folder '{survey_folder}' does not exist, skipping.") + continue + + csv_files = sorted(survey_folder.glob("*.csv")) + if not csv_files: + print(f"⚠️ No CSV files for {survey_name}, skipping.") + continue + + epsg = row["epsg"] + transformer = Transformer.from_crs(f"EPSG:{epsg}", "EPSG:4978", always_xy=True) + file_counter = 1 + print(f"Writing LAS chunks for {survey_name}...") + for csv_file in csv_files: + # Read CSV in Dask partitions (out-of-core) + ddf = dd.read_csv(str(csv_file), header=None, blocksize=chunk_size_bytes) + + for partition in ddf.to_delayed(): + df = partition.compute() + x, y, z = df.iloc[:,0].to_numpy(), df.iloc[:,1].to_numpy(), df.iloc[:,2].to_numpy() + + # Transform coordinates into EPSG:4978 + x, y, z = transformer.transform(x, y, z) + + accepted = ( + df.iloc[:, 3].astype(str) + .str.strip() + .str.lower() + .eq("accepted") + .astype(np.uint8) + .to_numpy() + ) + + # Default to 0 if TVU/THU missing + tvu = df.iloc[:, 4].to_numpy(dtype=np.float32) if df.shape[1] > 4 else np.zeros(len(df), dtype=np.float32) + thu = df.iloc[:, 5].to_numpy(dtype=np.float32) if df.shape[1] > 5 else np.zeros(len(df), dtype=np.float32) + + ids = np.full(len(df), source_id, dtype=np.uint16) + + # Create LAS header template + header = laspy.LasHeader(point_format=6, version="1.4") + + # Add extra dimensions + header.add_extra_dim(laspy.ExtraBytesParams(name="accepted", type=np.uint8)) + header.add_extra_dim(laspy.ExtraBytesParams(name="TVU", type=np.float32)) + header.add_extra_dim(laspy.ExtraBytesParams(name="THU", type=np.float32)) + + las = laspy.LasData(header) + las.x = x + las.y = y + las.z = z + + # Add extra dimensions + las["accepted"] = accepted + las["TVU"] = tvu + las["THU"] = thu + las.point_source_id = ids + + las_path = temp_folder / f"{survey_name}_chunk_{file_counter}.las" + las.write(las_path) + print(f"✅ Wrote {las_path}") + file_counter += 1 + + # Merging chunked LAS files into single LAS + print("Step 2: Merging LAS chunks into final LAS...") + las_files = sorted(temp_folder.glob(f"{survey_name}_chunk_*.las")) + first_las = laspy.read(str(las_files[0])) + merged_header = first_las.header + + with laspy.open(str(output_folder / f"{survey_name}.las"), mode="w", header=merged_header) as merged_writer: + for f in las_files: + las = laspy.read(str(f)) + merged_writer.write_points(las.points) + print(f"✅ Merged {f.name} ({len(las.points)} points)") + + # Cleanup temporary LAS files for f in las_files: - las = laspy.read(str(f)) - merged_writer.write_points(las.points) - print(f"Merged {f.name} ({len(las.points)} points)") + f.unlink() - # Cleanup temporary LAS files - for f in las_files: - f.unlink() temp_folder.rmdir() - print(f"Final merged LAS saved at {output_folder / output_name}") + +def group_by_h3_cell(cell_to_surveys, cell_polygons): + output_folder = Path("h3_cells") + output_folder.mkdir(exist_ok=True) + + temp_folder_base = Path("temp_cells") # temporary folder base + temp_folder_base.mkdir(exist_ok=True) + + for c2, survey_fids in cell_to_surveys.items(): + if not survey_fids: + continue + + # Create a temporary folder for this H3 cell + temp_folder = temp_folder_base / c2 + temp_folder.mkdir(exist_ok=True) + + input_files = [] + for fid in survey_fids: + survey_name = metadata.loc[metadata['fid'] == fid, 'survey_name'].item() + + src = Path("converted_surveys") / f"{survey_name}.las" + if not src.exists(): + continue + + dst = temp_folder / src.name + if dst.exists(): + dst.unlink() # remove the existing file or symlink first + + dst.symlink_to(src.resolve()) + input_files.append(str(dst)) + + + # Skip cell if no files exist + if not input_files: + shutil.rmtree(temp_folder) + print(f"⚠️ No survey files for cell {c2}, skipping") + continue + + # PDAL filename glob pattern + input_pattern = str(temp_folder / "*.las") + cell_poly = cell_polygons[c2] + + # PDAL JSON pipeline + pdal_pipeline = { + "pipeline": [ + { + "type": "readers.las", + "filename": input_pattern, + "extra_dims": "Accepted=uint8,TVU=float32,THU=float32" + }, + { + "type": "filters.crop", + "polygon": cell_poly.wkt + }, + { + "type": "writers.las", + "filename": str(output_folder / f"{c2}.las"), + "extra_dims": "Accepted=uint8,TVU=float32,THU=float32" + } + ] + } + + pipeline_json = json.dumps(pdal_pipeline) + + p = pdal.Pipeline(pipeline_json) + try: + p.execute_streaming(chunk_size=500000) + print(f"Written H3 cell {c2}.") + except KeyboardInterrupt: + print("Keyboard interrupt detected, cleaning up and exiting...") + break + except RuntimeError as e: + print(f"Pipeline failed for cell {c2}: {e}") + finally: + shutil.rmtree(temp_folder) if __name__ == "__main__": - CSV_2_LAS(sys.argv[1], sys.argv[2]) \ No newline at end of file + metadata = pd.read_csv("./metadata/metadata.csv") # includes fid, survey_name, survey_area, geom, epsg + + cell_to_surveys, cell_polygons = prepare_cells(metadata) + + CSV_2_LAS("surveys", "converted_surveys", metadata) + + group_by_h3_cell(cell_to_surveys, cell_polygons) \ No newline at end of file From ffe92c6f1de0536bbee1b5916551b9d5e8432944 Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Thu, 23 Oct 2025 21:06:38 +0200 Subject: [PATCH 2/7] refactor(#3): clean up code --- lasConverter.py | 73 ++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/lasConverter.py b/lasConverter.py index 1b5d5d7..431633b 100644 --- a/lasConverter.py +++ b/lasConverter.py @@ -13,29 +13,31 @@ from shapely.ops import transform from shapely import wkt -def prepare_cells(metadata): - # Relevant level 1 H3 cells - level1_cells = pd.read_csv("metadata/level1_h3_cells.csv")["cell_id"].tolist() +BASE_PATH = Path(".") +METADATA_PATH = BASE_PATH / "metadata" +SURVEYS_PATH = BASE_PATH / "surveys" +CONVERTED_PATH = BASE_PATH / "converted_surveys" +H3_CELLS_PATH = BASE_PATH / "h3_cells" - # Transformer for polygons - transformer = Transformer.from_crs("EPSG:4326", "EPSG:4978", always_xy=True) +def prepare_cells(metadata, level1_cells): + # Transformer for h3 cell polygons + transformer_4326_to_4978 = Transformer.from_crs("EPSG:4326", "EPSG:4978", always_xy=True) # Generate level 2 cells & polygons - level2_cells = [] cell_polygons = {} for c1 in level1_cells: children = h3.cell_to_children(c1, 2) for c2 in children: coords = h3.cell_to_boundary(c2) poly = Polygon([(lng, lat) for lat, lng in coords]) - cell_polygons[c2] = transform(transformer.transform, poly) - level2_cells.append(c2) + cell_polygons[c2] = transform(transformer_4326_to_4978.transform, poly) + # Transfrom survey polygons from EPSG:4258 to EPSG:4978 metadata["geom"] = metadata["geom"].apply(wkt.loads) transform_4258_to_4978 = Transformer.from_crs("EPSG:4258", "EPSG:4978", always_xy=True) metadata["geom_4978"] = metadata["geom"].apply(lambda g: transform(transform_4258_to_4978.transform, g)) - # Assign surveys to cells + # Assign surveys to cells based on overlap cell_to_surveys = defaultdict(list) for c2, cell_poly in cell_polygons.items(): for _, survey in metadata.iterrows(): @@ -52,11 +54,9 @@ def prepare_cells(metadata): return cell_to_surveys, cell_polygons -def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB"): - surveys_path = Path(surveys_folders) - output_folder = Path(output_folder) - output_folder.mkdir(parents=True, exist_ok=True) - temp_folder = output_folder / "tmp" +def CSV_2_LAS(metadata, chunk_size_bytes="64MB"): + CONVERTED_PATH.mkdir(parents=True, exist_ok=True) + temp_folder = CONVERTED_PATH / "tmp" temp_folder.mkdir(exist_ok=True) # Set EPSG:25832 as default when crs is missing @@ -64,7 +64,7 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") for source_id, row in metadata.iterrows(): survey_name = row["survey_name"] - survey_folder = surveys_path / survey_name + survey_folder = SURVEYS_PATH / survey_name if not survey_folder.exists() or not survey_folder.is_dir(): print(f"⚠️ Survey folder '{survey_folder}' does not exist, skipping.") @@ -76,9 +76,11 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") continue epsg = row["epsg"] - transformer = Transformer.from_crs(f"EPSG:{epsg}", "EPSG:4978", always_xy=True) + transformer_to_4978 = Transformer.from_crs(f"EPSG:{epsg}", "EPSG:4978", always_xy=True) file_counter = 1 + print(f"Writing LAS chunks for {survey_name}...") + for csv_file in csv_files: # Read CSV in Dask partitions (out-of-core) ddf = dd.read_csv(str(csv_file), header=None, blocksize=chunk_size_bytes) @@ -88,8 +90,9 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") x, y, z = df.iloc[:,0].to_numpy(), df.iloc[:,1].to_numpy(), df.iloc[:,2].to_numpy() # Transform coordinates into EPSG:4978 - x, y, z = transformer.transform(x, y, z) + x, y, z = transformer_to_4978.transform(x, y, z) + # Encode 'Accepted' to 0 and 1 accepted = ( df.iloc[:, 3].astype(str) .str.strip() @@ -103,6 +106,7 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") tvu = df.iloc[:, 4].to_numpy(dtype=np.float32) if df.shape[1] > 4 else np.zeros(len(df), dtype=np.float32) thu = df.iloc[:, 5].to_numpy(dtype=np.float32) if df.shape[1] > 5 else np.zeros(len(df), dtype=np.float32) + # Set source_id ids = np.full(len(df), source_id, dtype=np.uint16) # Create LAS header template @@ -118,7 +122,7 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") las.y = y las.z = z - # Add extra dimensions + # Set extra dimensions las["accepted"] = accepted las["TVU"] = tvu las["THU"] = thu @@ -130,12 +134,12 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") file_counter += 1 # Merging chunked LAS files into single LAS - print("Step 2: Merging LAS chunks into final LAS...") + print("Merging LAS chunks into final LAS...") las_files = sorted(temp_folder.glob(f"{survey_name}_chunk_*.las")) first_las = laspy.read(str(las_files[0])) merged_header = first_las.header - with laspy.open(str(output_folder / f"{survey_name}.las"), mode="w", header=merged_header) as merged_writer: + with laspy.open(str(CONVERTED_PATH / f"{survey_name}.las"), mode="w", header=merged_header) as merged_writer: for f in las_files: las = laspy.read(str(f)) merged_writer.write_points(las.points) @@ -149,18 +153,14 @@ def CSV_2_LAS(surveys_folders, output_folder, metadata, chunk_size_bytes="64MB") def group_by_h3_cell(cell_to_surveys, cell_polygons): - output_folder = Path("h3_cells") - output_folder.mkdir(exist_ok=True) - - temp_folder_base = Path("temp_cells") # temporary folder base - temp_folder_base.mkdir(exist_ok=True) + H3_CELLS_PATH.mkdir(exist_ok=True) for c2, survey_fids in cell_to_surveys.items(): if not survey_fids: continue # Create a temporary folder for this H3 cell - temp_folder = temp_folder_base / c2 + temp_folder = H3_CELLS_PATH / f"tmp_{c2}" temp_folder.mkdir(exist_ok=True) input_files = [] @@ -187,6 +187,7 @@ def group_by_h3_cell(cell_to_surveys, cell_polygons): # PDAL filename glob pattern input_pattern = str(temp_folder / "*.las") + cell_poly = cell_polygons[c2] # PDAL JSON pipeline @@ -203,7 +204,7 @@ def group_by_h3_cell(cell_to_surveys, cell_polygons): }, { "type": "writers.las", - "filename": str(output_folder / f"{c2}.las"), + "filename": str(H3_CELLS_PATH / f"{c2}.las"), "extra_dims": "Accepted=uint8,TVU=float32,THU=float32" } ] @@ -214,20 +215,18 @@ def group_by_h3_cell(cell_to_surveys, cell_polygons): p = pdal.Pipeline(pipeline_json) try: p.execute_streaming(chunk_size=500000) - print(f"Written H3 cell {c2}.") - except KeyboardInterrupt: - print("Keyboard interrupt detected, cleaning up and exiting...") - break - except RuntimeError as e: - print(f"Pipeline failed for cell {c2}: {e}") + print(f"✅ Written H3 cell {c2}.") + except Exception as e: + print(f"⚠️ Pipeline failed for cell {c2}: {e}") finally: shutil.rmtree(temp_folder) if __name__ == "__main__": - metadata = pd.read_csv("./metadata/metadata.csv") # includes fid, survey_name, survey_area, geom, epsg + metadata = pd.read_csv(METADATA_PATH / 'metadata.csv') # includes fid, survey_name, survey_area, geom, epsg + level1_cells = pd.read_csv(METADATA_PATH / "level1_h3_cells.csv")["cell_id"].tolist() # Relevant level 1 H3 cells - cell_to_surveys, cell_polygons = prepare_cells(metadata) + cell_to_surveys, cell_polygons = prepare_cells(metadata, level1_cells) - CSV_2_LAS("surveys", "converted_surveys", metadata) + CSV_2_LAS(metadata) - group_by_h3_cell(cell_to_surveys, cell_polygons) \ No newline at end of file + group_by_h3_cell(cell_to_surveys, cell_polygons) \ No newline at end of file From 5874832b3b44062f8f1214690928026380eb49ef Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Thu, 23 Oct 2025 21:07:10 +0200 Subject: [PATCH 3/7] docs(#3): update readme --- README.MD | 73 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/README.MD b/README.MD index d7dd955..fff1341 100644 --- a/README.MD +++ b/README.MD @@ -1,25 +1,78 @@ +# Las Converter + +`lasConverter.py` converts survey CSV files into LAS files using EPSG:4978 and groups/crops them by H3 cells. +The main steps are: + +1. Prepare H3 Cells and Assign Surveys +2. Convert CSV Surveys to LAS using EPSG:4978 +3. Group and Crop by H3 Cells + # How to run -## Activate python environment +## 1. Prepare Files and Folders + +Before running the script, make sure your project folder has the following structure: + +``` +lasConverter.py +surveys/ + ├── survey_name_1/ + │ ├── file1.csv + │ └── file2.csv + ├── survey_name_2/ + │ └── ... + └── ... +metadata/ + ├── metadata.csv + └── level1_h3_cells.csv +``` + +### `surveys/` +- Contains **one folder per survey**, using the **survey name** as the folder name. +- Each survey folder contains the CSV files for that survey (arbitrarily named). + +### `metadata/metadata.csv` +- Must include the columns: `fid`, `survey_name`, `survey_area`, `geom`, `epsg`. +- Each row corresponds to a survey. + +### `metadata/level1_h3_cells.csv` +- Must contain a single column `cell_id`, listing all relevant **level 1 H3 cells**. + -Create a python environment with +## 2. Activate python environment + +Create a python virtual environment with ```zsh python -m venv ``` -and run it with: +Activate it: -Windows: `\Scripts\activate` +* **Windows:** +```zsh +\Scripts\activate +``` -macOS/Linux: -`source /bin/activate` +* **macOS/Linux:** +```zsh +source /bin/activate +``` -## Run script -First ensure that all requirements are installed: `pip install -r requirements.txt` +## 3. Install requirements -Then run the command below to create the potree structure. +Install the required Python packages: +```zsh +pip install -r requirements.txt +``` + + +## 4. Run the script + +Run the converter: ```zsh -python lasConverter.py +python lasConverter.py ``` + +The processed LAS files, cropped to level 2 H3 cells, will be saved in the `./h3_cells` folder. From 33b6ad710d5c1e3e335d8d2a838627c75935c5a1 Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Fri, 24 Oct 2025 19:40:33 +0200 Subject: [PATCH 4/7] docs(#3): add documentation for functions --- lasConverter.py | 64 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/lasConverter.py b/lasConverter.py index 431633b..8d517e9 100644 --- a/lasConverter.py +++ b/lasConverter.py @@ -2,6 +2,7 @@ import json import shutil from pathlib import Path +from typing import DefaultDict, Dict, List, Tuple import dask.dataframe as dd import h3 import numpy as np @@ -9,9 +10,8 @@ import pandas as pd import pdal from pyproj import Transformer -from shapely import MultiPolygon, Polygon +from shapely import MultiPolygon, Polygon, wkt from shapely.ops import transform -from shapely import wkt BASE_PATH = Path(".") METADATA_PATH = BASE_PATH / "metadata" @@ -19,7 +19,19 @@ CONVERTED_PATH = BASE_PATH / "converted_surveys" H3_CELLS_PATH = BASE_PATH / "h3_cells" -def prepare_cells(metadata, level1_cells): +def prepare_cells(metadata: pd.DataFrame, level1_cells: List[str]) -> Tuple[DefaultDict[str, List[str]], Dict[str, Polygon]]: + """ + Prepares H3 level-2 cell polygons and maps surveys to the cells they intersect. + + Args: + metadata: DataFrame containing survey metadata with 'geom' WKT polygons. + level1_cells: List of level-1 H3 cell IDs to subdivide. + + Returns: + cell_to_surveys: Mapping of H3 cell IDs to survey FIDs that overlap them. + cell_polygons: Mapping of H3 cell IDs to their shapely Polygons (EPSG:4978). + """ + # Transformer for h3 cell polygons transformer_4326_to_4978 = Transformer.from_crs("EPSG:4326", "EPSG:4978", always_xy=True) @@ -54,7 +66,15 @@ def prepare_cells(metadata, level1_cells): return cell_to_surveys, cell_polygons -def CSV_2_LAS(metadata, chunk_size_bytes="64MB"): +def CSV_2_LAS(metadata: pd.DataFrame, chunk_size_bytes: str ="64MB") -> None: + """ + Converts survey CSV files into LAS format using EPSG:4978, one per survey. + + Args: + metadata: Survey metadata DataFrame with survey_name and epsg columns. + chunk_size_bytes: Dask partition size for reading CSVs. + """ + CONVERTED_PATH.mkdir(parents=True, exist_ok=True) temp_folder = CONVERTED_PATH / "tmp" temp_folder.mkdir(exist_ok=True) @@ -87,20 +107,30 @@ def CSV_2_LAS(metadata, chunk_size_bytes="64MB"): for partition in ddf.to_delayed(): df = partition.compute() + + # Coordinate validation + if df.shape[1] < 3: + print(f"⚠️ File {csv_file} has fewer than 3 columns (x, y, z missing), skipping.") + continue + x, y, z = df.iloc[:,0].to_numpy(), df.iloc[:,1].to_numpy(), df.iloc[:,2].to_numpy() # Transform coordinates into EPSG:4978 x, y, z = transformer_to_4978.transform(x, y, z) # Encode 'Accepted' to 0 and 1 - accepted = ( - df.iloc[:, 3].astype(str) - .str.strip() - .str.lower() - .eq("accepted") - .astype(np.uint8) - .to_numpy() - ) + # Default to 'Accepted' if column is missing + if df.shape[1] > 3: + accepted = ( + df.iloc[:, 3].astype(str) + .str.strip() + .str.lower() + .eq("accepted") + .astype(np.uint8) + .to_numpy() + ) + else: + accepted = np.ones(len(df), dtype=np.uint8) # Default to 0 if TVU/THU missing tvu = df.iloc[:, 4].to_numpy(dtype=np.float32) if df.shape[1] > 4 else np.zeros(len(df), dtype=np.float32) @@ -152,7 +182,15 @@ def CSV_2_LAS(metadata, chunk_size_bytes="64MB"): temp_folder.rmdir() -def group_by_h3_cell(cell_to_surveys, cell_polygons): +def group_by_h3_cell(cell_to_surveys: DefaultDict[str, List[int]], cell_polygons: Dict[str, Polygon]): + """ + Groups survey LAS files by H3 level-2 cells and crops them using PDAL. + + Args: + cell_to_surveys: Mapping of H3 cell IDs to survey FIDs that overlap them. + cell_polygons: Mapping of H3 cell IDs to their shapely Polygons (EPSG:4978). + """ + H3_CELLS_PATH.mkdir(exist_ok=True) for c2, survey_fids in cell_to_surveys.items(): From 4d72a31863c972e377fd746245f418e91b276cb0 Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Fri, 24 Oct 2025 19:41:21 +0200 Subject: [PATCH 5/7] chore(#3): update requirements.txt --- requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a4eb13c..ac28ae5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ pandas numpy laspy -pathlib dask[dataframe] -pyproj \ No newline at end of file +pyproj +h3 +pdal +shapely \ No newline at end of file From fc1d7072a1892ac2d06f0b1590d1071ae7a45d1c Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Sun, 26 Oct 2025 13:09:17 +0100 Subject: [PATCH 6/7] style(#3): small coderabbit fixes --- README.MD | 5 +++-- lasConverter.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README.MD b/README.MD index fff1341..c892b34 100644 --- a/README.MD +++ b/README.MD @@ -13,7 +13,7 @@ The main steps are: Before running the script, make sure your project folder has the following structure: -``` +```text lasConverter.py surveys/ ├── survey_name_1/ @@ -32,7 +32,8 @@ metadata/ - Each survey folder contains the CSV files for that survey (arbitrarily named). ### `metadata/metadata.csv` -- Must include the columns: `fid`, `survey_name`, `survey_area`, `geom`, `epsg`. +- Must include the columns: `fid`, `survey_name`, `survey_area`, `geom`, `epsg`. +- Script assumes bounding polygons in `geom` use EPSG:4258. - Each row corresponds to a survey. ### `metadata/level1_h3_cells.csv` diff --git a/lasConverter.py b/lasConverter.py index 8d517e9..18476bc 100644 --- a/lasConverter.py +++ b/lasConverter.py @@ -44,7 +44,7 @@ def prepare_cells(metadata: pd.DataFrame, level1_cells: List[str]) -> Tuple[Defa poly = Polygon([(lng, lat) for lat, lng in coords]) cell_polygons[c2] = transform(transformer_4326_to_4978.transform, poly) - # Transfrom survey polygons from EPSG:4258 to EPSG:4978 + # Transform survey polygons from EPSG:4258 to EPSG:4978 metadata["geom"] = metadata["geom"].apply(wkt.loads) transform_4258_to_4978 = Transformer.from_crs("EPSG:4258", "EPSG:4978", always_xy=True) metadata["geom_4978"] = metadata["geom"].apply(lambda g: transform(transform_4258_to_4978.transform, g)) @@ -143,7 +143,7 @@ def CSV_2_LAS(metadata: pd.DataFrame, chunk_size_bytes: str ="64MB") -> None: header = laspy.LasHeader(point_format=6, version="1.4") # Add extra dimensions - header.add_extra_dim(laspy.ExtraBytesParams(name="accepted", type=np.uint8)) + header.add_extra_dim(laspy.ExtraBytesParams(name="Accepted", type=np.uint8)) header.add_extra_dim(laspy.ExtraBytesParams(name="TVU", type=np.float32)) header.add_extra_dim(laspy.ExtraBytesParams(name="THU", type=np.float32)) @@ -153,7 +153,7 @@ def CSV_2_LAS(metadata: pd.DataFrame, chunk_size_bytes: str ="64MB") -> None: las.z = z # Set extra dimensions - las["accepted"] = accepted + las["Accepted"] = accepted las["TVU"] = tvu las["THU"] = thu las.point_source_id = ids @@ -166,6 +166,9 @@ def CSV_2_LAS(metadata: pd.DataFrame, chunk_size_bytes: str ="64MB") -> None: # Merging chunked LAS files into single LAS print("Merging LAS chunks into final LAS...") las_files = sorted(temp_folder.glob(f"{survey_name}_chunk_*.las")) + if not las_files: + print(f"⚠️ No LAS chunks produced for {survey_name}, skipping merge.") + continue first_las = laspy.read(str(las_files[0])) merged_header = first_las.header @@ -182,7 +185,7 @@ def CSV_2_LAS(metadata: pd.DataFrame, chunk_size_bytes: str ="64MB") -> None: temp_folder.rmdir() -def group_by_h3_cell(cell_to_surveys: DefaultDict[str, List[int]], cell_polygons: Dict[str, Polygon]): +def group_by_h3_cell(metadata: pd.DataFrame, cell_to_surveys: DefaultDict[str, List[str]], cell_polygons: Dict[str, Polygon]): """ Groups survey LAS files by H3 level-2 cells and crops them using PDAL. @@ -205,7 +208,7 @@ def group_by_h3_cell(cell_to_surveys: DefaultDict[str, List[int]], cell_polygons for fid in survey_fids: survey_name = metadata.loc[metadata['fid'] == fid, 'survey_name'].item() - src = Path("converted_surveys") / f"{survey_name}.las" + src = CONVERTED_PATH / f"{survey_name}.las" if not src.exists(): continue @@ -267,4 +270,4 @@ def group_by_h3_cell(cell_to_surveys: DefaultDict[str, List[int]], cell_polygons CSV_2_LAS(metadata) - group_by_h3_cell(cell_to_surveys, cell_polygons) \ No newline at end of file + group_by_h3_cell(metadata, cell_to_surveys, cell_polygons) \ No newline at end of file From 4393a81f9d893814e09e7b6e1fcbea2851e073df Mon Sep 17 00:00:00 2001 From: AdrianSolberg Date: Sun, 26 Oct 2025 13:29:41 +0100 Subject: [PATCH 7/7] refactor(#3): catch RuntimeError instead of Exception --- lasConverter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lasConverter.py b/lasConverter.py index 18476bc..fa6f051 100644 --- a/lasConverter.py +++ b/lasConverter.py @@ -257,7 +257,7 @@ def group_by_h3_cell(metadata: pd.DataFrame, cell_to_surveys: DefaultDict[str, L try: p.execute_streaming(chunk_size=500000) print(f"✅ Written H3 cell {c2}.") - except Exception as e: + except RuntimeError as e: print(f"⚠️ Pipeline failed for cell {c2}: {e}") finally: shutil.rmtree(temp_folder)