diff --git a/.github/workflows/linting_and_testing.yml b/.github/workflows/linting_and_testing.yml index dc2ed2fe..1f8f32a8 100644 --- a/.github/workflows/linting_and_testing.yml +++ b/.github/workflows/linting_and_testing.yml @@ -9,6 +9,10 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip @@ -25,25 +29,39 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' - name: Install GDAL run: | sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable sudo apt-get update sudo apt-get install -y libgdal-dev gdal-bin + - name: Set up Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: '3.12' + miniforge-version: latest + activate-environment: test-env + use-mamba: true + auto-activate-base: false + - name: Install dependencies + shell: bash -l {0} run: | - conda update -n base -c defaults conda -y - conda install -n base conda-libmamba-solver -c conda-forge -y - conda install -c conda-forge gdal -y - conda install -c conda-forge -c loop3d --file dependencies.txt -y - conda install pytest -y + mamba install python=3.12 -y + mamba install -c conda-forge gdal geopandas shapely networkx owslib beartype pytest scikit-learn -y + pip install map2model loopprojectfile==0.2.2 - name: Install map2loop + shell: bash -l {0} run: | python -m pip install . - name: Run tests + shell: bash -l {0} run: | - pytest - + python -c "import map2model" || echo "map2model not available, tests will use fallback mode" + pytest -v \ No newline at end of file diff --git a/docs/examples/plot_data_checks_on_fault.py b/docs/examples/plot_data_checks_on_fault.py new file mode 100644 index 00000000..ff9cefb3 --- /dev/null +++ b/docs/examples/plot_data_checks_on_fault.py @@ -0,0 +1,403 @@ +# %% +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.data_checks import check_fault_fields_validity + + +# Mock Datatype Enum +class Datatype: + GEOLOGY = 0 + STRUCTURE = 1 + FAULT = 2 + + +# Mock Config class +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + } + + +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault A", "Fault B"], + "ID": [1, 2], +} + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data with invalid geometry +invalid_geometry_fault_data = valid_fault_data.copy() +invalid_geometry_fault_data["geometry"] = [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]), # Invalid geometry +] +invalid_geometry_fault_gdf = gpd.GeoDataFrame(invalid_geometry_fault_data, crs="EPSG:4326") + +# Test with invalid geometry +map_data.raw_data[Datatype.FAULT] = invalid_geometry_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE2": ["f A", "Fault B"], + "ID": [1, 2], +} + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": [5, 2], + "ID": [1, 2], +} + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["ult A", "faultB"], + "ID": [1, 2], +} + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ['Zuleika_1', 'Zuleika'], + "ID": [1, 2], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'tEST', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": [1, 'Zuleika'], + "ID": [1, 2], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": [None, 'Zuleika'], + "ID": [1, 2], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ['Zuleika', 'Zuleika'], + "ID": [1, 2], + "DIP": [45, 50], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + "dip_column": 'DIP2', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ['Zuleika', 'Zuleika'], + "ID": [1, 2], + "DIP": ['A', 50], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + "dip_column": 'DIP', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ['Zuleika', 'Zuleika'], + "ID": [1, 2], + "DIP": [70, 50], + "STRIKE": [150, None], + 'DEC': ["north_east", "southt"], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + "dip_column": 'DIP', + "dipdir_column": 'STRIKE', + "dip_estimate_column": 'DEC', + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the fault dataset +fhg = None +valid_fault_data = { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ['Zuleika', 'Zuleika'], + "ID": [fhg, 2], + "DIP": [70, 50], + "STRIKE": [150, None], + 'DEC': ["north_east", "southt"], +} + + +class MockConfig: + def __init__(self): + self.fault_config = { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": 'NAME', + "dip_column": 'DIP', + "dipdir_column": 'STRIKE', + # "dip_estimate_column": 'DEC' + } + + +# Create a GeoDataFrame for valid fault data +valid_fault_gdf = gpd.GeoDataFrame(valid_fault_data, crs="EPSG:4326") + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid fault data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.FAULT] = valid_fault_gdf +validity_check, message = check_fault_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% diff --git a/docs/examples/plot_data_checks_on_structure.py b/docs/examples/plot_data_checks_on_structure.py new file mode 100644 index 00000000..a1dcb8bc --- /dev/null +++ b/docs/examples/plot_data_checks_on_structure.py @@ -0,0 +1,179 @@ +# %% +import geopandas as gpd +from map2loop.mapdata import MapData +from map2loop import data_checks +import shapely.geometry + + +# Mock Datatype Enum +class Datatype: + GEOLOGY = 0 + STRUCTURE = 1 + + +# Mock Config class +class MockConfig: + def __init__(self): + self.structure_config = { + "dipdir_column": "DIPDIR", + "dip_column": "DIP", + "description_column": "DESCRIPTION", + "overturned_column": "OVERTURNED", + "objectid_column": "ID", + } + + +# Mock data for the structure dataset +valid_structure_data = { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], +} + +# Create a GeoDataFrame for valid structure data +valid_structure_gdf = gpd.GeoDataFrame(valid_structure_data, crs="EPSG:4326") + + +# Instantiate the MapData class with the mock config and data +map_data = MapData() +map_data.config = MockConfig() + +# Test with valid structure data +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.STRUCTURE] = valid_structure_gdf +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print("Test 1 - Valid Data:") +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data with invalid geometry +invalid_geometry_structure_data = valid_structure_data.copy() +invalid_geometry_structure_data["geometry"] = [ + shapely.geometry.Point(0, 0), + shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]), # Invalid geometry +] +invalid_geometry_structure_gdf = gpd.GeoDataFrame(invalid_geometry_structure_data, crs="EPSG:4326") + + +# Test with invalid geometry +map_data.raw_data[Datatype.STRUCTURE] = invalid_geometry_structure_gdf +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print("\nTest 3 - Invalid Geometry:") +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data with missing required columns +missing_column_structure_data = valid_structure_data.copy() +del missing_column_structure_data["DIPDIR"] +missing_column_structure_gdf = gpd.GeoDataFrame(missing_column_structure_data, crs="EPSG:4326") + +# Test with missing required column +map_data.raw_data[Datatype.STRUCTURE] = missing_column_structure_gdf +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print("\nTest 2 - Missing Required Column:") +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": ["A", "B"], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], +} + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = gpd.GeoDataFrame( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": ["A", "B"], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + } +) + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = gpd.GeoDataFrame( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [None, 3], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + } +) + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = gpd.GeoDataFrame( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [5, 3], + "DIP": [120.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + } +) + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = gpd.GeoDataFrame( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [5, 3], + "DIP": [90, 45.0], + "DESCRIPTION": [None, "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + } +) + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +# Mock data for the structure dataset +invalid_structure_data = gpd.GeoDataFrame( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [5, 3], + "DIP": [90, 45.0], + "DESCRIPTION": [None, "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 1], + } +) + +map_data.raw_data[Datatype.STRUCTURE] = invalid_structure_data +validity_check, message = data_checks.check_structure_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% diff --git a/docs/examples/plot_m2l_data_checks_on_datatype_geology.py b/docs/examples/plot_m2l_data_checks_on_datatype_geology.py new file mode 100644 index 00000000..82203112 --- /dev/null +++ b/docs/examples/plot_m2l_data_checks_on_datatype_geology.py @@ -0,0 +1,341 @@ +# %% +import geopandas as gpd +import shapely.geometry +from map2loop import data_checks + + +# Mock Datatype Enum +class Datatype: + GEOLOGY = 0 + + +# Mock Config class +class MockConfig: + def __init__(self): + self.geology_config = { + "unitname_column": "UNITNAME", + "alt_unitname_column": "CODE", + "group_column": "GROUP", + "supergroup_column": "SUPERGROUP", + "description_column": "DESCRIPTION", + "rocktype_column": "ROCKTYPE1", + "alt_rocktype_column": "ROCKTYPE2", + "minage_column": "MIN_AGE", + "maxage_column": "MAX_AGE", + "objectid_column": "ID", + "ignore_lithology_codes": [], + } + + +# Mock data for the geology dataset +geology_data = { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], +} + +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: + geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +# Test the check_geology_fields_validity function +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +########### should run with no issues + +# %% +######## invalid geometries + +invalid_geometry = shapely.geometry.Polygon( + [(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)] # This creates a self-intersecting polygon (bowtie) +) + + +geology_data = { + "geometry": [invalid_geometry], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: + geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +# Test the check_geology_fields_validity function +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +geology_data = { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + # "CODE": ["SST"], ########################## + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +# Test the check_geology_fields_validity function +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +geology_data = { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": [2], ################################ + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% + +geology_data = { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": [''], ################################################### + "CODE": ['SSt'], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% + +geology_data = { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ['fr'], + "CODE": ['SSt'], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A"], + "ROCKTYPE1": ["A"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": ["150.0"], + "MAX_AGE": [200.0], + "ID": [1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% +message + +# %% + +geology_data = { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ['fr', 'df'], + "CODE": ['SSt', 'fgh'], + "GROUP": ["Sedimentary", "ign"], + "SUPERGROUP": ["Mesozoic", "arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDs"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 1], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% + +geology_data = { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ['fr', 'df'], + "CODE": ['SSt', 'fgh'], + "GROUP": ["Sedimentary", "ign"], + "SUPERGROUP": ["Mesozoic", "arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", None], + "ROCKTYPE2": ["Quartz", "FDs"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, None], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% + +geology_data = { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ['fr', None], + "CODE": ['SSt', 'fgh'], + "GROUP": ["Sedimentary", "ign"], + "SUPERGROUP": ["Mesozoic", "arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", None], + "ROCKTYPE2": ["Quartz", "FDs"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 4], +} +# Create a GeoDataFrame for geology +geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + +# Ensure that all string columns are of dtype str +# for col in ["UNITNAME", "CODE", "GROUP", "SUPERGROUP", "DESCRIPTION", "ROCKTYPE1", "ROCKTYPE2"]: +# geology_gdf[col] = geology_gdf[col].astype(str) + +from map2loop.mapdata import MapData + +map_data = MapData() +map_data.config = MockConfig() +map_data.raw_data = [None] * len(Datatype.__dict__) +map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + +validity_check, message = data_checks.check_geology_fields_validity(map_data) +print(f"Validity Check: {validity_check}, Message: {message}") + +# %% diff --git a/map2loop/config.py b/map2loop/config.py index 48d017d3..edff2e20 100644 --- a/map2loop/config.py +++ b/map2loop/config.py @@ -100,11 +100,6 @@ def update_from_dictionary(self, dictionary: dict, lower: bool = True): Args: dictionary (dict): The dictionary to update from """ - # make sure dictionary doesn't contain legacy keys - self.check_for_legacy_keys(dictionary) - - # make sure it has the minimum requirements - self.validate_config_dictionary(dictionary) if "structure" in dictionary: self.structure_config.update(dictionary["structure"]) @@ -214,59 +209,4 @@ def update_from_file( else: err_string += "Please check the file exists and is accessible then\n" err_string += "Check the contents for mismatched quotes or brackets!" - raise Exception(err_string) - - @beartype.beartype - def validate_config_dictionary(self, config_dict: dict) -> None: - """ - Validate the structure and keys of the configuration dictionary. - - Args: - config_dict (dict): The config dictionary to validate. - - Raises: - ValueError: If the dictionary does not meet the minimum requirements for ma2p2loop. - """ - required_keys = { - "structure": {"dipdir_column", "dip_column"}, - "geology": {"unitname_column", "alt_unitname_column"}, - } - - for section, keys in required_keys.items(): - if section not in config_dict: - logger.error(f"Missing required section '{section}' in config dictionary.") - raise ValueError(f"Missing required section '{section}' in config dictionary.") - - for key in keys: - if key not in config_dict[section]: - logger.error( - f"Missing required key '{key}' for '{section}' section of the config dictionary." - ) - raise ValueError( - f"Missing required key '{key}' for '{section}' section of the config dictionary." - ) - - @beartype.beartype - def check_for_legacy_keys(self, config_dict: dict) -> None: - - legacy_keys = { - "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u", - "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic", - "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest", - "fdipest_vals", "n", "ff", "t", "syn" - } - - # Recursively search for keys in the dictionary - def check_keys(d: dict, parent_key=""): - for key, value in d.items(): - if key in legacy_keys: - logger.error( - f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" - ) - raise ValueError( - f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" - ) - if isinstance(value, dict): - check_keys(value, parent_key=f"{parent_key}{key}.") - - check_keys(config_dict) \ No newline at end of file + raise Exception(err_string) \ No newline at end of file diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py new file mode 100644 index 00000000..6c7af1db --- /dev/null +++ b/map2loop/data_checks.py @@ -0,0 +1,880 @@ +# internal imports +from .m2l_enums import Datatype + +# external imports +import beartype as beartype +from beartype.typing import Tuple, Optional, List, Dict, Type, Union +import geopandas +import shapely +import pandas + +from .logging import getLogger +logger = getLogger(__name__) + +@beartype.beartype +def check_geology_fields_validity(mapdata) -> tuple[bool, str]: + #TODO (AR) - add check for gaps in geology data (inspo here: https://medium.com/@achm.firmansyah/an-approach-for-checking-overlaps-and-gaps-in-polygons-using-geopandas-ebd6606e7f70 ) + """ + Validate the columns in GEOLOGY geodataframe + + Several checks to ensure that the geology data: + - Is loaded and valid. + - Contains required columns with appropriate types and no missing or blank values. + - Has optional columns with valid types, if present. + - Does not contain duplicate in IDs. + - Ensures the geometry column has valid geometries. + + Returns: + Tuple[bool, str]: A tuple indicating success (False) or failure (True) + """ + # Check if geology data is loaded and valid + if ( + mapdata.raw_data[Datatype.GEOLOGY] is None + or type(mapdata.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame + ): + logger.error("GEOLOGY data is not loaded or is not a valid GeoDataFrame") + return (True, "GEOLOGY data is not loaded or is not a valid GeoDataFrame") + + geology_data = mapdata.raw_data[Datatype.GEOLOGY] + config = mapdata.config.geology_config + + # 2. Validate geometry + failed, message = validate_geometry( + geodata=geology_data, + expected_geom_types=[shapely.Polygon, shapely.MultiPolygon], + datatype_name="GEOLOGY" + ) + if failed: + return (failed, message) + + + # check required columns in geology + required_columns = ["unitname_column", "alt_unitname_column"] + + failed, message = validate_required_columns( + geodata=geology_data, + config=config, + required_columns=required_columns, + expected_type=str, + check_blank=True, + datatype_name="GEOLOGY" + ) + if failed: + return (failed, message) + + # check optional columns + optional_string_columns = [ + "group_column", "supergroup_column", "description_column", + "rocktype_column", "alt_rocktype_column", + ] + + string_warnings = validate_optional_columns( + geodata=geology_data, + config=config, + optional_columns=optional_string_columns, + expected_type=str, + check_blank=True, + datatype_name="GEOLOGY" + ) + ### only emit warnings for optional columns + for warning in string_warnings: + logger.warning(warning) + + # 5. Validate Optional Numeric Columns + optional_numeric_columns = ["minage_column", "maxage_column"] + numeric_warnings = validate_optional_columns( + geodata=geology_data, + config=config, + optional_columns=optional_numeric_columns, + expected_type=(int, float), + check_blank=False, + datatype_name="GEOLOGY" + ) + + ### only emit warnings for optional columns + for warning in numeric_warnings: + logger.warning(warning) + + # # 4. check ID column + if "objectid_column" in config: + failed, message = validate_id_column( + geodata=geology_data, + config=config, + id_config_key="objectid_column", + geodata_name="GEOLOGY") + + if failed: + return (failed, message) + + logger.info("Geology fields validation passed.") + return (False, "") + + +@beartype.beartype +def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: + """ + Validate the structure data for required and optional fields. + + Performs the following checks: + - Ensures the structure map is loaded, valid, and contains at least two structures. + - Validates the geometry column + - Checks required numeric columns (`dip_column`, `dipdir_column`) for existence, dtype, range, and null values. + - Checks optional string columns (`description_column`, `overturned_column`) for type and null/empty values. + - Validates the optional numeric `objectid_column` for type, null values, and duplicates. + + Returns: + Tuple[bool, str]: A tuple where the first value indicates if validation failed (True = failed), + and the second value provides a message describing the issue. + """ + + # Check type and size of loaded structure map + if ( + mapdata.raw_data[Datatype.STRUCTURE] is None + or type(mapdata.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame + ): + logger.warning("Structure map is not loaded or valid") + return (True, "Structure map is not loaded or valid") + + if len(mapdata.raw_data[Datatype.STRUCTURE]) < 2: + logger.warning( + "Datatype STRUCTURE: map does with not enough orientations to complete calculations (need at least 2), projection may be inconsistent" + ) + + structure_data = mapdata.raw_data[Datatype.STRUCTURE] + config = mapdata.config.structure_config + + # 2. Validate geometry + failed, message = validate_geometry( + geodata=structure_data, + expected_geom_types=[shapely.Point, shapely.MultiPoint], + datatype_name="STRUCTURE" + ) + if failed: + return (failed, message) + + + # check required columns in structure (numeric dips & dip dir) + required_columns = ["dipdir_column", "dip_column"] + failed, message = validate_required_columns( + geodata=structure_data, + config=config, + required_columns=required_columns, + expected_type=(int, float), + check_blank=False, + datatype_name="STRUCTURE" + ) + if failed: + return (failed, message) + + # 4. Validate Dip and Dip Direction value ranges + dip_columns = ["dip_column", "dipdir_column"] + dip_validation_failed, dip_message = validate_dip_columns( + geodata=structure_data, + config=config, + dip_columns=dip_columns, + datatype_name="STRUCTURE", + allow_nulls=False # Dip and dipdir cannot have nulls in structure data + ) + if dip_validation_failed: + logger.warning(dip_message) + + # check optional columns + optional_string_columns = ["description_column", "overturned_column"] + string_warnings = validate_optional_columns( + geodata=structure_data, + config=config, + optional_columns=optional_string_columns, + expected_type=str, + check_blank=True, + datatype_name="STRUCTURE" + ) + + ## only emit warnings for optional columns + for warning in string_warnings: + logger.warning(warning) + + # check ID column + if "objectid_column" in config: + failed, id_message = validate_id_column( + geodata=structure_data, + config=config, + id_config_key="objectid_column", + geodata_name="STRUCTURE") + + if failed: + return (failed, id_message) + + return (False, "") + +@beartype.beartype +def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: + + # Check type of loaded fault map + if ( + mapdata.raw_data[Datatype.FAULT] is None + or type(mapdata.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame + ): + logger.warning("Fault map is not loaded or valid") + return (True, "Fault map is not loaded or valid") + + fault_data = mapdata.raw_data[Datatype.FAULT] + config = mapdata.config.fault_config + + # 2. Validate geometry + failed, message = validate_geometry( + geodata=fault_data, + expected_geom_types=[shapely.LineString, shapely.MultiLineString], + datatype_name="FAULT" + ) + if failed: + return (failed, message) + + # # Check "structtype_column" if it exists + text_keys = { + "fault_text": "fault_text" + } + structtype_validation_failed, structtype_message = validate_structtype_column( + geodata=fault_data, + config=config, + datatype_name="FAULT", + required=True, # Assuming structtype_column is required in FAULT + text_keys=text_keys + ) + if structtype_validation_failed: + return (structtype_validation_failed, structtype_message) + + + + #checks on name column + name_column = config.get("name_column") + if name_column not in fault_data.columns: + logger.warning( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') is missing from the fault data." + "Please ensure it is present, or remove that key from the config." + ) + + if name_column and name_column in fault_data.columns: + # Check if the column contains non-string values + if not fault_data[name_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values. Ensure all values are valid strings." + ) + return (True, f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values.") + + # Check for NaN values + if fault_data[name_column].isnull().any(): + logger.warning( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains NaN or empty values. This may affect processing." + ) + + # Check for duplicate values + if fault_data[name_column].duplicated().any(): + logger.warning( + f"Datatype FAULT: Column '{name_column}' contains duplicate values. This may affect processing." + ) + + # # dips & strikes + dip_columns = ["dip_column", "dipdir_column"] + dip_validation_failed, dip_message = validate_dip_columns( + geodata=fault_data, + config=config, + dip_columns=dip_columns, + datatype_name="FAULT", + allow_nulls=True # Dip fields can be empty + ) + if dip_validation_failed: + logger.warning(dip_message) + + # dip estimates + dip_estimate_column = config.get("dip_estimate_column") + valid_directions = [ + "north_east", "south_east", "south_west", "north_west", + "north", "east", "south", "west" + ] + + if dip_estimate_column: + if dip_estimate_column in fault_data.columns: + # Ensure all values are in the set of valid directions or are NaN + invalid_values = fault_data[dip_estimate_column][ + ~fault_data[dip_estimate_column].apply(lambda x: x in valid_directions or pandas.isnull(x)) + ] + + if not invalid_values.empty: + logger.error( + f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values not in the set of allowed dip estimates: {valid_directions}." + ) + return ( + True, + f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values. Allowed values: {valid_directions}.", + ) + + # Warn if there are NaN or empty values + if fault_data[dip_estimate_column].isnull().any(): + logger.warning( + f"Datatype FAULT: Column '{dip_estimate_column}' contains NaN or empty values. This may affect processing." + ) + else: + logger.error( + f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data. Please ensure the column name is correct or remove that key from the config." + ) + return (True, f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data.") + + + # # 4. check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=fault_data, + config=config, + id_config_key="objectid_column", + geodata_name="FAULT") + + if id_validation_failed: + return (id_validation_failed, id_message) + + return (False, "") + +@beartype.beartype +def check_fold_fields_validity(mapdata) -> Tuple[bool, str]: + # Check type of loaded fold map + if ( + mapdata.raw_data[Datatype.FOLD] is None + or type(mapdata.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame + ): + logger.warning("Fold map is not loaded or valid") + return (True, "Fold map is not loaded or valid") + + folds = mapdata.raw_data[Datatype.FOLD] + config = mapdata.config.fold_config + + # Debugging: Print column names in the fold_data + logger.debug(f"Fold data columns: {folds.columns.tolist()}") + + # 2. Validate geometry + failed, message = validate_geometry( + geodata=folds, + expected_geom_types=[shapely.LineString, shapely.MultiLineString], + datatype_name="FOLD" + ) + if failed: + return (failed, message) + + ## check structtype column if it exists + text_keys = { + "fold_text": "fold_text", + "synform_text": "synform_text" + } + structtype_validation_failed, structtype_message = validate_structtype_column( + geodata=folds, + config=config, + datatype_name="FOLD", + required=True, # Assuming structtype_column is required in FOLD + text_keys=text_keys + ) + if structtype_validation_failed: + return (structtype_validation_failed, structtype_message) + + # check description column + description_column = config.get("description_column", None) + if description_column: + # Ensure the column exists in the data + if description_column not in folds.columns: + logger.warning( + f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') is missing from the fold data. Consider removing that key from the config." + ) + else: + # Check if all entries in the column are strings + if not folds[description_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values. Please ensure all values in this column are strings." + ) + return (True, f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values.") + + # Warn about empty or null cells + if folds[description_column].isnull().any() or folds[description_column].str.strip().eq("").any(): + logger.warning( + f"Datatype FOLD: Column '{description_column}' contains NaN, empty, or blank values. Processing might not work as expected." + ) + + + # # 4. check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=folds, + config=config, + id_config_key="objectid_column", + geodata_name="FOLD") + + if id_validation_failed: + return (id_validation_failed, id_message) + + return (False, "") + + +@beartype.beartype +def validate_config_dictionary(config_dict: dict) -> None: + + # 1) check mandatory keys for "structure" and "geology" + required_keys = { + "structure": {"dipdir_column", "dip_column"}, + "geology": {"unitname_column", "alt_unitname_column"}, + } + + # Loop over "structure" and "geology" + for section, keys in required_keys.items(): + + # 1) Check that "section" exists + if section not in config_dict: + logger.error(f"Missing required section '{section}' in config dictionary.") + raise ValueError(f"Missing required section '{section}' in config dictionary.") + + # 2) Check that each required key is in config_dict[section] + for key in keys: + if key not in config_dict[section]: + logger.error(f"Missing required key '{key}' for '{section}' section of the config dictionary.") + raise ValueError(f"Missing required key '{key}' for '{section}' section of the config dictionary.") + + # 2) check for legacy keys first: + legacy_keys = { + "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u", + "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic", + "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest", + "fdipest_vals", "n", "ff", "t", "syn" + } + + def check_keys(d: dict, parent_key=""): + for key, value in d.items(): + if key in legacy_keys: + logger.error( + f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" + ) + raise ValueError( + f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" + ) + if isinstance(value, dict): + check_keys(value, parent_key=f"{parent_key}{key}.") + + check_keys(config_dict) + + # 3) check if all keys are valid: + allowed_keys_by_section = { + "structure": { + "orientation_type", "dipdir_column", "dip_column", + "description_column", "bedding_text", "overturned_column", "overturned_text", + "objectid_column", "desciption_column", + }, + "geology": { + "unitname_column", "alt_unitname_column", "group_column", + "supergroup_column", "description_column", "minage_column", + "maxage_column", "rocktype_column", "alt_rocktype_column", + "sill_text", "intrusive_text", "volcanic_text", "objectid_column", "ignore_lithology_codes", + }, + "fault": { + "structtype_column", "fault_text", "dip_null_value", + "dipdir_flag", "dipdir_column", "dip_column", "orientation_type", + "dipestimate_column", "dipestimate_text", "name_column", + "objectid_column", "minimum_fault_length", "ignore_fault_codes", + }, + "fold": { + "structtype_column", "fold_text", "description_column", + "synform_text", "foldname_column","objectid_column", + }, + } + + for section_name, section_dict in config_dict.items(): + # check section + if section_name not in allowed_keys_by_section: + logger.error(f"Unrecognized section '{section_name}' in config dictionary.") + raise ValueError(f"Unrecognized section '{section_name}' in config dictionary.") + + # check keys + allowed_keys = allowed_keys_by_section[section_name] + for key in section_dict.keys(): + if key not in allowed_keys: + logger.error(f"Key '{key}' is not an allowed key in the '{section_name}' section.") + raise ValueError(f"Key '{key}' is not an allowed key in the '{section_name}' section.") + + # 4) check if minimum fault length is a number + mfl = config_dict.get("fault", {}).get("minimum_fault_length", None) + if mfl is not None and not isinstance(mfl, (int, float)): + logger.error("minimum_fault_length must be a number.") + raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}") + +@beartype.beartype +def validate_geometry( + geodata: geopandas.GeoDataFrame, + expected_geom_types: List[type], + datatype_name: str +) -> Tuple[bool, str]: + geodata.geometry = geodata.geometry.make_valid() + # 1. Check if all geometries are valid + if not geodata.geometry.is_valid.all(): + logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.") + return True, f"Invalid geometries found in datatype {datatype_name}" + + # 2. Check if all geometries are of the expected types + if not geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types))).all(): + invalid_types = geodata[~geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types)))] + invalid_indices = invalid_types.index.tolist() + expected_types_names = ', '.join([geom_type.__name__ for geom_type in expected_geom_types]) + logger.error( + f"Datatype {datatype_name}: Invalid geometry types found. Expected types: {expected_types_names}. " + f"Rows with invalid types: {invalid_indices}" + ) + return True, ( + f"Invalid geometry types found in datatype {datatype_name}. " + f"All geometries must be {expected_types_names}." + ) + + # If all checks pass + logger.debug(f"Geometry validation passed for datatype {datatype_name}") + return False, "" + + +@beartype.beartype +def validate_id_column( + geodata: geopandas.GeoDataFrame, + config: dict, + id_config_key: str, + geodata_name: str +) -> Tuple[bool, str]: + + # Retrieve the ID column name from the configuration + id_column = config.get(id_config_key) + + if not id_column: + error_msg = f"Configuration key '{id_config_key}' is missing." + logger.error(error_msg) + return (True, error_msg) + + if id_column in geodata.columns: + geodata[id_column] = pandas.to_numeric(geodata[id_column], errors='coerce') + + # Check for non-numeric values (which are now NaN after coercion) + if geodata[id_column].isnull().any(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains non-numeric or NaN values. " + "Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs." + ) + logger.error(error_msg) + return (True, error_msg) + + if not (geodata[id_column] == geodata[id_column].astype(int)).all(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains non-integer values." + ) + logger.error(error_msg) + return (True, error_msg) + + if geodata[id_column].duplicated().any(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains duplicate values." + ) + logger.error(error_msg) + return (True, error_msg) + + + elif id_column not in geodata.columns: + msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') is missing from the data. " + "Map2loop will automatically generate IDs." + ) + logger.warning(msg) + + return (False, "") + +@beartype.beartype +def validate_required_columns( + geodata: geopandas.GeoDataFrame, + config: dict, + required_columns: List[str], + expected_type: Union[Type, Tuple[Type, ...]], + check_blank: bool = False, + datatype_name: str = "UNKNOWN" +) -> Tuple[bool, str]: + + for config_key in required_columns: + column_name = config.get(config_key) + + if not column_name: + error_msg = ( + f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'." + ) + logger.error(error_msg) + return (True, error_msg) + + if column_name not in geodata.columns: + error_msg = ( + f"Datatype {datatype_name.upper()}: Required column with config key '{config_key}' " + f"(column: '{column_name}') is missing from the data." + ) + logger.error(error_msg) + return (True, error_msg) + + # Check data type + if not geodata[column_name].apply(lambda x: isinstance(x, expected_type)).all(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"must contain only {expected_type if isinstance(expected_type, type) else 'numeric'} values." + ) + logger.error(error_msg) + return (True, error_msg) + + # Check for null values + if geodata[column_name].isnull().any(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"contains null values. Please ensure all values are present." + ) + logger.error(error_msg) + return (True, error_msg) + + # Optionally check for blank strings + if check_blank and issubclass(expected_type, str): + if geodata[column_name].str.strip().eq("").any(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"contains blank (empty) values. Please ensure all values are populated." + ) + logger.error(error_msg) + return (True, error_msg) + + # If all required columns pass validation + logger.info(f"Datatype {datatype_name.upper()}: All required columns validated successfully.") + return (False, "") + + +def validate_optional_columns( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + optional_columns: List[str], + expected_type: Union[Type, Tuple[Type, ...]], + check_blank: bool = False, + datatype_name: str = "UNKNOWN" +) -> List[str]: + + warnings = [] + + for config_key in optional_columns: + column_name = config.get(config_key) + + if not column_name: + warning_msg = ( + f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'. " + f"Optional column validation for this key is skipped." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + continue + + if column_name in geodata.columns: + # Type Check + if not geodata[column_name].apply(lambda x: isinstance(x, expected_type) or pandas.isnull(x)).all(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains values that are not of type " + f"{expected_type if isinstance(expected_type, type) else expected_type}. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + # Blank String Check (if applicable) + if check_blank and issubclass(expected_type, str): + if geodata[column_name].str.strip().eq("").any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains blank (empty) string values. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + # Null Value Check + if geodata[column_name].isnull().any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains NaN or null values. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + else: + info_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') is missing from the data. " + ) + ###### this might be taking it a bit too far + + logger.info(info_msg) + + return warnings + + +@beartype.beartype +def validate_dip_columns( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + dip_columns: List[str], + datatype_name: str = "UNKNOWN", + allow_nulls: bool = False +) -> Tuple[bool, str]: + + validation_failed = False + messages = [] + + # Define fixed ranges + fixed_ranges = { + "dip_column": (0, 90), + "dipdir_column": (0, 360) + } + + for key in dip_columns: + column_name = config.get(key) + if not column_name and datatype_name == "STRUCTURE": # only mandatory for structure, not faults! + warning_msg = ( + f"Configuration key '{key}' is missing for datatype '{datatype_name}'. " + f"Dip column validation for this key is skipped." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + continue + + if column_name in geodata.columns: + # Coerce to numeric + geodata[column_name] = pandas.to_numeric(geodata[column_name], errors='coerce') + + # Check for non-numeric or NaN values + if geodata[column_name].isnull().any(): + if not allow_nulls: + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') contains non-numeric or NaN values." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + + # Check if all values are numeric + if not geodata[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') must contain only numeric values." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + + # Range validation + min_val, max_val = fixed_ranges.get(key, (None, None)) + if min_val is not None and max_val is not None: + invalid_values = ~geodata[column_name].between(min_val, max_val, inclusive='both') + if invalid_values.any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') contains values outside the range [{min_val}, {max_val}]. " + "Is this intentional?" + ) + logger.warning(warning_msg) + messages.append(warning_msg) + + summary_message = "\n".join(messages) + return (validation_failed, summary_message) + + +@beartype.beartype +def validate_structtype_column( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + datatype_name: str, + required: bool = True, + text_keys: Optional[Dict[str, str]] = None +) -> Tuple[bool, str]: + + structtype_key = "structtype_column" + structtype_column = config.get(structtype_key) + + if not structtype_column: + if required: + error_msg = ( + f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. " + f"Validation for 'structtype_column' is skipped." + ) + logger.warning(error_msg) + return (True, error_msg) + else: + warning_msg = ( + f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. " + f"Optional 'structtype_column' validation is skipped." + ) + logger.warning(warning_msg) + return (False, "") + + if structtype_column not in geodata.columns: + if required: + error_msg = ( + f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') " + f"is missing from the data. Consider removing that key from the config." + ) + logger.error(error_msg) + return (True, error_msg) + else: + warning_msg = ( + f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') " + f"is missing from the data. Consider removing that key from the config." + ) + logger.warning(warning_msg) + return (False, "") + + # Check if all entries are strings or nulls + if not geodata[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{structtype_column}' " + f"(config key: '{structtype_key}') contains non-string values. " + "Please ensure all values in this column are strings." + ) + logger.error(error_msg) + return (True, error_msg) + + # Warn about empty or null cells + if geodata[structtype_column].isnull().any() or geodata[structtype_column].str.strip().eq("").any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{structtype_column}' contains NaN, empty, or blank values. " + "Processing might not work as expected." + ) + logger.warning(warning_msg) + + # Check for specific text keys + if text_keys: + for text_key, config_key in text_keys.items(): + text_value = config.get(config_key, None) + if text_value: + if not isinstance(text_value, str): + error_msg = ( + f"Datatype {datatype_name.upper()}: '{config_key}' must be a string. " + "Please ensure it is defined correctly in the config." + ) + logger.error(error_msg) + return (True, error_msg) + + if not geodata[structtype_column].str.contains(text_value, na=False).any(): + if text_key == "synform_text": + warning_msg = ( + f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. " + "This may impact processing." + ) + logger.warning(warning_msg) + else: + error_msg = ( + f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. " + "Project might end up with no faults." + ) + logger.error(error_msg) + return (True, error_msg) + + return (False, "") diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index 6948d3c3..884ef3d7 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -3,6 +3,7 @@ from .config import Config from .aus_state_urls import AustraliaStateUrls from .utils import generate_random_hex_colors, calculate_minimum_fault_length +from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity, check_fold_fields_validity # external imports import geopandas @@ -689,16 +690,40 @@ def check_map(self, datatype: Datatype): The datatype to check """ func = None + #check and parse geology data if datatype == Datatype.GEOLOGY: + validity_check, message = check_geology_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype GEOLOGY - data validation failed: {message}") + raise ValueError(f"Datatype GEOLOGY - data validation failed: {message}") func = self.parse_geology_map + + #check and parse structure data elif datatype == Datatype.STRUCTURE: + validity_check, message = check_structure_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype STRUCTURE - data validation failed: {message}") + raise ValueError(f"Datatype STRUCTURE - data validation failed: {message}") func = self.parse_structure_map + + #check and parse fault data elif datatype == Datatype.FAULT: + validity_check, message = check_fault_fields_validity(mapdata = self) + if validity_check: + raise ValueError(f"Datatype FAULT - data validation failed: {message}") func = self.parse_fault_map - elif datatype == Datatype.FOLD: - func = self.parse_fold_map + elif datatype == Datatype.FAULT_ORIENTATION: func = self.parse_fault_orientations + + #check and parse fold data + elif datatype == Datatype.FOLD: + validity_check, message = check_fold_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype FOLD - data validation failed: {message}") + raise ValueError(f"Datatype FOLD - data validation failed: {message}") + func = self.parse_fold_map + if func: error, message = func() if error: @@ -773,77 +798,7 @@ def parse_fault_orientations(self) -> tuple: return (False, "") - @beartype.beartype - def parse_structure_map(self) -> tuple: - """ - Parse the structure shapefile data into a consistent format - - Returns: - tuple: A tuple of (bool: success/fail, str: failure message) - """ - # Check type and size of loaded structure map - if ( - self.raw_data[Datatype.STRUCTURE] is None - or type(self.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame - ): - logger.warning("Structure map is not loaded or valid") - return (True, "Structure map is not loaded or valid") - - if len(self.raw_data[Datatype.STRUCTURE]) < 2: - logger.warning( - "Stucture map does not enough orientations to complete calculations (need at least 2), projection may be inconsistent" - ) - - # Create new geodataframe - structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"]) - config = self.config.structure_config - - # Parse dip direction and dip columns - if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]: - if config["orientation_type"] == "strike": - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( - lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 - ) - else: - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]] - else: - print(f"Structure map does not contain dipdir_column '{config['dipdir_column']}'") - - # Ensure all DIPDIR values are within [0, 360] - structure["DIPDIR"] = structure["DIPDIR"] % 360.0 - - if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]] - else: - print(f"Structure map does not contain dip_column '{config['dip_column']}'") - - # Add bedding and overturned booleans - if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["OVERTURNED"] = ( - self.raw_data[Datatype.STRUCTURE][config["overturned_column"]] - .astype(str) - .str.contains(config["overturned_text"]) - ) - else: - structure["OVERTURNED"] = False - - if config["description_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["BEDDING"] = ( - self.raw_data[Datatype.STRUCTURE][config["description_column"]] - .astype(str) - .str.contains(config["bedding_text"]) - ) - else: - structure["BEDDING"] = False - - # Add object id - if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]] - else: - structure["ID"] = numpy.arange(len(structure)) - - self.data[Datatype.STRUCTURE] = structure - return (False, "") + @beartype.beartype def parse_geology_map(self) -> tuple: @@ -853,13 +808,6 @@ def parse_geology_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded geology map - if ( - self.raw_data[Datatype.GEOLOGY] is None - or type(self.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame - ): - logger.warning("Geology map is not loaded or valid") - return (True, "Geology map is not loaded or valid") # Create new geodataframe geology = geopandas.GeoDataFrame(self.raw_data[Datatype.GEOLOGY]["geometry"]) @@ -870,22 +818,11 @@ def parse_geology_map(self) -> tuple: geology["UNITNAME"] = self.raw_data[Datatype.GEOLOGY][config["unitname_column"]].astype( str ) - else: - msg = f"Geology map does not contain unitname_column {config['unitname_column']}" - print(msg) - logger.warning(msg) - return (True, msg) + if config["alt_unitname_column"] in self.raw_data[Datatype.GEOLOGY]: geology["CODE"] = self.raw_data[Datatype.GEOLOGY][config["alt_unitname_column"]].astype( str ) - else: - msg = ( - f"Geology map does not contain alt_unitname_column {config['alt_unitname_column']}" - ) - print(msg) - logger.warning(msg) - return (True, msg) # Parse group and supergroup columns if config["group_column"] in self.raw_data[Datatype.GEOLOGY]: @@ -955,11 +892,9 @@ def parse_geology_map(self) -> tuple: else: geology["ID"] = numpy.arange(len(geology)) - # TODO: Check for duplicates in "ID" # TODO: Check that the exploded geology has more than 1 unit # Do we need to explode the geometry at this stage for geology/faults/folds??? # If not subsequent classes will need to be able to deal with them - # TODO: Check for Nans or blanks in "UNITNAME", "GROUP", "SUPERGROUP", "DESCRIPTION", "CODE", "ROCKTYPE" # Strip out whitespace (/n /t) and '-', ',', '?' from "UNITNAME", "CODE" "GROUP" "SUPERGROUP" geology["UNITNAME"] = geology["UNITNAME"].str.replace("[ -/?]", "_", regex=True) geology["CODE"] = geology["CODE"].str.replace("[ -/?]", "_", regex=True) @@ -978,12 +913,62 @@ def parse_geology_map(self) -> tuple: return (False, "") @beartype.beartype - def get_minimum_fault_length(self) -> Union[float, int, None]: + def parse_structure_map(self) -> tuple: """ - Get the minimum fault length + Parse the structure shapefile data into a consistent format + + Returns: + tuple: A tuple of (bool: success/fail, str: failure message) """ - return self.minimum_fault_length + # Create new geodataframe + structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"]) + config = self.config.structure_config + + # Parse dip direction and dip columns + if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]: + if config["orientation_type"] == "strike": + structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( + lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 + ) + else: + structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]] + + # Ensure all DIPDIR values are within [0, 360] + structure["DIPDIR"] = structure["DIPDIR"] % 360.0 + + if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]] + + + # Add bedding and overturned booleans + if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["OVERTURNED"] = ( + self.raw_data[Datatype.STRUCTURE][config["overturned_column"]] + .astype(str) + .str.contains(config["overturned_text"]) + ) + else: + structure["OVERTURNED"] = False + + if config["description_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["BEDDING"] = ( + self.raw_data[Datatype.STRUCTURE][config["description_column"]] + .astype(str) + .str.contains(config["bedding_text"]) + ) + else: + structure["BEDDING"] = False + + # Add object id + if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]] + else: + structure["ID"] = numpy.arange(len(structure)) + + self.data[Datatype.STRUCTURE] = structure + return (False, "") + @beartype.beartype def parse_fault_map(self) -> tuple: @@ -993,14 +978,6 @@ def parse_fault_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded fault map - if ( - self.raw_data[Datatype.FAULT] is None - or type(self.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame - ): - logger.warning("Fault map is not loaded or valid") - return (True, "Fault map is not loaded or valid") - # Create a new geodataframe faults = geopandas.GeoDataFrame(self.raw_data[Datatype.FAULT]["geometry"]) @@ -1013,11 +990,12 @@ def parse_fault_map(self) -> tuple: self.minimum_fault_length = calculate_minimum_fault_length( bbox=self.bounding_box, area_percentage=0.05 ) - + logger.info(f"Calculated minimum fault length - {self.minimum_fault_length}") + # crop faults = faults.loc[faults.geometry.length >= self.minimum_fault_length] - - if config["structtype_column"] in self.raw_data[Datatype.FAULT]: + + if config["structtype_column"] in self.raw_data[Datatype.FAULT]: faults["FEATURE"] = self.raw_data[Datatype.FAULT][config["structtype_column"]] faults = faults[faults["FEATURE"].astype(str).str.contains(config["fault_text"])] if self.verbose_level > VerboseLevel.NONE: @@ -1047,7 +1025,7 @@ def parse_fault_map(self) -> tuple: # Filter the DataFrame to remove rows where 'NAME' is in the existing_codes if existing_codes: faults = faults[~faults["NAME"].isin(existing_codes)] - logger.info(f"The following codes were found and removed: {existing_codes}") + logger.info(f"The following faults were found and removed as per the config: {existing_codes}") else: logger.info("None of the fault ignore codes exist in the original fault data.") pass @@ -1134,6 +1112,9 @@ def parse_fault_map(self) -> tuple: return (False, "") + + + @beartype.beartype def parse_fold_map(self) -> tuple: """ @@ -1142,12 +1123,6 @@ def parse_fold_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded fold map - if ( - self.raw_data[Datatype.FOLD] is None - or type(self.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame - ): - return (True, "Fold map is not loaded or valid") # Create new geodataframe folds = geopandas.GeoDataFrame(self.raw_data[Datatype.FOLD]["geometry"]) @@ -1690,4 +1665,4 @@ def STRUCTURE(self): @property def FAULT(self): - return self.get_map_data(Datatype.FAULT) + return self.get_map_data(Datatype.FAULT) \ No newline at end of file diff --git a/map2loop/project.py b/map2loop/project.py index 84aa0eea..d9cfbb83 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -11,6 +11,7 @@ from .stratigraphic_column import StratigraphicColumn from .deformation_history import DeformationHistory from .map2model_wrapper import Map2ModelWrapper +from .data_checks import validate_config_dictionary # external imports import LoopProjectFile as LPF @@ -18,7 +19,7 @@ gdal.UseExceptions() import geopandas import beartype -from beartype.typing import Union, List +from beartype.typing import Union, List, Dict, Any import pathlib import numpy import pandas @@ -34,7 +35,7 @@ class Project(object): """ The main entry point into using map2loop - Attiributes + Attributes ----------- verbose_level: m2l_enums.VerboseLevel A selection that defines how much console logging is output @@ -74,8 +75,7 @@ def __init__( save_pre_checked_map_data: bool = False, loop_project_filename: str = "", overwrite_loopprojectfile: bool = False, - **kwargs, - ): + ): """ The initialiser for the map2loop project @@ -119,6 +119,19 @@ def __init__( TypeError: Type of bounding_box not a dict or tuple ValueError: use_australian_state_data not in state list ['WA', 'SA', 'QLD', 'NSW', 'TAS', 'VIC', 'ACT', 'NT'] """ + + # make sure all the needed arguments are provided + if not use_australian_state_data: # this check has to skip if using Loop server data + self.validate_required_inputs( + bounding_box=bounding_box, + working_projection=working_projection, + geology_filename=geology_filename, + structure_filename=structure_filename, + dtm_filename=dtm_filename, + config_dictionary=config_dictionary, + config_filename=config_filename, + ) + self._error_state = ErrorState.NONE self._error_state_msg = "" self.verbose_level = verbose_level @@ -145,11 +158,6 @@ def __init__( self.fold_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"]) self.geology_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"]) - - # Check for alternate config filenames in kwargs - if "metadata_filename" in kwargs and config_filename == "": - config_filename = kwargs["metadata_filename"] - # Sanity check on working projection parameter if issubclass(type(working_projection), str) or issubclass(type(working_projection), int): self.map_data.set_working_projection(working_projection) @@ -207,12 +215,14 @@ def __init__( self.map_data.set_config_filename(config_filename) if config_dictionary != {}: + validate_config_dictionary(config_dictionary) self.map_data.config.update_from_dictionary(config_dictionary) + # print(self.map_data.config) + # self.map_data.config.validate_config_dictionary(config_dictionary) if clut_filename != "": self.map_data.set_colour_filename(clut_filename) - # Load all data (both shape and raster) self.map_data.load_all_map_data() @@ -230,9 +240,59 @@ def __init__( self.stratigraphic_column.populate(self.map_data.get_map_data(Datatype.GEOLOGY)) self.deformation_history.populate(self.map_data.get_map_data(Datatype.FAULT)) - if len(kwargs): - logger.warning(f"Unused keyword arguments: {kwargs}") + @beartype.beartype + def validate_required_inputs( + self, + bounding_box: Dict[str, Union[float, int]], + working_projection: str, + geology_filename: str, + structure_filename: str, + dtm_filename: str, + config_filename: str = None, + config_dictionary: Dict[str, Any] = {}, + ) -> None: + + required_inputs = { + "bounding_box": bounding_box, + "working_projection": working_projection, # this may be removed when fix is added for https://github.com/Loop3D/map2loop/issues/103 + "geology_filename": geology_filename, + "structure_filename": structure_filename, + "dtm_filename": dtm_filename, + } + + # Check for missing required inputs in project + missing_inputs = [key for key, value in required_inputs.items() if not value] + + if missing_inputs: + missing_list = ", ".join(missing_inputs) + logger.error( + f"Project construction is missing required inputs: {missing_list}. " + "Please add them to the Project()." + ) + raise ValueError( + f"Project construction is missing required inputs: {missing_list}. " + "Please add them to the Project()." + ) + + # Either config_filename or config_dictionary must be provided (but not both or neither) + if not config_filename and not config_dictionary: + logger.error( + "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project." + ) + raise ValueError( + "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project." + ) + if config_filename and config_dictionary: + logger.error( + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config." + ) + raise ValueError( + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config." + ) + + + # Getters and Setters @beartype.beartype def set_ignore_lithology_codes(self, codes: list): @@ -734,9 +794,10 @@ def save_into_projectfile(self): logger.info('Saving data into loop project file') if not self.loop_filename: logger.info('No loop project file specified, creating a new one') - self.loop_filename = os.path.join( - self.map_data.tmp_path, os.path.basename(self.map_data.tmp_path) + ".loop3d" - ) + output_dir = pathlib.Path.cwd() + output_dir.mkdir(parents=True, exist_ok=True) + filename = "new_project.loop3d" + self.loop_filename = str(output_dir / filename) file_exists = os.path.isfile(self.loop_filename) @@ -1012,7 +1073,7 @@ def draw_geology_map(self, points: pandas.DataFrame = None, overlay: str = ""): gdf.plot(ax=base, marker="o", color="red", markersize=5) @beartype.beartype - def save_mapdata_to_files(self, save_path: str = ".", extension: str = ".shp.zip"): + def save_mapdata_to_files(self, save_path: Union[pathlib.Path,str], extension: str = ".shp.zip"): """ Saves the map data frames to csv files @@ -1022,8 +1083,10 @@ def save_mapdata_to_files(self, save_path: str = ".", extension: str = ".shp.zip extension (str, optional): An alternate extension to save the GeoDataFrame in. Defaults to ".csv". """ - if not os.path.exists(save_path): - os.mkdir(save_path) + + save_path=pathlib.Path(save_path) + if not save_path.exists(): + os.makedirs(save_path) self.map_data.save_all_map_data(save_path, extension) @beartype.beartype diff --git a/tests/data_checks/test_config.py b/tests/data_checks/test_config.py new file mode 100644 index 00000000..cc6f01ba --- /dev/null +++ b/tests/data_checks/test_config.py @@ -0,0 +1,135 @@ +import pytest +from map2loop.data_checks import validate_config_dictionary + + +@pytest.fixture +def valid_config(): + return { + "structure": { + "orientation_type": "dip direction", + "dipdir_column": "azimuth", + "dip_column": "inclinatn", + "description_column": "DESCRIPTION", + "bedding_text": "bed", + "overturned_column": "no_col", + "overturned_text": "blah", + "objectid_column": "geographic", + "desciption_column": "sub_type" + }, + "geology": { + "unitname_column": "formatted_", + "alt_unitname_column": "abbreviate", + "group_column": "no_col", + "supergroup_column": "interpreta", + "description_column": "text_descr", + "minage_column": "no_col", + "maxage_column": "no_col", + "rocktype_column": "rank", + "alt_rocktype_column": "type", + "sill_text": "sill", + "intrusive_text": "intrusion", + "volcanic_text": "volc", + "objectid_column": "ID", + "ignore_lithology_codes": ["cover"] + }, + "fault": { + "structtype_column": "featuretyp", + "fault_text": "s", + "dip_null_value": "0", + "dipdir_flag": "num", + "dipdir_column": "no_col", + "dip_column": "no_col", + "orientation_type": "dip direction", + "dipestimate_column": "no_col", + "dipestimate_text": "no_col", + "name_column": "no_col", + "objectid_column": "geographic", + "minimum_fault_length": 100.0, + "ignore_fault_codes": [] + }, + "fold": { + "structtype_column": "featuretyp", + "fold_text": "fold", + "description_column": "no_col", + "synform_text": "syn", + "foldname_column": "NAME", + "objectid_column": "geographic" + } + } + + +def test_valid_config_no_errors(valid_config): + # Should not raise any error + validate_config_dictionary(valid_config) + + +def test_missing_required_section(valid_config): + + config_missing_structure = dict(valid_config) + del config_missing_structure["structure"] # remove required section + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_missing_structure) + assert "Missing required section 'structure'" in str(exc_info.value) + + +def test_missing_required_key(valid_config): + + config_missing_dip = dict(valid_config) + + del config_missing_dip["structure"]["dip_column"] # remove required key + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_missing_dip) + assert "Missing required key 'dip_column' for 'structure'" in str(exc_info.value) + + +def test_unrecognized_section(valid_config): + + config_extra_section = dict(valid_config) + config_extra_section["random_section"] = {"random_key": "random_value"} + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_extra_section) + assert "Unrecognized section 'random_section'" in str(exc_info.value) + + +def test_unrecognized_key_in_section(valid_config): + + config_extra_key = dict(valid_config) + config_extra_key["structure"]["random_key"] = "random_value" + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_extra_key) + assert "Key 'random_key' is not an allowed key in the 'structure' section." in str(exc_info.value) + + +def test_legacy_key_detected(valid_config): + + config_with_legacy = dict(valid_config) + config_with_legacy["structure"]["otype"] = "legacy_value" # 'otype' --> legacy key + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_with_legacy) + assert "Legacy key found in config - 'otype'" in str(exc_info.value) + + +def test_minimum_fault_length_wrong_type(valid_config): + + config_wrong_mfl = dict(valid_config) + config_wrong_mfl["fault"]["minimum_fault_length"] = "one_hundred" # invalid type + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_wrong_mfl) + assert "minimum_fault_length must be a number" in str(exc_info.value) + + +def test_minimum_fault_length_missing(valid_config): + """ + Remove minimum_fault_length entirely. That should be fine (None -> no check). + """ + config_no_mfl = dict(valid_config) + del config_no_mfl["fault"]["minimum_fault_length"] + + # Should not raise any error, as it's optional + validate_config_dictionary(config_no_mfl) + diff --git a/tests/data_checks/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py new file mode 100644 index 00000000..7cdd8fc7 --- /dev/null +++ b/tests/data_checks/test_input_data_faults.py @@ -0,0 +1,108 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.m2l_enums import Datatype +from map2loop.data_checks import check_fault_fields_validity + + +@pytest.mark.parametrize( + "fault_data, fault_config, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault A", "Fault B"], + "ID": [1, 2], + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + False, + "", + ), + # Invalid geometry + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.Polygon( + [(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)] + ), # Invalid geometry + ], + "FEATURE": ["Fault A", "Fault B"], + "ID": [1, 2], + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + True, + "Invalid geometry types found in datatype FAULT. All geometries must be LineString, MultiLineString.", + ), + # Non-string FEATURE column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": [5, 2], + "ID": [1, 2], + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + True, + "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings.", + ), + # Invalid values in DIP estimate column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]), + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ["Zuleika", "Zuleika"], + "ID": [1, 2], + "DIP": [70, 50], + "STRIKE": [150, None], + "DEC": ["north_east", "southt"], + }, + { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": "NAME", + "dip_column": "DIP", + "dipdir_column": "STRIKE", + "dip_estimate_column": "DEC", + }, + True, + "Datatype FAULT: Column 'DEC' contains invalid values. Allowed values: ['north_east', 'south_east', 'south_west', 'north_west', 'north', 'east', 'south', 'west'].", + ), + ], + ids=[ + "Valid fault data", + "Invalid geometry", + "Non-string FEATURE column", + "Invalid DIP estimate column", + ], +) +def test_check_fault_fields_validity(fault_data, fault_config, expected_validity, expected_message): + # Dynamically create the mock config for this test case + class MockConfig: + def __init__(self, config): + self.fault_config = config + + # Create a GeoDataFrame + fault_gdf = gpd.GeoDataFrame(fault_data, crs="EPSG:4326") + + # Instantiate the MapData class with the dynamic mock config and data + map_data = MapData() + map_data.config = MockConfig(fault_config) + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.FAULT] = fault_gdf + + # Test the check_fault_fields_validity function + validity_check, message = check_fault_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py new file mode 100644 index 00000000..8d5e5b69 --- /dev/null +++ b/tests/data_checks/test_input_data_fold.py @@ -0,0 +1,112 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.m2l_enums import Datatype +from map2loop.data_checks import check_fold_fields_validity + +@pytest.mark.parametrize( + "fold_data, fold_config, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + False, + "" + ), + # Missing geometry + ( + { + "geometry": [ + shapely.geometry.LineString([(0,0), (0,0)]), # Invalid type + shapely.geometry.LineString([(0, 0), (1, 1)]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Invalid geometry types found in datatype FOLD. All geometries must be LineString, MultiLineString." + ), + # Non-string FEATURE column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": [123, 456], # Invalid type + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." + ), + # Missing ID column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + False, + "" + ), + # Duplicate ID values + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 1], # Duplicate values + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Datatype FOLD: Column 'ID' (config key: 'objectid_column') contains duplicate values." + ), + ], + ids=[ + "Valid fold data", + "Invalid geometry", + "Non-string FEATURE column", + "Missing ID column", + "Duplicate ID values" + ] +) +def test_check_fold_fields_validity(fold_data, fold_config, expected_validity, expected_message): + # Dynamically create the mock config for this test case + class MockConfig: + def __init__(self, config): + self.fold_config = config + + # Create a GeoDataFrame + fold_gdf = gpd.GeoDataFrame(fold_data, crs="EPSG:4326") + + # Instantiate the MapData class with the dynamic mock config and data + map_data = MapData() + map_data.config = MockConfig(fold_config) + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.FOLD] = fold_gdf + + # Test the check_fold_fields_validity function + validity_check, message = check_fold_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message diff --git a/tests/data_checks/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py new file mode 100644 index 00000000..2cc42408 --- /dev/null +++ b/tests/data_checks/test_input_data_geology.py @@ -0,0 +1,203 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.data_checks import check_geology_fields_validity + +# Datatype Enum +class Datatype: + GEOLOGY = 0 + +# Config +class MockConfig: + def __init__(self): + self.geology_config = { + "unitname_column": "UNITNAME", + "alt_unitname_column": "CODE", + "group_column": "GROUP", + "supergroup_column": "SUPERGROUP", + "description_column": "DESCRIPTION", + "rocktype_column": "ROCKTYPE1", + "alt_rocktype_column": "ROCKTYPE2", + "minage_column": "MIN_AGE", + "maxage_column": "MAX_AGE", + "objectid_column": "ID", + "ignore_lithology_codes": [], + } + +@pytest.mark.parametrize( + "geology_data, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + False, + "", + ), + # Invalid geometry + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)])], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + False, + "", + ), + # Missing required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + # "CODE": ["SST"], # Missing required column + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: Required column with config key 'alt_unitname_column' (column: 'CODE') is missing from the data.", + ), + # Non-string value in required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": [2], # Non-string value + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: Column 'alt_unitname_column' (column: 'CODE') must contain only values.", + ), + # NaN or blank value in required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": [""], # Blank value + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') contains blank (empty) values. Please ensure all values are populated.", + ), + # Duplicate ID values + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", "df"], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 1], # Duplicate ID + }, + True, + "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains duplicate values.", + ), + # nan in id + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", "df"], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, None], + }, + True, + "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains non-numeric or NaN values. Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs.", + ), + # nan in unit name + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", None], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 1], # Duplicate ID + }, + True, + "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') must contain only values.", + ), + ], +) + + + +def test_check_geology_fields_validity(geology_data, expected_validity, expected_message): + # Create a GeoDataFrame + geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + + # Instantiate the MapData class with the mock config and data + map_data = MapData() + map_data.config = MockConfig() + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + + # Test the check_geology_fields_validity function + validity_check, message = check_geology_fields_validity(map_data) + + assert validity_check == expected_validity + assert message == expected_message \ No newline at end of file diff --git a/tests/data_checks/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py new file mode 100644 index 00000000..4cabca28 --- /dev/null +++ b/tests/data_checks/test_input_data_structure.py @@ -0,0 +1,124 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.data_checks import check_structure_fields_validity + +# Datatype Enum +class Datatype: + STRUCTURE = 1 + +# Config +class MockConfig: + def __init__(self): + self.structure_config = { + "dipdir_column": "DIPDIR", + "dip_column": "DIP", + "description_column": "DESCRIPTION", + "overturned_column": "OVERTURNED", + "objectid_column": "ID", + } + + +@pytest.mark.parametrize( + "structure_data, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + }, + False, + "", + ), + # Invalid geometry + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Polygon( + [(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)] + ), # Invalid geometry + ], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + }, + True, + "Invalid geometry types found in datatype STRUCTURE. All geometries must be Point, MultiPoint.", + ), + # Missing required column + ( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + # "DIPDIR": [45.0, 135.0], # Missing required column + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + }, + True, + "Datatype STRUCTURE: Required column with config key 'dipdir_column' (column: 'DIPDIR') is missing from the data.", + ), + # Non-numeric value in numeric column + ( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": ["A", "B"], # Non-numeric value + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + }, + True, + "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR') must contain only numeric values.", + ), + # NaN or blank value in required column + ( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [None, 3], # NaN value + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2], + }, + True, + "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR') contains null values. Please ensure all values are present.", + ), + # Duplicate ID column + ( + { + "geometry": [shapely.geometry.Point(0, 0), shapely.geometry.Point(1, 1)], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 1], # Duplicate ID + }, + True, + "Datatype STRUCTURE: Column 'ID' (config key: 'objectid_column') contains duplicate values.", + ), + ], +) +def test_check_structure_fields_validity(structure_data, expected_validity, expected_message): + # Create a GeoDataFrame + structure_gdf = gpd.GeoDataFrame(structure_data, crs="EPSG:4326") + + # Instantiate the MapData class with the mock config and data + map_data = MapData() + map_data.config = MockConfig() + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.STRUCTURE] = structure_gdf + + # Test the check_structure_fields_validity function + validity_check, message = check_structure_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message diff --git a/tests/project/test_config_arguments.py b/tests/project/test_config_arguments.py new file mode 100644 index 00000000..53e6ce35 --- /dev/null +++ b/tests/project/test_config_arguments.py @@ -0,0 +1,148 @@ +import pytest +import pathlib +from map2loop.project import Project +import map2loop + +# ------------------------------------------------------------------------------ +# Common fixtures or helper data (bounding box, minimal filenames, etc.) +# ------------------------------------------------------------------------------ + +@pytest.fixture +def minimal_bounding_box(): + return { + "minx": 515687.31005864, + "miny": 7493446.76593407, + "maxx": 562666.860106543, + "maxy": 7521273.57407786, + "base": -3200, + "top": 3000, + } + +@pytest.fixture +def geology_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') + ) + +@pytest.fixture +def structure_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/structure.geojson') + ) + +@pytest.fixture +def dtm_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') + ) + +@pytest.fixture +def valid_config_dictionary(): + """ + A valid config dictionary that meets the 'structure' and 'geology' requirements + """ + return { + "structure": { + "dipdir_column": "azimuth2", + "dip_column": "dip" + }, + "geology": { + "unitname_column": "unitname", + "alt_unitname_column": "code", + } + } + + + +# 1) config_filename and config_dictionary both present should raise ValueError +def test_config_filename_and_dictionary_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + + with pytest.raises(ValueError, match="Both 'config_filename' and 'config_dictionary' were provided"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_filename="dummy_config.json", + config_dictionary=valid_config_dictionary, + ) + +# 2) No config_filename or config_dictionary should raise ValueError +def test_no_config_provided_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file +): + + with pytest.raises(ValueError, match="A config file is required to run map2loop"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + ) + +# 3) Passing an unexpected argument should raise TypeError +def test_unexpected_argument_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + + with pytest.raises(TypeError, match="unexpected keyword argument 'config_file'"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=valid_config_dictionary, + config_file="wrong_kwarg.json", + ) + +# 4) Dictionary missing a required key should raise ValueError + +def test_dictionary_missing_required_key_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file +): + + invalid_dictionary = { + "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"}, + "geology": {"unitname_column": "unitname"} # alt_unitname_column missing + } + + with pytest.raises(ValueError, match="Missing required key 'alt_unitname_column' for 'geology'"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=invalid_dictionary, + ) + +# 5) All good => The Project should be created without errors +def test_good_config_runs_successfully( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + project = None + try: + project = Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=valid_config_dictionary, + ) + except Exception as e: + pytest.fail(f"Project initialization raised an unexpected exception: {e}") + + assert project is not None, "Project was not created." + assert project.map_data.config.structure_config["dipdir_column"] == "azimuth2" + assert project.map_data.config.structure_config["dip_column"] == "dip" + assert project.map_data.config.geology_config["unitname_column"] == "unitname" + assert project.map_data.config.geology_config["alt_unitname_column"] == "code" \ No newline at end of file diff --git a/tests/project/test_ignore_codes_setters_getters.py b/tests/project/test_ignore_codes_setters_getters.py index 4cebdba7..0674ca95 100644 --- a/tests/project/test_ignore_codes_setters_getters.py +++ b/tests/project/test_ignore_codes_setters_getters.py @@ -2,6 +2,7 @@ from map2loop.project import Project from map2loop.m2l_enums import Datatype import map2loop +from unittest.mock import patch # Sample test function for lithology and fault ignore codes @@ -20,25 +21,27 @@ def test_set_get_ignore_codes(): config_dictionary = { "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"}, "geology": {"unitname_column": "unitname", "alt_unitname_column": "code"}, + "fault": {'structtype_column': 'feature', 'fault_text': 'Fault'}, } - - project = Project( - working_projection='EPSG:28350', - bounding_box=bbox_3d, - geology_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') - ), - fault_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson') - ), - dtm_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') - ), - config_dictionary=config_dictionary, - ) + with patch.object(Project, 'validate_required_inputs', return_value=None): + project = Project( + working_projection='EPSG:28350', + bounding_box=bbox_3d, + geology_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') + ), + fault_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson') + ), + dtm_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') + ), + config_dictionary=config_dictionary, + structure_filename="", + ) # Define test ignore codes for lithology and faults lithology_codes = ["cover", "Fortescue_Group", "A_FO_od"]