diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..4a88ce5
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,62 @@
+# GitHub Copilot Instructions
+
+## General Guidelines
+
+- Prefer to return early from functions to reduce nesting and improve code readability.
+- Always follow good security practices
+- Use f-strings for string formatting
+- Use logging instead of print statements
+- Follow the DRY principle (Don't Repeat Yourself)
+- Use pathlib for all file operations
+- Follow defensive programming practices
+ - Validate all function inputs
+ - Handle exceptions gracefully
+- Use retry logic for any network calls
+- Add a new line before return statements in functions
+- Use list/dict/set comprehensions where appropriate
+- Avoid using wildcard imports (e.g., from module import *)
+- Use context managers for file operations (e.g., with open(...) as f:)
+- Prefer using built-in functions and libraries over custom implementations when possible
+- Prefer smaller, focused functions that do one thing well rather than large, monolithic functions
+
+## Commit Message Format
+
+All commits must follow the Conventional Commits format using the Angular preset.
+
+For detailed guidelines on commit types, scopes, and formatting rules, see the release-composite-action README.
+
+## Code Style and Conventions
+
+### Python Style
+- Line length: 120 characters (configured in ruff)
+- Indentation: 4 spaces for Python files
+- Use type hints for all new work
+- Follow PEP 8 conventions
+- Follow ruff style guide and linting rules
+- Use pylint disable comments sparingly and only when necessary (e.g., `# pylint: disable=invalid-name`)
+
+### Documentation
+- Use docstrings for all classes and public methods
+- Follow NumPy/SciPy docstring format with sections:
+ - Brief description
+ - `Attributes` for class attributes
+ - `Parameters` for method parameters
+ - `Returns` for return values
+ - `Methods` for public methods in class docstrings
+
+## Testing Guidelines
+
+- Unit tests are required, and are required to pass before PR
+- Mock external services
+- Test both success and failure paths
+- Verify warning messages for invalid configurations
+- Code coverage should be maintained at a high level (tracked via codecov)
+- Test names should be descriptive and follow the pattern `test__`
+
+## Code Quality
+
+- Run `ruff` for linting before committing
+- Maintain test coverage (tracked via codecov)
+- Follow existing patterns in the codebase
+- Keep methods focused and single-purpose
+- Use static methods when methods don't need instance state
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 3b62c65..19bac35 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,20 +1,24 @@
{
- // Use IntelliSense to learn about possible attributes.
- // Hover to view descriptions of existing attributes.
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
- "version": "0.2.0",
- "configurations": [
- {
- "name": "Python Debugger: Module",
- "type": "debugpy",
- "request": "launch",
- "module": "sweeper",
- "args": [
- "sweep",
- "--workspace",
- "C:\\temp\\locators.gdb"
- ],
- "cwd": "${workspaceFolder}"
- }
- ]
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+
+ {
+ "name": "Python Debugger: Module",
+ "type": "debugpy",
+ "request": "launch",
+ "module": "sweeper.__main__",
+ "args": [
+ "sweep",
+ "duplicates",
+ "--workspace=C:\\temp\\sweeper_test\\Pre1978.gdb",
+ "--table-name=DAQPre1978LeadInHomes",
+ "--try-fix",
+ "--verbose",
+ "--save-report=C:\\temp\\sweeper_test\\"
+ ]
+ }
+ ]
}
diff --git a/src/sweeper/sweepers/duplicates.py b/src/sweeper/sweepers/duplicates.py
index c8faa20..044d5b8 100644
--- a/src/sweeper/sweepers/duplicates.py
+++ b/src/sweeper/sweepers/duplicates.py
@@ -2,6 +2,8 @@
# * coding: utf8 *
import logging
import re
+import typing
+from typing import Generator
import arcpy
from xxhash import xxh64
@@ -99,30 +101,94 @@ def try_fix(self):
if len(self.oids_with_issues) == 0:
return report
- sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in self.oids_with_issues])})'
+ chunk_size = 1000
+
+ lists_of_oids = list(self._chunk_oid_list(self.oids_with_issues, chunk_size))
temp_feature_layer = "temp_layer"
log.info(f"Workspace is: {self.workspace}")
- #: Delete duplicate rows using different arcpy tools for tables and feature classes
+ log.info(
+ f"Attempting to delete a total of {len(self.oids_with_issues)} duplicate records in {len(lists_of_oids)} batch(es) of {chunk_size} records each"
+ )
+
+ successful_deletes = 0
+ failed_deletes = 0
+ failed_batches = 0
+
with arcpy.EnvManager(workspace=self.workspace):
- if self.is_table:
- duplicate_features = arcpy.management.MakeTableView(self.table_name, temp_feature_layer, sql)
- else:
- duplicate_features = arcpy.management.MakeFeatureLayer(self.table_name, temp_feature_layer, sql)
-
- try:
- log.info(f"attempting to delete {len(self.oids_with_issues)} duplicate records")
- if self.is_table:
- arcpy.management.DeleteRows(duplicate_features)
- else:
- arcpy.management.DeleteFeatures(duplicate_features)
- except Exception as error:
- error_message = f"unable to delete features {error}"
- report["issues"].append(error_message)
- finally:
- if arcpy.Exists(temp_feature_layer):
- arcpy.management.Delete(temp_feature_layer)
-
- report["fixes"].append(f"{len(self.oids_with_issues)} records deleted successfully")
+ all_features = self._make_feature_layer(temp_feature_layer)
+
+ for index, list_of_oids in enumerate(lists_of_oids, start=1):
+ sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in list_of_oids])})'
+
+ try:
+ log.info(f"Batch {index}: attempting to delete {len(list_of_oids)} duplicate records")
+ arcpy.management.SelectLayerByAttribute(all_features, "NEW_SELECTION", sql)
+ if not self._valid_selection(all_features, list_of_oids):
+ raise RuntimeError(
+ f"Invalid selection for batch {index}. The OIDs in the selection do not match the expected OIDs."
+ )
+ self._delete_features_or_rows(all_features)
+ successful_deletes += len(list_of_oids)
+ except Exception as error:
+ error_message = f"unable to delete features in batch {index}: {error}"
+ log.info(error_message)
+ report["issues"].append(error_message)
+ failed_deletes += len(list_of_oids)
+ failed_batches += 1
+ finally:
+ arcpy.management.SelectLayerByAttribute(all_features, "CLEAR_SELECTION")
+
+ if arcpy.Exists(temp_feature_layer):
+ arcpy.management.Delete(temp_feature_layer)
+ if successful_deletes > 0:
+ report["fixes"].append(f"{successful_deletes} records deleted successfully")
+ if failed_deletes > 0:
+ report["issues"].append(
+ f"{failed_batches} batch(es) with {failed_deletes} total records had errors deleting."
+ )
return report
+
+ @staticmethod
+ def _chunk_oid_list(lst: list, chunk_size: int) -> Generator[list, None, None]:
+ """Breaks a list into chunks of chunk_size
+
+ Args:
+ lst (list): Input List
+ chunk_size (int): The desired size per chunk
+
+ Yields:
+ Generator[list, None, None]: The next chunk_size sized chunk
+ """
+ if len(lst) <= chunk_size:
+ yield lst
+ return
+ for i in range(0, len(lst), chunk_size):
+ yield lst[i : i + chunk_size]
+
+ def _make_feature_layer(self, temp_layer_name: str) -> typing.Any:
+ """Single method to handle table or layer creation based on is_table parameter"""
+ #: arcpy's typing gets really convoluted, so we're just using typing.Any.
+
+ if self.is_table:
+ return arcpy.management.MakeTableView(self.table_name, temp_layer_name)
+ else:
+ return arcpy.management.MakeFeatureLayer(self.table_name, temp_layer_name)
+
+ def _delete_features_or_rows(self, feature_layer: typing.Any):
+ """Single method to handle deleting features or rows based on is_table parameter"""
+ if self.is_table:
+ arcpy.management.DeleteRows(feature_layer)
+ else:
+ arcpy.management.DeleteFeatures(feature_layer)
+
+ @staticmethod
+ def _valid_selection(feature_layer_or_table: typing.Any, list_of_oids: list) -> bool:
+ """Makes sure the selection is valid by comparing the OBJECTIDs in the selection to the list of OBJECTIDs used to create the where clause. This is a safeguard to prevent deleting the entire table if something goes wrong with the where clause."""
+ with arcpy.da.SearchCursor(feature_layer_or_table, ["OID@"]) as cursor:
+ selected_oids = {str(row[0]) for row in cursor}
+ if set([str(oid) for oid in list_of_oids]) != selected_oids:
+ log.info("Selected OIDs do not match expected OIDs.")
+ return False
+ return True
diff --git a/tests/test_duplicates.py b/tests/test_duplicates.py
new file mode 100644
index 0000000..e3acb88
--- /dev/null
+++ b/tests/test_duplicates.py
@@ -0,0 +1,493 @@
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ruff: isort: off
+#: Mock arcpy and its submodules before importing the module under test
+arcpy_mock = MagicMock()
+_MOCK_MODULE_NAMES = [
+ "arcpy",
+ "arcpy.da",
+ "arcpy._mp",
+ "arcpy.typing",
+ "arcpy.typing.gp",
+ "xxhash",
+]
+_originals = {name: sys.modules.get(name) for name in _MOCK_MODULE_NAMES}
+sys.modules["arcpy"] = arcpy_mock
+sys.modules["arcpy.da"] = arcpy_mock.da
+sys.modules["arcpy._mp"] = arcpy_mock._mp
+sys.modules["arcpy.typing"] = arcpy_mock.typing
+sys.modules["arcpy.typing.gp"] = arcpy_mock.typing.gp
+sys.modules["xxhash"] = MagicMock()
+
+from sweeper.sweepers.duplicates import DuplicateTest # noqa: E402
+
+# ruff: isort: on
+
+
+@pytest.fixture(autouse=True, scope="module")
+def _restore_sys_modules():
+ """Restore sys.modules entries modified by this module after all tests complete."""
+ yield
+ for name, original in _originals.items():
+ if original is None:
+ sys.modules.pop(name, None)
+ else:
+ sys.modules[name] = original
+
+
+class TestChunkOidList:
+ def test_chunk_oid_list_small_list_returns_single_chunk(self):
+ lst = [1, 2, 3]
+ result = list(DuplicateTest._chunk_oid_list(lst, 10))
+
+ assert result == [[1, 2, 3]]
+
+ def test_chunk_oid_list_exact_chunk_size_returns_single_chunk(self):
+ lst = list(range(10))
+ result = list(DuplicateTest._chunk_oid_list(lst, 10))
+
+ assert result == [list(range(10))]
+
+ def test_chunk_oid_list_large_list_returns_multiple_chunks(self):
+ lst = list(range(25))
+ result = list(DuplicateTest._chunk_oid_list(lst, 10))
+
+ assert len(result) == 3
+ assert result[0] == list(range(10))
+ assert result[1] == list(range(10, 20))
+ assert result[2] == list(range(20, 25))
+
+ def test_chunk_oid_list_empty_list_returns_single_empty_chunk(self):
+ result = list(DuplicateTest._chunk_oid_list([], 10))
+
+ assert result == [[]]
+
+ def test_chunk_oid_list_exactly_two_chunks(self):
+ lst = list(range(20))
+ result = list(DuplicateTest._chunk_oid_list(lst, 10))
+
+ assert len(result) == 2
+ assert result[0] == list(range(10))
+ assert result[1] == list(range(10, 20))
+
+
+class TestMakeFeatureLayer:
+ def setup_method(self):
+ arcpy_mock.reset_mock()
+
+ def test_make_feature_layer_is_table_calls_make_table_view(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.is_table = True
+ sweeper._make_feature_layer("temp_layer")
+
+ arcpy_mock.management.MakeTableView.assert_called_once_with("my_table", "temp_layer")
+
+ def test_make_feature_layer_is_not_table_calls_make_feature_layer(self):
+ sweeper = DuplicateTest("workspace", "my_fc")
+ sweeper._make_feature_layer("temp_layer")
+
+ arcpy_mock.management.MakeFeatureLayer.assert_called_once_with("my_fc", "temp_layer")
+
+
+class TestDeleteFeaturesOrRows:
+ def setup_method(self):
+ arcpy_mock.reset_mock()
+
+ def test_delete_features_or_rows_is_table_calls_delete_rows(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.is_table = True
+ layer_mock = MagicMock()
+ sweeper._delete_features_or_rows(layer_mock)
+
+ arcpy_mock.management.DeleteRows.assert_called_once_with(layer_mock)
+
+ def test_delete_features_or_rows_is_not_table_calls_delete_features(self):
+ sweeper = DuplicateTest("workspace", "my_fc")
+ layer_mock = MagicMock()
+ sweeper._delete_features_or_rows(layer_mock)
+
+ arcpy_mock.management.DeleteFeatures.assert_called_once_with(layer_mock)
+
+
+class TestTryFix:
+ def setup_method(self):
+ arcpy_mock.reset_mock()
+ arcpy_mock.management.SelectLayerByAttribute.side_effect = None
+ arcpy_mock.EnvManager.return_value.__enter__ = MagicMock(return_value=None)
+ arcpy_mock.EnvManager.return_value.__exit__ = MagicMock(return_value=False)
+ self._valid_selection_patcher = patch.object(DuplicateTest, "_valid_selection", return_value=True)
+ self._valid_selection_patcher.start()
+
+ def teardown_method(self):
+ self._valid_selection_patcher.stop()
+
+ def test_try_fix_no_oids_returns_empty_report(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+
+ report = sweeper.try_fix()
+
+ assert report["title"] == "Duplicate Try Fix"
+ assert report["feature_class"] == "my_table"
+ assert report["issues"] == []
+ assert report["fixes"] == []
+
+ def test_try_fix_successful_deletes_reports_count(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = [1, 2, 3]
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+
+ report = sweeper.try_fix()
+
+ assert "3 records deleted successfully" in report["fixes"]
+ assert report["issues"] == []
+
+ def test_try_fix_failed_batch_reports_error(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = [1, 2, 3]
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+ arcpy_mock.management.SelectLayerByAttribute.side_effect = [Exception("delete error"), MagicMock()]
+
+ report = sweeper.try_fix()
+
+ assert any("unable to delete features in batch 1" in issue for issue in report["issues"])
+ assert any("1 batch(es) with 3 total records had errors deleting." in issue for issue in report["issues"])
+
+ def test_try_fix_clear_selection_called_in_finally(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = [1, 2]
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+ arcpy_mock.management.SelectLayerByAttribute.side_effect = [Exception("fail"), None]
+
+ sweeper.try_fix()
+
+ clear_calls = [
+ c for c in arcpy_mock.management.SelectLayerByAttribute.call_args_list if "CLEAR_SELECTION" in c.args
+ ]
+ assert len(clear_calls) >= 1
+
+ def test_try_fix_is_table_uses_make_table_view(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = [1]
+ sweeper.is_table = True
+ table_view_mock = MagicMock()
+ arcpy_mock.management.MakeTableView.return_value = table_view_mock
+
+ report = sweeper.try_fix()
+
+ arcpy_mock.management.MakeTableView.assert_called_once_with("my_table", "temp_layer")
+ assert "1 records deleted successfully" in report["fixes"]
+
+ def test_try_fix_multiple_batches_accumulates_successful_deletes(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = list(range(1500))
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+ arcpy_mock.management.SelectLayerByAttribute.side_effect = None
+ arcpy_mock.management.SelectLayerByAttribute.return_value = MagicMock()
+
+ report = sweeper.try_fix()
+
+ assert "1500 records deleted successfully" in report["fixes"]
+
+ def test_try_fix_partial_failure_reports_correct_counts(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = list(range(1500))
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+
+ call_count = 0
+
+ def select_side_effect(*args, **kwargs):
+ nonlocal call_count
+ if "CLEAR_SELECTION" in args:
+ return MagicMock()
+ call_count += 1
+ if call_count == 1:
+ raise Exception("batch 1 failed")
+
+ return MagicMock()
+
+ arcpy_mock.management.SelectLayerByAttribute.side_effect = select_side_effect
+
+ report = sweeper.try_fix()
+
+ assert any("500 records deleted successfully" in fix for fix in report["fixes"])
+ assert any("1 batch(es) with 1000 total records had errors deleting." in issue for issue in report["issues"])
+
+ def test_try_fix_invalid_selection_reports_error(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = [1, 2, 3]
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+
+ with patch.object(DuplicateTest, "_valid_selection", return_value=False):
+ report = sweeper.try_fix()
+
+ assert any("Invalid selection for batch 1" in issue for issue in report["issues"])
+ assert any("1 batch(es) with 3 total records had errors deleting." in issue for issue in report["issues"])
+
+ def test_try_fix_continues_after_invalid_selection_batch(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ sweeper.oids_with_issues = list(range(1500))
+ layer_mock = MagicMock()
+ arcpy_mock.management.MakeFeatureLayer.return_value = layer_mock
+
+ #: Return False for batch 1, True for all subsequent batches
+ valid_selection_results = iter([False, True, True])
+ with patch.object(DuplicateTest, "_valid_selection", side_effect=valid_selection_results):
+ report = sweeper.try_fix()
+
+ assert any("500 records deleted successfully" in fix for fix in report["fixes"])
+ assert any("Invalid selection for batch 1" in issue for issue in report["issues"])
+ assert any("1 batch(es) with 1000 total records had errors deleting." in issue for issue in report["issues"])
+
+
+class TestValidSelection:
+ def setup_method(self):
+ arcpy_mock.reset_mock()
+
+ def _make_cursor_mock(self, rows):
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ def test_valid_selection_returns_true_when_oids_match(self):
+ self._make_cursor_mock([(1,), (2,), (3,)])
+
+ result = DuplicateTest._valid_selection(MagicMock(), [1, 2, 3])
+
+ assert result is True
+
+ def test_valid_selection_returns_false_when_oids_do_not_match(self):
+ self._make_cursor_mock([(1,), (2,)])
+
+ result = DuplicateTest._valid_selection(MagicMock(), [1, 2, 3])
+
+ assert result is False
+
+ def test_valid_selection_returns_false_when_selection_is_empty(self):
+ self._make_cursor_mock([])
+
+ result = DuplicateTest._valid_selection(MagicMock(), [1, 2, 3])
+
+ assert result is False
+
+ def test_valid_selection_returns_true_for_single_oid_match(self):
+ self._make_cursor_mock([(42,)])
+
+ result = DuplicateTest._valid_selection(MagicMock(), [42])
+
+ assert result is True
+
+
+#: This test class was added by copilot when creating tests for the try_fix method. They pass, but I've not verified if they're actually sane.
+class TestSweep:
+ def setup_method(self):
+ arcpy_mock.reset_mock()
+ arcpy_mock.EnvManager.return_value.__enter__ = MagicMock(return_value=None)
+ arcpy_mock.EnvManager.return_value.__exit__ = MagicMock(return_value=False)
+
+ def _make_field(self, name: str) -> MagicMock:
+ field = MagicMock()
+ field.name = name
+
+ return field
+
+ def test_sweep_table_no_duplicates_returns_empty_issues(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ description = {
+ "dataType": "Table",
+ "hasGlobalID": False,
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "fields": [self._make_field("Name"), self._make_field("OBJECTID")],
+ "shapeFieldName": "Shape",
+ "globalIDFieldName": "GlobalID",
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = [("Alice", 1), ("Bob", 2)]
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ # import xxhash #: This is unused; not sure if manually inserting the mock into sys.modules works; this was a copilot-created test.
+
+ real_hashes = {}
+
+ def fake_xxh64(data):
+ hasher = MagicMock()
+ hasher.hexdigest.return_value = str(hash(data))
+ real_hashes[data] = hasher.hexdigest.return_value
+
+ return hasher
+
+ sys.modules["xxhash"].xxh64.side_effect = fake_xxh64
+
+ report = sweeper.sweep()
+
+ assert report["issues"] == []
+ assert sweeper.is_table is True
+
+ def test_sweep_table_with_duplicates_reports_oids(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ description = {
+ "dataType": "Table",
+ "hasGlobalID": False,
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "fields": [self._make_field("Name"), self._make_field("OBJECTID")],
+ "shapeFieldName": "Shape",
+ "globalIDFieldName": "GlobalID",
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = [("Alice", 1), ("Alice", 2)]
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ digest_counter = {"count": 0}
+
+ def fake_xxh64(data):
+ hasher = MagicMock()
+ digest_counter["count"] += 1
+ #: Return the same hash for both rows to simulate duplicate
+ hasher.hexdigest.return_value = "same_hash"
+
+ return hasher
+
+ sys.modules["xxhash"].xxh64.side_effect = fake_xxh64
+
+ report = sweeper.sweep()
+
+ assert "2" in report["issues"]
+ assert 2 in sweeper.oids_with_issues
+
+ def test_sweep_feature_class_skips_none_shape(self):
+ sweeper = DuplicateTest("workspace", "my_fc")
+ description = {
+ "dataType": "FeatureClass",
+ "hasGlobalID": False,
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "shapeFieldName": "Shape",
+ "globalIDFieldName": "GlobalID",
+ "fields": [self._make_field("Name"), self._make_field("OBJECTID"), self._make_field("Shape")],
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = [("Alice", 1, None)]
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ report = sweeper.sweep()
+
+ assert report["issues"] == []
+ assert sweeper.is_table is False
+
+ def test_sweep_feature_class_with_duplicates_reports_oids(self):
+ sweeper = DuplicateTest("workspace", "my_fc")
+ description = {
+ "dataType": "FeatureClass",
+ "hasGlobalID": False,
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "shapeFieldName": "Shape",
+ "globalIDFieldName": "GlobalID",
+ "fields": [self._make_field("Name"), self._make_field("OBJECTID"), self._make_field("Shape")],
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = [
+ ("Alice", 1, "POINT (1.123456 2.123456)"),
+ ("Alice", 2, "POINT (1.123456 2.123456)"),
+ ]
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ def fake_xxh64(data):
+ hasher = MagicMock()
+ hasher.hexdigest.return_value = "same_hash"
+
+ return hasher
+
+ sys.modules["xxhash"].xxh64.side_effect = fake_xxh64
+
+ report = sweeper.sweep()
+
+ assert "2" in report["issues"]
+ assert 2 in sweeper.oids_with_issues
+
+ def test_sweep_skips_global_id_field(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ description = {
+ "dataType": "Table",
+ "hasGlobalID": True,
+ "globalIDFieldName": "GlobalID",
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "shapeFieldName": "Shape",
+ "fields": [self._make_field("Name"), self._make_field("GlobalID"), self._make_field("OBJECTID")],
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = [("Alice", 1)]
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ def fake_xxh64(data):
+ hasher = MagicMock()
+ hasher.hexdigest.return_value = "unique_hash"
+
+ return hasher
+
+ sys.modules["xxhash"].xxh64.side_effect = fake_xxh64
+
+ sweeper.sweep()
+
+ call_args = arcpy_mock.da.SearchCursor.call_args
+ fields_used = call_args[0][1]
+ assert "GlobalID" not in fields_used
+
+ def test_sweep_returns_correct_report_structure(self):
+ sweeper = DuplicateTest("workspace", "my_table")
+ description = {
+ "dataType": "Table",
+ "hasGlobalID": False,
+ "hasOID": True,
+ "OIDFieldName": "OBJECTID",
+ "shapeFieldName": "Shape",
+ "globalIDFieldName": "GlobalID",
+ "fields": [self._make_field("Name"), self._make_field("OBJECTID")],
+ }
+ arcpy_mock.da.Describe.return_value = description
+
+ rows = []
+ cursor_mock = MagicMock()
+ cursor_mock.__enter__ = MagicMock(return_value=iter(rows))
+ cursor_mock.__exit__ = MagicMock(return_value=False)
+ arcpy_mock.da.SearchCursor.return_value = cursor_mock
+
+ report = sweeper.sweep()
+
+ assert "title" in report
+ assert "feature_class" in report
+ assert "issues" in report
+ assert report["title"] == "Duplicate Test"
+ assert report["feature_class"] == "my_table"