agrc · jacobdadams · Mar 23, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -0,0 +1,62 @@
+# GitHub Copilot Instructions
+
+## General Guidelines
+
+- Prefer to return early from functions to reduce nesting and improve code readability.
+- Always follow good security practices
+- Use f-strings for string formatting
+- Use logging instead of print statements
+- Follow the DRY principle (Don't Repeat Yourself)
+- Use pathlib for all file operations
+- Follow defensive programming practices
+  - Validate all function inputs
+  - Handle exceptions gracefully
+- Use retry logic for any network calls
+- Add a new line before return statements in functions
+- Use list/dict/set comprehensions where appropriate
+- Avoid using wildcard imports (e.g., from module import *)
+- Use context managers for file operations (e.g., with open(...) as f:)
+- Prefer using built-in functions and libraries over custom implementations when possible
+- Prefer smaller, focused functions that do one thing well rather than large, monolithic functions
+
+## Commit Message Format
+
+All commits must follow the <a href="https://www.conventionalcommits.org/">Conventional Commits</a> format using the Angular preset.
+
+For detailed guidelines on commit types, scopes, and formatting rules, see the <a href="https://github.com/agrc/release-composite-action/blob/main/README.md#commits">release-composite-action README</a>.
+
+## Code Style and Conventions
+
+### Python Style
+- Line length: 120 characters (configured in ruff)
+- Indentation: 4 spaces for Python files
+- Use type hints for all new work
+- Follow PEP 8 conventions
+- Follow ruff style guide and linting rules
+- Use pylint disable comments sparingly and only when necessary (e.g., `# pylint: disable=invalid-name`)
+
+### Documentation
+- Use docstrings for all classes and public methods
+- Follow NumPy/SciPy docstring format with sections:
+  - Brief description
+  - `Attributes` for class attributes
+  - `Parameters` for method parameters
+  - `Returns` for return values
+  - `Methods` for public methods in class docstrings
+
+## Testing Guidelines
+
+- Unit tests are required, and are required to pass before PR
+- Mock external services
+- Test both success and failure paths
+- Verify warning messages for invalid configurations
+- Code coverage should be maintained at a high level (tracked via codecov)
+- Test names should be descriptive and follow the pattern `test_<method_name>_<expected_behavior>`
+
+## Code Quality
+
+- Run `ruff` for linting before committing
+- Maintain test coverage (tracked via codecov)
+- Follow existing patterns in the codebase
+- Keep methods focused and single-purpose
+- Use static methods when methods don't need instance state
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,20 +1,24 @@
 {
-  // Use IntelliSense to learn about possible attributes.
-  // Hover to view descriptions of existing attributes.
-  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-  "version": "0.2.0",
-  "configurations": [
-    {
-      "name": "Python Debugger: Module",
-      "type": "debugpy",
-      "request": "launch",
-      "module": "sweeper",
-      "args": [
-        "sweep",
-        "--workspace",
-        "C:\\temp\\locators.gdb"
-      ],
-      "cwd": "${workspaceFolder}"
-    }
-  ]
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+
+        {
+            "name": "Python Debugger: Module",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "sweeper.__main__",
+            "args": [
+                "sweep",
+                "duplicates",
+                "--workspace=C:\\temp\\sweeper_test\\Pre1978.gdb",
+                "--table-name=DAQPre1978LeadInHomes",
+                "--try-fix",
+                "--verbose",
+                "--save-report=C:\\temp\\sweeper_test\\"
+            ]
+        }
+    ]
 }
diff --git a/src/sweeper/sweepers/duplicates.py b/src/sweeper/sweepers/duplicates.py
@@ -2,6 +2,8 @@
 # * coding: utf8 *
 import logging
 import re
+import typing
+from typing import Generator
 
 import arcpy
 from xxhash import xxh64
@@ -99,30 +101,94 @@ def try_fix(self):
         if len(self.oids_with_issues) == 0:
             return report
 
-        sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in self.oids_with_issues])})'
+        chunk_size = 1000
+
+        lists_of_oids = list(self._chunk_oid_list(self.oids_with_issues, chunk_size))
         temp_feature_layer = "temp_layer"
 
         log.info(f"Workspace is:   {self.workspace}")
-        #: Delete duplicate rows using different arcpy tools for tables and feature classes
+        log.info(
+            f"Attempting to delete a total of {len(self.oids_with_issues)} duplicate records in {len(lists_of_oids)} batch(es) of {chunk_size} records each"
+        )
+
+        successful_deletes = 0
+        failed_deletes = 0
+        failed_batches = 0
+
         with arcpy.EnvManager(workspace=self.workspace):
-            if self.is_table:
-                duplicate_features = arcpy.management.MakeTableView(self.table_name, temp_feature_layer, sql)
-            else:
-                duplicate_features = arcpy.management.MakeFeatureLayer(self.table_name, temp_feature_layer, sql)
-
-            try:
-                log.info(f"attempting to delete {len(self.oids_with_issues)} duplicate records")
-                if self.is_table:
-                    arcpy.management.DeleteRows(duplicate_features)
-                else:
-                    arcpy.management.DeleteFeatures(duplicate_features)
-            except Exception as error:
-                error_message = f"unable to delete features {error}"
-                report["issues"].append(error_message)
-            finally:
-                if arcpy.Exists(temp_feature_layer):
-                    arcpy.management.Delete(temp_feature_layer)
-
-            report["fixes"].append(f"{len(self.oids_with_issues)} records deleted successfully")
+            all_features = self._make_feature_layer(temp_feature_layer)
+
+            for index, list_of_oids in enumerate(lists_of_oids, start=1):
+                sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in list_of_oids])})'
+
+                try:
+                    log.info(f"Batch {index}: attempting to delete {len(list_of_oids)} duplicate records")
+                    arcpy.management.SelectLayerByAttribute(all_features, "NEW_SELECTION", sql)
+                    if not self._valid_selection(all_features, list_of_oids):
+                        raise RuntimeError(
+                            f"Invalid selection for batch {index}. The OIDs in the selection do not match the expected OIDs."
+                        )
+                    self._delete_features_or_rows(all_features)
+                    successful_deletes += len(list_of_oids)
+                except Exception as error:
+                    error_message = f"unable to delete features in batch {index}: {error}"
+                    log.info(error_message)
+                    report["issues"].append(error_message)
+                    failed_deletes += len(list_of_oids)
+                    failed_batches += 1
+                finally:
+                    arcpy.management.SelectLayerByAttribute(all_features, "CLEAR_SELECTION")
+
+            if arcpy.Exists(temp_feature_layer):
+                arcpy.management.Delete(temp_feature_layer)
+            if successful_deletes > 0:
+                report["fixes"].append(f"{successful_deletes} records deleted successfully")
+            if failed_deletes > 0:
+                report["issues"].append(
+                    f"{failed_batches} batch(es) with {failed_deletes} total records had errors deleting."
+                )
 
         return report
+
+    @staticmethod
+    def _chunk_oid_list(lst: list, chunk_size: int) -> Generator[list, None, None]:
+        """Breaks a list into chunks of chunk_size
+
+        Args:
+            lst (list): Input List
+            chunk_size (int): The desired size per chunk
+
+        Yields:
+            Generator[list, None, None]: The next chunk_size sized chunk
+        """
+        if len(lst) <= chunk_size:
+            yield lst
+            return
+        for i in range(0, len(lst), chunk_size):
+            yield lst[i : i + chunk_size]
+
+    def _make_feature_layer(self, temp_layer_name: str) -> typing.Any:
+        """Single method to handle table or layer creation based on is_table parameter"""
+        #: arcpy's typing gets really convoluted, so we're just using typing.Any.
+
+        if self.is_table:
+            return arcpy.management.MakeTableView(self.table_name, temp_layer_name)
+        else:
+            return arcpy.management.MakeFeatureLayer(self.table_name, temp_layer_name)
+
+    def _delete_features_or_rows(self, feature_layer: typing.Any):
+        """Single method to handle deleting features or rows based on is_table parameter"""
+        if self.is_table:
+            arcpy.management.DeleteRows(feature_layer)
+        else:
+            arcpy.management.DeleteFeatures(feature_layer)
+
+    @staticmethod
+    def _valid_selection(feature_layer_or_table: typing.Any, list_of_oids: list) -> bool:
+        """Makes sure the selection is valid by comparing the OBJECTIDs in the selection to the list of OBJECTIDs used to create the where clause. This is a safeguard to prevent deleting the entire table if something goes wrong with the where clause."""
+        with arcpy.da.SearchCursor(feature_layer_or_table, ["OID@"]) as cursor:
+            selected_oids = {str(row[0]) for row in cursor}
+        if set([str(oid) for oid in list_of_oids]) != selected_oids:
+            log.info("Selected OIDs do not match expected OIDs.")
+            return False
+        return True