Skip to content
62 changes: 62 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# GitHub Copilot Instructions

## General Guidelines

- Prefer to return early from functions to reduce nesting and improve code readability.
- Always follow good security practices
- Use f-strings for string formatting
- Use logging instead of print statements
- Follow the DRY principle (Don't Repeat Yourself)
- Use pathlib for all file operations
- Follow defensive programming practices
- Validate all function inputs
- Handle exceptions gracefully
- Use retry logic for any network calls
- Add a new line before return statements in functions
- Use list/dict/set comprehensions where appropriate
- Avoid using wildcard imports (e.g., from module import *)
- Use context managers for file operations (e.g., with open(...) as f:)
- Prefer using built-in functions and libraries over custom implementations when possible
- Prefer smaller, focused functions that do one thing well rather than large, monolithic functions

## Commit Message Format

All commits must follow the <a href="https://www.conventionalcommits.org/">Conventional Commits</a> format using the Angular preset.

For detailed guidelines on commit types, scopes, and formatting rules, see the <a href="https://github.com/agrc/release-composite-action/blob/main/README.md#commits">release-composite-action README</a>.

## Code Style and Conventions

### Python Style
- Line length: 120 characters (configured in ruff)
- Indentation: 4 spaces for Python files
- Use type hints for all new work
- Follow PEP 8 conventions
- Follow ruff style guide and linting rules
- Use pylint disable comments sparingly and only when necessary (e.g., `# pylint: disable=invalid-name`)

### Documentation
- Use docstrings for all classes and public methods
- Follow NumPy/SciPy docstring format with sections:
- Brief description
- `Attributes` for class attributes
- `Parameters` for method parameters
- `Returns` for return values
- `Methods` for public methods in class docstrings

## Testing Guidelines

- Unit tests are required, and are required to pass before PR
- Mock external services
- Test both success and failure paths
- Verify warning messages for invalid configurations
- Code coverage should be maintained at a high level (tracked via codecov)
- Test names should be descriptive and follow the pattern `test_<method_name>_<expected_behavior>`

## Code Quality

- Run `ruff` for linting before committing
- Maintain test coverage (tracked via codecov)
- Follow existing patterns in the codebase
- Keep methods focused and single-purpose
- Use static methods when methods don't need instance state
40 changes: 22 additions & 18 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Module",
"type": "debugpy",
"request": "launch",
"module": "sweeper",
"args": [
"sweep",
"--workspace",
"C:\\temp\\locators.gdb"
],
"cwd": "${workspaceFolder}"
}
]
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

{
"name": "Python Debugger: Module",
"type": "debugpy",
"request": "launch",
"module": "sweeper.__main__",
"args": [
"sweep",
"duplicates",
"--workspace=C:\\temp\\sweeper_test\\Pre1978.gdb",
"--table-name=DAQPre1978LeadInHomes",
"--try-fix",
"--verbose",
"--save-report=C:\\temp\\sweeper_test\\"
]
}
]
}
108 changes: 87 additions & 21 deletions src/sweeper/sweepers/duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# * coding: utf8 *
import logging
import re
import typing
from typing import Generator

import arcpy
from xxhash import xxh64
Expand Down Expand Up @@ -99,30 +101,94 @@ def try_fix(self):
if len(self.oids_with_issues) == 0:
return report

sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in self.oids_with_issues])})'
chunk_size = 1000

lists_of_oids = list(self._chunk_oid_list(self.oids_with_issues, chunk_size))
temp_feature_layer = "temp_layer"

log.info(f"Workspace is: {self.workspace}")
#: Delete duplicate rows using different arcpy tools for tables and feature classes
log.info(
f"Attempting to delete a total of {len(self.oids_with_issues)} duplicate records in {len(lists_of_oids)} batch(es) of {chunk_size} records each"
)

successful_deletes = 0
failed_deletes = 0
failed_batches = 0

with arcpy.EnvManager(workspace=self.workspace):
if self.is_table:
duplicate_features = arcpy.management.MakeTableView(self.table_name, temp_feature_layer, sql)
else:
duplicate_features = arcpy.management.MakeFeatureLayer(self.table_name, temp_feature_layer, sql)

try:
log.info(f"attempting to delete {len(self.oids_with_issues)} duplicate records")
if self.is_table:
arcpy.management.DeleteRows(duplicate_features)
else:
arcpy.management.DeleteFeatures(duplicate_features)
except Exception as error:
error_message = f"unable to delete features {error}"
report["issues"].append(error_message)
finally:
if arcpy.Exists(temp_feature_layer):
arcpy.management.Delete(temp_feature_layer)

report["fixes"].append(f"{len(self.oids_with_issues)} records deleted successfully")
all_features = self._make_feature_layer(temp_feature_layer)

for index, list_of_oids in enumerate(lists_of_oids, start=1):
sql = f'"OBJECTID" IN ({",".join([str(oid) for oid in list_of_oids])})'

try:
log.info(f"Batch {index}: attempting to delete {len(list_of_oids)} duplicate records")
arcpy.management.SelectLayerByAttribute(all_features, "NEW_SELECTION", sql)
if not self._valid_selection(all_features, list_of_oids):
raise RuntimeError(
f"Invalid selection for batch {index}. The OIDs in the selection do not match the expected OIDs."
)
self._delete_features_or_rows(all_features)
successful_deletes += len(list_of_oids)
except Exception as error:
error_message = f"unable to delete features in batch {index}: {error}"
log.info(error_message)
report["issues"].append(error_message)
failed_deletes += len(list_of_oids)
failed_batches += 1
finally:
arcpy.management.SelectLayerByAttribute(all_features, "CLEAR_SELECTION")

if arcpy.Exists(temp_feature_layer):
arcpy.management.Delete(temp_feature_layer)
if successful_deletes > 0:
report["fixes"].append(f"{successful_deletes} records deleted successfully")
if failed_deletes > 0:
report["issues"].append(
f"{failed_batches} batch(es) with {failed_deletes} total records had errors deleting."
)

return report

@staticmethod
def _chunk_oid_list(lst: list, chunk_size: int) -> Generator[list, None, None]:
"""Breaks a list into chunks of chunk_size

Args:
lst (list): Input List
chunk_size (int): The desired size per chunk

Yields:
Generator[list, None, None]: The next chunk_size sized chunk
"""
if len(lst) <= chunk_size:
yield lst
return
for i in range(0, len(lst), chunk_size):
yield lst[i : i + chunk_size]

def _make_feature_layer(self, temp_layer_name: str) -> typing.Any:
"""Single method to handle table or layer creation based on is_table parameter"""
#: arcpy's typing gets really convoluted, so we're just using typing.Any.

if self.is_table:
return arcpy.management.MakeTableView(self.table_name, temp_layer_name)
else:
return arcpy.management.MakeFeatureLayer(self.table_name, temp_layer_name)

def _delete_features_or_rows(self, feature_layer: typing.Any):
"""Single method to handle deleting features or rows based on is_table parameter"""
if self.is_table:
arcpy.management.DeleteRows(feature_layer)
else:
arcpy.management.DeleteFeatures(feature_layer)

@staticmethod
def _valid_selection(feature_layer_or_table: typing.Any, list_of_oids: list) -> bool:
"""Makes sure the selection is valid by comparing the OBJECTIDs in the selection to the list of OBJECTIDs used to create the where clause. This is a safeguard to prevent deleting the entire table if something goes wrong with the where clause."""
with arcpy.da.SearchCursor(feature_layer_or_table, ["OID@"]) as cursor:
selected_oids = {str(row[0]) for row in cursor}
if set([str(oid) for oid in list_of_oids]) != selected_oids:
log.info("Selected OIDs do not match expected OIDs.")
return False
return True
Loading
Loading