diff --git a/setup.py b/setup.py index fcabd837..4d3048ea 100644 --- a/setup.py +++ b/setup.py @@ -46,10 +46,8 @@ "pygsheets==2.0.*", "pysftp==0.2.9", "setuptools==80.*", + "paramiko>=3.0,<4.0", "SQLAlchemy>=1.4,<2.1", - # Temporary pin to override pysftp's dependency resolution. - # TODO: Migrate away from pysftp to use paramiko directly. See https://github.com/agrc/palletjack/issues/123 - "paramiko<4.0.0", ], extras_require={ "tests": [ diff --git a/src/palletjack/extract.py b/src/palletjack/extract.py index db4cef1f..1658ac03 100644 --- a/src/palletjack/extract.py +++ b/src/palletjack/extract.py @@ -11,6 +11,7 @@ import re import time import warnings +from contextlib import contextmanager from datetime import datetime, timedelta from io import BytesIO from pathlib import Path @@ -20,7 +21,7 @@ import arcgis import geopandas as gpd import pandas as pd -import pysftp +import paramiko import requests import sqlalchemy import ujson @@ -133,7 +134,7 @@ def __init__(self, out_dir): self._class_logger.debug("Initializing GoogleDriveDownloader") self._class_logger.debug("Output directory: %s", out_dir) self.out_dir = Path(out_dir) - regex_pattern = "(\/|=)([-\w]{25,})" # pylint:disable=anomalous-backslash-in-string + regex_pattern = r"(/|=)([-\w]{25,})" self._class_logger.debug("Regex pattern: %s", regex_pattern) self.regex = re.compile(regex_pattern) @@ -191,7 +192,7 @@ def _get_filename_from_response(response): """ content = response.headers["Content-Disposition"] - all_filenames = re.findall("filename\*?=([^;]+)", content, flags=re.IGNORECASE) # pylint:disable=anomalous-backslash-in-string + all_filenames = re.findall(r"filename\*?=([^;]+)", content, flags=re.IGNORECASE) if all_filenames: #: Remove spurious whitespace and "s return all_filenames[0].strip().strip('"') @@ -389,78 +390,114 @@ def download_attachments_from_dataframe_using_api( class SFTPLoader: """Loads data from an SFTP share into a pandas DataFrame""" - def __init__(self, host, username, password, knownhosts_file, download_dir): + def __init__(self, host, username, password, download_dir): """ Args: host (str): The SFTP host to connect to username (str): SFTP username password (str): SFTP password - knownhosts_file (str): Path to a known_hosts file for pysftp.CnOpts. Can be generated via ssh-keyscan. download_dir (str or Path): Directory to save downloaded files """ self.host = host self.username = username self.password = password - self.knownhosts_file = knownhosts_file self.download_dir = download_dir self._class_logger = logging.getLogger(__name__).getChild(self.__class__.__name__) - def download_sftp_folder_contents(self, sftp_folder="upload"): - """Download all files in sftp_folder to the SFTPLoader's download_dir + @contextmanager + def _sftp_connection(self): + """Context manager for SFTP connections that ensures proper cleanup. + + Yields: + paramiko.SFTPClient: An active SFTP client + """ + transport = None + sftp = None + try: + transport = paramiko.Transport((self.host, 22)) + transport.connect(username=self.username, password=self.password) + sftp = paramiko.SFTPClient.from_transport(transport) + yield sftp + finally: + if sftp: + sftp.close() + if transport: + transport.close() + + def download_sftp_folder_contents(self, remote_directory, overwrite_existing_files=True): + """Download all files in remote_directory to the SFTPLoader's download_dir Args: - sftp_folder (str, optional): Path of remote folder, relative to sftp home directory. Defaults to 'upload'. + remote_directory (str, optional): Absolute path to remote_directory on the SFTP server + overwrite_existing_files (bool, optional): If False, raise an error if a file already exists + in download_dir. Defaults to True. + + Raises: + FileExistsError: If overwrite_existing_files is False and a file already exists + FileNotFoundError: If the remote directory or file is not found + ValueError: If no files were downloaded """ - self._class_logger.info("Downloading files from `%s:%s` to `%s`", self.host, sftp_folder, self.download_dir) - starting_file_count = len(list(self.download_dir.iterdir())) + if not remote_directory.endswith("/"): + remote_directory += "/" + + self._class_logger.info( + "Downloading files from `%s:%s` to `%s`", self.host, remote_directory, self.download_dir + ) self._class_logger.debug("SFTP Username: %s", self.username) - connection_opts = pysftp.CnOpts(knownhosts=self.knownhosts_file) - with pysftp.Connection( - self.host, username=self.username, password=self.password, cnopts=connection_opts - ) as sftp: + + with self._sftp_connection() as sftp: try: - sftp.get_d(sftp_folder, self.download_dir, preserve_mtime=True) + file_list = sftp.listdir(remote_directory) except FileNotFoundError as error: - raise FileNotFoundError(f"Folder `{sftp_folder}` not found on SFTP server") from error - downloaded_file_count = len(list(self.download_dir.iterdir())) - starting_file_count - if not downloaded_file_count: + raise FileNotFoundError(f"Directory `{remote_directory}` not found on SFTP server") from error + + if not file_list: + raise ValueError("No files to download from remote directory") + + downloaded_files = [] + with self._sftp_connection() as sftp: + for file_name in file_list: + outfile_path = self.download_dir / file_name + + # Check if file exists and overwrite is disabled + if not overwrite_existing_files and outfile_path.exists(): + raise FileExistsError(f"File `{outfile_path}` already exists and overwrite_existing_files is False") + + try: + sftp.get(f"{remote_directory}{file_name}", outfile_path.as_posix()) + downloaded_files.append(file_name) + except FileNotFoundError as error: + raise FileNotFoundError(f"File `{remote_directory}{file_name}` not found on SFTP server") from error + + if not downloaded_files: raise ValueError("No files downloaded") - return downloaded_file_count + return len(downloaded_files) - def download_sftp_single_file(self, filename, sftp_folder="upload"): - """Download filename into SFTPLoader's download_dir + def download_sftp_single_file(self, remote_file): + """Download remote_file into SFTPLoader's download_dir Args: - filename (str): Filename to download; used as output filename as well. - sftp_folder (str, optional): Path of remote folder, relative to sftp home directory. Defaults to 'upload'. - + remote_file (str): Remote file to download; see paramiko.SFTPClient.get() for path details Raises: - FileNotFoundError: Will warn if pysftp can't find the file or folder on the sftp server + FileNotFoundError: If paramiko can't find remote_file on the sftp server Returns: Path: Downloaded file's path """ - outfile = Path(self.download_dir, filename) - - self._class_logger.info("Downloading %s from `%s:%s` to `%s`", filename, self.host, sftp_folder, outfile) + self._class_logger.info("Downloading %s from `%s` to `%s`", remote_file, self.host, self.download_dir) self._class_logger.debug("SFTP Username: %s", self.username) - connection_opts = pysftp.CnOpts(knownhosts=self.knownhosts_file) - try: - with pysftp.Connection( - self.host, - username=self.username, - password=self.password, - cnopts=connection_opts, - default_path=sftp_folder, - ) as sftp: - sftp.get(filename, localpath=outfile, preserve_mtime=True) - except FileNotFoundError as error: - raise FileNotFoundError(f"File `{filename}` or folder `{sftp_folder}`` not found on SFTP server") from error - - return outfile + + with self._sftp_connection() as sftp: + outfile_path = self.download_dir / Path(remote_file).name + try: + sftp.get(remote_file, outfile_path.as_posix()) + except FileNotFoundError as error: + raise FileNotFoundError(f"File `{remote_file}` not found on SFTP server") from error + + return outfile_path def read_csv_into_dataframe(self, filename, column_types=None): """Read filename into a dataframe with optional column names and types diff --git a/src/palletjack/load.py b/src/palletjack/load.py index 90304e82..59508ae3 100644 --- a/src/palletjack/load.py +++ b/src/palletjack/load.py @@ -425,18 +425,19 @@ def _upload_gdb(self, zipped_gdb_path: Path) -> Item: raise RuntimeError(f"Error deleting existing gdb item with title {title}") from error try: - gdb_item = utils.retry( - self.gis.content.add, + root_folder = self.gis.content.folders.get() + add_job = utils.retry( + root_folder.add, item_properties={ "type": item_type, "title": title, "snippet": "Temporary gdb upload from palletjack", }, - data=zipped_gdb_path, + file=str(zipped_gdb_path), ) except Exception as error: raise RuntimeError(f"Error uploading {zipped_gdb_path} to AGOL") from error - return gdb_item + return add_job.result() def _cleanup(self, gdb_item: Item | None = None, zipped_gdb_path: Path | None = None) -> None: """Remove the zipped gdb from disk and the gdb item from AGOL @@ -573,7 +574,7 @@ def _create_attachment_action_df(self, attachment_eval_df, attachment_path_field ) #: Overwrite if different names, add if no existing name, do nothing if names are the same - attachment_eval_df["operation"] = np.nan + attachment_eval_df["operation"] = pd.Series([np.nan] * len(attachment_eval_df), dtype="object", index=attachment_eval_df.index) attachment_eval_df.loc[attachment_eval_df["NAME"] != attachment_eval_df["new_filename"], "operation"] = ( "overwrite" ) @@ -747,7 +748,8 @@ def build_attachments_dataframe(input_dataframe, join_column, attachment_column, pd.DataFrame: Dataframe with join key, attachment name, and full attachment paths """ - input_dataframe[attachment_column].replace("", np.nan, inplace=True) #: pandas doesn't see empty strings as NAs + #: pandas doesn't see empty strings as NAs + input_dataframe[attachment_column] = input_dataframe[attachment_column].replace("", np.nan) attachments_dataframe = ( input_dataframe[[join_column, attachment_column]].copy().dropna(subset=[attachment_column]) ) diff --git a/src/palletjack/utils.py b/src/palletjack/utils.py index 8faa7ad3..a12d2c5f 100644 --- a/src/palletjack/utils.py +++ b/src/palletjack/utils.py @@ -483,6 +483,7 @@ def check_fields(cls, live_data_properties, new_dataframe, fields, add_oid): field_checker.check_field_length(fields) # field_checker.check_srs_wgs84() field_checker.check_nullable_ints_shapely() + field_checker.check_for_np_inf() def __init__(self, live_data_properties, new_dataframe): """ @@ -797,6 +798,25 @@ def check_nullable_ints_shapely(self): f"{', '.join(columns_with_nulls)}" ) + def check_for_np_inf(self): + """Raise a warning if any of the new dataframe's fields contain np.inf or -np.inf values. + + Raises: + UserWarning: If any of the new dataframe's fields contain np.inf or -np.inf values. + """ + + columns_with_inf = [] + non_spatial_columns = [col for col in self.new_dataframe.columns if self.new_dataframe[col].dtype != "geometry"] + for column in non_spatial_columns: + if np.isinf(self.new_dataframe[column]).any(): + columns_with_inf.append(column) + + if columns_with_inf: + warnings.warn( + "The following columns have np.inf or -np.inf values, which may cause empty feature services: " + f"{', '.join(columns_with_inf)}" + ) + def get_null_geometries(feature_layer_properties): """Generate placeholder geometries near 0, 0 with type based on provided feature layer properties dictionary. diff --git a/tests/test_extract.py b/tests/test_extract.py index 06095422..98b8859c 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -8,6 +8,7 @@ import geopandas as gpd import numpy as np import pandas as pd +import paramiko import pytest import requests import requests_mock @@ -525,51 +526,377 @@ def test_download_attachments_from_dataframe_using_api_handles_multiple_rows(sel class TestSFTPLoader: def test_download_sftp_folder_contents_uses_right_credentials(self, mocker): sftploader_mock = mocker.Mock() - sftploader_mock.knownhosts_file = "knownhosts_file" sftploader_mock.host = "sftp_host" sftploader_mock.username = "username" sftploader_mock.password = "password" - download_dir_mock = mocker.Mock() - download_dir_mock.iterdir.side_effect = [[], ["file_a", "file_b"]] - sftploader_mock.download_dir = download_dir_mock + sftploader_mock.download_dir = Path("local/dir") - connection_mock = mocker.MagicMock() - context_manager_mock = mocker.MagicMock() - context_manager_mock.return_value.__enter__.return_value = connection_mock - mocker.patch("pysftp.Connection", new=context_manager_mock) + # Mock the context manager helper method + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = ["file_a", "file_b"] + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) - cnopts_mock = mocker.Mock() - cnopts_mock.side_effect = lambda knownhosts: knownhosts - mocker.patch("pysftp.CnOpts", new=cnopts_mock) + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir") - extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock) + # Verify the connection method was called (connection is abstracted behind the helper) + assert sftploader_mock._sftp_connection.call_count >= 1 - context_manager_mock.assert_called_with( - "sftp_host", username="username", password="password", cnopts="knownhosts_file" - ) + def test_download_sftp_folder_contents_creates_two_separate_connections_with_proper_calls(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance_1 = mocker.MagicMock(name="sftp1") + sftp_instance_1.listdir.return_value = [Path("file_a"), Path("file_b")] + sftp_instance_2 = mocker.MagicMock(name="sftp2") + + # Mock the context manager helper method to return different instances + context_manager_1 = mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance_1), __exit__=mocker.Mock(return_value=False)) + context_manager_2 = mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance_2), __exit__=mocker.Mock(return_value=False)) + sftploader_mock._sftp_connection = mocker.Mock(side_effect=[context_manager_1, context_manager_2]) + + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir/") + + assert sftploader_mock._sftp_connection.call_count == 2 + sftp_instance_1.listdir.assert_called_once_with("remote/dir/") + sftp_instance_2.get.assert_any_call("remote/dir/file_a", Path("local/dir/file_a").as_posix()) + sftp_instance_2.get.assert_any_call("remote/dir/file_b", Path("local/dir/file_b").as_posix()) + + def test_download_sftp_folder_contents_adds_trailing_slash_if_missing(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = ["file_a", "file_b"] + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir") + + sftp_instance.listdir.assert_called_once_with("remote/dir/") + + def test_download_sftp_folder_contents_doesnt_add_trailing_slash_if_present(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = ["file_a", "file_b"] + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir/") + + sftp_instance.listdir.assert_called_once_with("remote/dir/") + + def test_download_sftp_folder_contents_raises_error_on_no_files_downloaded(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.download_dir = Path("local/dir") + + mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = [] + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + with pytest.raises(ValueError, match="No files to download from remote directory"): + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir/") + + def test_download_sftp_folder_contents_raises_on_remote_dir_not_found(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.side_effect = FileNotFoundError("No such directory") + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + with pytest.raises(FileNotFoundError, match="Directory `remote/dir/` not found on SFTP server"): + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir/") + + def test_download_sftp_folder_contents_raises_on_file_not_found_during_get(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance_1 = mocker.MagicMock() + sftp_instance_1.listdir.return_value = [Path("file_a"), Path("file_b")] + sftp_instance_2 = mocker.MagicMock() + sftp_instance_2.get.side_effect = FileNotFoundError("No such file") + + context_manager_1 = mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance_1), __exit__=mocker.Mock(return_value=False)) + context_manager_2 = mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance_2), __exit__=mocker.Mock(return_value=False)) + sftploader_mock._sftp_connection = mocker.Mock(side_effect=[context_manager_1, context_manager_2]) + + with pytest.raises(FileNotFoundError, match="File `remote/dir/file_a` not found on SFTP server"): + extract.SFTPLoader.download_sftp_folder_contents(sftploader_mock, "remote/dir/") def test_download_sftp_single_file_uses_right_credentials(self, mocker): sftploader_mock = mocker.Mock() - sftploader_mock.knownhosts_file = "knownhosts_file" sftploader_mock.host = "sftp_host" sftploader_mock.username = "username" sftploader_mock.password = "password" - sftploader_mock.download_dir = "download_dir" + sftploader_mock.download_dir = Path("local/dir") - connection_mock = mocker.MagicMock() - context_manager_mock = mocker.MagicMock() - context_manager_mock.return_value.__enter__.return_value = connection_mock - mocker.patch("pysftp.Connection", new=context_manager_mock) + sftp_instance = mocker.MagicMock() + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) - cnopts_mock = mocker.Mock() - cnopts_mock.side_effect = lambda knownhosts: knownhosts - mocker.patch("pysftp.CnOpts", new=cnopts_mock) + extract.SFTPLoader.download_sftp_single_file(sftploader_mock, "remote/file.txt") - extract.SFTPLoader.download_sftp_single_file(sftploader_mock, "filename", "upload") + # Verify the connection method was called + sftploader_mock._sftp_connection.assert_called_once() - context_manager_mock.assert_called_with( - "sftp_host", username="username", password="password", cnopts="knownhosts_file", default_path="upload" - ) + def test_download_sftp_single_file_calls_get_with_proper_args(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + extract.SFTPLoader.download_sftp_single_file(sftploader_mock, "remote/file.txt") + + sftp_instance.get.assert_called_once_with("remote/file.txt", Path("local/dir/file.txt").as_posix()) + + def test_download_sftp_single_file_raises_on_file_not_found_during_get(self, mocker): + sftploader_mock = mocker.Mock() + sftploader_mock.host = "sftp_host" + sftploader_mock.username = "username" + sftploader_mock.password = "password" + sftploader_mock.download_dir = Path("local/dir") + + transport_mock = mocker.patch("palletjack.extract.paramiko.Transport", autospec=True) + sftp_client_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient", autospec=True) + + sftp_instance = mocker.MagicMock() + sftp_instance.get.side_effect = FileNotFoundError("No such file") + sftploader_mock._sftp_connection = mocker.MagicMock(return_value=mocker.MagicMock(__enter__=mocker.Mock(return_value=sftp_instance), __exit__=mocker.Mock(return_value=False))) + + with pytest.raises(FileNotFoundError, match="File `remote/file.txt` not found on SFTP server"): + extract.SFTPLoader.download_sftp_single_file(sftploader_mock, "remote/file.txt") + + def test_download_sftp_folder_contents_raises_on_file_exists_when_overwrite_disabled(self, mocker): + """Test that FileExistsError is raised when overwrite_existing_files=False and file exists""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp/test")) + + # Mock Path to return a path that exists + mock_path = mocker.MagicMock(spec=Path) + mock_path.exists.return_value = True + mock_path.as_posix.return_value = "/tmp/test/existing_file.txt" + + # Patch Path.__truediv__ to return our mock path + mocker.patch.object(Path, "__truediv__", return_value=mock_path) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = ["existing_file.txt"] + + transport_mock = mocker.MagicMock() + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.return_value = sftp_instance + + with pytest.raises(FileExistsError, match="already exists and overwrite_existing_files is False"): + loader.download_sftp_folder_contents("/remote/dir/", overwrite_existing_files=False) + + def test_download_sftp_folder_contents_overwrites_when_overwrite_enabled(self, mocker): + """Test that files are overwritten when overwrite_existing_files=True (default)""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp/test")) + + sftp_instance = mocker.MagicMock() + sftp_instance.listdir.return_value = ["file.txt"] + + transport_mock = mocker.MagicMock() + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.return_value = sftp_instance + + # Should not raise FileExistsError even if file exists + result = loader.download_sftp_folder_contents("/remote/dir/", overwrite_existing_files=True) + assert result == 1 + + def test_sftp_connection_context_manager_success(self, mocker): + """Test successful SFTP connection with proper cleanup""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp")) + + transport_mock = mocker.MagicMock() + sftp_mock = mocker.MagicMock() + + transport_class_mock = mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.return_value = sftp_mock + + with loader._sftp_connection() as sftp: + assert sftp == sftp_mock + + # Verify connection was established + transport_class_mock.assert_called_once_with(("test_host", 22)) + transport_mock.connect.assert_called_once_with(username="test_user", password="test_pass") + sftp_class_mock.from_transport.assert_called_once_with(transport_mock) + + # Verify cleanup happened + sftp_mock.close.assert_called_once() + transport_mock.close.assert_called_once() + + def test_sftp_connection_context_manager_connection_failure(self, mocker): + """Test that cleanup happens when transport.connect fails""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp")) + + transport_mock = mocker.MagicMock() + transport_mock.connect.side_effect = Exception("Connection failed") + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + + with pytest.raises(Exception, match="Connection failed"): + with loader._sftp_connection() as sftp: + pass + + # Verify cleanup happened even though connection failed + # sftp was never created, so it shouldn't be closed + transport_mock.close.assert_called_once() + + def test_sftp_connection_context_manager_auth_failure(self, mocker): + """Test that cleanup happens when authentication fails""" + loader = extract.SFTPLoader("test_host", "bad_user", "bad_pass", Path("/tmp")) + + transport_mock = mocker.MagicMock() + transport_mock.connect.side_effect = paramiko.AuthenticationException("Authentication failed") + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + + with pytest.raises(paramiko.AuthenticationException, match="Authentication failed"): + with loader._sftp_connection() as sftp: + pass + + # Verify cleanup happened + transport_mock.close.assert_called_once() + + def test_sftp_connection_context_manager_sftp_client_failure(self, mocker): + """Test that cleanup happens when SFTPClient.from_transport fails""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp")) + + transport_mock = mocker.MagicMock() + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.side_effect = Exception("SFTP client creation failed") + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + + with pytest.raises(Exception, match="SFTP client creation failed"): + with loader._sftp_connection() as sftp: + pass + + # Verify cleanup happened - transport should be closed even though sftp wasn't created + transport_mock.close.assert_called_once() + + def test_sftp_connection_context_manager_exception_during_operation(self, mocker): + """Test that cleanup happens when exception occurs during SFTP operations""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp")) + + transport_mock = mocker.MagicMock() + sftp_mock = mocker.MagicMock() + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.return_value = sftp_mock + + with pytest.raises(RuntimeError, match="Operation failed"): + with loader._sftp_connection() as sftp: + # Simulate an exception during file operations + raise RuntimeError("Operation failed") + + # Verify cleanup happened despite the exception + sftp_mock.close.assert_called_once() + transport_mock.close.assert_called_once() + + def test_download_sftp_folder_contents_with_real_context_manager(self, mocker): + """Test download_sftp_folder_contents using real context manager (not mocked)""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp/test")) + + test_files = ["file1.txt", "file2.txt"] + + transport_mock = mocker.MagicMock() + sftp_mock_1 = mocker.MagicMock() + sftp_mock_1.listdir.return_value = test_files + sftp_mock_2 = mocker.MagicMock() + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.side_effect = [sftp_mock_1, sftp_mock_2] + + result = loader.download_sftp_folder_contents("/remote/dir") + + # Verify both connections were made + assert transport_mock.connect.call_count == 2 + assert sftp_class_mock.from_transport.call_count == 2 + + # Verify both connections were cleaned up + assert sftp_mock_1.close.call_count == 1 + assert sftp_mock_2.close.call_count == 1 + assert transport_mock.close.call_count == 2 + + # Verify operations + sftp_mock_1.listdir.assert_called_once_with("/remote/dir/") + sftp_mock_2.get.assert_any_call("/remote/dir/file1.txt", (Path("/tmp/test") / "file1.txt").as_posix()) + sftp_mock_2.get.assert_any_call("/remote/dir/file2.txt", (Path("/tmp/test") / "file2.txt").as_posix()) + + assert result == 2 + + def test_download_sftp_single_file_with_real_context_manager(self, mocker): + """Test download_sftp_single_file using real context manager (not mocked)""" + loader = extract.SFTPLoader("test_host", "test_user", "test_pass", Path("/tmp/test")) + + transport_mock = mocker.MagicMock() + sftp_mock = mocker.MagicMock() + + mocker.patch("palletjack.extract.paramiko.Transport", return_value=transport_mock) + sftp_class_mock = mocker.patch("palletjack.extract.paramiko.SFTPClient") + sftp_class_mock.from_transport.return_value = sftp_mock + + result = loader.download_sftp_single_file("/remote/file.txt") + + # Verify connection was made + transport_mock.connect.assert_called_once_with(username="test_user", password="test_pass") + sftp_class_mock.from_transport.assert_called_once_with(transport_mock) + + # Verify cleanup happened + sftp_mock.close.assert_called_once() + transport_mock.close.assert_called_once() + + # Verify operation - use as_posix() for cross-platform compatibility + sftp_mock.get.assert_called_once_with("/remote/file.txt", (Path("/tmp/test") / "file.txt").as_posix()) + + assert result == Path("/tmp/test/file.txt") def test_read_csv_into_dataframe_with_column_names(self, mocker): pd_mock = mocker.Mock() diff --git a/tests/test_load.py b/tests/test_load.py index 86c051b7..f3e014b4 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1363,17 +1363,18 @@ def test__upload_gdb_calls_add_default_name(self, mocker): "title": "palletjack Temporary gdb upload", "snippet": "Temporary gdb upload from palletjack", }, - "data": gdb_path, + "file": str(gdb_path), } mocker.patch("palletjack.load.arcgis") gis_mock = mocker.Mock() gis_mock.content.search.return_value = [] + root_folder_mock = gis_mock.content.folders.get.return_value updater = load.ServiceUpdater(gis_mock, "abc", service_type="table") foo = updater._upload_gdb(gdb_path) - gis_mock.content.add.assert_called_once_with(**expected_call_kwargs) + root_folder_mock.add.assert_called_once_with(**expected_call_kwargs) def test__upload_gdb_calls_add_custom_name(self, mocker): gdb_path = Path("/foo/bar/upload.gdb") @@ -1383,44 +1384,46 @@ def test__upload_gdb_calls_add_custom_name(self, mocker): "title": "foo Temporary gdb upload", "snippet": "Temporary gdb upload from palletjack", }, - "data": gdb_path, + "file": str(gdb_path), } mocker.patch("palletjack.load.arcgis") gis_mock = mocker.Mock() gis_mock.content.search.return_value = [] + root_folder_mock = gis_mock.content.folders.get.return_value updater = load.ServiceUpdater(gis_mock, "abc", gdb_item_prefix="foo") foo = updater._upload_gdb(gdb_path) - gis_mock.content.add.assert_called_once_with(**expected_call_kwargs) + root_folder_mock.add.assert_called_once_with(**expected_call_kwargs) def test__upload_gdb_raises_on_agol_error(self, mocker): gdb_path = Path("/foo/bar/upload.gdb") updater_mock = mocker.Mock() updater_mock.gis.content.search.return_value = [] - updater_mock.gis.content.add.side_effect = [Exception("foo")] * 4 + root_folder_mock = updater_mock.gis.content.folders.get.return_value + root_folder_mock.add.side_effect = [Exception("foo")] * 4 mocker.patch("palletjack.utils.sleep") with pytest.raises(RuntimeError) as exc_info: load.ServiceUpdater._upload_gdb(updater_mock, gdb_path) assert exc_info.value.args[0] == f"Error uploading {gdb_path} to AGOL" - assert updater_mock.gis.content.add.call_count == 4 #: retries + assert root_folder_mock.add.call_count == 4 #: retries def test__upload_gdb_deletes_existing_item(self, mocker): gdb_path = Path("/foo/bar/upload.gdb") updater_mock = mocker.Mock() delete_function = mocker.Mock() updater_mock.gis.content.search.return_value = [mocker.Mock(id="1234", delete=delete_function)] - updater_mock.gis.content.add.return_value = mocker.Mock() + root_folder_mock = updater_mock.gis.content.folders.get.return_value updater_mock.gdb_item_prefix = "palletjack" updater_mock.gis.users.me.username = "test_user" load.ServiceUpdater._upload_gdb(updater_mock, gdb_path) delete_function.assert_called_once() - updater_mock.gis.content.add.assert_called_once() + root_folder_mock.add.assert_called_once() def test__upload_gdb_handles_missing_gdb_after_finding_gdb_and_continues(self, mocker, caplog): caplog.set_level(logging.DEBUG) @@ -1428,7 +1431,7 @@ def test__upload_gdb_handles_missing_gdb_after_finding_gdb_and_continues(self, m updater_mock = mocker.Mock() delete_function = mocker.Mock() updater_mock.gis.content.search.return_value = [mocker.Mock(id="1234", delete=delete_function)] - updater_mock.gis.content.add.return_value = mocker.Mock() + root_folder_mock = updater_mock.gis.content.folders.get.return_value delete_function.side_effect = [Exception("Item does not exist or is inaccessible.")] * 4 mocker.patch("palletjack.utils.sleep") @@ -1441,14 +1444,13 @@ def test__upload_gdb_handles_missing_gdb_after_finding_gdb_and_continues(self, m == "Can't find or delete existing gdb, attempting to continue" ) assert delete_function.call_count == 4 #: retries - updater_mock.gis.content.add.assert_called_once() + root_folder_mock.add.assert_called_once() def test__upload_gdb_raises_delete_error(self, mocker): gdb_path = Path("/foo/bar/upload.gdb") updater_mock = mocker.Mock() delete_function = mocker.Mock() updater_mock.gis.content.search.return_value = [mocker.Mock(id="1234", delete=delete_function)] - updater_mock.gis.content.add.return_value = mocker.Mock() delete_function.side_effect = [RuntimeError("foo")] * 4 mocker.patch("palletjack.utils.sleep") updater_mock.gdb_item_prefix = "palletjack" diff --git a/tests/test_utils.py b/tests/test_utils.py index d72d0864..c144afb5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,7 @@ from collections import namedtuple from pathlib import Path +import geopandas as gpd import numpy as np import pandas as pd import pyogrio @@ -1598,6 +1599,130 @@ def test_check_srs_wgs84_handles_srs_with_key_of_0(self, mocker): palletjack.utils.FieldChecker.check_srs_wgs84(checker_mock) +class TestNpInfChecker: + def test_check_for_np_inf_warns_on_inf_in_dataframe(self, mocker): + new_df = pd.DataFrame( + { + "a": [1, 2, np.inf], + "b": [4.1, 5.1, 6.1], + } + ) + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_df) + + with pytest.warns( + UserWarning, + match=re.escape( + "The following columns have np.inf or -np.inf values, which may cause empty feature services: a" + ), + ): + checker.check_for_np_inf() + + def test_check_for_np_inf_doesnt_raise_on_normal_dataframe(self, mocker): + new_df = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4.1, 5.1, 6.1], + } + ) + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_df) + + #: Should not raise + checker.check_for_np_inf() + + def test_check_for_np_inf_warns_on_neg_inf_in_dataframe(self, mocker): + new_df = pd.DataFrame( + { + "a": [1, -np.inf, 3], + "b": [4.1, 5.1, 6.1], + } + ) + + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_df) + + with pytest.warns( + UserWarning, + match=re.escape( + "The following columns have np.inf or -np.inf values, which may cause empty feature services: a" + ), + ): + checker.check_for_np_inf() + + def test_check_for_np_inf_doesnt_warn_on_nan(self, mocker): + new_df = pd.DataFrame( + { + "a": [1, 2, np.nan], + "b": [4.1, 5.1, 6.1], + } + ) + + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_df) + + #: Should not raise + checker.check_for_np_inf() + + def test_check_for_np_inf_handles_spatially_enabled_dataframe(self, mocker): + new_sedf = pd.DataFrame.spatial.from_xy( + pd.DataFrame( + { + "a": [1, 2, np.inf], + "b": [4.1, 5.1, 6.1], + "x": [0, 0, 0], + "y": [0, 0, 0], + } + ), + x_column="x", + y_column="y", + ) + + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_sedf) + + with pytest.warns( + UserWarning, + match=re.escape( + "The following columns have np.inf or -np.inf values, which may cause empty feature services: a" + ), + ): + checker.check_for_np_inf() + + def test_check_for_np_inf_handles_geodataframe(self, mocker): + new_gdf = gpd.GeoDataFrame( + { + "a": [1, 2, np.inf], + "b": [4.1, 5.1, 6.1], + "geometry": gpd.points_from_xy([0, 0, 0], [0, 0, 0]), + }, + crs="EPSG:4326", + ) + + properties_mock = mocker.Mock() + properties_mock.fields = [{"name": "foo"}, {"name": "bar"}] + + checker = palletjack.utils.FieldChecker(properties_mock, new_gdf) + + with pytest.warns( + UserWarning, + match=re.escape( + "The following columns have np.inf or -np.inf values, which may cause empty feature services: a" + ), + ): + checker.check_for_np_inf() + + class TestNullGeometryGenerators: def test_get_null_geometries_point(self, mocker): properties_mock = mocker.Mock()