diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index c51989ca54730..843bf1af493ae 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -65,6 +65,7 @@ objects. api.extensions.ExtensionArray.repeat api.extensions.ExtensionArray.searchsorted api.extensions.ExtensionArray.shift + api.extensions.ExtensionArray.sort api.extensions.ExtensionArray.take api.extensions.ExtensionArray.unique api.extensions.ExtensionArray.dtype diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 5a6b1546e4b2b..7205636000dae 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -34,6 +34,7 @@ Other enhancements - :meth:`ExtensionArray.map` now calls :meth:`ExtensionArray._cast_pointwise_result` to retain the dtype backend, e.g. Arrow-backed arrays now preserve their Arrow dtype through ``map`` (:issue:`57189`, :issue:`62164`) - :func:`read_csv` now supports ``dtype="complex64"`` and ``dtype="complex128"`` with the C engine, enabling round-tripping of complex-number columns written by :meth:`DataFrame.to_csv` (:issue:`9379`) - Added :meth:`ExtensionArray.count` (:issue:`64450`) +- Added :meth:`ExtensionArray.sort` for in-place sorting of :class:`ExtensionArray` (:issue:`64977`) - Added :meth:`Index.replace` method to support value replacement functionality similar to :meth:`Series.replace` (:issue:`19495`) - Display formatting for float sequences in DataFrame cells now respects the ``display.precision`` option (:issue:`60503`). - Improved the precision of float parsing in :func:`read_csv` (:issue:`64395`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 182d0b41df7a2..3a16817854aa4 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -62,6 +62,7 @@ ScalarIndexer, SequenceIndexer, Shape, + SortKind, TakeIndexer, npt, ) @@ -231,6 +232,18 @@ def unique(self) -> Self: new_data = unique(self._ndarray) return self._from_backing_data(new_data) + def sort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + ) -> None: + sort_indices = self.argsort( + ascending=ascending, kind=kind, na_position=na_position + ) + self._ndarray[:] = self._ndarray[sort_indices] + @classmethod def _concat_same_type( cls, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4703a347ba0c9..fe528417bae70 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1391,6 +1391,19 @@ def argsort( np_result = result.to_numpy() return np_result.astype(np.intp, copy=False) + def sort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + ) -> None: + sort_indices = self.argsort( + ascending=ascending, kind=kind, na_position=na_position + ) + sorted_array = self.take(sort_indices) + self._pa_array = sorted_array._pa_array + def _argmin_max(self, skipna: bool, method: str) -> int: if self._pa_array.length() in (0, self._pa_array.null_count) or ( self._hasna and not skipna diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 838527b8d6378..d3dd174bcec6f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1069,6 +1069,48 @@ def argsort( mask=np.asarray(self.isna()), ) + def sort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + ) -> None: + """ + Sort the array in-place. + + Parameters + ---------- + ascending : bool, default True + Whether to sort in ascending order. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Sorting algorithm. + na_position : {'first', 'last'}, default 'last' + If 'first', put NaN values at the beginning. + If 'last', put NaN values at the end. + + Returns + ------- + None + + See Also + -------- + ExtensionArray.argsort : Return the indices that would sort this array. + + Examples + -------- + >>> arr = pd.array([3, 1, 2, 5, 4]) + >>> arr.sort() + >>> arr + + [1, 2, 3, 4, 5] + Length: 5, dtype: Int64 + """ + sort_indices = self.argsort( + ascending=ascending, kind=kind, na_position=na_position + ) + self[:] = self.take(sort_indices) + def argmin(self, skipna: bool = True) -> int: """ Return the index of minimum value. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index cb508a6f3d429..a6a17c8f5c56a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -127,6 +127,7 @@ def tocsc(self, /) -> csc_array | csc_matrix: ... Scalar, ScalarIndexer, SequenceIndexer, + SortKind, npt, ) @@ -614,6 +615,15 @@ def __setitem__(self, key, value) -> None: msg = "SparseArray does not support item assignment via setitem" raise TypeError(msg) + def sort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + ) -> None: + raise NotImplementedError("SparseArray does not support in-place sort") + @classmethod def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3dd98f1f3a123..c2ec72a7f471a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -650,3 +650,12 @@ def test_numpy_random_permute(dtype, box): result = rng.permutation(arr) assert isinstance(result, np.ndarray) assert sorted(result.tolist()) == ["a", "bb", "ccc"] + + +def test_sort_unique_result(dtype): + # https://github.com/pandas-dev/pandas/issues/64977 + arr = pd.array(["Bob", "Alice", "Bob"], dtype=dtype) + unique_names = arr.unique() + unique_names.sort() + expected = pd.array(["Alice", "Bob"], dtype=dtype) + tm.assert_extension_array_equal(unique_names, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 46a2f048fed3f..caeac087a4aa1 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -292,6 +292,32 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + def test_sort_inplace(self, data_for_sorting): + arr = data_for_sorting.copy() + result = arr.sort() + assert result is None + expected = data_for_sorting.take([2, 0, 1]) + tm.assert_extension_array_equal(arr, expected) + + def test_sort_inplace_descending(self, data_for_sorting): + arr = data_for_sorting.copy() + arr.sort(ascending=False) + if pd.Series(data_for_sorting).nunique() == 2: + expected = data_for_sorting.take([0, 1, 2]) + else: + expected = data_for_sorting.take([1, 0, 2]) + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize("na_position", ["first", "last"]) + def test_sort_inplace_na_position(self, data_missing_for_sorting, na_position): + arr = data_missing_for_sorting.copy() + arr.sort(na_position=na_position) + if na_position == "last": + expected = data_missing_for_sorting.take([2, 0, 1]) + else: + expected = data_missing_for_sorting.take([1, 2, 0]) + tm.assert_extension_array_equal(arr, expected) + @pytest.mark.parametrize("ascending", [True, False]) def test_rank(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 9adb72877b333..0b48cbb402d14 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -143,6 +143,9 @@ def __setitem__(self, key, value) -> None: if isinstance(key, numbers.Integral): self.data[key] = value else: + if isinstance(key, slice): + key = range(*key.indices(len(self))) + if not isinstance(value, (type(self), abc.Sequence)): # broadcast value value = itertools.cycle([value]) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index aa4372d80d85d..5eaa2ca44f341 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -396,12 +396,6 @@ def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): def test_setitem_scalar_key_sequence_raise(self, data): super().test_setitem_scalar_key_sequence_raise(data) - def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): - if "full_slice" in request.node.name: - mark = pytest.mark.xfail(reason="slice is not iterable") - request.applymarker(mark) - super().test_setitem_with_expansion_dataframe_column(data, full_indexer) - @pytest.mark.xfail(reason="slice is not iterable") def test_setitem_frame_2d_values(self, data): super().test_setitem_frame_2d_values(data) @@ -413,10 +407,12 @@ def test_setitem_frame_2d_values(self, data): def test_setitem_mask_broadcast(self, data, setter): super().test_setitem_mask_broadcast(data, setter) - @pytest.mark.xfail( - reason="cannot set using a slice indexer with a different length" - ) - def test_setitem_slice(self, data, box_in_series): + def test_setitem_slice(self, data, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a slice indexer with a different length" + ) + request.applymarker(mark) super().test_setitem_slice(data, box_in_series) @pytest.mark.xfail(reason="slice object is not iterable") @@ -427,10 +423,6 @@ def test_setitem_loc_iloc_slice(self, data): def test_setitem_slice_mismatch_length_raises(self, data): super().test_setitem_slice_mismatch_length_raises(data) - @pytest.mark.xfail(reason="slice object is not iterable") - def test_setitem_slice_array(self, data): - super().test_setitem_slice_array(data) - @pytest.mark.xfail(reason="Fail to raise") def test_setitem_invalid(self, data, invalid_scalar): super().test_setitem_invalid(data, invalid_scalar) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 6a460b3ef1496..ff09156e3a927 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -330,6 +330,19 @@ def test_searchsorted(self, performance_warning, data_for_sorting, as_series): with tm.assert_produces_warning(performance_warning, check_stacklevel=False): super().test_searchsorted(data_for_sorting, as_series) + def test_sort_inplace(self, data_for_sorting): + with pytest.raises(NotImplementedError): + data_for_sorting.sort() + + def test_sort_inplace_descending(self, data_for_sorting): + with pytest.raises(NotImplementedError): + data_for_sorting.sort(ascending=False) + + @pytest.mark.parametrize("na_position", ["first", "last"]) + def test_sort_inplace_na_position(self, data_missing_for_sorting, na_position): + with pytest.raises(NotImplementedError): + data_missing_for_sorting.sort(na_position=na_position) + def test_shift_0_periods(self, data): # GH#33856 shifting with periods=0 should return a copy, not same obj result = data.shift(0)