diff --git a/cpp/cmake_modules/FindArrowSkyhook.cmake b/cpp/cmake_modules/FindArrowSkyhook.cmake new file mode 100644 index 000000000000..a078f11af5ac --- /dev/null +++ b/cpp/cmake_modules/FindArrowSkyhook.cmake @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find Arrow Skyhook (arrow/dataset/api.h, libarrow_dataset.a, libarrow_dataset.so) +# +# This module requires Arrow from which it uses +# arrow_find_package() +# +# This module defines +# ARROW_SKYHOOK_FOUND, whether Arrow Skyhook has been found +# ARROW_SKYHOOK_IMPORT_LIB, +# path to libarrow_dataset's import library (Windows only) +# ARROW_SKYHOOK_INCLUDE_DIR, directory containing headers +# ARROW_SKYHOOK_LIB_DIR, directory containing Arrow Skyhook libraries +# ARROW_SKYHOOK_SHARED_LIB, path to libarrow_dataset's shared library +# ARROW_SKYHOOK_STATIC_LIB, path to libarrow_dataset.a + +if(DEFINED ARROW_SKYHOOK_FOUND) + return() +endif() + +set(find_package_arguments) +if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) + list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") +endif() +if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) + list(APPEND find_package_arguments REQUIRED) +endif() +if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) + list(APPEND find_package_arguments QUIET) +endif() +find_package(Arrow ${find_package_arguments}) + +if(ARROW_FOUND) + arrow_find_package(ARROW_SKYHOOK + "${ARROW_HOME}" + arrow_skyhook + skyhook/client/file_skyhook.h + ArrowSkyhook + arrow-skyhook) + if(NOT ARROW_SKYHOOK_VERSION) + set(ARROW_SKYHOOK_VERSION "${ARROW_VERSION}") + endif() +endif() + +if("${ARROW_SKYHOOK_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") + set(ARROW_SKYHOOK_VERSION_MATCH TRUE) +else() + set(ARROW_SKYHOOK_VERSION_MATCH FALSE) +endif() + +mark_as_advanced(ARROW_SKYHOOK_IMPORT_LIB + ARROW_SKYHOOK_INCLUDE_DIR + ARROW_SKYHOOK_LIBS + ARROW_SKYHOOK_LIB_DIR + ARROW_SKYHOOK_SHARED_IMP_LIB + ARROW_SKYHOOK_SHARED_LIB + ARROW_SKYHOOK_STATIC_LIB + ARROW_SKYHOOK_VERSION + ARROW_SKYHOOK_VERSION_MATCH) + +find_package_handle_standard_args( + ArrowSkyhook + REQUIRED_VARS ARROW_SKYHOOK_INCLUDE_DIR ARROW_SKYHOOK_LIB_DIR + ARROW_SKYHOOK_VERSION_MATCH + VERSION_VAR ARROW_SKYHOOK_VERSION) +set(ARROW_SKYHOOK_FOUND ${ArrowSkyhook_FOUND}) + +if(ArrowSkyhook_FOUND AND NOT ArrowSkyhook_FIND_QUIETLY) + message(STATUS "Found the Arrow Skyhook by ${ARROW_SKYHOOK_FIND_APPROACH}") + message(STATUS "Found the Arrow Skyhook shared library: ${ARROW_SKYHOOK_SHARED_LIB}") + message(STATUS "Found the Arrow Skyhook import library: ${ARROW_SKYHOOK_IMPORT_LIB}") + message(STATUS "Found the Arrow Skyhook static library: ${ARROW_SKYHOOK_STATIC_LIB}") +endif() diff --git a/cpp/src/skyhook/ArrowSkyhookConfig.cmake.in b/cpp/src/skyhook/ArrowSkyhookConfig.cmake.in new file mode 100644 index 000000000000..5bedc3988814 --- /dev/null +++ b/cpp/src/skyhook/ArrowSkyhookConfig.cmake.in @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This config sets the following variables in your project:: +# +# ArrowDataset_FOUND - true if Arrow Dataset found on the system +# +# This config sets the following targets in your project:: +# +# arrow_dataset_shared - for linked as shared library if shared library is built +# arrow_dataset_static - for linked as static library if static library is built + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(Arrow) +find_dependency(ArrowDataset) +find_dependency(Parquet) + +# Load targets only once. If we load targets multiple times, CMake reports +# already existent target error. +if(NOT (TARGET arrow_dataset_shared OR TARGET arrow_dataset_static)) + include("${CMAKE_CURRENT_LIST_DIR}/ArrowDatasetTargets.cmake") +endif() diff --git a/cpp/src/skyhook/CMakeLists.txt b/cpp/src/skyhook/CMakeLists.txt index 992c46741325..6cffda38b35a 100644 --- a/cpp/src/skyhook/CMakeLists.txt +++ b/cpp/src/skyhook/CMakeLists.txt @@ -19,7 +19,7 @@ add_subdirectory(client) # define the targets to build -add_custom_target(arrow_skyhook_client) +add_custom_target(arrow_skyhook) add_custom_target(cls_skyhook) # define the dependencies @@ -28,19 +28,23 @@ set(ARROW_SKYHOOK_LINK_STATIC arrow_dataset_static librados::rados) set(ARROW_SKYHOOK_LINK_SHARED arrow_dataset_shared librados::rados) # define the client and cls sources -set(ARROW_SKYHOOK_CLIENT_SOURCES client/file_skyhook.cc protocol/rados_protocol.cc +set(ARROW_SKYHOOK_SOURCES client/file_skyhook.cc protocol/rados_protocol.cc protocol/skyhook_protocol.cc) set(ARROW_SKYHOOK_CLS_SOURCES cls/cls_skyhook.cc protocol/rados_protocol.cc protocol/skyhook_protocol.cc) # define the client library -add_arrow_lib(arrow_skyhook_client +add_arrow_lib(arrow_skyhook + CMAKE_PACKAGE_NAME + ArrowSkyhook PKG_CONFIG_NAME - skyhook + arrow-skyhook + # PKG_CONFIG_NAME + # skyhook SOURCES - ${ARROW_SKYHOOK_CLIENT_SOURCES} + ${ARROW_SKYHOOK_SOURCES} OUTPUTS - ARROW_SKYHOOK_CLIENT_LIBRARIES + ARROW_SKYHOOK_LIBRARIES SHARED_LINK_LIBS ${ARROW_SKYHOOK_LINK_SHARED} STATIC_LINK_LIBS @@ -58,15 +62,15 @@ add_arrow_lib(cls_skyhook ${ARROW_SKYHOOK_LINK_STATIC}) # finish building the project -add_dependencies(arrow_skyhook_client ${ARROW_SKYHOOK_CLIENT_LIBRARIES}) +add_dependencies(arrow_skyhook ${ARROW_SKYHOOK_LIBRARIES}) add_dependencies(cls_skyhook ${ARROW_SKYHOOK_CLS_LIBRARIES}) # define the test builds if(ARROW_TEST_LINKAGE STREQUAL "static") - set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_static arrow_dataset_static + set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_static arrow_dataset_static ${ARROW_TEST_STATIC_LINK_LIBS}) else() - set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_shared arrow_dataset_shared + set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_shared arrow_dataset_shared ${ARROW_TEST_SHARED_LINK_LIBS}) endif() diff --git a/cpp/src/skyhook/skyhook.pc.in b/cpp/src/skyhook/arrow-skyhook.pc.in similarity index 96% rename from cpp/src/skyhook/skyhook.pc.in rename to cpp/src/skyhook/arrow-skyhook.pc.in index a3a4da5ee9c9..14ec168e8724 100644 --- a/cpp/src/skyhook/skyhook.pc.in +++ b/cpp/src/skyhook/arrow-skyhook.pc.in @@ -23,4 +23,4 @@ Name: Skyhook Description: Skyhook is a plugin for offloading computations into Ceph. Version: @SKYHOOK_VERSION@ Requires: arrow_dataset -Libs: -L${libdir} -larrow_skyhook_client \ No newline at end of file +Libs: -L${libdir} -larrow_skyhook \ No newline at end of file diff --git a/cpp/src/skyhook/client/file_skyhook.h b/cpp/src/skyhook/client/file_skyhook.h index bec41fbdfa61..00018b4cde00 100644 --- a/cpp/src/skyhook/client/file_skyhook.h +++ b/cpp/src/skyhook/client/file_skyhook.h @@ -41,6 +41,8 @@ struct RadosConnCtx { std::string ceph_cluster_name; std::string ceph_cls_name; + RadosConnCtx() = default; + RadosConnCtx(std::string ceph_config_path, std::string ceph_data_pool, std::string ceph_user_name, std::string ceph_cluster_name, std::string ceph_cls_name) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b4b713ba9914..68bbcc4ca8ff 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -72,6 +72,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) + option(PYARROW_BUILD_SKYHOOK "Build the PyArrow Skyhook integration" OFF) option(PYARROW_PARQUET_USE_SHARED "Rely on parquet shared libraries where relevant" ON) option(PYARROW_BUILD_PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration" OFF) @@ -511,6 +512,23 @@ if(PYARROW_BUILD_ORC) endif() endif() +if (PYARROW_BUILD_SKYHOOK) + # Skyhook + find_package(ArrowSkyhook REQUIRED) + include_directories(SYSTEM ${ARROW_SKYHOOK_INCLUDE_DIR}) + if(PYARROW_BUILD_DATASET) + if(PYARROW_BUNDLE_ARROW_CPP) + file(COPY ${ARROW_SKYHOOK_INCLUDE_DIR}/skyhook + DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include) + bundle_arrow_lib(ARROW_SKYHOOK_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION}) + if(MSVC) + bundle_arrow_import_lib(ARROW_SKYHOOK_IMPORT_LIB) + endif() + endif() + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _dataset_skyhook) + endif() +endif() + # Flight if(PYARROW_BUILD_FLIGHT) # Arrow Flight @@ -632,6 +650,9 @@ if(PYARROW_BUILD_DATASET) if(PYARROW_BUILD_PARQUET) target_link_libraries(_dataset_parquet PRIVATE ${DATASET_LINK_LIBS}) endif() + if (PYARROW_BUILD_SKYHOOK) + target_link_libraries(_dataset_skyhook PRIVATE ${DATASET_LINK_LIBS}) + endif() endif() if(PYARROW_BUILD_GANDIVA) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index f3c2220a55d7..fd6724aa175f 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -86,6 +86,24 @@ def _get_parquet_classes(): _dataset_pq = None +def _get_skyhook_fileformat(): + """ + Import SkyhookFileFormat on first usage (to avoid circular import issue + when `pyarrow._dataset_skyhook` would be imported first) + """ + global _skyhook_fileformat + global _skyhook_imported + if not _skyhook_imported: + try: + from pyarrow._dataset_skyhook import SkyhookFileFormat + _skyhook_fileformat = SkyhookFileFormat + except ImportError as e: + _skyhook_fileformat = None + finally: + _skyhook_imported = True + return _skyhook_fileformat + + def _get_parquet_symbol(name): """ Get a symbol from pyarrow.parquet if the latter is importable, otherwise @@ -681,6 +699,7 @@ cdef class FileFormat(_Weakrefable): 'csv': CsvFileFormat, 'parquet': _get_parquet_symbol('ParquetFileFormat'), 'orc': _get_orc_fileformat(), + 'skyhook': _get_skyhook_fileformat(), } class_ = classes.get(type_name, None) diff --git a/python/pyarrow/_dataset_skyhook.pyx b/python/pyarrow/_dataset_skyhook.pyx new file mode 100644 index 000000000000..7f5715762af4 --- /dev/null +++ b/python/pyarrow/_dataset_skyhook.pyx @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow._dataset cimport FileFormat +from pyarrow.lib cimport * +from pyarrow.lib import frombytes, tobytes +from pyarrow.includes.libarrow_dataset_skyhook cimport * + + +cdef class SkyhookFileFormat(FileFormat): + """ + A FileFormat implementation that offloads the fragment + scan operations to the Ceph OSDs. + Parameters + --------- + file_format: The underlying file format to use. + ceph_config_path: The path to the Ceph config file. + data_pool: Name of the CephFS data pool. + user_name: The username accessing the Ceph cluster. + cluster_name: Name of the cluster. + """ + cdef: + CSkyhookFileFormat* skyhook_format + + def __init__( + self, + file_format="parquet", + ceph_config_path="/etc/ceph/ceph.conf", + ceph_data_pool="cephfs_data", + ceph_user_name="client.admin", + ceph_cluster_name="ceph", + ceph_cls_name="arrow" + ): + cdef: + CRadosConnCtx ctx + + ctx.ceph_config_path = ceph_config_path + ctx.ceph_data_pool = ceph_data_pool + ctx.ceph_user_name = ceph_user_name + ctx.ceph_cluster_name = ceph_cluster_name + ctx.ceph_cls_name = ceph_cls_name + + self.init(shared_ptr[CFileFormat]( + new CSkyhookFileFormat( + make_shared[CRadosConnCtx](ctx), + tobytes(file_format) + ) + )) + + cdef void init(self, const shared_ptr[CFileFormat]& sp): + FileFormat.init(self, sp) + self.skyhook_format = sp.get() diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 437fa6caa0ef..49aaf21b220d 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -51,6 +51,18 @@ from pyarrow.compute import Expression, scalar, field # noqa +_skyhook_available = False +_skyhook_msg = ( + "The pyarrow installation is not built with support for the Skyhook file " + "format." +) + +try: + from pyarrow._dataset_skyhook import SkyhookFileFormat + _skyhook_available = True +except ImportError: + pass + _orc_available = False _orc_msg = ( "The pyarrow installation is not built with support for the ORC file " @@ -92,6 +104,9 @@ def __getattr__(name): if name == "ParquetFileFormat" and not _parquet_available: raise ImportError(_parquet_msg) + if name == "SkyhookFileFormat" and not _skyhook_available: + raise ImportError(_skyhook_msg) + raise AttributeError( "module 'pyarrow.dataset' has no attribute '{0}'".format(name) ) @@ -263,6 +278,10 @@ def _ensure_format(obj): if not _orc_available: raise ValueError(_orc_msg) return OrcFileFormat() + elif obj == "skyhook": + if not _skyhook_available: + raise ValueError(_skyhook_msg) + return SkyhookFileFormat() else: raise ValueError("format '{}' is not supported".format(obj)) diff --git a/python/pyarrow/includes/libarrow_dataset_skyhook.pxd b/python/pyarrow/includes/libarrow_dataset_skyhook.pxd new file mode 100644 index 000000000000..6033e5a1e2b3 --- /dev/null +++ b/python/pyarrow/includes/libarrow_dataset_skyhook.pxd @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport * +from pyarrow.lib import frombytes, tobytes +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_dataset cimport * + +from pyarrow.lib cimport _Weakrefable + +cdef extern from "skyhook/client/file_skyhook.h" \ + namespace "skyhook" nogil: + cdef struct CRadosConnCtx "skyhook::RadosConnCtx": + CRadosConnCtx() + c_string ceph_config_path + c_string ceph_data_pool + c_string ceph_user_name + c_string ceph_cluster_name + c_string ceph_cls_name + +cdef extern from "skyhook/client/file_skyhook.h" \ + namespace "skyhook" nogil: + cdef cppclass CSkyhookFileFormat \ + "skyhook::SkyhookFileFormat"( + CFileFormat): + CSkyhookFileFormat( + shared_ptr[CRadosConnCtx] ctx, + c_string file_format + ) \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 5bf95caba95f..3469bb1e0c08 100755 --- a/python/setup.py +++ b/python/setup.py @@ -26,6 +26,7 @@ import shlex import shutil import sys +from numpy import append if sys.version_info >= (3, 10): import sysconfig @@ -120,6 +121,7 @@ def run(self): 'build pyarrow with TensorFlow support'), ('with-orc', None, 'build the ORC extension'), ('with-gandiva', None, 'build the Gandiva extension'), + ('with-skyhook', None, 'build the Skyhook bindings'), ('generate-coverage', None, 'enable Cython code coverage'), ('bundle-boost', None, @@ -166,6 +168,8 @@ def initialize_options(self): os.environ.get('PYARROW_WITH_DATASET', '0')) self.with_parquet = strtobool( os.environ.get('PYARROW_WITH_PARQUET', '0')) + self.with_skyhook = strtobool( + os.environ.get('PYARROW_WITH_SKYHOOK', '0')) self.with_static_parquet = strtobool( os.environ.get('PYARROW_WITH_STATIC_PARQUET', '0')) self.with_parquet_encryption = strtobool( @@ -204,6 +208,7 @@ def initialize_options(self): '_dataset', '_dataset_orc', '_dataset_parquet', + '_dataset_skyhook', '_feather', '_parquet', '_parquet_encryption', @@ -261,6 +266,7 @@ def append_cmake_bool(value, varname): append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') + append_cmake_bool(self.with_skyhook, 'PYARROW_BUILD_SKYHOOK') append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') @@ -394,6 +400,8 @@ def _bundle_arrow_cpp(self, build_prefix, build_lib): move_shared_libs(build_prefix, build_lib, "plasma") if self.with_gandiva: move_shared_libs(build_prefix, build_lib, "gandiva") + if self.with_skyhook: + move_shared_libs(build_prefix, build_lib, "arrow_skyhook") if self.with_parquet and not self.with_static_parquet: move_shared_libs(build_prefix, build_lib, "parquet") if not self.with_static_boost and self.bundle_boost: @@ -443,6 +451,10 @@ def _failure_permitted(self, name): self.with_parquet and self.with_dataset ): return True + if name == '_dataset_skyhook' and not ( + self.with_skyhook and self.with_dataset + ): + return True if name == '_cuda' and not self.with_cuda: return True if name == 'gandiva' and not self.with_gandiva: