Skip to content

Commit adde3d9

Browse files
ArrayRecord Teamcopybara-github
authored andcommitted
Update ArrayRecord Reader to allow reading from GCS directly since Reglieli
PiperOrigin-RevId: 792768387
1 parent d0ed18b commit adde3d9

3 files changed

Lines changed: 31 additions & 10 deletions

File tree

MODULE.bazel

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,17 @@ module(
2121
repo_name = "com_google_array_record",
2222
)
2323

24-
bazel_dep(name = "rules_proto", version = "7.0.2")
25-
bazel_dep(name = "rules_python", version = "1.0.0")
24+
bazel_dep(name = "rules_proto", version = "7.1.0")
25+
bazel_dep(name = "rules_python", version = "1.4.1")
2626
bazel_dep(name = "platforms", version = "0.0.11")
2727
bazel_dep(name = "protobuf", version = "31.1")
2828
bazel_dep(name = "googletest", version = "1.15.2")
29-
bazel_dep(name = "abseil-cpp", version = "20250127.0")
29+
bazel_dep(name = "abseil-cpp", version = "20250127.1")
3030
bazel_dep(name = "abseil-py", version = "2.1.0")
3131
bazel_dep(name = "eigen", version = "3.4.0.bcr.3")
32-
bazel_dep(name = "riegeli", version = "0.0.0-20241218-3385e3c")
32+
bazel_dep(name = "riegeli", version = "0.0.0-20250717-5b2e77e")
3333
bazel_dep(name = "pybind11_bazel", version = "2.12.0")
34+
bazel_dep(name = "google_cloud_cpp", version = "3.0.0-rc0")
3435

3536
SUPPORTED_PYTHON_VERSIONS = [
3637
"3.10",

python/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ pybind_extension(
2020
"@riegeli//riegeli/base:initializer",
2121
"@riegeli//riegeli/bytes:fd_reader",
2222
"@riegeli//riegeli/bytes:fd_writer",
23+
"@riegeli//riegeli/gcs:gcs_object",
24+
"@riegeli//riegeli/gcs:gcs_reader",
2325
],
2426
)
2527

python/array_record_module.cc

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,22 @@ limitations under the License.
2222
#include <vector>
2323

2424
#include "absl/status/status.h"
25+
#include "absl/strings/match.h"
2526
#include "absl/strings/str_format.h"
2627
#include "absl/strings/string_view.h"
2728
#include "cpp/array_record_reader.h"
2829
#include "cpp/array_record_writer.h"
2930
#include "cpp/thread_pool.h"
31+
#include "third_party/cloud_cpp/google/cloud/storage/client.h"
3032
#include "pybind11/gil.h"
3133
#include "pybind11/pybind11.h"
3234
#include "pybind11/pytypes.h"
3335
#include "pybind11/stl.h"
3436
#include "riegeli/base/maker.h"
3537
#include "riegeli/bytes/fd_reader.h"
3638
#include "riegeli/bytes/fd_writer.h"
39+
#include "riegeli/gcs/gcs_object.h"
40+
#include "riegeli/gcs/gcs_reader.h"
3741

3842
namespace py = pybind11;
3943

@@ -50,10 +54,13 @@ PYBIND11_MODULE(array_record_module, m) {
5054
throw py::value_error(
5155
std::string(status_or_option.status().message()));
5256
}
57+
riegeli::FdWriterBase::Options file_writer_options;
58+
file_writer_options.set_buffer_size(size_t{16} << 20);
5359
// Release the GIL because IO is time consuming.
5460
py::gil_scoped_release scoped_release;
5561
return new array_record::ArrayRecordWriter(
56-
riegeli::Maker<riegeli::FdWriter>(path),
62+
riegeli::Maker<riegeli::FdWriter>(
63+
path, std::move(file_writer_options)),
5764
status_or_option.value());
5865
}),
5966
py::arg("path"), py::arg("options") = "")
@@ -84,18 +91,29 @@ PYBIND11_MODULE(array_record_module, m) {
8491
std::string(status_or_option.status().message()));
8592
}
8693
riegeli::FdReaderBase::Options file_reader_options;
94+
riegeli::GcsReader::Options gcs_reader_options;
8795
if (kwargs.contains("file_reader_buffer_size")) {
8896
auto file_reader_buffer_size =
8997
kwargs["file_reader_buffer_size"].cast<int64_t>();
9098
file_reader_options.set_buffer_size(file_reader_buffer_size);
99+
gcs_reader_options.set_buffer_size(file_reader_buffer_size);
91100
}
92101
// Release the GIL because IO is time consuming.
93102
py::gil_scoped_release scoped_release;
94-
return new array_record::ArrayRecordReader(
95-
riegeli::Maker<riegeli::FdReader>(
96-
path, std::move(file_reader_options)),
97-
status_or_option.value(),
98-
array_record::ArrayRecordGlobalPool());
103+
if (absl::StartsWith(path, "gs://")) {
104+
return new array_record::ArrayRecordReader(
105+
riegeli::Maker<riegeli::GcsReader>(
106+
google::cloud::storage::Client(),
107+
riegeli::GcsObject(path), std::move(gcs_reader_options)),
108+
status_or_option.value(),
109+
array_record::ArrayRecordGlobalPool());
110+
} else {
111+
return new array_record::ArrayRecordReader(
112+
riegeli::Maker<riegeli::FdReader>(
113+
path, std::move(file_reader_options)),
114+
status_or_option.value(),
115+
array_record::ArrayRecordGlobalPool());
116+
}
99117
}),
100118
py::arg("path"), py::arg("options") = "", R"(
101119
ArrayRecordReader for fast sequential or random access.

0 commit comments

Comments
 (0)