Skip to content

Commit 40b98ed

Browse files
ArrayRecord Teamcopybara-github
authored andcommitted
Update ArrayRecord Reader to allow reading from GCS directly since Reglieli
PiperOrigin-RevId: 792768387
1 parent d0ed18b commit 40b98ed

3 files changed

Lines changed: 33 additions & 10 deletions

File tree

MODULE.bazel

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ module(
2121
repo_name = "com_google_array_record",
2222
)
2323

24-
bazel_dep(name = "rules_proto", version = "7.0.2")
25-
bazel_dep(name = "rules_python", version = "1.0.0")
24+
bazel_dep(name = "rules_proto", version = "7.1.0")
25+
bazel_dep(name = "rules_python", version = "1.4.1")
2626
bazel_dep(name = "platforms", version = "0.0.11")
2727
bazel_dep(name = "protobuf", version = "31.1")
2828
bazel_dep(name = "googletest", version = "1.15.2")
29-
bazel_dep(name = "abseil-cpp", version = "20250127.0")
29+
bazel_dep(name = "abseil-cpp", version = "20250127.1")
3030
bazel_dep(name = "abseil-py", version = "2.1.0")
3131
bazel_dep(name = "eigen", version = "3.4.0.bcr.3")
32-
bazel_dep(name = "riegeli", version = "0.0.0-20241218-3385e3c")
32+
bazel_dep(name = "riegeli", version = "0.0.0-20250717-5b2e77e")
3333
bazel_dep(name = "pybind11_bazel", version = "2.12.0")
3434

3535
SUPPORTED_PYTHON_VERSIONS = [

python/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ pybind_extension(
1717
"//cpp:array_record_reader",
1818
"//cpp:array_record_writer",
1919
"//cpp:thread_pool",
20+
"//third_party/cloud_cpp/google/cloud/storage:storage_client",
2021
"@riegeli//riegeli/base:initializer",
2122
"@riegeli//riegeli/bytes:fd_reader",
2223
"@riegeli//riegeli/bytes:fd_writer",
24+
"@riegeli//riegeli/gcs:gcs_object",
25+
"@riegeli//riegeli/gcs:gcs_reader",
2326
],
2427
)
2528

python/array_record_module.cc

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,25 @@ limitations under the License.
2121
#include <utility>
2222
#include <vector>
2323

24+
#include "file/base/options.pb.h"
2425
#include "absl/status/status.h"
26+
#include "absl/strings/match.h"
2527
#include "absl/strings/str_format.h"
2628
#include "absl/strings/string_view.h"
2729
#include "cpp/array_record_reader.h"
2830
#include "cpp/array_record_writer.h"
2931
#include "cpp/thread_pool.h"
32+
#include "third_party/cloud_cpp/google/cloud/storage/client.h"
3033
#include "pybind11/gil.h"
3134
#include "pybind11/pybind11.h"
3235
#include "pybind11/pytypes.h"
3336
#include "pybind11/stl.h"
3437
#include "riegeli/base/maker.h"
3538
#include "riegeli/bytes/fd_reader.h"
3639
#include "riegeli/bytes/fd_writer.h"
40+
#include "riegeli/gcs/gcs_object.h"
41+
#include "riegeli/gcs/gcs_reader.h"
42+
3743

3844
namespace py = pybind11;
3945

@@ -50,10 +56,13 @@ PYBIND11_MODULE(array_record_module, m) {
5056
throw py::value_error(
5157
std::string(status_or_option.status().message()));
5258
}
59+
riegeli::FdWriterBase::Options file_writer_options;
60+
file_writer_options.set_buffer_size(size_t{16} << 20);
5361
// Release the GIL because IO is time consuming.
5462
py::gil_scoped_release scoped_release;
5563
return new array_record::ArrayRecordWriter(
56-
riegeli::Maker<riegeli::FdWriter>(path),
64+
riegeli::Maker<riegeli::FdWriter>(
65+
path, std::move(file_writer_options)),
5766
status_or_option.value());
5867
}),
5968
py::arg("path"), py::arg("options") = "")
@@ -84,18 +93,29 @@ PYBIND11_MODULE(array_record_module, m) {
8493
std::string(status_or_option.status().message()));
8594
}
8695
riegeli::FdReaderBase::Options file_reader_options;
96+
riegeli::GcsReader::Options gcs_reader_options;
8797
if (kwargs.contains("file_reader_buffer_size")) {
8898
auto file_reader_buffer_size =
8999
kwargs["file_reader_buffer_size"].cast<int64_t>();
90100
file_reader_options.set_buffer_size(file_reader_buffer_size);
101+
gcs_reader_options.set_buffer_size(file_reader_buffer_size);
91102
}
92103
// Release the GIL because IO is time consuming.
93104
py::gil_scoped_release scoped_release;
94-
return new array_record::ArrayRecordReader(
95-
riegeli::Maker<riegeli::FdReader>(
96-
path, std::move(file_reader_options)),
97-
status_or_option.value(),
98-
array_record::ArrayRecordGlobalPool());
105+
if (absl::StartsWith(path, "gs://")) {
106+
return new array_record::ArrayRecordReader(
107+
riegeli::Maker<riegeli::GcsReader>(
108+
google::cloud::storage::Client(),
109+
riegeli::GcsObject(path), std::move(gcs_reader_options)),
110+
status_or_option.value(),
111+
array_record::ArrayRecordGlobalPool());
112+
} else {
113+
return new array_record::ArrayRecordReader(
114+
riegeli::Maker<riegeli::FdReader>(
115+
path, std::move(file_reader_options)),
116+
status_or_option.value(),
117+
array_record::ArrayRecordGlobalPool());
118+
}
99119
}),
100120
py::arg("path"), py::arg("options") = "", R"(
101121
ArrayRecordReader for fast sequential or random access.

0 commit comments

Comments
 (0)