From 8a45bc1be4017a6aef28818703e055b36d2accfe Mon Sep 17 00:00:00 2001 From: Patrick Walsh Date: Tue, 12 Aug 2025 16:52:25 +0000 Subject: [PATCH 1/3] update _proto.py to have both including_default_value_fields and always_print_fields_with_no_presence so both versions of protobuf are supported --- pbspark/_proto.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/pbspark/_proto.py b/pbspark/_proto.py index 0c3c1c3..ae40aed 100644 --- a/pbspark/_proto.py +++ b/pbspark/_proto.py @@ -203,6 +203,7 @@ def message_to_dict( self, message: Message, including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, descriptor_pool: t.Optional[DescriptorPool] = None, @@ -212,7 +213,7 @@ def message_to_dict( Args: message: The protocol buffers message instance to serialize. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -224,14 +225,23 @@ def message_to_dict( default. float_precision: If set, use this to specify float field valid digits. """ - printer = _Printer( + printer_kwargs = dict( custom_serializers=self._custom_serializers, - including_default_value_fields=including_default_value_fields, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, descriptor_pool=descriptor_pool, float_precision=float_precision, ) + # protobuf versions >=3.20.0,<5.26.1 + if including_default_value_fields: + printer_kwargs.update({"including_default_value_fields": including_default_value_fields}) + # protobuf version >=5.26.1 + elif always_print_fields_with_no_presence: + printer_kwargs.update({"always_print_fields_with_no_presence": always_print_fields_with_no_presence}) + + printer = _Printer( + **printer_kwargs + ) return printer._MessageToJsonObject(message=message) def parse_dict( @@ -315,6 +325,7 @@ def get_decoder( self, message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -326,7 +337,7 @@ def get_decoder( Args: message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -343,6 +354,7 @@ def decoder(s: bytes) -> dict: return self.message_to_dict( message_type.FromString(s), including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, @@ -354,6 +366,7 @@ def get_decoder_udf( self, message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -365,7 +378,7 @@ def get_decoder_udf( Args: message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -379,6 +392,7 @@ def get_decoder_udf( self.get_decoder( message_type=message_type, including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, @@ -395,6 +409,7 @@ def from_protobuf( data: t.Union[Column, str], message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -406,7 +421,7 @@ def from_protobuf( Args: message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -420,6 +435,7 @@ def from_protobuf( protobuf_decoder_udf = self.get_decoder_udf( message_type=message_type, including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, @@ -517,6 +533,7 @@ def df_from_protobuf( df: DataFrame, message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -527,7 +544,7 @@ def df_from_protobuf( Args: df: A pyspark dataframe with encoded protobuf in the column at index 0. message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -545,6 +562,7 @@ def df_from_protobuf( data=df.columns[0], message_type=message_type, including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, @@ -599,6 +617,7 @@ def from_protobuf( data: t.Union[Column, str], message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -609,7 +628,7 @@ def from_protobuf( Args: data: A pyspark column. message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -625,6 +644,7 @@ def from_protobuf( data=data, message_type=message_type, including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, @@ -665,6 +685,7 @@ def df_from_protobuf( df: DataFrame, message_type: t.Type[Message], including_default_value_fields: bool = False, + always_print_fields_with_no_presence: bool = False, preserving_proto_field_name: bool = False, use_integers_for_enums: bool = False, float_precision: t.Optional[int] = None, @@ -676,7 +697,7 @@ def df_from_protobuf( Args: df: A pyspark dataframe with encoded protobuf in the column at index 0. message_type: The message type for decoding. - including_default_value_fields: If True, singular primitive fields, + always_print_fields_with_no_presence: If True, singular primitive fields, repeated fields, and map fields will always be serialized. If False, only serialize non-empty fields. Singular message fields and oneof fields are not affected by this option. @@ -695,6 +716,7 @@ def df_from_protobuf( df=df, message_type=message_type, including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, preserving_proto_field_name=preserving_proto_field_name, use_integers_for_enums=use_integers_for_enums, float_precision=float_precision, From fcd047dcd83037d737d682da1a7548055a6cfb1c Mon Sep 17 00:00:00 2001 From: Patrick Walsh Date: Tue, 12 Aug 2025 16:53:52 +0000 Subject: [PATCH 2/3] fix comments --- pbspark/_proto.py | 56 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pbspark/_proto.py b/pbspark/_proto.py index ae40aed..d41147e 100644 --- a/pbspark/_proto.py +++ b/pbspark/_proto.py @@ -213,10 +213,10 @@ def message_to_dict( Args: message: The protocol buffers message instance to serialize. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -337,10 +337,10 @@ def get_decoder( Args: message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -378,10 +378,10 @@ def get_decoder_udf( Args: message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -421,10 +421,10 @@ def from_protobuf( Args: message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -544,10 +544,10 @@ def df_from_protobuf( Args: df: A pyspark dataframe with encoded protobuf in the column at index 0. message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -628,10 +628,10 @@ def from_protobuf( Args: data: A pyspark column. message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. @@ -697,10 +697,10 @@ def df_from_protobuf( Args: df: A pyspark dataframe with encoded protobuf in the column at index 0. message_type: The message type for decoding. - always_print_fields_with_no_presence: If True, singular primitive fields, - repeated fields, and map fields will always be serialized. If - False, only serialize non-empty fields. Singular message fields - and oneof fields are not affected by this option. + including_default_value_fields and always_print_fields_with_no_presence: If True, + singular primitive fields, repeated fields, and map fields will always + be serialized. If False, only serialize non-empty fields. Singular + message fields and one of fields are not affected by this option. preserving_proto_field_name: If True, use the original proto field names as defined in the .proto file. If False, convert the field names to lowerCamelCase. From d070f1fe529dc30b8a87856f45805b04eb307106 Mon Sep 17 00:00:00 2001 From: Patrick Walsh Date: Tue, 19 Aug 2025 12:13:37 +0000 Subject: [PATCH 3/3] update unit tests to pass with update to always print fields with no presence test --- tests/test_proto.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_proto.py b/tests/test_proto.py index 4e68565..1cb0de0 100644 --- a/tests/test_proto.py +++ b/tests/test_proto.py @@ -101,7 +101,7 @@ def expanded(request): @pytest.fixture(params=[True, False]) -def including_default_value_fields(request): +def always_print_fields_with_no_presence(request): return request.param @@ -386,7 +386,7 @@ def test_df_to_from_protobuf(example, spark, expanded): assert df.collect() == df_encoded.collect() -def test_including_default_value_fields(spark, including_default_value_fields): +def test_always_print_fields_with_no_presence(spark, always_print_fields_with_no_presence): example = ExampleMessage(string="asdf") data = [{"value": example.SerializeToString()}] @@ -396,15 +396,14 @@ def test_including_default_value_fields(spark, including_default_value_fields): df=df, message_type=ExampleMessage, expanded=True, - including_default_value_fields=including_default_value_fields, + always_print_fields_with_no_presence=always_print_fields_with_no_presence, ) data = df_decoded.collect() - if including_default_value_fields: + if always_print_fields_with_no_presence: assert data[0].asDict(True)["int32"] == 0 else: assert data[0].asDict(True)["int32"] is None - def test_use_integers_for_enums(spark, use_integers_for_enums): example = ExampleMessage(enum=ExampleMessage.SomeEnum.first) data = [{"value": example.SerializeToString()}]