Skip to content

Commit 2ac0d72

Browse files
committed
Fix Protobuf msgidx serialization and added use.deprecated.format
1 parent f44d6ce commit 2ac0d72

File tree

6 files changed

+221
-54
lines changed

6 files changed

+221
-54
lines changed

CHANGELOG.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
v1.8.2 is a maintenance release with the following fixes and enhancements:
66

7+
- **IMPORTANT**: Added mandatory `use.deprecated.format` to
8+
`ProtobufSerializer` and `ProtobufDeserializer`.
9+
See **Upgrade considerations** below for more information.
710
- **Python 2.7 binary wheels are no longer provided.**
811
Users still on Python 2.7 will need to build confluent-kafka from source
912
and install librdkafka separately, see [README.md](README.md#Prerequisites)
@@ -25,6 +28,41 @@ for a complete list of changes, enhancements, fixes and upgrade considerations.
2528
**Note**: There were no v1.8.0 and v1.8.1 releases.
2629

2730

31+
## Upgrade considerations
32+
33+
### Protobuf serialization format changes
34+
35+
Prior to this version the confluent-kafka-python client had a bug where
36+
nested protobuf schemas indexes were incorrectly serialized, causing
37+
incompatibility with other Schema-Registry protobuf consumers and producers.
38+
39+
This has now been fixed, but since the old defect serialization and the new
40+
correct serialization are mutually incompatible the user of
41+
confluent-kafka-python will need to make an explicit choice which
42+
serialization format to use during a transitory phase while old producers and
43+
consumers are upgraded.
44+
45+
The `ProtobufSerializer` and `ProtobufDeserializer` constructors now
46+
both take a (for the time being) configuration dictionary that requires
47+
the `use.deprecated.format` configuration property to be explicitly set.
48+
49+
Producers should be upgraded first and as long as there are old (<=v1.7.0)
50+
Python consumers reading from topics being produced to, the new (>=v1.8.2)
51+
Python producer must be configured with `use.deprecated.format` set to `True`.
52+
53+
When all existing messages in the topic have been consumed by older consumers
54+
the consumers should be upgraded and both new producers and the new consumers
55+
must set `use.deprecated.format` to `False`.
56+
57+
58+
The requirement to explicitly set `use.deprecated.format` will be removed
59+
in a future version and the setting will then default to `False` (new format).
60+
61+
62+
63+
64+
65+
2866
## v1.7.0
2967

3068
v1.7.0 is a maintenance release with the following fixes and enhancements:

examples/protobuf_consumer.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
def main(args):
4242
topic = args.topic
4343

44-
protobuf_deserializer = ProtobufDeserializer(user_pb2.User)
44+
protobuf_deserializer = ProtobufDeserializer(user_pb2.User,
45+
{'use.deprecated.format': False})
4546
string_deserializer = StringDeserializer('utf_8')
4647

4748
consumer_conf = {'bootstrap.servers': args.bootstrap_servers,
@@ -62,7 +63,8 @@ def main(args):
6263

6364
user = msg.value()
6465
if user is not None:
65-
print("User record {}: name: {}\n"
66+
print("User record {}:\n"
67+
"\tname: {}\n"
6668
"\tfavorite_number: {}\n"
6769
"\tfavorite_color: {}\n"
6870
.format(msg.key(), user.name,

examples/protobuf_producer.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def main(args):
7474
schema_registry_client = SchemaRegistryClient(schema_registry_conf)
7575

7676
protobuf_serializer = ProtobufSerializer(user_pb2.User,
77-
schema_registry_client)
77+
schema_registry_client,
78+
{'use.deprecated.format': True})
7879

7980
producer_conf = {'bootstrap.servers': args.bootstrap_servers,
8081
'key.serializer': StringSerializer('utf_8'),
@@ -93,9 +94,9 @@ def main(args):
9394
user = user_pb2.User(name=user_name,
9495
favorite_color=user_favorite_color,
9596
favorite_number=user_favorite_number)
96-
producer.produce(topic=topic, key=str(uuid4()), value=user,
97+
producer.produce(topic=topic, partition=0, key=str(uuid4()), value=user,
9798
on_delivery=delivery_report)
98-
except KeyboardInterrupt:
99+
except (KeyboardInterrupt, EOFError):
99100
break
100101
except ValueError:
101102
print("Invalid input, discarding record...")

src/confluent_kafka/schema_registry/protobuf.py

Lines changed: 147 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import sys
2020
import base64
2121
import struct
22+
import warnings
2223
from collections import deque
2324

2425
from google.protobuf.message import DecodeError
@@ -110,11 +111,6 @@ def _create_msg_index(msg_desc):
110111
if not found:
111112
raise ValueError("MessageDescriptor not found in file")
112113

113-
# The root element at the 0 position does not need a length prefix.
114-
if len(msg_idx) == 1 and msg_idx[0] == 0:
115-
return [0]
116-
117-
msg_idx.appendleft(len(msg_idx))
118114
return list(msg_idx)
119115

120116

@@ -169,6 +165,17 @@ class ProtobufSerializer(object):
169165
| | | Schema Registry subject names for Schema References |
170166
| | | Defaults to reference_subject_name_strategy |
171167
+-------------------------------------+----------+------------------------------------------------------+
168+
| ``use.deprecated.format`` | bool | Specifies whether the Protobuf serializer should |
169+
| | | serialize message indexes without zig-zag encoding. |
170+
| | | This option must be explicitly configured as older |
171+
| | | and newer Protobuf producers are incompatible. |
172+
| | | If the consumers of the topic being produced to are |
173+
| | | using confluent-kafka-python <1.8 then this property |
174+
| | | must be set to True until all old consumers have |
175+
| | | have been upgraded. |
176+
| | | Warning: This configuration property will be removed |
177+
| | | in a future version of the client. |
178+
+-------------------------------------+----------+------------------------------------------------------+
172179
173180
Schemas are registered to namespaces known as Subjects which define how a
174181
schema may evolve over time. By default the subject name is formed by
@@ -208,17 +215,27 @@ class ProtobufSerializer(object):
208215
__slots__ = ['_auto_register', '_use_latest_version', '_skip_known_types',
209216
'_registry', '_known_subjects',
210217
'_msg_class', '_msg_index', '_schema', '_schema_id',
211-
'_ref_reference_subject_func', '_subject_name_func']
218+
'_ref_reference_subject_func', '_subject_name_func',
219+
'_use_deprecated_format']
212220
# default configuration
213221
_default_conf = {
214222
'auto.register.schemas': True,
215223
'use.latest.version': False,
216224
'skip.known.types': False,
217225
'subject.name.strategy': topic_subject_name_strategy,
218-
'reference.subject.name.strategy': reference_subject_name_strategy
226+
'reference.subject.name.strategy': reference_subject_name_strategy,
227+
'use.deprecated.format': False,
219228
}
220229

221230
def __init__(self, msg_type, schema_registry_client, conf=None):
231+
232+
if conf is None or 'use.deprecated.format' not in conf:
233+
raise RuntimeError(
234+
"ProtobufSerializer: the 'use.deprecated.format' configuration "
235+
"property must be explicitly set due to backward incompatibility "
236+
"with older confluent-kafka-python Protobuf producers and consumers. "
237+
"See the release notes for more details")
238+
222239
# handle configuration
223240
conf_copy = self._default_conf.copy()
224241
if conf is not None:
@@ -238,6 +255,19 @@ def __init__(self, msg_type, schema_registry_client, conf=None):
238255
if not isinstance(self._skip_known_types, bool):
239256
raise ValueError("skip.known.types must be a boolean value")
240257

258+
self._use_deprecated_format = conf_copy.pop('use.deprecated.format')
259+
if not isinstance(self._use_deprecated_format, bool):
260+
raise ValueError("use.deprecated.format must be a boolean value")
261+
if not self._use_deprecated_format:
262+
warnings.warn("ProtobufSerializer: the 'use.deprecated.format' "
263+
"configuration property, and the ability to use the "
264+
"old incorrect Protobuf serializer heading format "
265+
"introduced in confluent-kafka-python v1.4.0, "
266+
"will be removed in an upcoming release in 2021 Q2. "
267+
"Please migrate your Python Protobuf producers and "
268+
"consumers to 'use.deprecated.format':True as "
269+
"soon as possible")
270+
241271
self._subject_name_func = conf_copy.pop('subject.name.strategy')
242272
if not callable(self._subject_name_func):
243273
raise ValueError("subject.name.strategy must be callable")
@@ -263,20 +293,46 @@ def __init__(self, msg_type, schema_registry_client, conf=None):
263293
schema_type='PROTOBUF')
264294

265295
@staticmethod
266-
def _encode_uvarints(buf, ints):
296+
def _write_varint(buf, val, zigzag=True):
297+
"""
298+
Writes val to buf, either using zigzag or uvarint encoding.
299+
300+
Args:
301+
buf (BytesIO): buffer to write to.
302+
val (int): integer to be encoded.
303+
zigzag (bool): whether to encode in zigzag or uvarint encoding
304+
"""
305+
306+
if zigzag:
307+
val = (val << 1) ^ (val >> 63)
308+
309+
while (val & ~0x7f) != 0:
310+
buf.write(_bytes((val & 0x7f) | 0x80))
311+
val >>= 7
312+
buf.write(_bytes(val))
313+
314+
@staticmethod
315+
def _encode_varints(buf, ints, zigzag=True):
267316
"""
268317
Encodes each int as a uvarint onto buf
269318
270319
Args:
271320
buf (BytesIO): buffer to write to.
272321
ints ([int]): ints to be encoded.
322+
zigzag (bool): whether to encode in zigzag or uvarint encoding
273323
274324
"""
325+
326+
assert len(ints) > 0
327+
# The root element at the 0 position does not need a length prefix.
328+
if ints == [0]:
329+
buf.write(_bytes(0x00))
330+
return
331+
332+
ProtobufSerializer._write_varint(buf, len(ints), zigzag=zigzag)
333+
275334
for value in ints:
276-
while (value & ~0x7f) != 0:
277-
buf.write(_bytes((value & 0x7f) | 0x80))
278-
value >>= 7
279-
buf.write(_bytes(value))
335+
ProtobufSerializer._write_varint(buf, value, zigzag=zigzag)
280336

281337
def _resolve_dependencies(self, ctx, file_desc):
282338
"""
@@ -361,7 +417,8 @@ def __call__(self, message_type, ctx):
361417
# (big endian)
362418
fo.write(struct.pack('>bI', _MAGIC_BYTE, self._schema_id))
363419
# write the record index to the buffer
364-
self._encode_uvarints(fo, self._msg_index)
420+
self._encode_varints(fo, self._msg_index,
421+
zigzag=not self._use_deprecated_format)
365422
# write the record itself
366423
fo.write(message_type.SerializeToString())
367424
return fo.getvalue()
@@ -374,28 +431,82 @@ class ProtobufDeserializer(object):
374431
375432
Args:
376433
message_type (GeneratedProtocolMessageType): Protobuf Message type.
434+
conf (dict): Configuration dictionary.
435+
436+
ProtobufDeserializer configuration properties:
437+
438+
+-------------------------------------+----------+------------------------------------------------------+
439+
| Property Name | Type | Description |
440+
+-------------------------------------+----------+------------------------------------------------------+
441+
| ``use.deprecated.format`` | bool | Specifies whether the Protobuf deserializer should |
442+
| | | deserialize message indexes without zig-zag encoding.|
443+
| | | This option must be explicitly configured as older |
444+
| | | and newer Protobuf producers are incompatible. |
445+
| | | If Protobuf messages in the topic to consume were |
446+
| | | produced with confluent-kafka-python <1.8 then this |
447+
| | | property must be set to True until all old messages |
448+
| | | have been processed and producers have been upgraded.|
449+
| | | Warning: This configuration property will be removed |
450+
| | | in a future version of the client. |
451+
+-------------------------------------+----------+------------------------------------------------------+
452+
377453
378454
See Also:
379455
`Protobuf API reference <https://googleapis.dev/python/protobuf/latest/google/protobuf.html>`_
380456
381457
"""
382-
__slots__ = ['_msg_class', '_msg_index']
458+
__slots__ = ['_msg_class', '_msg_index', '_use_deprecated_format']
459+
460+
# default configuration
461+
_default_conf = {
462+
'use.deprecated.format': False,
463+
}
464+
465+
def __init__(self, message_type, conf=None):
466+
467+
# Require use.deprecated.format to be explicitly configured
468+
# during a transitionary period since old/new format are
469+
# incompatible.
470+
if conf is None or 'use.deprecated.format' not in conf:
471+
raise RuntimeError(
472+
"ProtobufDeserializer: the 'use.deprecated.format' configuration "
473+
"property must be explicitly set due to backward incompatibility "
474+
"with older confluent-kafka-python Protobuf producers and consumers. "
475+
"See the release notes for more details")
476+
477+
# handle configuration
478+
conf_copy = self._default_conf.copy()
479+
if conf is not None:
480+
conf_copy.update(conf)
481+
482+
self._use_deprecated_format = conf_copy.pop('use.deprecated.format')
483+
if not isinstance(self._use_deprecated_format, bool):
484+
raise ValueError("use.deprecated.format must be a boolean value")
485+
if not self._use_deprecated_format:
486+
warnings.warn("ProtobufDeserializer: the 'use.deprecated.format' "
487+
"configuration property, and the ability to use the "
488+
"old incorrect Protobuf serializer heading format "
489+
"introduced in confluent-kafka-python v1.4.0, "
490+
"will be removed in an upcoming release in 2022 Q2. "
491+
"Please migrate your Python Protobuf producers and "
492+
"consumers to 'use.deprecated.format':True as "
493+
"soon as possible")
383494

384-
def __init__(self, message_type):
385495
descriptor = message_type.DESCRIPTOR
386496
self._msg_index = _create_msg_index(descriptor)
387497
self._msg_class = MessageFactory().GetPrototype(descriptor)
388498

389499
@staticmethod
390-
def _decode_uvarint(buf):
500+
def _decode_varint(buf, zigzag=True):
391501
"""
392-
Decodes a single uvarint from a buffer.
502+
Decodes a single varint from a buffer.
393503
394504
Args:
395505
buf (BytesIO): buffer to read from
506+
zigzag (bool): decode as zigzag or uvarint
396507
397508
Returns:
398-
int: decoded uvarint
509+
int: decoded varint
399510
400511
Raises:
401512
EOFError: if buffer is empty
@@ -410,7 +521,12 @@ def _decode_uvarint(buf):
410521
value |= (i & 0x7f) << shift
411522
shift += 7
412523
if not (i & 0x80):
413-
return value
524+
break
525+
526+
if zigzag:
527+
value = (value >> 1) ^ -(value & 1)
528+
529+
return value
414530

415531
except EOFError:
416532
raise EOFError("Unexpected EOF while reading index")
@@ -432,7 +548,7 @@ def _read_byte(buf):
432548
return ord(i)
433549

434550
@staticmethod
435-
def _decode_index(buf):
551+
def _decode_index(buf, zigzag=True):
436552
"""
437553
Extracts message index from Schema Registry Protobuf formatted bytes.
438554
@@ -443,10 +559,17 @@ def _decode_index(buf):
443559
int: Protobuf Message index.
444560
445561
"""
446-
size = ProtobufDeserializer._decode_uvarint(buf)
447-
msg_index = [size]
562+
size = ProtobufDeserializer._decode_varint(buf, zigzag=zigzag)
563+
if size < 0 or size > 100000:
564+
raise DecodeError("Invalid Protobuf msgidx array length")
565+
566+
if size == 0:
567+
return [0]
568+
569+
msg_index = []
448570
for _ in range(size):
449-
msg_index.append(ProtobufDeserializer._decode_uvarint(buf))
571+
msg_index.append(ProtobufDeserializer._decode_varint(buf,
572+
zigzag=zigzag))
450573

451574
return msg_index
452575

@@ -486,7 +609,7 @@ def __call__(self, value, ctx):
486609

487610
# Protobuf Messages are self-describing; no need to query schema
488611
# Move the reader cursor past the index
489-
_ = ProtobufDeserializer._decode_index(payload)
612+
_ = self._decode_index(payload, zigzag=not self._use_deprecated_format)
490613
msg = self._msg_class()
491614
try:
492615
msg.ParseFromString(payload.read())

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy