59
59
from .constants import ImageAttributes as IA
60
60
from .constants import LzwFilterParameters as LZW
61
61
from .constants import StreamAttributes as SA
62
- from .errors import DependencyError , PdfReadError , PdfStreamError
62
+ from .errors import DependencyError , LimitReachedError , PdfReadError , PdfStreamError
63
63
from .generic import (
64
64
ArrayObject ,
65
65
DictionaryObject ,
69
69
is_null_or_none ,
70
70
)
71
71
72
+ ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
73
+
74
+
75
+ def _decompress_with_limit (data : bytes ) -> bytes :
76
+ decompressor = zlib .decompressobj ()
77
+ result = decompressor .decompress (data , max_length = ZLIB_MAX_OUTPUT_LENGTH )
78
+ if decompressor .unconsumed_tail :
79
+ raise LimitReachedError (
80
+ f"Limit reached while decompressing. { len (decompressor .unconsumed_tail )} bytes remaining."
81
+ )
82
+ return result
83
+
72
84
73
85
def decompress (data : bytes ) -> bytes :
74
86
"""
@@ -78,6 +90,12 @@ def decompress(data: bytes) -> bytes:
78
90
If the decompression fails due to a zlib error, it falls back
79
91
to using a decompression object with a larger window size.
80
92
93
+ Please note that the output length is limited to avoid memory
94
+ issues. If you need to process larger content streams, consider
95
+ adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
96
+ are only dealing with trusted inputs and/or want to disable these
97
+ limits, set the value to `0`.
98
+
81
99
Args:
82
100
data: The input data to be decompressed.
83
101
@@ -86,38 +104,43 @@ def decompress(data: bytes) -> bytes:
86
104
87
105
"""
88
106
try :
89
- return zlib . decompress (data )
107
+ return _decompress_with_limit (data )
90
108
except zlib .error :
91
- try :
92
- # For larger files, use decompression object to enable buffered reading
93
- return zlib .decompressobj ().decompress (data )
94
- except zlib .error :
95
- # First quick approach for known issue with faulty added bytes to the
96
- # tail of the encoded stream from early Adobe Distiller or Pitstop versions
97
- # with CR char as the default line separator (assumed by reverse engeneering)
98
- # that breaks the decoding process in the end.
99
- #
100
- # Try first to cut off some of the tail byte by byte, however limited to not
101
- # iterate through too many loops and kill the performance for large streams,
102
- # to then allow the final fallback to run. Added this intermediate attempt,
103
- # because starting from the head of the stream byte by byte kills completely
104
- # the performace for large streams (e.g. 6 MB) with the tail-byte-issue
105
- # and takes ages. This solution is really fast:
106
- max_tail_cut_off_bytes : int = 8
107
- for i in range (1 , min (max_tail_cut_off_bytes + 1 , len (data ))):
108
- try :
109
- return zlib .decompressobj ().decompress (data [:- i ])
110
- except zlib .error :
111
- pass
112
- # If still failing, then try with increased window size
113
- d = zlib .decompressobj (zlib .MAX_WBITS | 32 )
114
- result_str = b""
115
- for b in [data [i : i + 1 ] for i in range (len (data ))]:
116
- try :
117
- result_str += d .decompress (b )
118
- except zlib .error :
119
- pass
120
- return result_str
109
+ # First quick approach: There are known issues with faulty added bytes to the
110
+ # tail of the encoded stream from early Adobe Distiller or Pitstop versions
111
+ # with CR char as the default line separator (assumed by reverse engineering)
112
+ # that breaks the decoding process in the end.
113
+ #
114
+ # Try first to cut off some of the tail byte by byte, but limited to not
115
+ # iterate through too many loops and kill the performance for large streams,
116
+ # to then allow the final fallback to run. Added this intermediate attempt,
117
+ # because starting from the head of the stream byte by byte kills completely
118
+ # the performance for large streams (e.g., 6 MB) with the tail-byte-issue
119
+ # and takes ages. This solution is really fast:
120
+ max_tail_cut_off_bytes : int = 8
121
+ for i in range (1 , min (max_tail_cut_off_bytes + 1 , len (data ))):
122
+ try :
123
+ return _decompress_with_limit (data [:- i ])
124
+ except zlib .error :
125
+ pass
126
+
127
+ # If still failing, then try with increased window size.
128
+ decompressor = zlib .decompressobj (zlib .MAX_WBITS | 32 )
129
+ result_str = b""
130
+ remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
131
+ data_single_bytes = [data [i : i + 1 ] for i in range (len (data ))]
132
+ for index , b in enumerate (data_single_bytes ):
133
+ try :
134
+ decompressed = decompressor .decompress (b , max_length = remaining_limit )
135
+ result_str += decompressed
136
+ remaining_limit -= len (decompressed )
137
+ if remaining_limit <= 0 :
138
+ raise LimitReachedError (
139
+ f"Limit reached while decompressing. { len (data_single_bytes ) - index } bytes remaining."
140
+ )
141
+ except zlib .error :
142
+ pass
143
+ return result_str
121
144
122
145
123
146
class FlateDecode :
@@ -732,7 +755,7 @@ def decode_stream_data(stream: Any) -> bytes:
732
755
if not isinstance (decode_parms , (list , tuple )):
733
756
decode_parms = (decode_parms ,)
734
757
data : bytes = stream ._data
735
- # If there is not data to decode we should not try to decode the data .
758
+ # If there is no data to decode, we should not try to decode it .
736
759
if not data :
737
760
return data
738
761
for filter_name , params in zip (filters , decode_parms ):
0 commit comments