Skip to content

Commit bb3a690

Browse files
SEC: Limit decompressed size for FlateDecode filter (#3430)
Closes #3429.
1 parent 979af6d commit bb3a690

File tree

6 files changed

+119
-42
lines changed

6 files changed

+119
-42
lines changed

pypdf/_reader.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -971,15 +971,21 @@ def _read_pdf15_xref_stream(
971971
if cast(str, xrefstream["/Type"]) != "/XRef":
972972
raise PdfReadError(f"Unexpected type {xrefstream['/Type']!r}")
973973
self.cache_indirect_object(generation, idnum, xrefstream)
974-
stream_data = BytesIO(xrefstream.get_data())
974+
975975
# Index pairs specify the subsections in the dictionary.
976976
# If none, create one subsection that spans everything.
977-
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
977+
if "/Size" not in xrefstream:
978+
# According to table 17 of the PDF 2.0 specification, this key is required.
979+
raise PdfReadError(f"Size missing from XRef stream {xrefstream!r}!")
980+
idx_pairs = xrefstream.get("/Index", [0, xrefstream["/Size"]])
981+
978982
entry_sizes = cast(dict[Any, Any], xrefstream.get("/W"))
979983
assert len(entry_sizes) >= 3
980984
if self.strict and len(entry_sizes) > 3:
981985
raise PdfReadError(f"Too many entry sizes: {entry_sizes}")
982986

987+
stream_data = BytesIO(xrefstream.get_data())
988+
983989
def get_entry(i: int) -> Union[int, tuple[int, ...]]:
984990
# Reads the correct number of bytes for each entry. See the
985991
# discussion of the W parameter in PDF spec table 17.

pypdf/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,7 @@ class EmptyImageDataError(PyPdfError):
6464

6565

6666
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
67+
68+
69+
class LimitReachedError(PyPdfError):
70+
"""Raised when a limit is reached."""

pypdf/filters.py

Lines changed: 56 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
from .constants import ImageAttributes as IA
6060
from .constants import LzwFilterParameters as LZW
6161
from .constants import StreamAttributes as SA
62-
from .errors import DependencyError, PdfReadError, PdfStreamError
62+
from .errors import DependencyError, LimitReachedError, PdfReadError, PdfStreamError
6363
from .generic import (
6464
ArrayObject,
6565
DictionaryObject,
@@ -69,6 +69,18 @@
6969
is_null_or_none,
7070
)
7171

72+
ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
73+
74+
75+
def _decompress_with_limit(data: bytes) -> bytes:
76+
decompressor = zlib.decompressobj()
77+
result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH)
78+
if decompressor.unconsumed_tail:
79+
raise LimitReachedError(
80+
f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining."
81+
)
82+
return result
83+
7284

7385
def decompress(data: bytes) -> bytes:
7486
"""
@@ -78,6 +90,12 @@ def decompress(data: bytes) -> bytes:
7890
If the decompression fails due to a zlib error, it falls back
7991
to using a decompression object with a larger window size.
8092
93+
Please note that the output length is limited to avoid memory
94+
issues. If you need to process larger content streams, consider
95+
adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
96+
are only dealing with trusted inputs and/or want to disable these
97+
limits, set the value to `0`.
98+
8199
Args:
82100
data: The input data to be decompressed.
83101
@@ -86,38 +104,43 @@ def decompress(data: bytes) -> bytes:
86104
87105
"""
88106
try:
89-
return zlib.decompress(data)
107+
return _decompress_with_limit(data)
90108
except zlib.error:
91-
try:
92-
# For larger files, use decompression object to enable buffered reading
93-
return zlib.decompressobj().decompress(data)
94-
except zlib.error:
95-
# First quick approach for known issue with faulty added bytes to the
96-
# tail of the encoded stream from early Adobe Distiller or Pitstop versions
97-
# with CR char as the default line separator (assumed by reverse engeneering)
98-
# that breaks the decoding process in the end.
99-
#
100-
# Try first to cut off some of the tail byte by byte, however limited to not
101-
# iterate through too many loops and kill the performance for large streams,
102-
# to then allow the final fallback to run. Added this intermediate attempt,
103-
# because starting from the head of the stream byte by byte kills completely
104-
# the performace for large streams (e.g. 6 MB) with the tail-byte-issue
105-
# and takes ages. This solution is really fast:
106-
max_tail_cut_off_bytes: int = 8
107-
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
108-
try:
109-
return zlib.decompressobj().decompress(data[:-i])
110-
except zlib.error:
111-
pass
112-
# If still failing, then try with increased window size
113-
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
114-
result_str = b""
115-
for b in [data[i : i + 1] for i in range(len(data))]:
116-
try:
117-
result_str += d.decompress(b)
118-
except zlib.error:
119-
pass
120-
return result_str
109+
# First quick approach: There are known issues with faulty added bytes to the
110+
# tail of the encoded stream from early Adobe Distiller or Pitstop versions
111+
# with CR char as the default line separator (assumed by reverse engineering)
112+
# that breaks the decoding process in the end.
113+
#
114+
# Try first to cut off some of the tail byte by byte, but limited to not
115+
# iterate through too many loops and kill the performance for large streams,
116+
# to then allow the final fallback to run. Added this intermediate attempt,
117+
# because starting from the head of the stream byte by byte kills completely
118+
# the performance for large streams (e.g., 6 MB) with the tail-byte-issue
119+
# and takes ages. This solution is really fast:
120+
max_tail_cut_off_bytes: int = 8
121+
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
122+
try:
123+
return _decompress_with_limit(data[:-i])
124+
except zlib.error:
125+
pass
126+
127+
# If still failing, then try with increased window size.
128+
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32)
129+
result_str = b""
130+
remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
131+
data_single_bytes = [data[i : i + 1] for i in range(len(data))]
132+
for index, b in enumerate(data_single_bytes):
133+
try:
134+
decompressed = decompressor.decompress(b, max_length=remaining_limit)
135+
result_str += decompressed
136+
remaining_limit -= len(decompressed)
137+
if remaining_limit <= 0:
138+
raise LimitReachedError(
139+
f"Limit reached while decompressing. {len(data_single_bytes) - index} bytes remaining."
140+
)
141+
except zlib.error:
142+
pass
143+
return result_str
121144

122145

123146
class FlateDecode:
@@ -732,7 +755,7 @@ def decode_stream_data(stream: Any) -> bytes:
732755
if not isinstance(decode_parms, (list, tuple)):
733756
decode_parms = (decode_parms,)
734757
data: bytes = stream._data
735-
# If there is not data to decode we should not try to decode the data.
758+
# If there is no data to decode, we should not try to decode it.
736759
if not data:
737760
return data
738761
for filter_name, params in zip(filters, decode_parms):

tests/example_files.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,5 @@
118118
url: https://github.com/user-attachments/files/20923310/large_lzw_example_encoded.dat.txt
119119
- local_filename: issue-3419.pdf
120120
url: https://github.com/user-attachments/files/21578875/layout-parser-paper-with-empty-pages.pdf
121+
- local_filename: issue-3429.pdf
122+
url: https://github.com/user-attachments/files/21711469/bomb.pdf

tests/test_filters.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import shutil
44
import string
55
import subprocess
6+
import zlib
67
from io import BytesIO
78
from itertools import product as cartesian_product
89
from pathlib import Path
@@ -13,7 +14,7 @@
1314
from PIL import Image, ImageOps
1415

1516
from pypdf import PdfReader
16-
from pypdf.errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
17+
from pypdf.errors import DependencyError, DeprecationError, LimitReachedError, PdfReadError, PdfStreamError
1718
from pypdf.filters import (
1819
ASCII85Decode,
1920
ASCIIHexDecode,
@@ -23,6 +24,7 @@
2324
FlateDecode,
2425
JBIG2Decode,
2526
RunLengthDecode,
27+
decompress,
2628
)
2729
from pypdf.generic import (
2830
ArrayObject,
@@ -675,12 +677,15 @@ def test_ccitt_fax_decode__black_is_1():
675677
def test_flate_decode__image_is_none_due_to_size_limit(caplog):
676678
url = "https://github.com/user-attachments/files/19464256/file.pdf"
677679
name = "issue3220.pdf"
678-
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
679-
images = reader.pages[0].images
680-
assert len(images) == 1
681-
image = images[0]
682-
assert image.name == "Im0.png"
683-
assert image.image is None
680+
681+
with mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", 0):
682+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
683+
images = reader.pages[0].images
684+
assert len(images) == 1
685+
image = images[0]
686+
assert image.name == "Im0.png"
687+
assert image.image is None
688+
684689
assert (
685690
"Failed loading image: Image size (180000000 pixels) exceeds limit of "
686691
"178956970 pixels, could be decompression bomb DOS attack."
@@ -845,3 +850,25 @@ def test_rle_decode_exception_with_corrupted_stream():
845850
)
846851
with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
847852
RunLengthDecode.decode(data)
853+
854+
855+
def test_decompress():
856+
data = string.printable.encode("utf-8") + string.printable[::-1].encode("utf-8")
857+
compressed = FlateDecode.encode(data)
858+
859+
# # Decompress regularly.
860+
decompressed = decompress(compressed)
861+
assert decompressed == data
862+
863+
# # Decompress byte-wise.
864+
with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error):
865+
decompressed = decompress(compressed)
866+
assert decompressed == data
867+
868+
# Decompress byte-wise with very low output limit.
869+
with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error), \
870+
mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", len(compressed) - 13), \
871+
pytest.raises(
872+
LimitReachedError, match=r"^Limit reached while decompressing\. 12 bytes remaining\."
873+
):
874+
decompress(compressed)

tests/test_reader.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,3 +1842,18 @@ def test_trailer_cannot_be_read():
18421842
with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Unexpected type '/Invalid'$"):
18431843
reader = PdfReader(BytesIO(data))
18441844
list(reader.pages)
1845+
1846+
1847+
@pytest.mark.enable_socket
1848+
def test_read_pdf15_xref_stream():
1849+
data = get_data_from_url(name="issue-3429.pdf")
1850+
1851+
with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Size missing from XRef stream {"):
1852+
PdfReader(BytesIO(data))
1853+
1854+
data_modified = data.replace(b"/XRef/", b"/XRef/Size/2/")
1855+
with pytest.raises(
1856+
PdfReadError,
1857+
match=r"^Trailer cannot be read: Limit reached while decompressing\. 1545392 bytes remaining\.$"
1858+
):
1859+
PdfReader(BytesIO(data_modified))

0 commit comments

Comments
 (0)