Skip to content

Commit 93a14d5

Browse files
authored
Python: feature: support gpt-image-1 (#12621)
This pull request was created in response to the issue #12500 (comment). The current AzureTextToImage implementation only works with DALL-E 3. For gpt-image-1, the response format has changed from a URL to base64 only, which the current code does not support. Additionally, gpt-image-1 introduces a new image editing feature that also needs to be supported. Since breaking changes are required, new methods have been added. Minimal code that reproduces the problem ```python service = AzureTextToImage( service_id=service_id, deployment_name="image-1", endpoint=AZURE_OPENAI_IMAGE_ENDPOINT, api_key=AZURE_OPENAI_IMAGE_API_KEY, ) ) settings = service.get_prompt_execution_settings_class()(service_id="image1") settings.prompt = "sky" settings.size = ImageSize(width=1024, height=1024) settings.quality = "low" r = await service.generate_image(settings=settings) ``` Example of use with newly added methods ```python from semantic_kernel.connectors.ai.open_ai import AzureTextToImage service = AzureTextToImage( service_id="image1", deployment_name="gpt-image-1", endpoint=AZURE_OPENAI_IMAGE_ENDPOINT, api_key=AZURE_OPENAI_IMAGE_API_KEY, api_version="2025-04-01-preview", ) settings = service.get_prompt_execution_settings_class()(service_id="image1") settings.n = 3 images_b64 = await service.generate_images("A cute cat wearing a whimsical striped hat", settings=settings) ``` ```python from semantic_kernel.connectors.ai.open_ai import AzureTextToImage service = AzureTextToImage( service_id="image1", deployment_name="gpt-image-1", endpoint=AZURE_OPENAI_IMAGE_ENDPOINT, api_key=AZURE_OPENAI_IMAGE_API_KEY, api_version="2025-04-01-preview", ) file_paths = ["./new_images/img_1.png", "./new_images/img_2.png"] settings = service.get_prompt_execution_settings_class()(service_id="image1") settings.n = 2 results = await service.edit_image( prompt="Make the cat wear a wizard hat", image_paths=file_paths, settings=settings, ) ``` ## Problems Identified 1. Assumption of URL-based responses. The current implementation assumes a response format that includes an image url, which is not the case for gpt-image-1. See: https://github.com/microsoft/semantic-kernel/blob/8d1b3fd55155bb65e9671383366d2672c4582fb0/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_text_to_image_base.py#L69
1 parent c15c885 commit 93a14d5

File tree

5 files changed

+373
-11
lines changed

5 files changed

+373
-11
lines changed

python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_text_to_image_execution_settings.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright (c) Microsoft. All rights reserved.
22

33
import logging
4-
from typing import Any
4+
from typing import Any, Literal
55

66
from pydantic import Field, model_validator
77

@@ -40,6 +40,10 @@ class OpenAITextToImageExecutionSettings(PromptExecutionSettings):
4040
size: ImageSize | None = None
4141
quality: str | None = None
4242
style: str | None = None
43+
output_compression: int | None = None
44+
background: Literal["transparent", "opaque", "auto"] | None = None
45+
n: int | None = Field(default=1, ge=1, le=10)
46+
moderation: Literal["auto", "low"] | None = None
4347

4448
@model_validator(mode="before")
4549
@classmethod

python/semantic_kernel/connectors/ai/open_ai/services/open_ai_handler.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Any, Union
66

77
from openai import AsyncOpenAI, AsyncStream, BadRequestError, _legacy_response
8+
from openai._types import NOT_GIVEN, FileTypes, NotGiven
89
from openai.lib._parsing._completions import type_to_response_format_param
910
from openai.types import Completion, CreateEmbeddingResponse
1011
from openai.types.audio import Transcription
@@ -122,12 +123,41 @@ async def _send_embedding_request(self, settings: OpenAIEmbeddingPromptExecution
122123
async def _send_text_to_image_request(self, settings: OpenAITextToImageExecutionSettings) -> ImagesResponse:
123124
"""Send a request to the OpenAI text to image endpoint."""
124125
try:
125-
return await self.client.images.generate(
126+
response: ImagesResponse = await self.client.images.generate(
126127
**settings.prepare_settings_dict(),
127128
)
129+
self.store_usage(response)
130+
return response
128131
except Exception as ex:
129132
raise ServiceResponseException(f"Failed to generate image: {ex}") from ex
130133

134+
async def _send_image_edit_request(
135+
self,
136+
image: list[FileTypes],
137+
settings: OpenAITextToImageExecutionSettings,
138+
mask: FileTypes | NotGiven = NOT_GIVEN,
139+
) -> ImagesResponse:
140+
"""Send a request to the OpenAI image edit endpoint.
141+
142+
Args:
143+
image: List of image files to edit. Accepts file paths or bytes.
144+
settings: Image edit execution settings.
145+
mask: Optional mask image. Accepts file path or bytes.
146+
147+
Returns:
148+
ImagesResponse: The response from the image edit API.
149+
"""
150+
try:
151+
response: ImagesResponse = await self.client.images.edit(
152+
image=image,
153+
mask=mask,
154+
**settings.prepare_settings_dict(),
155+
)
156+
self.store_usage(response)
157+
return response
158+
except Exception as ex:
159+
raise ServiceResponseException(f"Failed to edit image: {ex}") from ex
160+
131161
async def _send_audio_to_text_request(self, settings: OpenAIAudioToTextExecutionSettings) -> Transcription:
132162
"""Send a request to the OpenAI audio to text endpoint."""
133163
if not settings.filename:
@@ -187,12 +217,19 @@ def store_usage(
187217
| Completion
188218
| AsyncStream[ChatCompletionChunk]
189219
| AsyncStream[Completion]
190-
| CreateEmbeddingResponse,
220+
| CreateEmbeddingResponse
221+
| ImagesResponse,
191222
):
192223
"""Store the usage information from the response."""
193-
if not isinstance(response, AsyncStream) and response.usage:
224+
if isinstance(response, ImagesResponse) and hasattr(response, "usage") and response.usage:
225+
logger.info(f"OpenAI image usage: {response.usage}")
226+
self.prompt_tokens += response.usage.input_tokens
227+
self.total_tokens += response.usage.total_tokens
228+
self.completion_tokens += response.usage.output_tokens
229+
return
230+
if not isinstance(response, AsyncStream) and not isinstance(response, ImagesResponse) and response.usage:
194231
logger.info(f"OpenAI usage: {response.usage}")
195232
self.prompt_tokens += response.usage.prompt_tokens
196233
self.total_tokens += response.usage.total_tokens
197234
if hasattr(response.usage, "completion_tokens"):
198-
self.completion_tokens += response.usage.completion_tokens
235+
self.completion_tokens += response.usage.completion_tokens # type: ignore

python/semantic_kernel/connectors/ai/open_ai/services/open_ai_text_to_image_base.py

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Copyright (c) Microsoft. All rights reserved.
22

3-
from typing import Any
3+
from pathlib import Path
4+
from typing import IO, Any
45
from warnings import warn
56

7+
from openai._types import NOT_GIVEN, FileTypes, NotGiven
68
from openai.types.images_response import ImagesResponse
79

810
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
@@ -38,6 +40,7 @@ async def generate_image(
3840
Returns:
3941
bytes | str: Image bytes or image URL.
4042
"""
43+
warn("generate_image is deprecated. Use generate_images.", DeprecationWarning, stacklevel=2)
4144
if not settings:
4245
settings = OpenAITextToImageExecutionSettings(**kwargs)
4346
if not isinstance(settings, OpenAITextToImageExecutionSettings):
@@ -70,6 +73,177 @@ async def generate_image(
7073

7174
return response.data[0].url
7275

76+
async def generate_images(
77+
self,
78+
prompt: str,
79+
settings: PromptExecutionSettings | None = None,
80+
**kwargs: Any,
81+
) -> list[str]:
82+
"""Generate one or more images from text. Returns URLs or base64-encoded images.
83+
84+
Args:
85+
prompt: Description of the image(s) to generate.
86+
settings: Execution settings for the prompt.
87+
kwargs: Additional arguments, check the openai images.generate documentation for the supported arguments.
88+
89+
Returns:
90+
list[str]: Image URLs or base64-encoded images.
91+
92+
Example:
93+
Generate images and save them as PNG files:
94+
95+
```python
96+
from semantic_kernel.connectors.ai.open_ai import AzureTextToImage
97+
import base64, os
98+
99+
service = AzureTextToImage(
100+
service_id="image1",
101+
deployment_name="gpt-image-1",
102+
endpoint="https://your-endpoint.cognitiveservices.azure.com",
103+
api_key="your-api-key",
104+
api_version="2025-04-01-preview",
105+
)
106+
settings = service.get_prompt_execution_settings_class()(service_id="image1")
107+
settings.n = 3
108+
images_b64 = await service.generate_images("A cute cat wearing a whimsical striped hat", settings=settings)
109+
```
110+
"""
111+
if not settings:
112+
settings = OpenAITextToImageExecutionSettings(**kwargs)
113+
if not isinstance(settings, OpenAITextToImageExecutionSettings):
114+
settings = OpenAITextToImageExecutionSettings.from_prompt_execution_settings(settings)
115+
if prompt:
116+
settings.prompt = prompt
117+
118+
if not settings.prompt:
119+
raise ServiceInvalidRequestError("Prompt is required.")
120+
121+
if not settings.ai_model_id:
122+
settings.ai_model_id = self.ai_model_id
123+
124+
response = await self._send_request(settings)
125+
126+
assert isinstance(response, ImagesResponse) # nosec
127+
if not response.data or not isinstance(response.data, list) or len(response.data) == 0:
128+
raise ServiceResponseException("Failed to generate image.")
129+
130+
results: list[str] = []
131+
for image in response.data:
132+
url: str | None = getattr(image, "url", None)
133+
b64_json: str | None = getattr(image, "b64_json", None)
134+
if url:
135+
results.append(url)
136+
elif b64_json:
137+
results.append(b64_json)
138+
else:
139+
continue
140+
141+
if len(results) == 0:
142+
raise ServiceResponseException("No valid image data found in response.")
143+
return results
144+
145+
async def edit_image(
146+
self,
147+
prompt: str,
148+
image_paths: list[str] | None = None,
149+
image_files: list[IO[bytes]] | None = None,
150+
mask_path: str | None = None,
151+
mask_file: IO[bytes] | None = None,
152+
settings: PromptExecutionSettings | None = None,
153+
**kwargs: Any,
154+
) -> list[str]:
155+
"""Edit images using the OpenAI image edit API.
156+
157+
Args:
158+
prompt: Instructional prompt for image editing.
159+
image_paths: List of image file paths to edit.
160+
image_files: List of file-like objects (opened in binary mode) to edit.
161+
mask_path: Optional mask image file path.
162+
mask_file: Optional mask image file-like object (opened in binary mode).
163+
settings: Optional execution settings. If not provided, will be constructed from kwargs.
164+
kwargs: Additional API parameters.
165+
166+
Returns:
167+
list[str]: List of edited image URLs or base64-encoded strings.
168+
169+
Example:
170+
Edit images from file path and save results:
171+
172+
```python
173+
from semantic_kernel.connectors.ai.open_ai import AzureTextToImage
174+
import base64, os
175+
176+
service = AzureTextToImage(
177+
service_id="image1",
178+
deployment_name="gpt-image-1",
179+
endpoint="https://your-endpoint.cognitiveservices.azure.com",
180+
api_key="your-api-key",
181+
api_version="2025-04-01-preview",
182+
)
183+
file_paths = ["./new_images/img_1.png", "./new_images/img_2.png"]
184+
settings = service.get_prompt_execution_settings_class()(service_id="image1")
185+
settings.n = 2
186+
results = await service.edit_image(
187+
prompt="Make the cat wear a wizard hat",
188+
image_paths=file_paths,
189+
settings=settings,
190+
)
191+
```
192+
193+
Edit images from file object:
194+
195+
```python
196+
with open("./new_images/img_1.png", "rb") as f:
197+
results = await service.edit_image(
198+
prompt="Make the cat wear a wizard hat",
199+
image_files=[f],
200+
)
201+
```
202+
"""
203+
if not settings:
204+
settings = OpenAITextToImageExecutionSettings(**kwargs)
205+
if not isinstance(settings, OpenAITextToImageExecutionSettings):
206+
settings = OpenAITextToImageExecutionSettings.from_prompt_execution_settings(settings)
207+
settings.prompt = prompt
208+
209+
if not settings.prompt:
210+
raise ServiceInvalidRequestError("Prompt is required.")
211+
if (image_paths is None and image_files is None) or (image_paths is not None and image_files is not None):
212+
raise ServiceInvalidRequestError("Provide either 'image_paths' or 'image_files', and only one.")
213+
214+
images: list[FileTypes] = []
215+
if image_paths is not None:
216+
images = [Path(p) for p in image_paths]
217+
elif image_files is not None:
218+
images = list(image_files)
219+
220+
mask: FileTypes | NotGiven = NOT_GIVEN
221+
if mask_path is not None:
222+
mask = Path(mask_path)
223+
elif mask_file is not None:
224+
mask = mask_file
225+
226+
response: ImagesResponse = await self._send_image_edit_request(
227+
image=images,
228+
mask=mask,
229+
settings=settings,
230+
)
231+
232+
if not response or not response.data or not isinstance(response.data, list):
233+
raise ServiceResponseException("Failed to edit image.")
234+
235+
results: list[str] = []
236+
for img in response.data:
237+
b64_json: str | None = getattr(img, "b64_json", None)
238+
url: str | None = getattr(img, "url", None)
239+
if b64_json:
240+
results.append(b64_json)
241+
elif url:
242+
results.append(url)
243+
if not results:
244+
raise ServiceResponseException("No valid image data found in response.")
245+
return results
246+
73247
def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]:
74248
"""Get the request settings class."""
75249
return OpenAITextToImageExecutionSettings

python/tests/unit/connectors/ai/open_ai/services/test_azure_text_to_image.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,19 @@ def test_azure_text_to_image_init_with_from_dict(azure_openai_unit_test_env) ->
8181
@patch.object(AsyncImages, "generate", return_value=AsyncMock(spec=ImagesResponse))
8282
async def test_azure_text_to_image_calls_with_parameters(mock_generate, azure_openai_unit_test_env) -> None:
8383
mock_generate.return_value.data = [Image(url="abc")]
84+
mock_generate.return_value.usage = None
8485

8586
prompt = "A painting of a vase with flowers"
8687
width = 512
8788

88-
azure_text_to_image = AzureTextToImage()
89+
azure_text_to_image = AzureTextToImage(
90+
deployment_name=azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME"]
91+
)
8992
await azure_text_to_image.generate_image(prompt, width=width, height=width)
9093

9194
mock_generate.assert_awaited_once_with(
9295
prompt=prompt,
9396
model=azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME"],
9497
size=f"{width}x{width}",
98+
n=1,
9599
)

0 commit comments

Comments
 (0)