Source code for langchain_nvidia_ai_endpoints.image_gen

"""Embeddings Components Derived from NVEModel/Embeddings"""
import base64
from io import BytesIO
from typing import Any, List, Optional

import requests
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models import LLM
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import Runnable, RunnableLambda
from PIL import Image

from langchain_nvidia_ai_endpoints._common import _NVIDIAClient

"""
## Image Generation Models

Due to the similarity of the underlying API, a selection of **Image Generation Models**
can be supported using the LLM interface. One example is the [**Stable Diffusion XL**]
(https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/sdxl/api) model,
which expects a prompt and some other arguments as input and produces an image (passed
back as a b64-encoded string).

```
from langchain_nvidia_ai_endpoints import ImageGenNVIDIA

ImageGenNVIDIA.get_available_models()  ## Only shows models supported by ImageGenNVIDIA

sdxl = ImageGenNVIDIA(model="sdxl")

out = sdxl.invoke("A picture of a cat, aesthetic")
out[:100] + "..."
```

To see the image, we can either convert it manually or use the built-in conversion
utility

```
from langchain_nvidia_ai_endpoints.image_gen import ImageParser

# from io import BytesIO
# from PIL import Image
# import base64

# Image.open(BytesIO(base64.decodebytes(bytes(out[10:], "utf-8"))))
ImageParser().invoke(out)  ## Runnable that does it all for you
```

In addition to the prompt, we can do a bit of hyperparameter tweaking and add some
negative prompt components that we'd like to avoid. We will also use the `ImageParser`
runnable automatically by calling the `.as_pil()` method.

```
sdxl = ImageGenNVIDIA(
    inference_steps = 100,
    negative_prompt = "ugly,bad eyes,low-res,crooked nose, smudged, painted",
)

sdxl.as_pil().invoke("A picture of a big green dog, futuristic cyberpunk")
```

Note that under the hood, `as_pil` returns a merger of the `ImageGenModel` object with
the `ImageParser` output parser. As a result, you may have trouble interacting with the
aggregation. Note that you can reference the first half of the pipeline via `.first` or
similar.

## Example of image generation with OpenAI and DALL-E

```
from langchain_nvidia_ai_endpoints import ImageGenNVIDIA
from getpass import getpass
import os

if not os.environ.get("OPENAI_API_KEY", "").startswith("sk-"):
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OPENAI Key: ")

llm = ImageGenNVIDIA().mode("openai")
llm.available_models
```

```
from langchain_nvidia_ai_endpoints import ImageGenNVIDIA

dalle = ImageGenNVIDIA().mode("openai", model="dall-e-3")

def payload_fn(d):
    if d:
        drop_keys = ["guidance_scale", "seed", "negative_prompt", "sampler"]
        d = {k: v for k, v in d.items() if k not in drop_keys}
    return d

dalle.client.payload_fn = payload_fn

dalle.as_pil().invoke("City skyline, neon green tint, aesthetic futuristic realistic")
print(dalle.client.last_inputs['json'])
dalle.client.last_response.json()
```
"""


def _get_pil_from_response(data: str) -> Image.Image:
    if data.startswith("url: "):
        body = requests.get(data[4:], stream=True).raw
    elif data.startswith("b64_json: "):
        body = BytesIO(base64.decodebytes(bytes(data[10:], "utf-8")))
    else:
        raise ValueError(f"Invalid response format: {str(data)[:100]}")
    return Image.open(body)


[docs]def ImageParser() -> RunnableLambda[str, Image.Image]: return RunnableLambda(_get_pil_from_response)
[docs]class ImageGenNVIDIA(_NVIDIAClient, LLM): """NVIDIA's AI Foundation Retriever Question-Answering Asymmetric Model.""" _default_model: str = "sdxl" infer_endpoint: str = Field("{base_url}/images/generations") model: str = Field(_default_model, description="Name of the model to invoke") negative_prompt: Optional[str] = Field(description="Sampling temperature in [0, 1]") sampler: Optional[str] = Field(description="Sampling strategy for process") guidance_scale: Optional[float] = Field(description="The scale of guidance") seed: Optional[int] = Field(description="The seed for deterministic results") @property def _llm_type(self) -> str: """Return type of NVIDIA AI Foundation Model Interface.""" return "nvidia-image-gen-model" def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: """Run the Image Gen Model on the given prompt and input.""" payload = { "prompt": prompt, "negative_prompt": kwargs.get("negative_prompt", self.negative_prompt), "sampler": kwargs.get("sampler", self.sampler), "guidance_scale": kwargs.get("guidance_scale", self.guidance_scale), "seed": kwargs.get("seed", self.seed), } if self.get_binding_model(): payload["model"] = self.get_binding_model() response = self.client.get_req( model_name=self.model, payload=payload, endpoint="infer" ) response.raise_for_status() out_dict = response.json() if "data" in out_dict: out_dict = out_dict.get("data")[0] if "url" in out_dict: output = "url: {}".format(out_dict.get("url")) elif "b64_json" in out_dict: output = "b64_json: {}".format(out_dict.get("b64_json")) else: output = str(out_dict) return output
[docs] def as_pil(self, **kwargs: Any) -> Runnable: """Returns a model that outputs a PIL image by default""" return self | ImageParser(**kwargs)