Extension to PR #26605 ldm inpainting sample (#26904)

Extension to PR #26605 ldm inpainting sample #26904 This PR adds and fixes following points in the ldm_inpainting sample on top of original PR #26605 by @Abdurrahheem DONE: 1. Added functionality to load models from a YAML configuration file, allowing for automatic downloading if models are not found locally. 2. Updated the script usage instructions to reflect the correct command format. 3. Improved user interaction by adding instructions to the image window for inpainting controls. 4. Introduced a new models.yml configuration section for inpainting models weights downloading, including placeholders for model SHA1 checksums. 5. Fixed input types and names of the onnx graph generation. 6. Added links to onnx graphs in models.yml 7. Support added for findModels and standarized the sample usage similar to other dnn samples 8. Fixes issue in download_models.py for downloading models from dl.opencv.org 9. Fixes issue in common.py which used to print duplicated positional arguments in case of samples that use multiple models. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --------- Co-authored-by: Abdurrahheem <abduragim.shtanchaev@xperience.ai>
2025-08-05 22:19:14 +08:00 · 2025-02-11 20:28:39 +05:30 · 2025-02-11 20:28:39 +05:30 · b605fc13d8
commit b605fc13d8
parent 48a7d8efbd
4 changed files with 211 additions and 84 deletions
--- a/samples/dnn/common.py
+++ b/samples/dnn/common.py
@ -62,16 +62,17 @@ def add_argument(zoo, parser, name, help, required=False, default=None, type=Non

 def add_preproc_args(zoo, parser, sample, alias=None, prefix=""):
    aliases = []
-    if os.path.isfile(zoo):
+    if os.path.isfile(zoo) and prefix == "":
        fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ)
        root = fs.root()
        for name in root.keys():
            model = root.getNode(name)
            if model.getNode('sample').string() == sample:
                aliases.append(name)
+    if len(aliases):
+        parser.add_argument(prefix+'alias', nargs='?', choices=aliases,
+                            help='An alias name of model to extract preprocessing parameters from models.yml file.')

-    parser.add_argument(prefix+'alias', nargs='?', choices=aliases,
-                        help='An alias name of model to extract preprocessing parameters from models.yml file.')
    add_argument(zoo, parser, prefix+'model',
                 help='Path to a binary file of model contains trained weights. '
                      'It could be a file with extensions .caffemodel (Caffe), '
--- a/samples/dnn/download_models.py
+++ b/samples/dnn/download_models.py
@ -14,7 +14,7 @@ import requests
 import shutil
 from pathlib import Path
 from datetime import datetime
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
 import xml.etree.ElementTree as ET

 __all__ = ["downloadFile"]
@ -188,9 +188,11 @@ class URLLoader(Loader):
        self.url = url

    def download(self, filepath):
-        r = urlopen(self.url, timeout=60)
-        self.printRequest(r)
-        self.save(filepath, r)
+        headers = {'User-Agent': 'Wget/1.20.3'}
+        req = Request(self.url, headers=headers)
+        with urlopen(req, timeout=60) as r:
+            self.printRequest(r)
+            self.save(filepath, r)
        return os.path.getsize(filepath)

    def printRequest(self, r):
--- a/samples/dnn/ldm_inpainting.py
+++ b/samples/dnn/ldm_inpainting.py
@ -4,8 +4,10 @@ import argparse
 from tqdm import tqdm
 from functools import partial
 from copy import deepcopy
+import os
+from common import *

-## let use write description of the script and general information how to use it
+## General information on how to use the sample

 '''
 This sample proposes experimental inpainting sample using Latent Diffusion Model (LDM) for inpainting.
@ -13,14 +15,12 @@ Most of the script is based on the code from the official repository of the LDM

 Current limitations of the script:
    - Slow diffusion sampling
-    - Not exact reproduction of the results from the original repository (due to issues related deviation in covolution operation.
+    - Not exact reproduction of the results from the original repository (due to issues related deviation in convolution operation.
    See issue for more details: https://github.com/opencv/opencv/pull/25973)

-Steps for running the script:
+LDM inpainting model was converted to ONNX graph using following steps:

-1. Firstly generate ONNX graph of the Latent Diffusion Model.
-
-    Generate the using this [repo](https://github.com/Abdurrahheem/latent-diffusion/tree/ash/export2onnx) and follow instructions below
+    Generate the onnx model using this [repo](https://github.com/Abdurrahheem/latent-diffusion/tree/ash/export2onnx) and follow instructions below

    - git clone https://github.com/Abdurrahheem/latent-diffusion.git
    - cd latent-diffusion
@ -29,58 +29,108 @@ Steps for running the script:
    - wget -O models/ldm/inpainting_big/last.ckpt https://heibox.uni-heidelberg.de/f/4d9ac7ea40c64582b7c9/?dl=1
    - python -m scripts.inpaint.py --indir data/inpainting_examples/ --outdir outputs/inpainting_results --export=True

-2. Build opencv (preferebly with CUDA support enabled
+2. Build opencv
 3. Run the script

    - cd opencv/samples/dnn
-    - python ldm_inpainting.py -e=<path-to-InpaintEncoder.onnx file> -d=<path-to-InpaintDecoder.onnx file> -df=<path-to-LatenDiffusion.onnx file> -i=<path-to-image>
+    - Download models using `python download_models.py ldm_inpainting`
+    - python ldm_inpainting.py
+    - For more options, use python ldm_inpainting.py -h

-Right after the last command you will be promted with image. You can click on left mouse botton and starting selection a region you would like to be inpainted (delited).
-Once you finish marking the region, click on left mouse botton again and press esc botton on your keyboard. The inpainting proccess will start.
+After running the code you will be promted with image. You can click on left mouse button and start selecting a region you would like to be inpainted (deleted).
+Once you finish marking the region, click on left mouse button again and press esc button on your keyboard. The inpainting proccess will start.

 Note: If you are running it on CPU it might take a large chank of time.
-Also make sure to have abount 15GB of RAM to make proccess faster (other wise swapping will ckick in and everything will be slower)
+Also make sure to have abount 15GB of RAM to make proccess faster (other wise swapping will kick in and everything will be slower)
 '''

+def get_args_parser():
+    backends = ("default", "openvino", "opencv", "vkcom", "cuda")
+    targets = ("cpu", "opencl", "opencl_fp16", "ncs2_vpu", "hddl_vpu", "vulkan", "cuda", "cuda_fp16")

-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_CUDA)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_CUDA)
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
+                        help='An optional path to file with preprocessing parameters.')
+    parser.add_argument('--input', '-i', default="rubberwhale1.png", help='Path to image file.', required=False)
+    parser.add_argument('--samples', '-s', type=int, help='Number of times to sample the model.', default=50)
+    parser.add_argument('--mask', '-m', type=str, help='Path to mask image. If not provided, interactive mask creation will be used.', default=None)

-parser = argparse.ArgumentParser(description='Use this script to run inpainting using Latent Diffusion Model',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--encoder', '-e', type=str, help='Path to encoder network.', default=None)
-parser.add_argument('--decoder', '-d', type=str, help='Path to decoder network.', default=None)
-parser.add_argument('--diffusor', '-df', type=str, help='Path to diffusion network.', default=None)
-parser.add_argument('--image', '-i', type=str, help='Path to input image.', default=None)
-parser.add_argument('--samples', '-s', type=int, help='Number of times to sample the model.', default=50)
-parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
-                        help="Choose one of computation backends: "
-                             "%d: automatically (by default), "
-                             "%d: OpenCV implementation, "
-                             "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                             "%d: CUDA, " % backends)
-parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
-                        help='Choose one of target computation devices: '
-                         '%d: CPU target (by default), '
-                         '%d: OpenCL, '
-                         '%d: NCS2 VPU, '
-                         '%d: HDDL VPU, '
-                         '%d: CUDA ' % targets)
+    parser.add_argument('--backend', default="default", type=str, choices=backends,
+            help="Choose one of computation backends: "
+            "default: automatically (by default), "
+            "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+            "opencv: OpenCV implementation, "
+            "vkcom: VKCOM, "
+            "cuda: CUDA, "
+            "webnn: WebNN")
+    parser.add_argument('--target', default="cpu", type=str, choices=targets,
+            help="Choose one of target computation devices: "
+            "cpu: CPU target (by default), "
+            "opencl: OpenCL, "
+            "opencl_fp16: OpenCL fp16 (half-float precision), "
+            "ncs2_vpu: NCS2 VPU, "
+            "hddl_vpu: HDDL VPU, "
+            "vulkan: Vulkan, "
+            "cuda: CUDA, "
+            "cuda_fp16: CUDA fp16 (half-float preprocess)")
+    args, _ = parser.parse_known_args()
+    add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="", alias="ldm_inpainting")
+    add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="encoder_", alias="ldm_inpainting")
+    add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="decoder_", alias="ldm_inpainting")
+    add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="diffusor_", alias="ldm_inpainting")
+    parser = argparse.ArgumentParser(parents=[parser],
+                                        description='Diffusion based image inpainting using OpenCV.',
+                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    return parser.parse_args()

-def make_batch(image, mask):
-    image = image.astype(np.float32)/255.0
-    image = image[np.newaxis, ...].transpose(0,3,1,2)
+stdSize = 0.7
+stdWeight = 2
+stdImgSize = 512
+imgWidth = None
+fontSize = 1.5
+fontThickness = 1

-    mask = mask.astype(np.float32)/255.0
-    mask = mask[np.newaxis, np.newaxis, ...]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
+def keyboard_shorcuts():
+    print('''
+    Keyboard Shorcuts:
+        Press 'i' to increase brush size.
+        Press 'd' to decrease brush size.
+        Press 'r' to reset mask.
+        Press ' ' (space bar) after selecting area to be inpainted.
+        Press ESC to terminate the program.
+    '''
+    )

-    masked_image = (1-mask)*image
+def help():
+    print(
+        '''
+        Use this script for image inpainting using OpenCV.
+
+        Firstly, download required models i.e. ldm_inpainting using `download_models.py ldm_inpainting` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.
+
+        To run:
+        Example: python ldm_inpainting.py
+        '''
+    )
+
+def make_batch_blob(image, mask):
+
+    blob_image = cv.dnn.blobFromImage(image, scalefactor=args.scale, size=(args.width, args.height), mean=args.mean, swapRB=args.rgb, crop=False)
+
+    blob_mask = cv.dnn.blobFromImage(mask, scalefactor=args.scale, size=(args.width, args.height), mean=args.mean, swapRB=False, crop=False)
+
+    blob_mask = (blob_mask >= 0.5).astype(np.float32)
+    masked_image = (1 - blob_mask) * blob_image
+
+    batch = {
+        "image": blob_image,
+        "mask": blob_mask,
+        "masked_image": masked_image
+    }

-    batch = {"image": image, "mask": mask, "masked_image": masked_image}
    for k in batch:
-        batch[k] = batch[k]*2.0-1.0
+        batch[k] = batch[k]*2.0 - 1.0
+
    return batch

 def noise_like(shape, repeat=False):
@ -300,11 +350,20 @@ class DDIMInpainter(object):
        self.conditioning_key = conditioning_key
        self.register_schedule(linear_start=linear_start, linear_end=linear_end)

-        self.encoder = cv.dnn.readNet(args.encoder)
-        self.decoder = cv.dnn.readNet(args.decoder)
-        self.diffusor = cv.dnn.readNet(args.diffusor)
+        # Initialize models using provided paths or download if necessary
+        encoder_path = findModel(args.encoder_model, args.encoder_sha1)
+        decoder_path = findModel(args.decoder_model, args.decoder_sha1)
+        diffusor_path = findModel(args.diffusor_model, args.diffusor_sha1)
+
+        engine = cv.dnn.ENGINE_AUTO
+        if args.backend != "default" or args.target != "cpu":
+            engine = cv.dnn.ENGINE_CLASSIC
+
+        self.encoder = cv.dnn.readNet(encoder_path, "", "", engine)
+        self.diffusor = cv.dnn.readNet(diffusor_path, "", "", engine)
+        self.decoder = cv.dnn.readNet(decoder_path, "", "", engine)
        self.sampler = DDIMSampler(self, ddpm_num_timesteps=self.num_timesteps)
-        self.set_backend(backend=args.backend, target=args.target)
+        self.set_backend(backend=get_backend_id(args.backend), target=get_target_id(args.target))

    def set_backend(self, backend=cv.dnn.DNN_BACKEND_DEFAULT, target=cv.dnn.DNN_TARGET_CPU):
        self.encoder.setPreferableBackend(backend)
@ -317,15 +376,15 @@ class DDIMInpainter(object):
        self.diffusor.setPreferableTarget(target)

    def apply_diffusor(self, x, timestep, cond):
-
        x = np.concatenate([x, cond], axis=1)
        x = cv.Mat(x.astype(np.float32))
        timestep = cv.Mat(timestep.astype(np.int64))
-        names = ["xc", "t"]
+        names = ["xc, t", "timesteps"]
        self.diffusor.setInputsNames(names)
        self.diffusor.setInput(x, names[0])
        self.diffusor.setInput(timestep, names[1])
        output = self.diffusor.forward()
+
        return output

    def register_buffer(self, name, attr):
@ -372,7 +431,6 @@ class DDIMInpainter(object):
            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
        self.register_buffer('posterior_mean_coef2', to_numpy(
            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
-
        if self.parameterization == "eps":
            lvlb_weights = self.betas ** 2 / (
                        2 * self.posterior_variance * to_numpy(alphas) * (1 - self.alphas_cumprod))
@ -386,7 +444,6 @@ class DDIMInpainter(object):
        assert not np.isnan(self.lvlb_weights).all()

    def apply_model(self, x_noisy, t, cond, return_ids=False):
-
        if isinstance(cond, dict):
            # hybrid case, cond is exptected to be a dict
            pass
@ -397,12 +454,15 @@ class DDIMInpainter(object):
            cond = {key: cond}

        x_recon = self.apply_diffusor(x_noisy, t, cond['c_concat'])
-
        if isinstance(x_recon, tuple) and not return_ids:
            return x_recon[0]
        else:
            return x_recon

+    def inpaint(self, image : np.ndarray, mask : np.ndarray, S : int = 50) -> np.ndarray:
+        inpainted = self(image, mask, S)
+        return np.squeeze(inpainted)
+
    def __call__(self, image : np.ndarray, mask : np.ndarray, S : int = 50) -> np.ndarray:

        # Encode the image and mask
@ -413,7 +473,6 @@ class DDIMInpainter(object):
        c = np.concatenate([c, cc], axis=1)

        shape = (c.shape[1] - 1,) + c.shape[2:]
-
        # Sample from the model
        samples_ddim, _ = self.sampler.sample(
            S=S,
@ -437,53 +496,98 @@ class DDIMInpainter(object):

        return inpainted

-def create_mask(img, radius=20):
+def create_mask(img):
    drawing = False  # True if the mouse is pressed
-    counter = 0
+    brush_size = 20

    # Mouse callback function
    def draw_circle(event, x, y, flags, param):
-        nonlocal drawing, counter, radius
+        nonlocal drawing, brush_size

        if event == cv.EVENT_LBUTTONDOWN:
-            drawing = True if counter % 2 == 0 else False
-            counter += 1
-            cv.circle(img, (x, y), radius, (255, 255, 255), -1)
-            cv.circle(mask, (x, y), radius, 255, -1)
-
+            drawing = True
        elif event == cv.EVENT_MOUSEMOVE:
            if drawing:
-                cv.circle(img, (x, y), radius, (255, 255, 255), -1)
-                cv.circle(mask, (x, y), radius, 255, -1)
+                cv.circle(mask, (x, y), brush_size, (255), thickness=-1)
+        elif event == cv.EVENT_LBUTTONUP:
+            drawing = False
+
+
+    # Create window with instructions
+    window_name = 'Draw Mask'
+    cv.namedWindow(window_name)
+    cv.setMouseCallback(window_name, draw_circle)
+    label = "Press 'i' to increase, 'd' to decrease brush size. And 'r' to reset mask. "
+    labelSize, _ = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, fontSize, fontThickness)
+    alpha = 0.5
+    temp_image = img.copy()
+    overlay = img.copy()
+    cv.rectangle(overlay, (0, 0), (labelSize[0]+10, labelSize[1]+int(30*fontSize)), (255, 255, 255), cv.FILLED)
+    cv.addWeighted(overlay, alpha, temp_image, 1 - alpha, 0, temp_image)
+    cv.putText(temp_image, "Draw the mask on the image. Press space bar when done.", (10, int(25*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
+    cv.putText(temp_image, label, (10, int(50*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)

    mask = np.zeros((img.shape[0], img.shape[1]), np.uint8)
-    cv.namedWindow('image')
-    cv.setMouseCallback('image', draw_circle)
+    display_img = temp_image.copy()
    while True:
-        cv.imshow('image', img)
-        if cv.waitKey(1) & 0xFF == 27:  # Press 'ESC' to exit
+        display_img[mask > 0] = [255, 255, 255]
+        cv.imshow(window_name, display_img)
+        # Create a copy of the image to show instructions
+        key = cv.waitKey(30) & 0xFF
+        if key == ord('i'):  # Increase brush size
+            brush_size += 1
+            print(f"Brush size increased to {brush_size}")
+        elif key == ord('d'):  # Decrease brush size
+            brush_size = max(1, brush_size - 1)
+            print(f"Brush size decreased to {brush_size}")
+        elif key == ord('r'):  # clear the mask
+            mask = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
+            display_img = temp_image.copy()
+            print(f"Mask cleared")
+        elif key == ord(' '): # Press space bar to finish drawing
            break
+        elif key == 27:
+            exit()

    cv.destroyAllWindows()
    return mask

+def prepare_input(args, image):
+    if args.mask:
+        mask = cv.imread(args.mask, cv.IMREAD_GRAYSCALE)
+        if mask is None:
+            raise ValueError(f"Could not read mask file: {args.mask}")
+        if mask.shape[:2] != image.shape[:2]:
+            mask = cv.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv.INTER_NEAREST)
+    else:
+        mask = create_mask(deepcopy(image))
+
+    batch = make_batch_blob(image, mask)
+    return batch

 def main(args):
+    global imgWidth, fontSize, fontThickness
+    keyboard_shorcuts()

-    image = cv.imread(args.image)
-    mask = create_mask(deepcopy(image))
-    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
+    image = cv.imread(findFile(args.input))
+    imgWidth = min(image.shape[:2])
+    fontSize = min(1.5, (stdSize*imgWidth)/stdImgSize)
+    fontThickness = max(1,(stdWeight*imgWidth)//stdImgSize)
+    aspect_ratio = image.shape[0]/image.shape[1]
+    height = int(args.width*aspect_ratio)

-    batch = make_batch(image, mask)
-    image, mask, masked_image = batch["image"], batch["mask"], batch["masked_image"]
+    batch = prepare_input(args, image)

    model = DDIMInpainter(args)
-    result = model(masked_image, mask, S=args.samples)
-    result = np.squeeze(result)
-    # save the result in the directore of args.image
-    cv.imwrite(args.image.replace(".png", "_inpainted.png"), result[..., ::-1])
+    result = model.inpaint(batch["masked_image"], batch["mask"], S=args.samples)

+    result = result.astype(np.uint8)
+    result = cv.resize(result, (args.width, height))
+    result = cv.cvtColor(result, cv.COLOR_RGB2BGR)
+    cv.imshow("Inpainted Image", result)
+    cv.waitKey(0)
+    cv.destroyAllWindows()

 if __name__ == '__main__':
-    args = parser.parse_args()
+    args = get_args_parser()
    main(args)
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@ -457,3 +457,23 @@ lama:
  height: 512
  rgb: false
  sample: "inpainting"
+
+ldm_inpainting:
+  encoder_load_info:
+    encoder_url: "https://dl.opencv.org/models/ldm_inpainting/InpaintEncoder.onnx"
+    encoder_sha1: "eb663262304473d81d6ae627d7117892dac56b5e"
+  encoder_model: "InpaintEncoder.onnx"
+  decoder_load_info:
+    decoder_url: "https://dl.opencv.org/models/ldm_inpainting/InpaintDecoder.onnx"
+    decoder_sha1: "af258c100e3a3b0970493b6375c8775beaffc9d1"
+  decoder_model: "InpaintDecoder.onnx"
+  diffusor_load_info:
+    diffusor_url: "https://dl.opencv.org/models/ldm_inpainting/LatentDiffusion.onnx"
+    diffusor_sha1: "2c6f8a505d9a93195510c854d8f023fab27ce70e"
+  diffusor_model: "LatentDiffusion.onnx"
+  mean: [0, 0, 0]
+  scale: 0.00392
+  width: 512
+  height: 512
+  rgb: true
+  sample: "ldm_inpainting"