diff --git a/samples/dnn/common.py b/samples/dnn/common.py index 5e211a30ca..acd7a56087 100644 --- a/samples/dnn/common.py +++ b/samples/dnn/common.py @@ -62,16 +62,17 @@ def add_argument(zoo, parser, name, help, required=False, default=None, type=Non def add_preproc_args(zoo, parser, sample, alias=None, prefix=""): aliases = [] - if os.path.isfile(zoo): + if os.path.isfile(zoo) and prefix == "": fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ) root = fs.root() for name in root.keys(): model = root.getNode(name) if model.getNode('sample').string() == sample: aliases.append(name) + if len(aliases): + parser.add_argument(prefix+'alias', nargs='?', choices=aliases, + help='An alias name of model to extract preprocessing parameters from models.yml file.') - parser.add_argument(prefix+'alias', nargs='?', choices=aliases, - help='An alias name of model to extract preprocessing parameters from models.yml file.') add_argument(zoo, parser, prefix+'model', help='Path to a binary file of model contains trained weights. ' 'It could be a file with extensions .caffemodel (Caffe), ' diff --git a/samples/dnn/download_models.py b/samples/dnn/download_models.py index 57243f9e04..7dde31a7b5 100644 --- a/samples/dnn/download_models.py +++ b/samples/dnn/download_models.py @@ -14,7 +14,7 @@ import requests import shutil from pathlib import Path from datetime import datetime -from urllib.request import urlopen +from urllib.request import Request, urlopen import xml.etree.ElementTree as ET __all__ = ["downloadFile"] @@ -188,9 +188,11 @@ class URLLoader(Loader): self.url = url def download(self, filepath): - r = urlopen(self.url, timeout=60) - self.printRequest(r) - self.save(filepath, r) + headers = {'User-Agent': 'Wget/1.20.3'} + req = Request(self.url, headers=headers) + with urlopen(req, timeout=60) as r: + self.printRequest(r) + self.save(filepath, r) return os.path.getsize(filepath) def printRequest(self, r): diff --git a/samples/dnn/ldm_inpainting.py b/samples/dnn/ldm_inpainting.py index f998bc716d..426697a66c 100644 --- a/samples/dnn/ldm_inpainting.py +++ b/samples/dnn/ldm_inpainting.py @@ -4,8 +4,10 @@ import argparse from tqdm import tqdm from functools import partial from copy import deepcopy +import os +from common import * -## let use write description of the script and general information how to use it +## General information on how to use the sample ''' This sample proposes experimental inpainting sample using Latent Diffusion Model (LDM) for inpainting. @@ -13,14 +15,12 @@ Most of the script is based on the code from the official repository of the LDM Current limitations of the script: - Slow diffusion sampling - - Not exact reproduction of the results from the original repository (due to issues related deviation in covolution operation. + - Not exact reproduction of the results from the original repository (due to issues related deviation in convolution operation. See issue for more details: https://github.com/opencv/opencv/pull/25973) -Steps for running the script: +LDM inpainting model was converted to ONNX graph using following steps: -1. Firstly generate ONNX graph of the Latent Diffusion Model. - - Generate the using this [repo](https://github.com/Abdurrahheem/latent-diffusion/tree/ash/export2onnx) and follow instructions below + Generate the onnx model using this [repo](https://github.com/Abdurrahheem/latent-diffusion/tree/ash/export2onnx) and follow instructions below - git clone https://github.com/Abdurrahheem/latent-diffusion.git - cd latent-diffusion @@ -29,58 +29,108 @@ Steps for running the script: - wget -O models/ldm/inpainting_big/last.ckpt https://heibox.uni-heidelberg.de/f/4d9ac7ea40c64582b7c9/?dl=1 - python -m scripts.inpaint.py --indir data/inpainting_examples/ --outdir outputs/inpainting_results --export=True -2. Build opencv (preferebly with CUDA support enabled +2. Build opencv 3. Run the script - cd opencv/samples/dnn - - python ldm_inpainting.py -e= -d= -df= -i= + - Download models using `python download_models.py ldm_inpainting` + - python ldm_inpainting.py + - For more options, use python ldm_inpainting.py -h -Right after the last command you will be promted with image. You can click on left mouse botton and starting selection a region you would like to be inpainted (delited). -Once you finish marking the region, click on left mouse botton again and press esc botton on your keyboard. The inpainting proccess will start. +After running the code you will be promted with image. You can click on left mouse button and start selecting a region you would like to be inpainted (deleted). +Once you finish marking the region, click on left mouse button again and press esc button on your keyboard. The inpainting proccess will start. Note: If you are running it on CPU it might take a large chank of time. -Also make sure to have abount 15GB of RAM to make proccess faster (other wise swapping will ckick in and everything will be slower) +Also make sure to have abount 15GB of RAM to make proccess faster (other wise swapping will kick in and everything will be slower) ''' +def get_args_parser(): + backends = ("default", "openvino", "opencv", "vkcom", "cuda") + targets = ("cpu", "opencl", "opencl_fp16", "ncs2_vpu", "hddl_vpu", "vulkan", "cuda", "cuda_fp16") -backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_CUDA) -targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_CUDA) + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'), + help='An optional path to file with preprocessing parameters.') + parser.add_argument('--input', '-i', default="rubberwhale1.png", help='Path to image file.', required=False) + parser.add_argument('--samples', '-s', type=int, help='Number of times to sample the model.', default=50) + parser.add_argument('--mask', '-m', type=str, help='Path to mask image. If not provided, interactive mask creation will be used.', default=None) -parser = argparse.ArgumentParser(description='Use this script to run inpainting using Latent Diffusion Model', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--encoder', '-e', type=str, help='Path to encoder network.', default=None) -parser.add_argument('--decoder', '-d', type=str, help='Path to decoder network.', default=None) -parser.add_argument('--diffusor', '-df', type=str, help='Path to diffusion network.', default=None) -parser.add_argument('--image', '-i', type=str, help='Path to input image.', default=None) -parser.add_argument('--samples', '-s', type=int, help='Number of times to sample the model.', default=50) -parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, - help="Choose one of computation backends: " - "%d: automatically (by default), " - "%d: OpenCV implementation, " - "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " - "%d: CUDA, " % backends) -parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, - help='Choose one of target computation devices: ' - '%d: CPU target (by default), ' - '%d: OpenCL, ' - '%d: NCS2 VPU, ' - '%d: HDDL VPU, ' - '%d: CUDA ' % targets) + parser.add_argument('--backend', default="default", type=str, choices=backends, + help="Choose one of computation backends: " + "default: automatically (by default), " + "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "opencv: OpenCV implementation, " + "vkcom: VKCOM, " + "cuda: CUDA, " + "webnn: WebNN") + parser.add_argument('--target', default="cpu", type=str, choices=targets, + help="Choose one of target computation devices: " + "cpu: CPU target (by default), " + "opencl: OpenCL, " + "opencl_fp16: OpenCL fp16 (half-float precision), " + "ncs2_vpu: NCS2 VPU, " + "hddl_vpu: HDDL VPU, " + "vulkan: Vulkan, " + "cuda: CUDA, " + "cuda_fp16: CUDA fp16 (half-float preprocess)") + args, _ = parser.parse_known_args() + add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="", alias="ldm_inpainting") + add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="encoder_", alias="ldm_inpainting") + add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="decoder_", alias="ldm_inpainting") + add_preproc_args(args.zoo, parser, 'ldm_inpainting', prefix="diffusor_", alias="ldm_inpainting") + parser = argparse.ArgumentParser(parents=[parser], + description='Diffusion based image inpainting using OpenCV.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + return parser.parse_args() -def make_batch(image, mask): - image = image.astype(np.float32)/255.0 - image = image[np.newaxis, ...].transpose(0,3,1,2) +stdSize = 0.7 +stdWeight = 2 +stdImgSize = 512 +imgWidth = None +fontSize = 1.5 +fontThickness = 1 - mask = mask.astype(np.float32)/255.0 - mask = mask[np.newaxis, np.newaxis, ...] - mask[mask < 0.5] = 0 - mask[mask >= 0.5] = 1 +def keyboard_shorcuts(): + print(''' + Keyboard Shorcuts: + Press 'i' to increase brush size. + Press 'd' to decrease brush size. + Press 'r' to reset mask. + Press ' ' (space bar) after selecting area to be inpainted. + Press ESC to terminate the program. + ''' + ) - masked_image = (1-mask)*image +def help(): + print( + ''' + Use this script for image inpainting using OpenCV. + + Firstly, download required models i.e. ldm_inpainting using `download_models.py ldm_inpainting` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data. + + To run: + Example: python ldm_inpainting.py + ''' + ) + +def make_batch_blob(image, mask): + + blob_image = cv.dnn.blobFromImage(image, scalefactor=args.scale, size=(args.width, args.height), mean=args.mean, swapRB=args.rgb, crop=False) + + blob_mask = cv.dnn.blobFromImage(mask, scalefactor=args.scale, size=(args.width, args.height), mean=args.mean, swapRB=False, crop=False) + + blob_mask = (blob_mask >= 0.5).astype(np.float32) + masked_image = (1 - blob_mask) * blob_image + + batch = { + "image": blob_image, + "mask": blob_mask, + "masked_image": masked_image + } - batch = {"image": image, "mask": mask, "masked_image": masked_image} for k in batch: - batch[k] = batch[k]*2.0-1.0 + batch[k] = batch[k]*2.0 - 1.0 + return batch def noise_like(shape, repeat=False): @@ -300,11 +350,20 @@ class DDIMInpainter(object): self.conditioning_key = conditioning_key self.register_schedule(linear_start=linear_start, linear_end=linear_end) - self.encoder = cv.dnn.readNet(args.encoder) - self.decoder = cv.dnn.readNet(args.decoder) - self.diffusor = cv.dnn.readNet(args.diffusor) + # Initialize models using provided paths or download if necessary + encoder_path = findModel(args.encoder_model, args.encoder_sha1) + decoder_path = findModel(args.decoder_model, args.decoder_sha1) + diffusor_path = findModel(args.diffusor_model, args.diffusor_sha1) + + engine = cv.dnn.ENGINE_AUTO + if args.backend != "default" or args.target != "cpu": + engine = cv.dnn.ENGINE_CLASSIC + + self.encoder = cv.dnn.readNet(encoder_path, "", "", engine) + self.diffusor = cv.dnn.readNet(diffusor_path, "", "", engine) + self.decoder = cv.dnn.readNet(decoder_path, "", "", engine) self.sampler = DDIMSampler(self, ddpm_num_timesteps=self.num_timesteps) - self.set_backend(backend=args.backend, target=args.target) + self.set_backend(backend=get_backend_id(args.backend), target=get_target_id(args.target)) def set_backend(self, backend=cv.dnn.DNN_BACKEND_DEFAULT, target=cv.dnn.DNN_TARGET_CPU): self.encoder.setPreferableBackend(backend) @@ -317,15 +376,15 @@ class DDIMInpainter(object): self.diffusor.setPreferableTarget(target) def apply_diffusor(self, x, timestep, cond): - x = np.concatenate([x, cond], axis=1) x = cv.Mat(x.astype(np.float32)) timestep = cv.Mat(timestep.astype(np.int64)) - names = ["xc", "t"] + names = ["xc, t", "timesteps"] self.diffusor.setInputsNames(names) self.diffusor.setInput(x, names[0]) self.diffusor.setInput(timestep, names[1]) output = self.diffusor.forward() + return output def register_buffer(self, name, attr): @@ -372,7 +431,6 @@ class DDIMInpainter(object): betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))) self.register_buffer('posterior_mean_coef2', to_numpy( (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod))) - if self.parameterization == "eps": lvlb_weights = self.betas ** 2 / ( 2 * self.posterior_variance * to_numpy(alphas) * (1 - self.alphas_cumprod)) @@ -386,7 +444,6 @@ class DDIMInpainter(object): assert not np.isnan(self.lvlb_weights).all() def apply_model(self, x_noisy, t, cond, return_ids=False): - if isinstance(cond, dict): # hybrid case, cond is exptected to be a dict pass @@ -397,12 +454,15 @@ class DDIMInpainter(object): cond = {key: cond} x_recon = self.apply_diffusor(x_noisy, t, cond['c_concat']) - if isinstance(x_recon, tuple) and not return_ids: return x_recon[0] else: return x_recon + def inpaint(self, image : np.ndarray, mask : np.ndarray, S : int = 50) -> np.ndarray: + inpainted = self(image, mask, S) + return np.squeeze(inpainted) + def __call__(self, image : np.ndarray, mask : np.ndarray, S : int = 50) -> np.ndarray: # Encode the image and mask @@ -413,7 +473,6 @@ class DDIMInpainter(object): c = np.concatenate([c, cc], axis=1) shape = (c.shape[1] - 1,) + c.shape[2:] - # Sample from the model samples_ddim, _ = self.sampler.sample( S=S, @@ -437,53 +496,98 @@ class DDIMInpainter(object): return inpainted -def create_mask(img, radius=20): +def create_mask(img): drawing = False # True if the mouse is pressed - counter = 0 + brush_size = 20 # Mouse callback function def draw_circle(event, x, y, flags, param): - nonlocal drawing, counter, radius + nonlocal drawing, brush_size if event == cv.EVENT_LBUTTONDOWN: - drawing = True if counter % 2 == 0 else False - counter += 1 - cv.circle(img, (x, y), radius, (255, 255, 255), -1) - cv.circle(mask, (x, y), radius, 255, -1) - + drawing = True elif event == cv.EVENT_MOUSEMOVE: if drawing: - cv.circle(img, (x, y), radius, (255, 255, 255), -1) - cv.circle(mask, (x, y), radius, 255, -1) + cv.circle(mask, (x, y), brush_size, (255), thickness=-1) + elif event == cv.EVENT_LBUTTONUP: + drawing = False + + + # Create window with instructions + window_name = 'Draw Mask' + cv.namedWindow(window_name) + cv.setMouseCallback(window_name, draw_circle) + label = "Press 'i' to increase, 'd' to decrease brush size. And 'r' to reset mask. " + labelSize, _ = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, fontSize, fontThickness) + alpha = 0.5 + temp_image = img.copy() + overlay = img.copy() + cv.rectangle(overlay, (0, 0), (labelSize[0]+10, labelSize[1]+int(30*fontSize)), (255, 255, 255), cv.FILLED) + cv.addWeighted(overlay, alpha, temp_image, 1 - alpha, 0, temp_image) + cv.putText(temp_image, "Draw the mask on the image. Press space bar when done.", (10, int(25*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness) + cv.putText(temp_image, label, (10, int(50*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness) mask = np.zeros((img.shape[0], img.shape[1]), np.uint8) - cv.namedWindow('image') - cv.setMouseCallback('image', draw_circle) + display_img = temp_image.copy() while True: - cv.imshow('image', img) - if cv.waitKey(1) & 0xFF == 27: # Press 'ESC' to exit + display_img[mask > 0] = [255, 255, 255] + cv.imshow(window_name, display_img) + # Create a copy of the image to show instructions + key = cv.waitKey(30) & 0xFF + if key == ord('i'): # Increase brush size + brush_size += 1 + print(f"Brush size increased to {brush_size}") + elif key == ord('d'): # Decrease brush size + brush_size = max(1, brush_size - 1) + print(f"Brush size decreased to {brush_size}") + elif key == ord('r'): # clear the mask + mask = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8) + display_img = temp_image.copy() + print(f"Mask cleared") + elif key == ord(' '): # Press space bar to finish drawing break + elif key == 27: + exit() cv.destroyAllWindows() return mask +def prepare_input(args, image): + if args.mask: + mask = cv.imread(args.mask, cv.IMREAD_GRAYSCALE) + if mask is None: + raise ValueError(f"Could not read mask file: {args.mask}") + if mask.shape[:2] != image.shape[:2]: + mask = cv.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv.INTER_NEAREST) + else: + mask = create_mask(deepcopy(image)) + + batch = make_batch_blob(image, mask) + return batch def main(args): + global imgWidth, fontSize, fontThickness + keyboard_shorcuts() - image = cv.imread(args.image) - mask = create_mask(deepcopy(image)) - image = cv.cvtColor(image, cv.COLOR_BGR2RGB) + image = cv.imread(findFile(args.input)) + imgWidth = min(image.shape[:2]) + fontSize = min(1.5, (stdSize*imgWidth)/stdImgSize) + fontThickness = max(1,(stdWeight*imgWidth)//stdImgSize) + aspect_ratio = image.shape[0]/image.shape[1] + height = int(args.width*aspect_ratio) - batch = make_batch(image, mask) - image, mask, masked_image = batch["image"], batch["mask"], batch["masked_image"] + batch = prepare_input(args, image) model = DDIMInpainter(args) - result = model(masked_image, mask, S=args.samples) - result = np.squeeze(result) - # save the result in the directore of args.image - cv.imwrite(args.image.replace(".png", "_inpainted.png"), result[..., ::-1]) + result = model.inpaint(batch["masked_image"], batch["mask"], S=args.samples) + result = result.astype(np.uint8) + result = cv.resize(result, (args.width, height)) + result = cv.cvtColor(result, cv.COLOR_RGB2BGR) + cv.imshow("Inpainted Image", result) + cv.waitKey(0) + cv.destroyAllWindows() if __name__ == '__main__': - args = parser.parse_args() + args = get_args_parser() main(args) diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml index 74034be855..e09cfc0d02 100644 --- a/samples/dnn/models.yml +++ b/samples/dnn/models.yml @@ -457,3 +457,23 @@ lama: height: 512 rgb: false sample: "inpainting" + +ldm_inpainting: + encoder_load_info: + encoder_url: "https://dl.opencv.org/models/ldm_inpainting/InpaintEncoder.onnx" + encoder_sha1: "eb663262304473d81d6ae627d7117892dac56b5e" + encoder_model: "InpaintEncoder.onnx" + decoder_load_info: + decoder_url: "https://dl.opencv.org/models/ldm_inpainting/InpaintDecoder.onnx" + decoder_sha1: "af258c100e3a3b0970493b6375c8775beaffc9d1" + decoder_model: "InpaintDecoder.onnx" + diffusor_load_info: + diffusor_url: "https://dl.opencv.org/models/ldm_inpainting/LatentDiffusion.onnx" + diffusor_sha1: "2c6f8a505d9a93195510c854d8f023fab27ce70e" + diffusor_model: "LatentDiffusion.onnx" + mean: [0, 0, 0] + scale: 0.00392 + width: 512 + height: 512 + rgb: true + sample: "ldm_inpainting"