From 800bf842a5e83ed5760eed881bbce96deb7b59f5 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Sat, 2 May 2026 15:23:42 -0400 Subject: [PATCH 1/9] Add VideoLatentCompositeMasked and RGBMaskToLatentMask nodes --- comfy_extras/nodes_mask.py | 162 +++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 8ca947718e0f..14db5c630757 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -46,6 +46,110 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou destination[..., top:bottom, left:right] = source_portion + destination_portion return destination +def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, resize_source=False): + # destination/source shape: [B, C, F, H, W] + source = source.to(destination.device) + + # 1. Spatial Resizing for Source + if resize_source: + # size=(Frames, Height, Width). We keep source's F, but match destination's H, W + target_size = (source.shape[2], destination.shape[3], destination.shape[4]) + source = torch.nn.functional.interpolate( + source, + size=target_size, + mode="trilinear", + align_corners=False + ) + + # 2. Coordinate Scaling + x_latent = x // multiplier + y_latent = y // multiplier + + # 3. Mask Processing (Input: [F, H, W]) + if mask is None: + mask = torch.ones_like(source) + else: + mask = mask.to(destination.device, copy=True) + + # Convert [F, H, W] -> [1, 1, F, H, W] + # This allows it to broadcast across any Batch or Channel in 'source' + mask = mask.unsqueeze(0).unsqueeze(0) + + # Resize mask spatially, preserving its frame count + # size=(mask_frames, source_height, source_width) + mask_target_size = (mask.shape[2], source.shape[3], source.shape[4]) + mask = torch.nn.functional.interpolate( + mask, + size=mask_target_size, + mode="trilinear", + align_corners=False + ) + + # 4. Dimension Calculations for Spatial Slicing + dst_h, dst_w = destination.shape[3], destination.shape[4] + src_h, src_w = source.shape[3], source.shape[4] + + # Calculate visible overlap region + visible_h = max(0, min(y_latent + src_h, dst_h) - max(0, y_latent)) + visible_w = max(0, min(x_latent + src_w, dst_w) - max(0, x_latent)) + + if visible_h <= 0 or visible_w <= 0: + return destination + + # Determine slicing offsets + src_top = max(0, -y_latent) + src_left = max(0, -x_latent) + dst_top = max(0, y_latent) + dst_left = max(0, x_latent) + + # 5. Slicing and Blending + # destination/source/mask are now all 5D: [B, C, F, H, W] + # We slice only the H and W dimensions (indices 3 and 4) + m = mask[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] + s = source[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] + d = destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] + + # Combine using the mask + destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] = (m * s) + ((1.0 - m) * d) + + return destination + +def convert_rgb_mask_to_latent_mask( + mask: torch.Tensor, + k: int, + spatial_downsample_h: int, + spatial_downsample_w: int +) -> torch.Tensor: + """ + Converts [T, H, W] mask to [T_latent, H_latent, W_latent]. + Handles non-square spatial downsampling. + """ + # 1. Temporal Sampling + # Select first frame and every k-th frame thereafter + mask0 = mask[0:1] + mask1 = mask[1::k] + sampled = torch.cat([mask0, mask1], dim=0) # [T_latent, H, W] + + # 2. Prepare for Spatial Interpolation + # Shape: [Batch=1, Channels=1, Depth=T_latent, Height=H, Width=W] + sampled = sampled.unsqueeze(0).unsqueeze(0) + + # 3. Calculate New Spatial Dimensions + h_latent = sampled.shape[-2] // spatial_downsample_h + w_latent = sampled.shape[-1] // spatial_downsample_w + + # 4. Interpolate + # We maintain the temporal count (sampled.shape[2]) + # but resize H and W independently + pooled = torch.nn.functional.interpolate( + sampled, + size=(sampled.shape[2], h_latent, w_latent), + mode="nearest" + ) + + # 5. Return to [T_latent, H_latent, W_latent] + return pooled.squeeze(0).squeeze(0) + class LatentCompositeMasked(IO.ComfyNode): @classmethod def define_schema(cls): @@ -74,6 +178,40 @@ def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.No composite = execute # TODO: remove +class VideoLatentCompositeMasked(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="VideoLatentCompositeMasked", + search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"], + category="latent", + inputs=[ + IO.Latent.Input("destination"), + IO.Latent.Input("source"), + IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), + IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), + IO.Boolean.Input("resize_source", default=False), + IO.Mask.Input("mask", optional=True), + ], + outputs=[IO.Latent.Output()], + ) + + @classmethod + def execute(cls, destination, source, x, y, resize_source, mask=None) -> IO.NodeOutput: + output = destination.copy() + # Ensure we work on a copy of the samples to remain non-destructive + dst_samples = destination["samples"].clone() + src_samples = source["samples"] + + output["samples"] = video_latent_composite( + dst_samples, + src_samples, + x, y, + mask, + multiplier=8, + resize_source=resize_source + ) + return IO.NodeOutput(output) class ImageCompositeMasked(IO.ComfyNode): @classmethod @@ -398,6 +536,28 @@ def execute(cls, mask, value) -> IO.NodeOutput: image_to_mask = execute # TODO: remove +class RGBMaskToLatentMask(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RGBMasktoLatentMask", + search_aliases=["rgb mask to latent mask", "rgb mask", "latent mask"], + description="Helpful for applying masks to video latents if the VAE uses spatial downsampling.", + category="latent", + inputs=[ + IO.Mask.Input("mask", optional=False), + IO.Vae.Input("vae", optional=False), + ], + outputs=[IO.Mask.Output()], + ) + + @classmethod + def execute(cls, mask, vae) -> IO.NodeOutput: + # Ensure we work on a copy of the mask to remain non-destructive + mask_copy = mask.clone() + downscale_ratio = vae.downscale_ratio + k = (mask.shape[0] - 1) // (downscale_ratio[0](mask.shape[0]) - 1) if (downscale_ratio[0](mask.shape[0]) - 1) > 1 else 1 + return IO.NodeOutput(convert_rgb_mask_to_latent_mask(mask_copy, k, spatial_downsample_h = downscale_ratio[1], spatial_downsample_w = downscale_ratio[2])) # Mask Preview - original implement from # https://github.com/cubiq/ComfyUI_essentials/blob/9d9f4bedfc9f0321c19faf71855e228c93bd0dc9/mask.py#L81 @@ -428,6 +588,7 @@ class MaskExtension(ComfyExtension): async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ LatentCompositeMasked, + VideoLatentCompositeMasked, ImageCompositeMasked, MaskToImage, ImageToMask, @@ -439,6 +600,7 @@ async def get_node_list(self) -> list[type[IO.ComfyNode]]: FeatherMask, GrowMask, ThresholdMask, + RGBMaskToLatentMask, MaskPreview, ] From c3cd2a4e756af90b7de42b89441fa0e2fe3d7082 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Sat, 2 May 2026 15:28:35 -0400 Subject: [PATCH 2/9] Add TimeToMoveKSamplerAdvanced --- comfy_extras/nodes_custom_sampler.py | 123 +++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 1e957c09beac..111c225a1561 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -11,6 +11,87 @@ from comfy_api.latest import ComfyExtension, io import re +def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, denoise=1.0, start_step=None, time_to_move_last_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None): + sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options) + + sigmas = sampler.sigmas + + if last_step == None: + last_step = steps + + if time_to_move_last_step == None: + time_to_move_last_step = last_step + + if time_to_move_last_step > last_step: + time_to_move_last_step = last_step + + if start_step == None: + start_step = 0 + + #during each step, composite the reference latent back onto the partially sampled latent using the reference latent mask + + for i in range (min(last_step, len(sigmas) - 1) - start_step): + if i > 1: + #don't add new noise to samples after first loop iteration + noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") + + if i < last_step - 1: + temp_force_full_denoise = False + else: + temp_force_full_denoise = force_full_denoise + + temp_start = start_step + i + + samples = sampler.sample(noise, positive, negative, cfg=cfg, latent_image=latent_image, start_step=temp_start, last_step=temp_start + 1, force_full_denoise=temp_force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed) + + #add noise to the reference latent image (referenced from AddNoise node) + + if temp_start < time_to_move_last_step: + + model_sampling = model.get_model_object("model_sampling") + process_latent_out = model.get_model_object("process_latent_out") + process_latent_in = model.get_model_object("process_latent_in") + + scale = sigmas[temp_start + 1] + + if torch.count_nonzero(reference_latent_image) > 0: #Don't shift the empty latent image. + reference_latent_image = process_latent_in(reference_latent_image) + noisy = model_sampling.noise_scaling(scale, noise, reference_latent_image) + noisy = process_latent_out(noisy) + noisy = torch.nan_to_num(noisy, nan=0.0, posinf=0.0, neginf=0.0) + + samples = video_latent_composite(samples, noisy, 0, 0, reference_latent_mask, multiplier=8, resize_source=True) + + samples = samples.to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) + return samples + + +def time_to_move_common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, reference_latent, reference_latent_mask, denoise=1.0, disable_noise=False, start_step=None, time_to_move_last_step = None, last_step=None, force_full_denoise=False): + latent_image = latent["samples"] + latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None)) + + reference_latent_image = reference_latent["samples"] + reference_latent_image = comfy.sample.fix_empty_latent_channels(model, reference_latent_image, reference_latent.get("downscale_ratio_spacial", None)) + + if disable_noise: + noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") + else: + batch_inds = latent["batch_index"] if "batch_index" in latent else None + noise = comfy.sample.prepare_noise(latent_image, seed, batch_inds) + + noise_mask = None + if "noise_mask" in latent: + noise_mask = latent["noise_mask"] + + callback = latent_preview.prepare_callback(model, steps) + disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED + samples = time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, + denoise=denoise, start_step=start_step, time_to_move_last_step = time_to_move_last_step, last_step=last_step, + force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed) + out = latent.copy() + out.pop("downscale_ratio_spacial", None) + out["samples"] = samples + return (out, ) class BasicScheduler(io.ComfyNode): @classmethod @@ -979,6 +1060,47 @@ def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput: sample = execute +class TimeToMoveKSamplerAdvanced(io.ComfyNode): + @classmethod + + def define_schema(cls): + return io.Schema( + node_id="TimeToMoveKSamplerAdvanced", + category="sampling/time_to_move", + inputs=[ + io.Model.Input("model"), + io.Combo.Input("add_noise", options=["enable", "disable"], advanced=True), + io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True), + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Combo.Input("sampler_name", options = comfy.samplers.KSampler.SAMPLERS), + io.Combo.Input("scheduler", options = comfy.samplers.KSampler.SCHEDULERS), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Latent.Input("latent_image", tooltip = "Generally should be the same as reference_latent_image."), + io.Latent.Input("reference_latent_image"), + io.Mask.Input("reference_latent_mask", tooltip = "Make sure mask is the same length as the latents rather than the original video."), + io.Int.Input("start_at_step", default = 0, min = 0, max = 10000, advanced = True, tooltip = "Generally should set at a step greater than 0."), + io.Int.Input("time_to_move_end_at_step", default = 0, min = 0, max = 10000, advanced = True, tooltip = "Generally should set at a step greater than 0 and less than total number of steps."), + io.Int.Input("end_at_step", default = 10000, min = 0, max = 10000, advanced = True, tooltip = "Use just like typical end_at_step with normal KSamplerAdvanced"), + io.Combo.Input("return_with_leftover_noise", options=["disable", "enable"], advanced = True), + ], + outputs=[ + io.Latent.Output(display_name="latent"), + ] + ) + + @classmethod + def execute(cls, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, start_at_step, time_to_move_end_at_step, end_at_step, return_with_leftover_noise, denoise=1.0) -> io.NodeOutput: + force_full_denoise = True + if return_with_leftover_noise == "enable": + force_full_denoise = False + disable_noise = False + if add_noise == "disable": + disable_noise = True + + return time_to_move_common_ksampler(model, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, denoise=denoise, disable_noise=disable_noise, start_step=start_at_step, time_to_move_last_step = time_to_move_end_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise) + class AddNoise(io.ComfyNode): @classmethod def define_schema(cls): @@ -1087,6 +1209,7 @@ async def get_node_list(self) -> list[type[io.ComfyNode]]: DisableNoise, AddNoise, SamplerCustomAdvanced, + TimeToMoveKSamplerAdvanced, ManualSigmas, ] From 0b7d56070d812b8ec38cdea65cd0e8a36a0c4f71 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Sat, 2 May 2026 18:28:54 -0400 Subject: [PATCH 3/9] Fix tensor on wrong device error; slight change to logic --- comfy_extras/nodes_custom_sampler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 111c225a1561..31ebede993b8 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -31,7 +31,7 @@ def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, posit #during each step, composite the reference latent back onto the partially sampled latent using the reference latent mask for i in range (min(last_step, len(sigmas) - 1) - start_step): - if i > 1: + if i > 0: #don't add new noise to samples after first loop iteration noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") @@ -52,13 +52,13 @@ def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, posit process_latent_out = model.get_model_object("process_latent_out") process_latent_in = model.get_model_object("process_latent_in") - scale = sigmas[temp_start + 1] + scale = sigmas[temp_start + 1].to(noise.device) if torch.count_nonzero(reference_latent_image) > 0: #Don't shift the empty latent image. reference_latent_image = process_latent_in(reference_latent_image) noisy = model_sampling.noise_scaling(scale, noise, reference_latent_image) noisy = process_latent_out(noisy) - noisy = torch.nan_to_num(noisy, nan=0.0, posinf=0.0, neginf=0.0) + noisy = torch.nan_to_num(noisy, nan=0.0, posinf=0.0, neginf=0.0).to(samples.device) samples = video_latent_composite(samples, noisy, 0, 0, reference_latent_mask, multiplier=8, resize_source=True) From b3a066559ba5f37f64145e27c3ecbe105b9bb9a2 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Sat, 2 May 2026 18:37:50 -0400 Subject: [PATCH 4/9] Fix new composite function not being defined --- comfy_extras/nodes_custom_sampler.py | 68 ++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 31ebede993b8..4eb0eec17179 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -11,6 +11,74 @@ from comfy_api.latest import ComfyExtension, io import re +def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, resize_source=False): + # destination/source shape: [B, C, F, H, W] + source = source.to(destination.device) + + # 1. Spatial Resizing for Source + if resize_source: + # size=(Frames, Height, Width). We keep source's F, but match destination's H, W + target_size = (source.shape[2], destination.shape[3], destination.shape[4]) + source = torch.nn.functional.interpolate( + source, + size=target_size, + mode="trilinear", + align_corners=False + ) + + # 2. Coordinate Scaling + x_latent = x // multiplier + y_latent = y // multiplier + + # 3. Mask Processing (Input: [F, H, W]) + if mask is None: + mask = torch.ones_like(source) + else: + mask = mask.to(destination.device, copy=True) + + # Convert [F, H, W] -> [1, 1, F, H, W] + # This allows it to broadcast across any Batch or Channel in 'source' + mask = mask.unsqueeze(0).unsqueeze(0) + + # Resize mask spatially, preserving its frame count + # size=(mask_frames, source_height, source_width) + mask_target_size = (mask.shape[2], source.shape[3], source.shape[4]) + mask = torch.nn.functional.interpolate( + mask, + size=mask_target_size, + mode="trilinear", + align_corners=False + ) + + # 4. Dimension Calculations for Spatial Slicing + dst_h, dst_w = destination.shape[3], destination.shape[4] + src_h, src_w = source.shape[3], source.shape[4] + + # Calculate visible overlap region + visible_h = max(0, min(y_latent + src_h, dst_h) - max(0, y_latent)) + visible_w = max(0, min(x_latent + src_w, dst_w) - max(0, x_latent)) + + if visible_h <= 0 or visible_w <= 0: + return destination + + # Determine slicing offsets + src_top = max(0, -y_latent) + src_left = max(0, -x_latent) + dst_top = max(0, y_latent) + dst_left = max(0, x_latent) + + # 5. Slicing and Blending + # destination/source/mask are now all 5D: [B, C, F, H, W] + # We slice only the H and W dimensions (indices 3 and 4) + m = mask[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] + s = source[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] + d = destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] + + # Combine using the mask + destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] = (m * s) + ((1.0 - m) * d) + + return destination + def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, denoise=1.0, start_step=None, time_to_move_last_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None): sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options) From f3aebfa2b0dcb5523df51c9afa80322d6bf06acc Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Sat, 2 May 2026 19:51:45 -0400 Subject: [PATCH 5/9] Remove VideoLatentCompositeMasked node --- comfy_extras/nodes_mask.py | 106 +------------------------------------ 1 file changed, 1 insertion(+), 105 deletions(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 14db5c630757..5d83aff0c868 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -46,74 +46,6 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou destination[..., top:bottom, left:right] = source_portion + destination_portion return destination -def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, resize_source=False): - # destination/source shape: [B, C, F, H, W] - source = source.to(destination.device) - - # 1. Spatial Resizing for Source - if resize_source: - # size=(Frames, Height, Width). We keep source's F, but match destination's H, W - target_size = (source.shape[2], destination.shape[3], destination.shape[4]) - source = torch.nn.functional.interpolate( - source, - size=target_size, - mode="trilinear", - align_corners=False - ) - - # 2. Coordinate Scaling - x_latent = x // multiplier - y_latent = y // multiplier - - # 3. Mask Processing (Input: [F, H, W]) - if mask is None: - mask = torch.ones_like(source) - else: - mask = mask.to(destination.device, copy=True) - - # Convert [F, H, W] -> [1, 1, F, H, W] - # This allows it to broadcast across any Batch or Channel in 'source' - mask = mask.unsqueeze(0).unsqueeze(0) - - # Resize mask spatially, preserving its frame count - # size=(mask_frames, source_height, source_width) - mask_target_size = (mask.shape[2], source.shape[3], source.shape[4]) - mask = torch.nn.functional.interpolate( - mask, - size=mask_target_size, - mode="trilinear", - align_corners=False - ) - - # 4. Dimension Calculations for Spatial Slicing - dst_h, dst_w = destination.shape[3], destination.shape[4] - src_h, src_w = source.shape[3], source.shape[4] - - # Calculate visible overlap region - visible_h = max(0, min(y_latent + src_h, dst_h) - max(0, y_latent)) - visible_w = max(0, min(x_latent + src_w, dst_w) - max(0, x_latent)) - - if visible_h <= 0 or visible_w <= 0: - return destination - - # Determine slicing offsets - src_top = max(0, -y_latent) - src_left = max(0, -x_latent) - dst_top = max(0, y_latent) - dst_left = max(0, x_latent) - - # 5. Slicing and Blending - # destination/source/mask are now all 5D: [B, C, F, H, W] - # We slice only the H and W dimensions (indices 3 and 4) - m = mask[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] - s = source[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] - d = destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] - - # Combine using the mask - destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] = (m * s) + ((1.0 - m) * d) - - return destination - def convert_rgb_mask_to_latent_mask( mask: torch.Tensor, k: int, @@ -177,42 +109,7 @@ def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.No return IO.NodeOutput(output) composite = execute # TODO: remove - -class VideoLatentCompositeMasked(IO.ComfyNode): - @classmethod - def define_schema(cls): - return IO.Schema( - node_id="VideoLatentCompositeMasked", - search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"], - category="latent", - inputs=[ - IO.Latent.Input("destination"), - IO.Latent.Input("source"), - IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), - IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), - IO.Boolean.Input("resize_source", default=False), - IO.Mask.Input("mask", optional=True), - ], - outputs=[IO.Latent.Output()], - ) - - @classmethod - def execute(cls, destination, source, x, y, resize_source, mask=None) -> IO.NodeOutput: - output = destination.copy() - # Ensure we work on a copy of the samples to remain non-destructive - dst_samples = destination["samples"].clone() - src_samples = source["samples"] - - output["samples"] = video_latent_composite( - dst_samples, - src_samples, - x, y, - mask, - multiplier=8, - resize_source=resize_source - ) - return IO.NodeOutput(output) - + class ImageCompositeMasked(IO.ComfyNode): @classmethod def define_schema(cls): @@ -588,7 +485,6 @@ class MaskExtension(ComfyExtension): async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ LatentCompositeMasked, - VideoLatentCompositeMasked, ImageCompositeMasked, MaskToImage, ImageToMask, From ae54d7a9872ff40be42af68ad828e0f301317ecb Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Mon, 4 May 2026 13:50:39 -0400 Subject: [PATCH 6/9] Fix compositing error, change input arguments --- comfy_extras/nodes_custom_sampler.py | 95 +++++++++++----------------- 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 4eb0eec17179..f90545b9a177 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -15,9 +15,7 @@ def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, r # destination/source shape: [B, C, F, H, W] source = source.to(destination.device) - # 1. Spatial Resizing for Source if resize_source: - # size=(Frames, Height, Width). We keep source's F, but match destination's H, W target_size = (source.shape[2], destination.shape[3], destination.shape[4]) source = torch.nn.functional.interpolate( source, @@ -26,22 +24,14 @@ def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, r align_corners=False ) - # 2. Coordinate Scaling x_latent = x // multiplier y_latent = y // multiplier - # 3. Mask Processing (Input: [F, H, W]) if mask is None: mask = torch.ones_like(source) else: mask = mask.to(destination.device, copy=True) - - # Convert [F, H, W] -> [1, 1, F, H, W] - # This allows it to broadcast across any Batch or Channel in 'source' mask = mask.unsqueeze(0).unsqueeze(0) - - # Resize mask spatially, preserving its frame count - # size=(mask_frames, source_height, source_width) mask_target_size = (mask.shape[2], source.shape[3], source.shape[4]) mask = torch.nn.functional.interpolate( mask, @@ -50,97 +40,87 @@ def video_latent_composite(destination, source, x, y, mask=None, multiplier=8, r align_corners=False ) - # 4. Dimension Calculations for Spatial Slicing dst_h, dst_w = destination.shape[3], destination.shape[4] src_h, src_w = source.shape[3], source.shape[4] - # Calculate visible overlap region visible_h = max(0, min(y_latent + src_h, dst_h) - max(0, y_latent)) visible_w = max(0, min(x_latent + src_w, dst_w) - max(0, x_latent)) if visible_h <= 0 or visible_w <= 0: return destination - # Determine slicing offsets src_top = max(0, -y_latent) src_left = max(0, -x_latent) dst_top = max(0, y_latent) dst_left = max(0, x_latent) - # 5. Slicing and Blending - # destination/source/mask are now all 5D: [B, C, F, H, W] - # We slice only the H and W dimensions (indices 3 and 4) m = mask[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] s = source[:, :, :, src_top:src_top+visible_h, src_left:src_left+visible_w] d = destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] - # Combine using the mask destination[:, :, :, dst_top:dst_top+visible_h, dst_left:dst_left+visible_w] = (m * s) + ((1.0 - m) * d) return destination -def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, denoise=1.0, start_step=None, time_to_move_last_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None): - sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options) +def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, latent_mask, denoise=1.0, start_step=None, time_to_move_last_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None): - sigmas = sampler.sigmas + sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options) + model_sampling = model.get_model_object("model_sampling") + process_latent_out = model.get_model_object("process_latent_out") + process_latent_in = model.get_model_object("process_latent_in") + + reference_latent_image = latent_image.clone() + + reference_sigmas = sampler.sigmas + reference_noise = noise.clone() - if last_step == None: + if last_step == None or last_step > steps: last_step = steps - if time_to_move_last_step == None: - time_to_move_last_step = last_step - - if time_to_move_last_step > last_step: + if time_to_move_last_step == None or time_to_move_last_step > last_step: time_to_move_last_step = last_step if start_step == None: start_step = 0 - #during each step, composite the reference latent back onto the partially sampled latent using the reference latent mask - - for i in range (min(last_step, len(sigmas) - 1) - start_step): + for i in range (min(last_step, steps) - start_step): if i > 0: - #don't add new noise to samples after first loop iteration - noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") + #don't add new noise to samples after first step taken + noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") - if i < last_step - 1: + temp_start = start_step + i + + if temp_start < last_step - 1: temp_force_full_denoise = False else: temp_force_full_denoise = force_full_denoise - temp_start = start_step + i - samples = sampler.sample(noise, positive, negative, cfg=cfg, latent_image=latent_image, start_step=temp_start, last_step=temp_start + 1, force_full_denoise=temp_force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed) - #add noise to the reference latent image (referenced from AddNoise node) - if temp_start < time_to_move_last_step: - - model_sampling = model.get_model_object("model_sampling") - process_latent_out = model.get_model_object("process_latent_out") - process_latent_in = model.get_model_object("process_latent_in") - - scale = sigmas[temp_start + 1].to(noise.device) - + scale = reference_sigmas[temp_start + 1].to(noise.device) + if torch.count_nonzero(reference_latent_image) > 0: #Don't shift the empty latent image. - reference_latent_image = process_latent_in(reference_latent_image) - noisy = model_sampling.noise_scaling(scale, noise, reference_latent_image) - noisy = process_latent_out(noisy) - noisy = torch.nan_to_num(noisy, nan=0.0, posinf=0.0, neginf=0.0).to(samples.device) + noisy = model_sampling.noise_scaling(scale, reference_noise, process_latent_in(reference_latent_image)) + noisy = model_sampling.inverse_noise_scaling(scale, noisy) + noisy = process_latent_out(noisy) + else: + noisy = reference_latent_image + + noisy.to(samples.device) + + samples = video_latent_composite(samples, noisy, 0, 0, latent_mask, multiplier=1, resize_source=True) - samples = video_latent_composite(samples, noisy, 0, 0, reference_latent_mask, multiplier=8, resize_source=True) + latent_image = samples samples = samples.to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) return samples -def time_to_move_common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, reference_latent, reference_latent_mask, denoise=1.0, disable_noise=False, start_step=None, time_to_move_last_step = None, last_step=None, force_full_denoise=False): +def time_to_move_common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, latent_mask, denoise=1.0, disable_noise=False, start_step=None, time_to_move_last_step = None, last_step=None, force_full_denoise=False): latent_image = latent["samples"] latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None)) - reference_latent_image = reference_latent["samples"] - reference_latent_image = comfy.sample.fix_empty_latent_channels(model, reference_latent_image, reference_latent.get("downscale_ratio_spacial", None)) - if disable_noise: noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") else: @@ -153,7 +133,7 @@ def time_to_move_common_ksampler(model, seed, steps, cfg, sampler_name, schedule callback = latent_preview.prepare_callback(model, steps) disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED - samples = time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, + samples = time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, latent_mask, denoise=denoise, start_step=start_step, time_to_move_last_step = time_to_move_last_step, last_step=last_step, force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed) out = latent.copy() @@ -1127,7 +1107,7 @@ def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput: return io.NodeOutput(out, out_denoised) sample = execute - + class TimeToMoveKSamplerAdvanced(io.ComfyNode): @classmethod @@ -1145,9 +1125,8 @@ def define_schema(cls): io.Combo.Input("scheduler", options = comfy.samplers.KSampler.SCHEDULERS), io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), - io.Latent.Input("latent_image", tooltip = "Generally should be the same as reference_latent_image."), - io.Latent.Input("reference_latent_image"), - io.Mask.Input("reference_latent_mask", tooltip = "Make sure mask is the same length as the latents rather than the original video."), + io.Latent.Input("latent_image"), + io.Mask.Input("latent_mask", tooltip = "Make sure mask is the same length as the latents rather than the original video."), io.Int.Input("start_at_step", default = 0, min = 0, max = 10000, advanced = True, tooltip = "Generally should set at a step greater than 0."), io.Int.Input("time_to_move_end_at_step", default = 0, min = 0, max = 10000, advanced = True, tooltip = "Generally should set at a step greater than 0 and less than total number of steps."), io.Int.Input("end_at_step", default = 10000, min = 0, max = 10000, advanced = True, tooltip = "Use just like typical end_at_step with normal KSamplerAdvanced"), @@ -1159,7 +1138,7 @@ def define_schema(cls): ) @classmethod - def execute(cls, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, start_at_step, time_to_move_end_at_step, end_at_step, return_with_leftover_noise, denoise=1.0) -> io.NodeOutput: + def execute(cls, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, latent_mask, start_at_step, time_to_move_end_at_step, end_at_step, return_with_leftover_noise, denoise=1.0) -> io.NodeOutput: force_full_denoise = True if return_with_leftover_noise == "enable": force_full_denoise = False @@ -1167,7 +1146,7 @@ def execute(cls, model, add_noise, noise_seed, steps, cfg, sampler_name, schedul if add_noise == "disable": disable_noise = True - return time_to_move_common_ksampler(model, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, reference_latent_image, reference_latent_mask, denoise=denoise, disable_noise=disable_noise, start_step=start_at_step, time_to_move_last_step = time_to_move_end_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise) + return time_to_move_common_ksampler(model, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, latent_mask, denoise=denoise, disable_noise=disable_noise, start_step=start_at_step, time_to_move_last_step = time_to_move_end_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise) class AddNoise(io.ComfyNode): @classmethod From 8dd41ef82e9cf254a43a5afd1fba84901f774f76 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Mon, 4 May 2026 15:26:21 -0400 Subject: [PATCH 7/9] Handle case where start_step is greater than last_step --- comfy_extras/nodes_custom_sampler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index f90545b9a177..d73ceffbf130 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -83,7 +83,14 @@ def time_to_move_sample(model, noise, steps, cfg, sampler_name, scheduler, posit if start_step == None: start_step = 0 - for i in range (min(last_step, steps) - start_step): + total_iterations = min(last_step, steps) - start_step + if total_iterations <= 0: + return latent_image.to( + device=comfy.model_management.intermediate_device(), + dtype=comfy.model_management.intermediate_dtype(), + ) + + for i in range(total_iterations): if i > 0: #don't add new noise to samples after first step taken noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") From d56a093800e3d2ecf89ff5d0d835c3706a2b5c38 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Mon, 4 May 2026 15:30:51 -0400 Subject: [PATCH 8/9] Update description of RGBMaskToLatentMask Clarifies that node is intended to work with causal Video VAEs. --- comfy_extras/nodes_mask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 5d83aff0c868..3704ce4683e6 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -439,7 +439,7 @@ def define_schema(cls): return IO.Schema( node_id="RGBMasktoLatentMask", search_aliases=["rgb mask to latent mask", "rgb mask", "latent mask"], - description="Helpful for applying masks to video latents if the VAE uses spatial downsampling.", + description="Converts an RGB mask to a latent-space mask for use with causal Video VAEs (e.g., Wan).", category="latent", inputs=[ IO.Mask.Input("mask", optional=False), From de97192962118cd91b251521887e2c6820bb3891 Mon Sep 17 00:00:00 2001 From: David Lee <47388918+Pizzawookiee@users.noreply.github.com> Date: Mon, 4 May 2026 15:46:35 -0400 Subject: [PATCH 9/9] Update RGBMaskToLatentMask to raise more detailed error if non-causal VAE inputted --- comfy_extras/nodes_mask.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 3704ce4683e6..e9c2f3982428 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -453,6 +453,8 @@ def execute(cls, mask, vae) -> IO.NodeOutput: # Ensure we work on a copy of the mask to remain non-destructive mask_copy = mask.clone() downscale_ratio = vae.downscale_ratio + if not isinstance(downscale_ratio, tuple) or len(downscale_ratio) < 3: + raise ValueError("RGBMaskToLatentMask requires a causal Video VAE (e.g., Wan). The provided VAE does not have a compatible downscale_ratio.") k = (mask.shape[0] - 1) // (downscale_ratio[0](mask.shape[0]) - 1) if (downscale_ratio[0](mask.shape[0]) - 1) > 1 else 1 return IO.NodeOutput(convert_rgb_mask_to_latent_mask(mask_copy, k, spatial_downsample_h = downscale_ratio[1], spatial_downsample_w = downscale_ratio[2]))