[modular] wan! #12611

yiyixuxu · 2025-11-08T01:54:55Z

wan2.1 text to video

from diffusers import ModularPipeline
from diffusers.utils import export_to_video
import torch

model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
# model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
pipeline = ModularPipeline.from_pretrained(model_id)
pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})
pipeline.to("cuda:0")

print(pipeline)

prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

video = pipeline(
    prompt=prompt, 
    negative_prompt=negative_prompt, 
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0)
    ).videos[0]
export_to_video(video, "yiyi_test_3_output.mp4", fps=16)

wan2.1 image to video

import logging

from diffusers import ModularPipeline
from diffusers.utils import export_to_video, load_image
import torch

# model_id = "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers"
model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
pipeline = ModularPipeline.from_pretrained(model_id)
print(pipeline)
print(pipeline.blocks.get_execution_blocks("image"))

pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})
pipeline.to("cuda:0")

image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
)

prompt = (
    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
)
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"


video = pipeline(
    prompt=prompt, 
    negative_prompt=negative_prompt, 
    image=image,
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0)
    ).videos[0]

print(f"video generated with frame size: {video[0].size}")
export_to_video(video, "yiyi_test_3_1_output.mp4", fps=16)

wan2.1 FLF2V

import logging

from diffusers import ModularPipeline
from diffusers.utils import export_to_video, load_image
import torch

model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
pipeline = ModularPipeline.from_pretrained(model_id)
print(pipeline)
print(pipeline.blocks.get_execution_blocks("last_image", "image"))

pipeline.load_components(torch_dtype={"default": torch.bfloat16, "image_encoder": torch.float32, "vae": torch.float32})
pipeline.to("cuda:1")

first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")


prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."

video = pipeline(
    prompt=prompt, 
    image=first_frame,
    last_image=last_frame,
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0)
    ).videos[0]

print(f"video generated with frame size: {video[0].size}")
export_to_video(video, "yiyi_test_3_2_output.mp4", fps=16)

wan2.2: text2video

from diffusers import ModularPipeline, ComponentsManager
from diffusers.utils import export_to_video
import torch

model_id = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
components = ComponentsManager()

pipeline = ModularPipeline.from_pretrained(model_id, components_manager=components)
pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})

components.enable_auto_cpu_offload(device="cuda:1")

prompt = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"

video = pipeline(
    prompt=prompt, 
    negative_prompt=negative_prompt, 
    height=720,
    width=1280,
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0)
    ).videos[0]
export_to_video(video, "yiyi_test_3_3_output.mp4", fps=16)

wan2.2 image2video

from diffusers import ModularPipeline, ComponentsManager
from diffusers.utils import export_to_video, load_image
import torch

model_id = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
components = ComponentsManager()

pipeline = ModularPipeline.from_pretrained(model_id, components_manager=components)
print(pipeline)
pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})
components.enable_auto_cpu_offload(device="cuda:1")

image =load_image(
    "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
)
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
generator = torch.Generator(device="cuda:1").manual_seed(0)

video = pipeline(
    prompt=prompt, 
    negative_prompt=negative_prompt, 
    image=image,
    height = 480,
    width = 832,
    num_inference_steps=40,
    generator=generator,
    output="videos"
    )[0]
export_to_video(video, "yiyi_test_3_4_output.mp4", fps=16)

modular setting example

# modular settings
from diffusers import ModularPipeline
from diffusers.utils import export_to_video, load_image
import torch

model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
blocks = ModularPipeline.from_pretrained(model_id).blocks
print(blocks)

text_encoder_node = blocks.sub_blocks["text_encoder"].init_pipeline(model_id)
image_encoder_node = blocks.sub_blocks["image_encoder"].init_pipeline(model_id)
vae_encoder_node = blocks.sub_blocks["vae_image_encoder"].init_pipeline(model_id)
denoise_node = blocks.sub_blocks["denoise"].init_pipeline(model_id)
decoder_node = blocks.sub_blocks["decode"].init_pipeline(model_id)


image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
)



text_encoder_node.load_components(torch_dtype=torch.bfloat16)
text_encoder_node.to("cuda:0")

prompt = (
    "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
    "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
)
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

text_embeddings = text_encoder_node(prompt=prompt, negative_prompt=negative_prompt).get_by_kwargs("denoiser_input_fields")
text_encoder_node.to("cpu")

image_encoder_node.load_components(torch_dtype=torch.float32)
image_encoder_node.to("cuda:0")

image_embeds = image_encoder_node(image=image).image_embeds
image_encoder_node.to("cpu")

vae_encoder_node.load_components(torch_dtype=torch.float32)
vae_encoder_node.to("cuda:0")

first_frame_latents = vae_encoder_node(image=image).first_frame_latents
vae_encoder_node.to("cpu")


denoise_node.load_components(torch_dtype=torch.bfloat16)
denoise_node.to("cuda:0")

latents = denoise_node(
    **text_embeddings, 
    first_frame_latents=first_frame_latents,
    image_embeds=image_embeds,
    num_inference_steps=30, 
    generator=torch.Generator().manual_seed(0)
    ).latents

denoise_node.to("cpu")
decoder_node.update_components(vae=vae_encoder_node.components["vae"])
decoder_node.to("cuda:0")

video = decoder_node(latents=latents).videos[0]
export_to_video(video, "yiyi_test_3_6_output.mp4", fps=16)

src/diffusers/modular_pipelines/modular_pipeline.py

src/diffusers/modular_pipelines/wan/before_denoise.py

src/diffusers/modular_pipelines/wan/denoise.py

HuggingFaceDocBuilderDev · 2025-11-09T22:09:03Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

yiyixuxu added 7 commits November 7, 2025 09:20

update, remove intermediaate_inputs

84e030c

support image2video

921185c

revert dynamic steps to simplify

846b5f9

refactor vae encoder block

1589e75

support flf2video!

cb2d3b9

add support for wan2.2 14B

63f5521

style

e4abfdb