Skip to content

Commit c535da1

Browse files
MrShahzebKhosomerveenoyansergiopaniegoVaibhavs10pcuenca
authored
Add video-to-video task (#1772)
- Added about.md with a definition and summary of the task. - Added data.ts with task metadata: - Datasets (VIRESET, LongV-EVAL, SeedVR_VideoDemos) - Models (e.g., Lucy-Edit-Dev, SeedVR2, Shape-for-Motion, etc.) - Demo Spaces (lucy-edit-dev, SeedVR2-3B) - Task summary describing scope and applications. --------- Co-authored-by: Merve Noyan <merve@huggingface.co> Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com> Co-authored-by: vb <vaibhavs10@gmail.com> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
1 parent d7f0d9b commit c535da1

File tree

3 files changed

+155
-1
lines changed

3 files changed

+155
-1
lines changed

packages/tasks/src/tasks/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import imageTo3D from "./image-to-3d/data.js";
4646
import textTo3D from "./text-to-3d/data.js";
4747
import keypointDetection from "./keypoint-detection/data.js";
4848
import videoTextToText from "./video-text-to-text/data.js";
49+
import videoToVideo from "./video-to-video/data.js";
4950

5051
export type * from "./audio-classification/inference.js";
5152
export type * from "./automatic-speech-recognition/inference.js";
@@ -250,7 +251,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
250251
translation: getData("translation", translation),
251252
"unconditional-image-generation": getData("unconditional-image-generation", unconditionalImageGeneration),
252253
"video-text-to-text": getData("video-text-to-text", videoTextToText),
253-
"video-to-video": getData("video-to-video", placeholder),
254+
"video-to-video": getData("video-to-video", videoToVideo),
254255
"visual-question-answering": getData("visual-question-answering", visualQuestionAnswering),
255256
"voice-activity-detection": undefined,
256257
"zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
## Use Cases
2+
3+
### Video Style Transfer
4+
5+
Apply artistic or cinematic styles to a video while preserving motion and structure. For example, convert real footage into anime, painting, or film-like visuals.
6+
7+
### Frame Interpolation
8+
9+
Generate intermediate frames to make videos smoother or convert 30 FPS videos to 60 FPS. This improves motion flow and enables realistic slow-motion playback.
10+
11+
### Video Super-Resolution
12+
13+
Enhance low-resolution videos into high-definition outputs with preserved detail and sharpness. Ideal for restoring old footage or improving video quality.
14+
15+
### Motion Transfer
16+
17+
Transfer the motion from a source video to another subject while maintaining identity and environment. This enables realistic animation or gesture replication.
18+
19+
### Video Editing & Synthesis
20+
21+
Add, remove, or modify objects in videos while keeping lighting and motion consistent. Perfect for visual effects, object replacement, and content-aware editing.
22+
23+
### Temporal Modification
24+
25+
Change a video’s overall time or environmental conditions, such as day to night or summer to winter. These models preserve motion dynamics and lighting continuity.
26+
27+
### Virtual Try-on
28+
29+
Simulate clothing changes or outfit fitting in videos while keeping the person’s motion and identity intact. Useful for digital fashion and e-commerce applications.
30+
31+
## Inference
32+
33+
Below is an example demonstrating how to use [Lucy-Edit-Dev](https://huggingface.co/decart-ai/Lucy-Edit-Dev) to perform video costume editing, changing a character’s clothing while maintaining identity and motion consistency. Lucy-Edit-Dev is trained on paired video edits, captioned videos, and extended image–text datasets.
34+
35+
```python
36+
!pip install torch diffusers
37+
38+
import torch
39+
from PIL import Image
40+
41+
from diffusers import AutoencoderKLWan, LucyEditPipeline
42+
from diffusers.utils import export_to_video, load_video
43+
44+
45+
url = "https://d2drjpuinn46lb.cloudfront.net/painter_original_edit.mp4"
46+
prompt = "Change the apron and blouse to a classic clown costume: satin polka-dot jumpsuit in bright primary colors, ruffled white collar, oversized pom-pom buttons, white gloves, oversized red shoes, red foam nose; soft window light from left, eye-level medium shot, natural folds and fabric highlights."
47+
negative_prompt = ""
48+
num_frames = 81
49+
height = 480
50+
width = 832
51+
52+
def convert_video(video: List[Image.Image]) -> List[Image.Image]:
53+
video = load_video(url)[:num_frames]
54+
video = [video[i].resize((width, height)) for i in range(num_frames)]
55+
return video
56+
57+
video = load_video(url, convert_method=convert_video)
58+
59+
model_id = "decart-ai/Lucy-Edit-Dev"
60+
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
61+
pipe = LucyEditPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
62+
pipe.to("cuda")
63+
64+
output = pipe(
65+
prompt=prompt,
66+
video=video,
67+
negative_prompt=negative_prompt,
68+
height=480,
69+
width=832,
70+
num_frames=81,
71+
guidance_scale=5.0
72+
).frames[0]
73+
74+
export_to_video(output, "output.mp4", fps=24)
75+
```
76+
77+
For more inference examples, check out the model cards on Hugging Face, where you can try the provided example code.
78+
79+
## Useful Resources
80+
81+
You can read more about the datasets, model architectures, and open-source implementations in the following repositories:
82+
83+
- [Lumen](https://github.com/Kunbyte-AI/Lumen) - Official implementation of Lumen for text-guided video editing.
84+
- [VIRES](https://github.com/suimuc/VIRES) - Implementation for sketch- and text-guided video instance repainting.
85+
- [ECCV2022-RIFE: Video Frame Interpolation](https://github.com/hzwer/ECCV2022-RIFE) - Real-time video frame interpolation via intermediate flow estimation.
86+
- [StableVSR: Enhancing Perceptual Quality in Video](https://github.com/claudiom4sir/StableVSR) - Super-resolution method to enhance perceptual video quality.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import type { TaskDataCustom } from "../index.js";
2+
3+
const taskData: TaskDataCustom = {
4+
datasets: [
5+
{
6+
description: "Dataset with detailed annotations for training and benchmarking video instance editing.",
7+
id: "suimu/VIRESET",
8+
},
9+
{
10+
description: "Dataset to evaluate models on long video generation and understanding.",
11+
id: "zhangsh2001/LongV-EVAL",
12+
},
13+
{
14+
description: "Collection of 104 demo videos from the SeedVR/SeedVR2 series showcasing model outputs.",
15+
id: "Iceclear/SeedVR_VideoDemos",
16+
},
17+
],
18+
demo: {
19+
inputs: [
20+
{
21+
filename: "input.gif",
22+
type: "img",
23+
},
24+
],
25+
outputs: [
26+
{
27+
filename: "output.gif",
28+
type: "img",
29+
},
30+
],
31+
},
32+
metrics: [],
33+
models: [
34+
{
35+
description: "Model for editing outfits, character, and scenery in videos.",
36+
id: "decart-ai/Lucy-Edit-Dev",
37+
},
38+
{
39+
description: "Framework that uses 3D mesh proxies for precise, consistent video editing.",
40+
id: "LeoLau/Shape-for-Motion",
41+
},
42+
{
43+
description: "Model for generating physics-aware videos from input videos and control conditions.",
44+
id: "nvidia/Cosmos-Transfer2.5-2B",
45+
},
46+
{
47+
description: "A model to upscale videos at input, designed for seamless use with ComfyUI.",
48+
id: "numz/SeedVR2_comfyUI",
49+
},
50+
],
51+
spaces: [
52+
{
53+
description: "Interactive demo space for Lucy-Edit-Dev video editing.",
54+
id: "decart-ai/lucy-edit-dev",
55+
},
56+
{
57+
description: "Demo space for SeedVR2-3B showcasing video upscaling and restoration.",
58+
id: "ByteDance-Seed/SeedVR2-3B",
59+
},
60+
],
61+
summary:
62+
"Video-to-video models take one or more videos as input and generate new videos as output. They can enhance quality, interpolate frames, modify styles, or create new motion dynamics, enabling creative applications, video production, and research.",
63+
widgetModels: [],
64+
youtubeId: "",
65+
};
66+
67+
export default taskData;

0 commit comments

Comments
 (0)