feat: add sample for vl reasoning

ikesnowy · ikesnowy · commit 01beef6f7946 · 2025-11-10T23:42:36.000+08:00
diff --git a/sample/Cnblogs.DashScope.Sample/Multimodal/ImageInputSample.cs b/sample/Cnblogs.DashScope.Sample/Multimodal/ImageInputSample.cs
@@ -0,0 +1,79 @@
+﻿using System.Text;
+using Cnblogs.DashScope.Core;
+
+namespace Cnblogs.DashScope.Sample.Multimodal;
+
+public class ImageInputSample : ISample
+{
+    /// <inheritdoc />
+    public string Description => "Chat with image input";
+
+    /// <inheritdoc />
+    public async Task RunAsync(IDashScopeClient client)
+    {
+        var messages = new List<MultimodalMessage>();
+        messages.Add(
+            MultimodalMessage.User(
+            [
+                MultimodalMessageContent.ImageContent(
+                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"),
+                MultimodalMessageContent.ImageContent("https://dashscope.oss-cn-beijing.aliyuncs.com/images/tiger.png"),
+                MultimodalMessageContent.TextContent("这些图展现了什么内容？")
+            ]));
+        var completion = client.GetMultimodalGenerationStreamAsync(
+            new ModelRequest<MultimodalInput, IMultimodalParameters>()
+            {
+                Model = "qwen3-vl-plus",
+                Input = new MultimodalInput() { Messages = messages },
+                Parameters = new MultimodalParameters()
+                {
+                    IncrementalOutput = true,
+                    EnableThinking = true,
+                    VlHighResolutionImages = true
+                }
+            });
+        var reply = new StringBuilder();
+        var reasoning = false;
+        MultimodalTokenUsage? usage = null;
+        await foreach (var chunk in completion)
+        {
+            var choice = chunk.Output.Choices[0];
+            if (string.IsNullOrEmpty(choice.Message.ReasoningContent) == false)
+            {
+                // reasoning
+                if (reasoning == false)
+                {
+                    Console.Write("Reasoning > ");
+                    reasoning = true;
+                }
+
+                Console.Write(choice.Message.ReasoningContent);
+                continue;
+            }
+
+            if (reasoning)
+            {
+                reasoning = false;
+                Console.WriteLine();
+                Console.Write("Assistant > ");
+            }
+
+            if (choice.Message.Content.Count == 0)
+            {
+                continue;
+            }
+
+            Console.Write(choice.Message.Content[0].Text);
+            reply.Append(choice.Message.Content[0].Text);
+            usage = chunk.Usage;
+        }
+
+        Console.WriteLine();
+        messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
+        if (usage != null)
+        {
+            Console.WriteLine(
+                $"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/reasoning({usage.OutputTokensDetails?.ReasoningTokens})/total({usage.TotalTokens})");
+        }
+    }
+}
diff --git a/src/Cnblogs.DashScope.Core/IMultimodalParameters.cs b/src/Cnblogs.DashScope.Core/IMultimodalParameters.cs
@@ -9,7 +9,8 @@ public interface IMultimodalParameters
         IIncrementalOutputParameter,
         IPenaltyParameter,
         IMaxTokenParameter,
-        IStopTokenParameter
+        IStopTokenParameter,
+        IThinkingParameter
 {
     /// <summary>
     /// Allow higher resolution for inputs. When setting to <c>true</c>, increases the maximum input token from 1280 to 16384. Defaults to <c>false</c>.
diff --git a/src/Cnblogs.DashScope.Core/ITextGenerationParameters.cs b/src/Cnblogs.DashScope.Core/ITextGenerationParameters.cs
@@ -9,7 +9,8 @@ public interface ITextGenerationParameters
         IProbabilityParameter,
         IPenaltyParameter,
         IMaxTokenParameter,
-        IStopTokenParameter
+        IStopTokenParameter,
+        IThinkingParameter
 {
     /// <summary>
     /// The format of the result, must be <c>text</c> or <c>message</c>.
@@ -50,16 +51,6 @@ public interface ITextGenerationParameters
     /// </summary>
     TextGenerationSearchOptions? SearchOptions { get; set; }
 
-    /// <summary>
-    /// Thinking option. Valid for supported models.(e.g. qwen3)
-    /// </summary>
-    bool? EnableThinking { get; }
-
-    /// <summary>
-    /// Maximum length of thinking content. Valid for supported models.(e.g. qwen3)
-    /// </summary>
-    int? ThinkingBudget { get; set; }
-
     /// <summary>
     /// Include log possibilities in response.
     /// </summary>
diff --git a/src/Cnblogs.DashScope.Core/IThinkingParameter.cs b/src/Cnblogs.DashScope.Core/IThinkingParameter.cs
@@ -0,0 +1,17 @@
+﻿namespace Cnblogs.DashScope.Core;
+
+/// <summary>
+/// Parameters for thinking.
+/// </summary>
+public interface IThinkingParameter
+{
+    /// <summary>
+    /// Thinking option. Valid for supported models.(e.g. qwen3)
+    /// </summary>
+    bool? EnableThinking { get; }
+
+    /// <summary>
+    /// Maximum length of thinking content. Valid for supported models.(e.g. qwen3)
+    /// </summary>
+    int? ThinkingBudget { get; set; }
+}
diff --git a/src/Cnblogs.DashScope.Core/MultimodalInputTokenDetails.cs b/src/Cnblogs.DashScope.Core/MultimodalInputTokenDetails.cs
@@ -0,0 +1,8 @@
+﻿namespace Cnblogs.DashScope.Core;
+
+/// <summary>
+/// Token details for multimodal inputs.
+/// </summary>
+/// <param name="ImageTokens">Token count of image.</param>
+/// <param name="TextTokens">Token count of text.</param>
+public record MultimodalInputTokenDetails(int? ImageTokens, int? TextTokens);
diff --git a/src/Cnblogs.DashScope.Core/MultimodalOutputTokenDetails.cs b/src/Cnblogs.DashScope.Core/MultimodalOutputTokenDetails.cs
@@ -0,0 +1,8 @@
+﻿namespace Cnblogs.DashScope.Core;
+
+/// <summary>
+/// Token details of multimodal outputs.
+/// </summary>
+/// <param name="ReasoningTokens">Token count of reasoning output.</param>
+/// <param name="TextTokens">Token count of text output.</param>
+public record MultimodalOutputTokenDetails(int? ReasoningTokens, int? TextTokens);
diff --git a/src/Cnblogs.DashScope.Core/MultimodalParameters.cs b/src/Cnblogs.DashScope.Core/MultimodalParameters.cs
@@ -37,4 +37,10 @@ public class MultimodalParameters : IMultimodalParameters
 
     /// <inheritdoc />
     public TextGenerationStop? Stop { get; set; }
+
+    /// <inheritdoc />
+    public bool? EnableThinking { get; set; }
+
+    /// <inheritdoc />
+    public int? ThinkingBudget { get; set; }
 }
diff --git a/src/Cnblogs.DashScope.Core/MultimodalTokenUsage.cs b/src/Cnblogs.DashScope.Core/MultimodalTokenUsage.cs
@@ -29,4 +29,24 @@ public class MultimodalTokenUsage
     /// The token usage of input video.
     /// </summary>
     public int? VideoTokens { get; set; }
+
+    /// <summary>
+    /// Count of cached tokens.
+    /// </summary>
+    public int? CachedTokens { get; set; }
+
+    /// <summary>
+    /// Count of total tokens.
+    /// </summary>
+    public int? TotalTokens { get; set; }
+
+    /// <summary>
+    /// The details of input token usage.
+    /// </summary>
+    public MultimodalInputTokenDetails? InputTokensDetails { get; set; }
+
+    /// <summary>
+    /// The details of output token usage.
+    /// </summary>
+    public MultimodalOutputTokenDetails? OutputTokensDetails { get; set; }
 }