From 7b925bc3ce0198e8893b4f7d044302969688bd1b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Oct 2025 22:50:07 +0000 Subject: [PATCH 1/7] Initial plan From ae6a023c6c8b29420e059dd5fa23c0ce2c5e2508 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Oct 2025 23:00:45 +0000 Subject: [PATCH 2/7] Add Microsoft.ML.Tokenizers documentation and working examples Co-authored-by: gewarren <24882762+gewarren@users.noreply.github.com> --- .../csharp/TokenizersExamples/BpeExample.cs | 42 +++++++++ .../csharp/TokenizersExamples/LlamaExample.cs | 64 +++++++++++++ .../csharp/TokenizersExamples/Program.cs | 19 ++++ .../TokenizersExamples/TiktokenExample.cs | 61 +++++++++++++ .../TokenizersExamples.csproj | 15 ++++ docs/ai/how-to/use-tokenizers.md | 89 +++++++++++++++++++ docs/ai/toc.yml | 4 + 7 files changed, 294 insertions(+) create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj create mode 100644 docs/ai/how-to/use-tokenizers.md diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs new file mode 100644 index 0000000000000..1f638bdffd498 --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Tokenizers; + +internal class BpeExample +{ + public static void Run() + { + BasicUsage(); + } + + private static void BasicUsage() + { + // + // Create a BPE tokenizer using Tiktoken + // BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers + Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + + string text = "Hello, how are you doing today?"; + + // Encode text to token IDs + IReadOnlyList ids = tokenizer.EncodeToIds(text); + Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); + + // Count tokens + int tokenCount = tokenizer.CountTokens(text); + Console.WriteLine($"Token count: {tokenCount}"); + + // Get detailed token information + IReadOnlyList tokens = tokenizer.EncodeToTokens(text, out string? normalizedString); + Console.WriteLine("Tokens:"); + foreach (var token in tokens) + { + Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'"); + } + + // Decode tokens back to text + string? decoded = tokenizer.Decode(ids); + Console.WriteLine($"Decoded: {decoded}"); + // + } +} diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs new file mode 100644 index 0000000000000..bd86a4e24f9d9 --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs @@ -0,0 +1,64 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Threading.Tasks; +using Microsoft.ML.Tokenizers; + +internal class LlamaExample +{ + public static async Task RunAsync() + { + await BasicUsageAsync(); + AdvancedOptions(); + } + + private static async Task BasicUsageAsync() + { + // + // Open a stream to the remote Llama tokenizer model data file + using HttpClient httpClient = new(); + const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"; + using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl); + + // Create the Llama tokenizer using the remote stream + Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream); + + string input = "Hello, world!"; + + // Encode text to token IDs + IReadOnlyList ids = llamaTokenizer.EncodeToIds(input); + Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); + // Output: Token IDs: 1, 15043, 29892, 3186, 29991 + + // Count the tokens + Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}"); + // Output: Tokens: 5 + + // Decode token IDs back to text + string? decoded = llamaTokenizer.Decode(ids); + Console.WriteLine($"Decoded: {decoded}"); + // Output: Decoded: Hello, world! + // + } + + private static void AdvancedOptions() + { + // For demonstration purposes, we'll use a mock tokenizer + // In real scenarios, you would initialize this properly + Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + + // + ReadOnlySpan textSpan = "Hello World".AsSpan(); + + // Bypass normalization during encoding + IReadOnlyList ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false); + + // Bypass pretokenization during encoding + ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false); + + // Bypass both normalization and pretokenization + ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false); + // + } +} diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs new file mode 100644 index 0000000000000..71d1e86a1c5a5 --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs @@ -0,0 +1,19 @@ +using System; +using System.Threading.Tasks; + +// Run examples +Console.WriteLine("=== Tiktoken Examples ==="); +TiktokenExample.Run(); + +Console.WriteLine("\n=== Llama Examples ==="); +try +{ + await LlamaExample.RunAsync(); +} +catch (Exception ex) +{ + Console.WriteLine($"Note: Llama example requires network access to download model files: {ex.Message}"); +} + +Console.WriteLine("\n=== BPE Examples ==="); +BpeExample.Run(); diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs new file mode 100644 index 0000000000000..b347617f62ffe --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Tokenizers; + +internal class TiktokenExample +{ + public static void Run() + { + BasicUsage(); + TrimText(); + } + + private static void BasicUsage() + { + // + // Initialize the tokenizer for the gpt-4o model + Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + + string source = "Text tokenization is the process of splitting a string into a list of tokens."; + + // Count the tokens in the text + Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}"); + // Output: Tokens: 16 + + // Encode text to token IDs + IReadOnlyList ids = tokenizer.EncodeToIds(source); + Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); + // Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13 + + // Decode token IDs back to text + string? decoded = tokenizer.Decode(ids); + Console.WriteLine($"Decoded: {decoded}"); + // Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens. + // + } + + private static void TrimText() + { + // + Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); + + string source = "Text tokenization is the process of splitting a string into a list of tokens."; + + // Get the last 5 tokens from the text + var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _); + if (processedText is not null) + { + Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}"); + // Output: Last 5 tokens: a list of tokens. + } + + // Get the first 5 tokens from the text + trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _); + if (processedText is not null) + { + Console.WriteLine($"First 5 tokens: {processedText.Substring(0, trimIndex)}"); + // Output: First 5 tokens: Text tokenization is the + } + // + } +} diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj new file mode 100644 index 0000000000000..f707b4c353c59 --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj @@ -0,0 +1,15 @@ + + + + Exe + net9.0 + enable + enable + + + + + + + + diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md new file mode 100644 index 0000000000000..3b3207fc42983 --- /dev/null +++ b/docs/ai/how-to/use-tokenizers.md @@ -0,0 +1,89 @@ +--- +title: Use Microsoft.ML.Tokenizers for text tokenization +description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize text for AI models, manage token counts, and work with various tokenization algorithms. +ms.topic: how-to +ms.date: 10/29/2025 +ai-usage: ai-assisted +#customer intent: As a .NET developer, I want to use the Microsoft.ML.Tokenizers library to tokenize text so I can work with AI models, manage costs, and handle token limits effectively. +--- +# Use Microsoft.ML.Tokenizers for text tokenization + +The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when working with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models. + +This article shows you how to use the library's key features and work with different tokenizer models. + +## Prerequisites + +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later + +## Install the package + +Install the Microsoft.ML.Tokenizers NuGet package: + +```dotnetcli +dotnet add package Microsoft.ML.Tokenizers +``` + +## Key features + +The Microsoft.ML.Tokenizers library provides: + +- **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components. +- **Multiple tokenization algorithms**: Supports BPE (Byte Pair Encoding), Tiktoken, Llama, CodeGen, and more. +- **Token counting and estimation**: Helps manage costs and context limits when working with AI services. +- **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text. + +## Use Tiktoken tokenizer + +The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The following example shows how to initialize a Tiktoken tokenizer and perform common operations: + +:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic"::: + +The tokenizer instance should be cached and reused throughout your application for better performance. + +### Manage token limits + +When working with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count: + +:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim"::: + +## Use Llama tokenizer + +The Llama tokenizer is designed for the Llama family of models. It requires a tokenizer model file, which you can download from model repositories like Hugging Face: + +:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic"::: + +### Advanced encoding options + +The tokenizer supports advanced encoding options, such as controlling normalization and pretokenization: + +:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced"::: + +## Use BPE tokenizer + +Byte Pair Encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization: + +:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic"::: + +The library also provides specialized tokenizers like `BpeTokenizer` and `EnglishRobertaTokenizer` that you can configure with custom vocabularies for specific models. + +## Common tokenizer operations + +All tokenizers in the library implement the `Tokenizer` base class, which provides a consistent API: + +- **`EncodeToIds`**: Converts text to a list of token IDs +- **`Decode`**: Converts token IDs back to text +- **`CountTokens`**: Returns the number of tokens in a text string +- **`EncodeToTokens`**: Returns detailed token information including values and IDs +- **`GetIndexByTokenCount`**: Finds the character index for a specific token count from the start +- **`GetIndexByTokenCountFromEnd`**: Finds the character index for a specific token count from the end + +## Migration from other libraries + +If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md). + +## Related content + +- [Understanding tokens](../conceptual/understanding-tokens.md) +- [Microsoft.ML.Tokenizers API reference](/dotnet/api/microsoft.ml.tokenizers) +- [Microsoft.ML.Tokenizers NuGet package](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) diff --git a/docs/ai/toc.yml b/docs/ai/toc.yml index bded8c4e59964..747d78b31a3f6 100644 --- a/docs/ai/toc.yml +++ b/docs/ai/toc.yml @@ -74,6 +74,10 @@ items: href: quickstarts/text-to-image.md - name: Generate images using OpenAI.Images.ImageClient href: quickstarts/generate-images.md +- name: Tokenization + items: + - name: Use Microsoft.ML.Tokenizers + href: how-to/use-tokenizers.md - name: Security and content safety items: - name: Authentication for Azure-hosted apps and services From aa30753b8e9383136fd27d39c58b6b6883bbb240 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Oct 2025 23:03:08 +0000 Subject: [PATCH 3/7] Add data package requirement and cross-references Co-authored-by: gewarren <24882762+gewarren@users.noreply.github.com> --- docs/ai/conceptual/understanding-tokens.md | 1 + docs/ai/how-to/use-tokenizers.md | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/docs/ai/conceptual/understanding-tokens.md b/docs/ai/conceptual/understanding-tokens.md index 219d1d26473d1..682f4e9272258 100644 --- a/docs/ai/conceptual/understanding-tokens.md +++ b/docs/ai/conceptual/understanding-tokens.md @@ -103,6 +103,7 @@ Generative AI services might also be limited regarding the maximum number of tok ## Related content +- [Use Microsoft.ML.Tokenizers for text tokenization](../how-to/use-tokenizers.md) - [How generative AI and LLMs work](how-genai-and-llms-work.md) - [Understand embeddings](embeddings.md) - [Work with vector databases](vector-databases.md) diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md index 3b3207fc42983..7217bb3aa056e 100644 --- a/docs/ai/how-to/use-tokenizers.md +++ b/docs/ai/how-to/use-tokenizers.md @@ -24,6 +24,12 @@ Install the Microsoft.ML.Tokenizers NuGet package: dotnet add package Microsoft.ML.Tokenizers ``` +For Tiktoken models (like GPT-4), you also need to install the corresponding data package: + +```dotnetcli +dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase +``` + ## Key features The Microsoft.ML.Tokenizers library provides: From 693150c4546c8b059d8d101c1729b66fc4aba68d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Oct 2025 23:05:03 +0000 Subject: [PATCH 4/7] Add .gitignore to exclude build artifacts Co-authored-by: gewarren <24882762+gewarren@users.noreply.github.com> --- .../use-tokenizers/csharp/TokenizersExamples/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/.gitignore diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/.gitignore b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/.gitignore new file mode 100644 index 0000000000000..a299c76326b42 --- /dev/null +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/.gitignore @@ -0,0 +1,3 @@ +## Build artifacts +bin/ +obj/ From a5e763a70fb21f5b30056ef096d025c7b2abcfe9 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Wed, 29 Oct 2025 17:01:28 -0700 Subject: [PATCH 5/7] human edits --- docs/ai/how-to/use-tokenizers.md | 35 +++++++++++++++----------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md index 7217bb3aa056e..e93f143ab9c5c 100644 --- a/docs/ai/how-to/use-tokenizers.md +++ b/docs/ai/how-to/use-tokenizers.md @@ -4,17 +4,16 @@ description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize te ms.topic: how-to ms.date: 10/29/2025 ai-usage: ai-assisted -#customer intent: As a .NET developer, I want to use the Microsoft.ML.Tokenizers library to tokenize text so I can work with AI models, manage costs, and handle token limits effectively. --- # Use Microsoft.ML.Tokenizers for text tokenization -The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when working with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models. +The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when you work with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models. This article shows you how to use the library's key features and work with different tokenizer models. ## Prerequisites -- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later ## Install the package @@ -35,7 +34,7 @@ dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase The Microsoft.ML.Tokenizers library provides: - **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components. -- **Multiple tokenization algorithms**: Supports BPE (Byte Pair Encoding), Tiktoken, Llama, CodeGen, and more. +- **Multiple tokenization algorithms**: Supports BPE (byte-pair encoding), Tiktoken, Llama, CodeGen, and more. - **Token counting and estimation**: Helps manage costs and context limits when working with AI services. - **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text. @@ -45,11 +44,9 @@ The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The follo :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic"::: -The tokenizer instance should be cached and reused throughout your application for better performance. +For better performance, you should cache and reuse the tokenizer instance throughout your app. -### Manage token limits - -When working with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count: +When you work with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count: :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim"::: @@ -59,30 +56,30 @@ The Llama tokenizer is designed for the Llama family of models. It requires a to :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic"::: -### Advanced encoding options - The tokenizer supports advanced encoding options, such as controlling normalization and pretokenization: :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced"::: ## Use BPE tokenizer -Byte Pair Encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization: +Byte-pair encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization: :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic"::: -The library also provides specialized tokenizers like `BpeTokenizer` and `EnglishRobertaTokenizer` that you can configure with custom vocabularies for specific models. +The library also provides specialized tokenizers like and that you can configure with custom vocabularies for specific models. ## Common tokenizer operations -All tokenizers in the library implement the `Tokenizer` base class, which provides a consistent API: +All tokenizers in the library implement the base class. The following table shows the available methods. -- **`EncodeToIds`**: Converts text to a list of token IDs -- **`Decode`**: Converts token IDs back to text -- **`CountTokens`**: Returns the number of tokens in a text string -- **`EncodeToTokens`**: Returns detailed token information including values and IDs -- **`GetIndexByTokenCount`**: Finds the character index for a specific token count from the start -- **`GetIndexByTokenCountFromEnd`**: Finds the character index for a specific token count from the end +| Method | Description | +|-------------------------------------------------------|--------------------------------------| +| | Converts text to a list of token IDs | +| | Converts token IDs back to text | +| | Returns the number of tokens in a text string | +| | Returns detailed token information including values and IDs | +| | Finds the character index for a specific token count from the start | +| | Finds the character index for a specific token count from the end | ## Migration from other libraries From 9be411235e714c4c2225d5b1cce75b71b33757c9 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Wed, 29 Oct 2025 17:23:44 -0700 Subject: [PATCH 6/7] code edits --- .../csharp/TokenizersExamples/BpeExample.cs | 13 +++++------ .../csharp/TokenizersExamples/LlamaExample.cs | 22 +++++++++---------- .../TokenizersExamples/TiktokenExample.cs | 12 +++++----- docs/ai/how-to/use-tokenizers.md | 6 +++-- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs index 1f638bdffd498..0fb73d38e6795 100644 --- a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs @@ -12,29 +12,28 @@ public static void Run() private static void BasicUsage() { // - // Create a BPE tokenizer using Tiktoken - // BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers + // Create a BPE tokenizer using Tiktoken. Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); string text = "Hello, how are you doing today?"; - // Encode text to token IDs + // Encode text to token IDs. IReadOnlyList ids = tokenizer.EncodeToIds(text); Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); - // Count tokens + // Count tokens. int tokenCount = tokenizer.CountTokens(text); Console.WriteLine($"Token count: {tokenCount}"); - // Get detailed token information + // Get detailed token information. IReadOnlyList tokens = tokenizer.EncodeToTokens(text, out string? normalizedString); Console.WriteLine("Tokens:"); - foreach (var token in tokens) + foreach (EncodedToken token in tokens) { Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'"); } - // Decode tokens back to text + // Decode tokens back to text. string? decoded = tokenizer.Decode(ids); Console.WriteLine($"Decoded: {decoded}"); // diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs index bd86a4e24f9d9..33930e980132d 100644 --- a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs @@ -16,26 +16,26 @@ public static async Task RunAsync() private static async Task BasicUsageAsync() { // - // Open a stream to the remote Llama tokenizer model data file + // Open a stream to the remote Llama tokenizer model data file. using HttpClient httpClient = new(); const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"; using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl); - // Create the Llama tokenizer using the remote stream + // Create the Llama tokenizer using the remote stream. Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream); string input = "Hello, world!"; - // Encode text to token IDs + // Encode text to token IDs. IReadOnlyList ids = llamaTokenizer.EncodeToIds(input); Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); // Output: Token IDs: 1, 15043, 29892, 3186, 29991 - // Count the tokens + // Count the tokens. Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}"); // Output: Tokens: 5 - // Decode token IDs back to text + // Decode token IDs back to text. string? decoded = llamaTokenizer.Decode(ids); Console.WriteLine($"Decoded: {decoded}"); // Output: Decoded: Hello, world! @@ -44,20 +44,20 @@ private static async Task BasicUsageAsync() private static void AdvancedOptions() { - // For demonstration purposes, we'll use a mock tokenizer - // In real scenarios, you would initialize this properly + // For demonstration purposes, we'll use a mock tokenizer. + // In real scenarios, you would initialize this properly. Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); - + // ReadOnlySpan textSpan = "Hello World".AsSpan(); - // Bypass normalization during encoding + // Bypass normalization during encoding. IReadOnlyList ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false); - // Bypass pretokenization during encoding + // Bypass pretokenization during encoding. ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false); - // Bypass both normalization and pretokenization + // Bypass both normalization and pretokenization. ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false); // } diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs index b347617f62ffe..fc6f90fff43d8 100644 --- a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs +++ b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs @@ -13,21 +13,21 @@ public static void Run() private static void BasicUsage() { // - // Initialize the tokenizer for the gpt-4o model + // Initialize the tokenizer for the gpt-4o model. Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); string source = "Text tokenization is the process of splitting a string into a list of tokens."; - // Count the tokens in the text + // Count the tokens in the text. Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}"); // Output: Tokens: 16 - // Encode text to token IDs + // Encode text to token IDs. IReadOnlyList ids = tokenizer.EncodeToIds(source); Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); // Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13 - // Decode token IDs back to text + // Decode token IDs back to text. string? decoded = tokenizer.Decode(ids); Console.WriteLine($"Decoded: {decoded}"); // Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens. @@ -41,7 +41,7 @@ private static void TrimText() string source = "Text tokenization is the process of splitting a string into a list of tokens."; - // Get the last 5 tokens from the text + // Get the last 5 tokens from the text. var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _); if (processedText is not null) { @@ -49,7 +49,7 @@ private static void TrimText() // Output: Last 5 tokens: a list of tokens. } - // Get the first 5 tokens from the text + // Get the first 5 tokens from the text. trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _); if (processedText is not null) { diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md index e93f143ab9c5c..fd87b5c8695a8 100644 --- a/docs/ai/how-to/use-tokenizers.md +++ b/docs/ai/how-to/use-tokenizers.md @@ -62,12 +62,14 @@ The tokenizer supports advanced encoding options, such as controlling normalizat ## Use BPE tokenizer -Byte-pair encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization: +*Byte-pair encoding* (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. BPE was initially developed as an algorithm to compress texts, and then used by OpenAI for tokenization when it pretrained the GPT model. The following example demonstrates BPE tokenization: :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic"::: The library also provides specialized tokenizers like and that you can configure with custom vocabularies for specific models. +For more information about BPE, see [Byte-pair encoding tokenization](https://huggingface.co/learn/llm-course/chapter6/5). + ## Common tokenizer operations All tokenizers in the library implement the base class. The following table shows the available methods. @@ -81,7 +83,7 @@ All tokenizers in the library implement the | Finds the character index for a specific token count from the start | | | Finds the character index for a specific token count from the end | -## Migration from other libraries +## Migrate from other libraries If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md). From cd5140617d227480a06e540c889671445206149f Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Wed, 29 Oct 2025 18:16:29 -0700 Subject: [PATCH 7/7] Update docs/ai/how-to/use-tokenizers.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/ai/how-to/use-tokenizers.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md index fd87b5c8695a8..24d540c26aa3a 100644 --- a/docs/ai/how-to/use-tokenizers.md +++ b/docs/ai/how-to/use-tokenizers.md @@ -76,12 +76,12 @@ All tokenizers in the library implement the | Converts text to a list of token IDs | -| | Converts token IDs back to text | -| | Returns the number of tokens in a text string | -| | Returns detailed token information including values and IDs | -| | Finds the character index for a specific token count from the start | -| | Finds the character index for a specific token count from the end | +| | Converts text to a list of token IDs. | +| | Converts token IDs back to text. | +| | Returns the number of tokens in a text string. | +| | Returns detailed token information including values and IDs. | +| | Finds the character index for a specific token count from the start. | +| | Finds the character index for a specific token count from the end. | ## Migrate from other libraries