Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions BotSharp.sln
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ExcelHandle
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ImageHandler", "src\Plugins\BotSharp.Plugin.ImageHandler\BotSharp.Plugin.ImageHandler.csproj", "{242F2D93-FCCE-4982-8075-F3052ECCA92C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.FuzzySharp", "src\Plugins\BotSharp.Plugin.FuzzySharp\BotSharp.Plugin.FuzzySharp.csproj", "{E7C243B9-E751-B3B4-8F16-95C76CA90D31}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -629,6 +631,14 @@ Global
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|Any CPU.Build.0 = Release|Any CPU
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.ActiveCfg = Release|Any CPU
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.Build.0 = Release|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.ActiveCfg = Debug|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.Build.0 = Debug|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.Build.0 = Release|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.ActiveCfg = Release|Any CPU
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -701,6 +711,7 @@ Global
{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
{FC63C875-E880-D8BB-B8B5-978AB7B62983} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
{242F2D93-FCCE-4982-8075-F3052ECCA92C} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
{E7C243B9-E751-B3B4-8F16-95C76CA90D31} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {A9969D89-C98B-40A5-A12B-FC87E55B3A19}
Expand Down
3 changes: 3 additions & 0 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
</PropertyGroup>
<ItemGroup>
<PackageVersion Include="CsvHelper" Version="33.1.0" />
<PackageVersion Include="FuzzySharp" Version="2.0.2" />
<PackageVersion Include="EntityFramework" Version="6.4.4" />
<PackageVersion Include="Google_GenerativeAI" Version="3.4.0" />
<PackageVersion Include="Google_GenerativeAI.Live" Version="3.4.0" />
Expand All @@ -18,6 +20,7 @@
<PackageVersion Include="Microsoft.Extensions.Logging" Version="9.0.0" />
<PackageVersion Include="Microsoft.Extensions.Caching.Memory" Version="8.0.1" />
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
<PackageVersion Include="SharpFuzz" Version="2.2.0" />
<PackageVersion Include="SharpHook" Version="5.3.9" />
<PackageVersion Include="SixLabors.ImageSharp" Version="3.1.11" />
<PackageVersion Include="System.ClientModel" Version="1.3.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

namespace BotSharp.Abstraction.FuzzSharp.Arguments
{
public class TextAnalysisRequest
{
/// <summary>
/// Text to analyze
/// </summary>
[Required]
[JsonPropertyName("text")]
public string Text { get; set; } = string.Empty;

/// <summary>
/// Folder path containing CSV files (will read all .csv files from the folder or its 'output' subfolder)
/// </summary>
[JsonPropertyName("vocabulary_folder_name")]
public string? VocabularyFolderName { get; set; }

/// <summary>
/// Domain term mapping CSV file
/// </summary>
[JsonPropertyName("domain_term_mapping_file")]
public string? DomainTermMappingFile { get; set; }

/// <summary>
/// Min score for suggestions (0.0-1.0)
/// </summary>
[JsonPropertyName("cutoff")]
[Range(0.0, 1.0)]
public double Cutoff { get; set; } = 0.80;

/// <summary>
/// Max candidates per domain (1-20)
/// </summary>
[JsonPropertyName("topk")]
[Range(1, 20)]
public int TopK { get; set; } = 5;

/// <summary>
/// Max n-gram size (1-10)
/// </summary>
[JsonPropertyName("max_ngram")]
[Range(1, 10)]
public int MaxNgram { get; set; } = 5;

/// <summary>
/// Include tokens field in response (default: False)
/// </summary>
[JsonPropertyName("include_tokens")]
public bool IncludeTokens { get; set; } = false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using BotSharp.Abstraction.FuzzSharp.Models;

namespace BotSharp.Abstraction.FuzzSharp
{
public interface INgramProcessor
{
/// <summary>
/// Process tokens and generate all possible n-gram match results
/// </summary>
/// <param name="tokens">List of tokens to process</param>
/// <param name="vocabulary">Vocabulary (domain type -> vocabulary set)</param>
/// <param name="domainTermMapping">Domain term mapping</param>
/// <param name="lookup">Lookup table (lowercase vocabulary -> (canonical form, domain type list))</param>
/// <param name="maxNgram">Maximum n-gram length</param>
/// <param name="cutoff">Minimum confidence threshold for fuzzy matching</param>
/// <param name="topK">Maximum number of matches to return</param>
/// <returns>List of flagged items</returns>
List<FlaggedItem> ProcessNgrams(
List<string> tokens,
Dictionary<string, HashSet<string>> vocabulary,
Dictionary<string, (string DbPath, string CanonicalForm)> domainTermMapping,
Dictionary<string, (string CanonicalForm, List<string> DomainTypes)> lookup,
int maxNgram,
double cutoff,
int topK);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
using BotSharp.Abstraction.FuzzSharp.Models;

namespace BotSharp.Abstraction.FuzzSharp
{
/// <summary>
/// Result processor interface
/// Responsible for processing match results, including deduplication and sorting
/// </summary>
public interface IResultProcessor
{
/// <summary>
/// Process a list of flagged items, removing overlapping duplicates and sorting
/// </summary>
/// <param name="flagged">List of flagged items to process</param>
/// <returns>Processed list of flagged items (deduplicated and sorted)</returns>
List<FlaggedItem> ProcessResults(List<FlaggedItem> flagged);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using BotSharp.Abstraction.FuzzSharp.Arguments;
using BotSharp.Abstraction.FuzzSharp.Models;

namespace BotSharp.Abstraction.FuzzSharp
{
public interface ITextAnalysisService
{
/// <summary>
/// Analyze text for typos and entities using domain-specific vocabulary
/// </summary>
Task<TextAnalysisResponse> AnalyzeTextAsync(TextAnalysisRequest request);
}
}
40 changes: 40 additions & 0 deletions src/Infrastructure/BotSharp.Abstraction/FuzzSharp/ITokenMatcher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
namespace BotSharp.Abstraction.FuzzSharp
{
public interface ITokenMatcher
{
/// <summary>
/// Try to match a content span and return a match result
/// </summary>
/// <param name="context">The matching context containing all necessary information</param>
/// <returns>Match result if found, null otherwise</returns>
MatchResult? TryMatch(MatchContext context);

/// <summary>
/// Priority of this matcher (higher priority matchers are tried first)
/// </summary>
int Priority { get; }
}

/// <summary>
/// Context information for token matching
/// </summary>
public record MatchContext(
string ContentSpan,
string ContentLow,
int StartIndex,
int NgramLength,
Dictionary<string, HashSet<string>> Vocabulary,
Dictionary<string, (string DbPath, string CanonicalForm)> DomainTermMapping,
Dictionary<string, (string CanonicalForm, List<string> DomainTypes)> Lookup,
double Cutoff,
int TopK);

/// <summary>
/// Result of a token match
/// </summary>
public record MatchResult(
string CanonicalForm,
List<string> DomainTypes,
string MatchType,
double Confidence);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

namespace BotSharp.Abstraction.FuzzSharp
{
public interface IVocabularyService
{
Task<Dictionary<string, HashSet<string>>> LoadVocabularyAsync(string? folderPath);
Task<Dictionary<string, (string DbPath, string CanonicalForm)>> LoadDomainTermMappingAsync(string? filePath);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

namespace BotSharp.Abstraction.FuzzSharp.Models
{
public class FlaggedItem
{
/// <summary>
/// Token index in the original text
/// </summary>
[JsonPropertyName("index")]
public int Index { get; set; }

/// <summary>
/// Original token text
/// </summary>
[JsonPropertyName("token")]
public string Token { get; set; } = string.Empty;

/// <summary>
/// Domain types where this token was found (e.g., ['client_Profile.Name', 'data_ServiceType.Name'])
/// </summary>
[JsonPropertyName("domain_types")]
public List<string> DomainTypes { get; set; } = new();

/// <summary>
/// Type of match: 'domain_term_mapping' (business abbreviations, confidence=1.0) |
/// 'exact_match' (vocabulary matches, confidence=1.0) |
/// 'typo_correction' (spelling corrections, confidence less than 1.0)
/// </summary>
[JsonPropertyName("match_type")]
public string MatchType { get; set; } = string.Empty;

/// <summary>
/// Canonical form or suggested correction
/// </summary>
[JsonPropertyName("canonical_form")]
public string CanonicalForm { get; set; } = string.Empty;

/// <summary>
/// Confidence score (0.0-1.0, where 1.0 is exact match)
/// </summary>
[JsonPropertyName("confidence")]
public double Confidence { get; set; }

/// <summary>
/// N-gram length (number of tokens in this match). Internal field, not included in JSON output.
/// </summary>
[JsonIgnore]
public int NgramLength { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

namespace BotSharp.Abstraction.FuzzSharp.Models
{
public class TextAnalysisResponse
{
/// <summary>
/// Original text
/// </summary>
[JsonPropertyName("original")]
public string Original { get; set; } = string.Empty;

/// <summary>
/// Tokenized text (only included if include_tokens=true)
/// </summary>
[JsonPropertyName("tokens")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public List<string>? Tokens { get; set; }

/// <summary>
/// Flagged items (filter by 'match_type' field: 'domain_term_mapping', 'exact_match', or 'typo_correction')
/// </summary>
[JsonPropertyName("flagged")]
public List<FlaggedItem> Flagged { get; set; } = new();

/// <summary>
/// Processing time in milliseconds
/// </summary>
[JsonPropertyName("processing_time_ms")]
public double ProcessingTimeMs { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>$(TargetFramework)</TargetFramework>
<Nullable>enable</Nullable>
<LangVersion>$(LangVersion)</LangVersion>
<VersionPrefix>$(BotSharpVersion)</VersionPrefix>
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild>
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile>
<OutputPath>$(SolutionDir)packages</OutputPath>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="CsvHelper" />
<PackageReference Include="FuzzySharp" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Infrastructure\BotSharp.Abstraction\BotSharp.Abstraction.csproj" />
</ItemGroup>
</Project>
21 changes: 21 additions & 0 deletions src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

namespace BotSharp.Plugin.FuzzySharp.Constants
{
public static class MatchReason
{
/// <summary>
/// Token matched a domain term mapping (e.g., HVAC -> Air Conditioning/Heating)
/// </summary>
public const string DomainTermMapping = "domain_term_mapping";

/// <summary>
/// Token exactly matched a vocabulary entry
/// </summary>
public const string ExactMatch = "exact_match";

/// <summary>
/// Token was flagged as a potential typo and a correction was suggested
/// </summary>
public const string TypoCorrection = "typo_correction";
}
}
30 changes: 30 additions & 0 deletions src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

namespace BotSharp.Plugin.FuzzySharp.Constants
{
public static class TextConstants
{
/// <summary>
/// Characters that need to be separated during tokenization (by adding spaces before and after)
/// Includes: parentheses, brackets, braces, punctuation marks, special symbols, etc.
/// This ensures "(IH)" is split into "(", "IH", ")"
/// </summary>
public static readonly char[] SeparatorChars =
{
// Parentheses and brackets
'(', ')', '[', ']', '{', '}',
// Punctuation marks
',', '.', ';', ':', '!', '?',
// Special symbols
'=', '@', '#', '$', '%', '^', '&', '*', '+', '-', '\\', '|', '<', '>', '~', '`'
};

/// <summary>
/// Whitespace characters used as token separators during tokenization.
/// Includes: space, tab, newline, and carriage return.
/// </summary>
public static readonly char[] TokenSeparators =
{
' ', '\t', '\n', '\r'
};
}
}
Loading
Loading