diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index cad114341..f62622a47 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -27,7 +27,7 @@ - + diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 42ba6dbc5..a608c6571 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -1,6 +1,5 @@ using LLama; using LLama.Common; -using LLama.Native; using Microsoft.KernelMemory; using Microsoft.KernelMemory.AI; @@ -112,5 +111,24 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok /// public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length; + + /// + /// Get the list of tokens for the input text + /// + /// Input string to be tokenized + /// Read-only list of tokens for the input test + /// + /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation. + /// + public IReadOnlyList GetTokens(string text) + { + /* see relevant unit tests for important implementation notes regarding unicode */ + var context = _embedder.Context; + var numericTokens = context.Tokenize(text, special: true); + var decoder = new StreamingTokenDecoder(context); + return numericTokens + .Select(x => { decoder.Add(x); return decoder.Read(); }) + .ToList(); + } } } diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index 43b9bed8b..e13e634b3 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -106,5 +106,23 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In /// public int CountTokens(string text) => _context.Tokenize(text, special: true).Length; + + /// + /// Get the list of tokens for the input text + /// + /// Input string to be tokenized + /// Read-only list of tokens for the input test + /// + /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation. + /// + public IReadOnlyList GetTokens(string text) + { + /* see relevant unit tests for important implementation notes regarding unicode */ + var numericTokens = _context.Tokenize(text, special: true); + var decoder = new StreamingTokenDecoder(_context); + return numericTokens + .Select(x => { decoder.Add(x); return decoder.Read(); }) + .ToList(); + } } } diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs new file mode 100644 index 000000000..4000525cc --- /dev/null +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -0,0 +1,117 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + + public abstract class ITextTokenizerTests + { + private readonly ITestOutputHelper _testOutputHelper; + +#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + protected ITextTokenizer? _generator; +#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + + protected InferenceParams _infParams; + protected LLamaSharpConfig _lsConfig; + + public ITextTokenizerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + _infParams = new() { AntiPrompts = ["\n\n"] }; + _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams }; + + testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); + } + + + [Theory] + [InlineData("The quick brown fox jumps over the lazy dog")] + [InlineData("Well, here're some special characters!!!")] + [InlineData("...___---")] + [InlineData("15 + 6 = 21 && 68 * 75 = 5100")] + [InlineData(" \n \r\n \t ")] + public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) + { + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer + var actual = string.Join("", tokens); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + + /* This is exactly the same test as the non-unicode cases. However, there are reasons why this + * should be made a special case and may deviate in the future: + * + * As of now there appears to be no final word as to how characters that consist of more than one + * numeric token should correspond to textual tokens, and results vary according to different + * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3} + * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens + * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z'). + * + * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters + * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862 + * + * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning + * any redundant tokens will not be omitted as long as they are counted by CountTokens. + * + * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing + * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered + * an example of proper use. + * + * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens + * and LLamaSharpTextGenerator.GetTokens + */ + [Theory] + [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] + [InlineData("猫坐在垫子上 😀🤨🤐😏")] + public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text) + { + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer + var actual = string.Join("", tokens); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + + [Fact] + public void GetToken_ShouldThrowForNull() + { + string? text = null; + + Assert.Throws(() => { _generator!.GetTokens(text!); }); + } + + [Fact] + public void GetToken_EmptyStringYieldsOneEmptyToken() + { + var text = ""; + var expected = ""; + + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + var actual = tokens.Single(); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs new file mode 100644 index 000000000..91161b72c --- /dev/null +++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs @@ -0,0 +1,30 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; + + public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); + + _generator = _embeddingGenerator; + } + + public void Dispose() + { + _embeddingGenerator.Dispose(); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs new file mode 100644 index 000000000..02001f8cf --- /dev/null +++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs @@ -0,0 +1,34 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Reflection.Emit; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; +using Xunit.Sdk; +using static System.Net.Mime.MediaTypeNames; + +namespace LLama.Unittest.KernelMemory +{ + public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LlamaSharpTextGenerator _textGenerator; + + public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _textGenerator = new LlamaSharpTextGenerator(_lsConfig); + + _generator = _textGenerator; + } + + public void Dispose() + { + _textGenerator.Dispose(); + } + } +} diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 4ddbd1974..95d4cbc5e 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -1,4 +1,4 @@ - + net8.0 @@ -29,31 +29,16 @@ - + - + - + - + @@ -63,14 +48,11 @@ + - - - - PreserveNewest