diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
index cad114341..f62622a47 100644
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -27,7 +27,7 @@
-
+
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 42ba6dbc5..a608c6571 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -1,6 +1,5 @@
using LLama;
using LLama.Common;
-using LLama.Native;
using Microsoft.KernelMemory;
using Microsoft.KernelMemory.AI;
@@ -112,5 +111,24 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok
///
public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
+
+ ///
+ /// Get the list of tokens for the input text
+ ///
+ /// Input string to be tokenized
+ /// Read-only list of tokens for the input test
+ ///
+ /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.
+ ///
+ public IReadOnlyList GetTokens(string text)
+ {
+ /* see relevant unit tests for important implementation notes regarding unicode */
+ var context = _embedder.Context;
+ var numericTokens = context.Tokenize(text, special: true);
+ var decoder = new StreamingTokenDecoder(context);
+ return numericTokens
+ .Select(x => { decoder.Add(x); return decoder.Read(); })
+ .ToList();
+ }
}
}
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index 43b9bed8b..e13e634b3 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -106,5 +106,23 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
///
public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
+
+ ///
+ /// Get the list of tokens for the input text
+ ///
+ /// Input string to be tokenized
+ /// Read-only list of tokens for the input test
+ ///
+ /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.
+ ///
+ public IReadOnlyList GetTokens(string text)
+ {
+ /* see relevant unit tests for important implementation notes regarding unicode */
+ var numericTokens = _context.Tokenize(text, special: true);
+ var decoder = new StreamingTokenDecoder(_context);
+ return numericTokens
+ .Select(x => { decoder.Add(x); return decoder.Read(); })
+ .ToList();
+ }
}
}
diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
new file mode 100644
index 000000000..4000525cc
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -0,0 +1,117 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+
+ public abstract class ITextTokenizerTests
+ {
+ private readonly ITestOutputHelper _testOutputHelper;
+
+#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+ protected ITextTokenizer? _generator;
+#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+
+ protected InferenceParams _infParams;
+ protected LLamaSharpConfig _lsConfig;
+
+ public ITextTokenizerTests(ITestOutputHelper testOutputHelper)
+ {
+ _testOutputHelper = testOutputHelper;
+
+ _infParams = new() { AntiPrompts = ["\n\n"] };
+ _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };
+
+ testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");
+ }
+
+
+ [Theory]
+ [InlineData("The quick brown fox jumps over the lazy dog")]
+ [InlineData("Well, here're some special characters!!!")]
+ [InlineData("...___---")]
+ [InlineData("15 + 6 = 21 && 68 * 75 = 5100")]
+ [InlineData(" \n \r\n \t ")]
+ public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)
+ {
+ var tokens = _generator!.GetTokens(text);
+ var tokensCount = _generator.CountTokens(text);
+
+ var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
+ var actual = string.Join("", tokens);
+
+ _testOutputHelper.WriteLine($"Tokens for '{text}':");
+ _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+ Assert.Equal(expected, actual);
+ Assert.Equal(tokensCount, tokens.Count);
+ }
+
+ /* This is exactly the same test as the non-unicode cases. However, there are reasons why this
+ * should be made a special case and may deviate in the future:
+ *
+ * As of now there appears to be no final word as to how characters that consist of more than one
+ * numeric token should correspond to textual tokens, and results vary according to different
+ * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3}
+ * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens
+ * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z').
+ *
+ * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters
+ * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862
+ *
+ * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning
+ * any redundant tokens will not be omitted as long as they are counted by CountTokens.
+ *
+ * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing
+ * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered
+ * an example of proper use.
+ *
+ * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens
+ * and LLamaSharpTextGenerator.GetTokens
+ */
+ [Theory]
+ [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]
+ [InlineData("猫坐在垫子上 😀🤨🤐😏")]
+ public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text)
+ {
+ var tokens = _generator!.GetTokens(text);
+ var tokensCount = _generator.CountTokens(text);
+
+ var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
+ var actual = string.Join("", tokens);
+
+ _testOutputHelper.WriteLine($"Tokens for '{text}':");
+ _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+ Assert.Equal(expected, actual);
+ Assert.Equal(tokensCount, tokens.Count);
+ }
+
+ [Fact]
+ public void GetToken_ShouldThrowForNull()
+ {
+ string? text = null;
+
+ Assert.Throws(() => { _generator!.GetTokens(text!); });
+ }
+
+ [Fact]
+ public void GetToken_EmptyStringYieldsOneEmptyToken()
+ {
+ var text = "";
+ var expected = "";
+
+ var tokens = _generator!.GetTokens(text);
+ var tokensCount = _generator.CountTokens(text);
+ var actual = tokens.Single();
+
+ _testOutputHelper.WriteLine($"Tokens for '{text}':");
+ _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+ Assert.Equal(expected, actual);
+ Assert.Equal(tokensCount, tokens.Count);
+ }
+ }
+}
diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
new file mode 100644
index 000000000..91161b72c
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -0,0 +1,30 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+ public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+ {
+ private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
+
+ public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+ {
+ _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
+
+ _generator = _embeddingGenerator;
+ }
+
+ public void Dispose()
+ {
+ _embeddingGenerator.Dispose();
+ }
+ }
+}
diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
new file mode 100644
index 000000000..02001f8cf
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -0,0 +1,34 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Reflection.Emit;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+using Xunit.Sdk;
+using static System.Net.Mime.MediaTypeNames;
+
+namespace LLama.Unittest.KernelMemory
+{
+ public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+ {
+ private readonly LlamaSharpTextGenerator _textGenerator;
+
+ public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+ {
+ _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
+
+ _generator = _textGenerator;
+ }
+
+ public void Dispose()
+ {
+ _textGenerator.Dispose();
+ }
+ }
+}
diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
index 4ddbd1974..95d4cbc5e 100644
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@@ -1,4 +1,4 @@
-
+
net8.0
@@ -29,31 +29,16 @@
-
+
-
+
-
+
-
+
@@ -63,14 +48,11 @@
+
-
-
-
-
PreserveNewest