diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index d8f4a32b4e3c..c51873290f70 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -195,15 +195,11 @@ private static List ProcessParagraphs(List paragraphs, int adjus var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); - var lastParagraphTokensCount = lastParagraphTokens.Length; - var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + var mergedParagraph = $"{string.Join(" ", secondLastParagraphTokens)} {string.Join(" ", lastParagraphTokens)}"; - if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + if (GetTokenCount(mergedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph) { - var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); - var newLastParagraph = string.Join(" ", lastParagraphTokens); - - paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}"; + paragraphs[paragraphs.Count - 2] = mergedParagraph; paragraphs.RemoveAt(paragraphs.Count - 1); } } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a31f077eef66..fd53dedf1504 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -558,6 +558,17 @@ public void CanSplitTextParagraphsWithCustomTokenCounter() Assert.Equal(expected, result); } + [Fact] + public void SplitTextParagraphsDoesNotMergeShortLastParagraphPastTokenLimit() + { + var input = new[] { "123456789", "x" }; + + var result = TextChunker.SplitPlainTextParagraphs(input, 10, tokenCounter: text => text.Length); + + Assert.Equal(["123456789", "x"], result); + Assert.All(result, paragraph => Assert.True(paragraph.Length <= 10, $"Paragraph exceeded token limit: {paragraph}")); + } + [Fact] public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter() {