Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ability to skip spellcheck for some terms and leave them unchanged #138

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions SymSpell.Test/SymSpellLookupCompoundTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
using NUnit.Framework;
using System.Text.RegularExpressions;

namespace symspell.Test
{
[TestFixture]
public class SymSpellLookupCompoundTests
{
private SymSpell _symSpell;

[OneTimeSetUp]
public void Init()
{
_symSpell = new SymSpell();
_symSpell.CreateDictionaryEntry("in", 5);
_symSpell.CreateDictionaryEntry("the", 10);
_symSpell.CreateDictionaryEntry("third", 10);
_symSpell.CreateDictionaryEntry("quarter", 10);
_symSpell.CreateDictionaryEntry("of", 10);
_symSpell.CreateDictionaryEntry("last", 10);
_symSpell.CreateDictionaryEntry("visit", 10);
_symSpell.CreateDictionaryEntry("our", 10);
_symSpell.CreateDictionaryEntry("offices", 10);
_symSpell.CreateDictionaryEntry("last", 10);
_symSpell.CreateDictionaryEntry("last", 10);
_symSpell.CreateDictionaryEntry("a", 10);
}

[Test]
public void SuggestWordsInDictionary_ReturnsCorrectedText()
{
var result = _symSpell.LookupCompound("in te dhird qarter oflast");
Assert.AreEqual(1, result.Count);
Assert.AreEqual("in the third quarter of last", result[0].term);
}

[Test]
public void NoSuggestForWord_ReturnsUnchanged()
{
var result = _symSpell.LookupCompound("in te dhird qarter oflast jear", 1);
Assert.AreEqual(1, result.Count);
Assert.AreEqual("in the third quarter of last jear", result[0].term);
}

[Test]
public void SplittedWord_ReturnsCorrectedWord()
{
var result = _symSpell.LookupCompound("in te dhird quar ter oflast");
Assert.AreEqual(1, result.Count);
Assert.AreEqual("in the third quarter of last", result[0].term);
}

[Test]
public void DigitsWithoutSkipFunction_Replaced()
{
var result = _symSpell.LookupCompound("visit our offices 24/7");
Assert.AreEqual(1, result.Count);
Assert.AreEqual("visit our offices of a", result[0].term);
}

[TestCase("visit our offices 24/7", "visit our offices 24 7")]
[TestCase("th rd", "third")]
[TestCase("th 3 rd", "the 3 of")]
public void SkipDigitWords_ReturnsDigits(string source, string expected)
{
var digitRegex = new Regex("^\\d+$", RegexOptions.Compiled);
var result = _symSpell.LookupCompound(source, 2, digitRegex.IsMatch);
Assert.AreEqual(1, result.Count);
Assert.AreEqual(expected, result[0].term);
}

[Test]
public void SplittedWordAndFirstPartSkiped_ReturnsSplitted()
{
var result = _symSpell.LookupCompound("in te dhird quar ter oflast", 2, (term) => term == "quar");
Assert.AreEqual(1, result.Count);
Assert.AreEqual("in the third quar the of last", result[0].term);
}

[Test]
public void SplittedWordAndSecondPartSkiped_ReturnsSplitted()
{
var result = _symSpell.LookupCompound("in te dhird quar ter oflast", 2, (term) => term == "ter");
Assert.AreEqual(1, result.Count);
Assert.AreEqual("in the third our ter of last", result[0].term);
}
}
}
49 changes: 28 additions & 21 deletions SymSpell/SymSpell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -848,44 +848,50 @@ public List<SuggestItem> LookupCompound(string input)

/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <param name="editDistanceMax">The maximum edit distance between input and suggested words.</param>
/// <param name="skipSpellcheck">The function to check if a term should remain unchanged.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input string.</returns>
public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
public List<SuggestItem> LookupCompound(string input, int editDistanceMax, Func<string,bool> skipSpellcheck = null)
{
//parse input string into single terms
string[] termList1 = ParseWords(input);

List<SuggestItem> suggestions = new List<SuggestItem>(); //suggestions for a single term
List<SuggestItem> suggestionParts = new List<SuggestItem>(); //1 line with separate parts
var distanceComparer = new EditDistance(this.distanceAlgorithm);
var termsToSkip = new HashSet<string>();

//translate every term to its best suggestion, otherwise it remains unchanged
bool lastCombi = false;
for (int i = 0; i < termList1.Length; i++)
{
// if skipSpellcheck returns true for term, leave it unchanged
if (skipSpellcheck != null && skipSpellcheck(termList1[i]))
{
termsToSkip.Add(termList1[i]);
suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax));
goto nextTerm;
}

suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax);

//combi check, always before split
if ((i > 0) && !lastCombi)
if ((i > 0) && !lastCombi && !termsToSkip.Contains(termList1[i - 1]))
{
List<SuggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Verbosity.Top, editDistanceMax);

if (suggestionsCombi.Count > 0)
{
SuggestItem best1 = suggestionParts[suggestionParts.Count - 1];
SuggestItem best2 = new SuggestItem();
SuggestItem best2;
if (suggestions.Count > 0)
{
best2 = suggestions[0];
}
else
{
//unknown word
best2.term = termList1[i];
//estimated edit distance
best2.distance = editDistanceMax + 1;
//estimated word occurrence probability P=10 / (N * 10^word length l)
best2.count = (long)((double)10 / Math.Pow((double)10, (double)best2.term.Length)); // 0;
best2 = CreateTermSuggestItem(termList1[i], editDistanceMax);
}

//distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination
Expand Down Expand Up @@ -988,22 +994,12 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
}
else
{
SuggestItem si = new SuggestItem();
si.term = termList1[i];
//estimated word occurrence probability P=10 / (N * 10^word length l)
si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
si.distance = editDistanceMax + 1;
suggestionParts.Add(si);
suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax));
}
}
else
{
SuggestItem si = new SuggestItem();
si.term = termList1[i];
//estimated word occurrence probability P=10 / (N * 10^word length l)
si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
si.distance = editDistanceMax + 1;
suggestionParts.Add(si);
suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax));
}
}
nextTerm:;
Expand All @@ -1024,6 +1020,17 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
return suggestionsLine;
}

private static SuggestItem CreateTermSuggestItem(string term, int editDistanceMax)
{
return new SuggestItem()
{
term = term,
//estimated word occurrence probability P=10 / (N * 10^word length l)
count = (long)((double)10 / Math.Pow((double)10, (double)term.Length)),
distance = editDistanceMax + 1
};
}

//######

//WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions
Expand Down