[TextExtractor] Add space between CJK words and non-CJK (#20926)

* [TextExtractor] add space between CJK words and non-CJK
This commit is contained in:
Weijie Zhao 2022-10-20 23:21:46 +08:00 committed by GitHub
parent d4083abee2
commit d17ac2bf79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -10,6 +10,7 @@ using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows;
using System.Windows.Input;
@ -146,11 +147,25 @@ internal class ImageMethods
}
else
{
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}");
foreach (OcrLine ocrLine in ocrResult.Lines)
{
bool isBeginning = true;
bool isCJKPrev = false;
foreach (OcrWord ocrWord in ocrLine.Words)
{
bool isCJK = cjkRegex.IsMatch(ocrWord.Text);
// Use spaces to separate non-CJK words.
if (!isBeginning && (!isCJK || !isCJKPrev))
{
_ = text.Append(' ');
}
_ = text.Append(ocrWord.Text);
isCJKPrev = isCJK;
isBeginning = false;
}
text.Append(Environment.NewLine);