ml: use unicode substr instead of std::string::substr.
Because the indices returned by tclib are unicode codepoints. We
should use unicode substr instead of std::string::substr.
BUG=none
TEST=on workstation, it works properly for case "350°F" now.
Change-Id: If686b28d2628dfcba5f1c0af74256f4e9fe59826
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2230729
Reviewed-by: Andrew Moylan <amoylan@chromium.org>
Tested-by: Honglin Yu <honglinyu@chromium.org>
Commit-Queue: Honglin Yu <honglinyu@chromium.org>
diff --git a/ml/text_classifier_impl.cc b/ml/text_classifier_impl.cc
index 6d8a62b..d6bbcc7 100644
--- a/ml/text_classifier_impl.cc
+++ b/ml/text_classifier_impl.cc
@@ -8,6 +8,7 @@
#include <vector>
#include <base/logging.h>
+#include <utils/utf8/unicodetext.h>
#include "ml/mojom/text_classifier.mojom.h"
#include "ml/request_metrics.h"
@@ -111,9 +112,11 @@
// For the other types, just encode the substring into string_value.
// TODO(honglinyu): add data extraction for more types when needed
// and available.
- entity_data->set_string_value(request->text.substr(
- annotated_result.span.first,
- annotated_result.span.second - annotated_result.span.first));
+ // Note that the returned indices by annotator is unicode codepoints.
+ entity_data->set_string_value(
+ libtextclassifier3::UTF8ToUnicodeText(request->text, false)
+ .UTF8Substring(annotated_result.span.first,
+ annotated_result.span.second));
}
// Second, create the entity.