ml: use unicode substr instead of std::string::substr.

Because the indices returned by tclib are unicode codepoints. We
should use unicode substr instead of std::string::substr.

BUG=none
TEST=on workstation, it works properly for case "350°F" now.

Change-Id: If686b28d2628dfcba5f1c0af74256f4e9fe59826
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2230729
Reviewed-by: Andrew Moylan <amoylan@chromium.org>
Tested-by: Honglin Yu <honglinyu@chromium.org>
Commit-Queue: Honglin Yu <honglinyu@chromium.org>
diff --git a/ml/text_classifier_impl.cc b/ml/text_classifier_impl.cc
index 6d8a62b..d6bbcc7 100644
--- a/ml/text_classifier_impl.cc
+++ b/ml/text_classifier_impl.cc
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include <base/logging.h>
+#include <utils/utf8/unicodetext.h>
 
 #include "ml/mojom/text_classifier.mojom.h"
 #include "ml/request_metrics.h"
@@ -111,9 +112,11 @@
         // For the other types, just encode the substring into string_value.
         // TODO(honglinyu): add data extraction for more types when needed
         // and available.
-        entity_data->set_string_value(request->text.substr(
-            annotated_result.span.first,
-            annotated_result.span.second - annotated_result.span.first));
+        // Note that the returned indices by annotator is unicode codepoints.
+        entity_data->set_string_value(
+            libtextclassifier3::UTF8ToUnicodeText(request->text, false)
+                .UTF8Substring(annotated_result.span.first,
+                               annotated_result.span.second));
       }
 
       // Second, create the entity.