ml: use unicode substr instead of std::string::substr. Because the indices returned by tclib are unicode codepoints. We should use unicode substr instead of std::string::substr. BUG=none TEST=on workstation, it works properly for case "350°F" now. Change-Id: If686b28d2628dfcba5f1c0af74256f4e9fe59826 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2230729 Reviewed-by: Andrew Moylan <amoylan@chromium.org> Tested-by: Honglin Yu <honglinyu@chromium.org> Commit-Queue: Honglin Yu <honglinyu@chromium.org>

commit: 568fc9a024b4edec5917da89d632b6fa3f13cf04 [log] [tgz]
author: Honglin Yu <honglinyu@chromium.org> Fri Jun 05 11:57:21 2020 +1000
committer: Commit Bot <commit-bot@chromium.org> Thu Jun 11 07:43:07 2020 +0000
tree: 09424ee945c85abec88a096975e7673d7072f400
parent: 86eb942f652f33242ea8b3f4c844b6ad4370a03d [diff] [blame]
diff --git a/ml/text_classifier_impl.cc b/ml/text_classifier_impl.cc
index 6d8a62b..d6bbcc7 100644
--- a/ml/text_classifier_impl.cc
+++ b/ml/text_classifier_impl.cc

@@ -8,6 +8,7 @@
 #include <vector>
 
 #include <base/logging.h>
+#include <utils/utf8/unicodetext.h>
 
 #include "ml/mojom/text_classifier.mojom.h"
 #include "ml/request_metrics.h"
@@ -111,9 +112,11 @@
         // For the other types, just encode the substring into string_value.
         // TODO(honglinyu): add data extraction for more types when needed
         // and available.
-        entity_data->set_string_value(request->text.substr(
-            annotated_result.span.first,
-            annotated_result.span.second - annotated_result.span.first));
+        // Note that the returned indices by annotator is unicode codepoints.
+        entity_data->set_string_value(
+            libtextclassifier3::UTF8ToUnicodeText(request->text, false)
+                .UTF8Substring(annotated_result.span.first,
+                               annotated_result.span.second));
       }
 
       // Second, create the entity.
commit	568fc9a024b4edec5917da89d632b6fa3f13cf04	[log] [tgz]
author	Honglin Yu <honglinyu@chromium.org>	Fri Jun 05 11:57:21 2020 +1000
committer	Commit Bot <commit-bot@chromium.org>	Thu Jun 11 07:43:07 2020 +0000
tree	09424ee945c85abec88a096975e7673d7072f400
parent	86eb942f652f33242ea8b3f4c844b6ad4370a03d [diff] [blame]