blob: 31b6d1346e8cd5cbc88410f0de78bd6db55e14cd [file] [log] [blame]
Andrew Moylan40ee4fc2018-08-24 15:46:09 +10001// Copyright 2018 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "ml/metrics.h"
6
7#include <algorithm>
8#include <string>
9#include <vector>
10
11#include <base/bind.h>
12#include <base/files/file_path.h>
Qijiang Fanfea991e2020-05-11 15:13:52 +090013#include <base/system/sys_info.h>
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100014#include <base/time/time.h>
15
Honglin Yu7b6c1192020-09-16 10:07:17 +100016#include "ml/process.h"
Honglin Yu21616692021-05-14 11:20:22 +100017#include "ml/request_metrics.h"
Honglin Yu1cd25072019-07-09 11:54:14 +100018#include "ml/util.h"
19
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100020namespace ml {
21
22namespace {
23
24// UMA metric names:
25constexpr char kCpuUsageMetricName[] =
26 "MachineLearningService.CpuUsageMilliPercent";
27constexpr char kMojoConnectionEventMetricName[] =
28 "MachineLearningService.MojoConnectionEvent";
Honglin Yu1cd25072019-07-09 11:54:14 +100029constexpr char kTotalMemoryMetricName[] =
30 "MachineLearningService.TotalMemoryKb";
31constexpr char kPeakTotalMemoryMetricName[] =
32 "MachineLearningService.PeakTotalMemoryKb";
Honglin Yu21616692021-05-14 11:20:22 +100033constexpr char kNumWorkerProcessMetricName[] =
34 "MachineLearningService.NumWorkerProcess";
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100035
36// UMA histogram ranges:
37constexpr int kCpuUsageMinMilliPercent = 1; // 0.001%
38constexpr int kCpuUsageMaxMilliPercent = 100000; // 100%
39constexpr int kCpuUsageBuckets = 25;
40constexpr int kMemoryUsageMinKb = 10; // 10 KB
41constexpr int kMemoryUsageMaxKb = 100000000; // 100 GB
42constexpr int kMemoryUsageBuckets = 100;
Honglin Yu21616692021-05-14 11:20:22 +100043constexpr int kNumWorkerProcessMin = 0;
44constexpr int kNumWorkerProcessMax = 1000;
45constexpr int kNumWorkerProcessBuckets = 100;
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100046
47// chromeos_metrics::CumulativeMetrics constants:
48constexpr char kCumulativeMetricsBackingDir[] = "/var/lib/ml_service/metrics";
Tom Hughes1d1c1922020-08-27 16:16:53 -070049constexpr char kPeakTotalMemoryCumulativeStatName[] = "peak_total_memory_kb";
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100050
51constexpr base::TimeDelta kCumulativeMetricsUpdatePeriod =
52 base::TimeDelta::FromMinutes(5);
53constexpr base::TimeDelta kCumulativeMetricsReportPeriod =
54 base::TimeDelta::FromDays(1);
55
56void RecordCumulativeMetrics(
57 MetricsLibrary* const metrics_library,
58 chromeos_metrics::CumulativeMetrics* const cumulative_metrics) {
59 metrics_library->SendToUMA(
Honglin Yu1cd25072019-07-09 11:54:14 +100060 kPeakTotalMemoryMetricName,
61 cumulative_metrics->Get(kPeakTotalMemoryCumulativeStatName),
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100062 kMemoryUsageMinKb, kMemoryUsageMaxKb, kMemoryUsageBuckets);
63}
64
Honglin Yu7b6c1192020-09-16 10:07:17 +100065// Returns true if getting the RAM of control process succeeds. Otherwise
66// returns false in which case the value of `total_mem_usage` should be
67// ignored.
68// Here we ignore the return status of getting worker processes's RAM usage
69// because there may be a case that the worker process has disappeared but it
70// has not been removed from Process::GetWorkerPidInfoMap(). We do not want this
71// to block the overall metric report. In the future, we may implement some
72// dedicated metrics to report such cases.
73bool GetControlAndWorkerProcessMemoryUsage(size_t* total_mem_usage) {
74 DCHECK(total_mem_usage != nullptr);
75 *total_mem_usage = 0;
76 MemoryUsage usage;
77 // Collect RAM usage for worker processes.
78 // Do not crash if `GetProcessMemoryUsage` fails for worker processes because
79 // maybe some worker process terminates before it is unregistered.
80 for (const auto& pid_info : Process::GetInstance()->GetWorkerPidInfoMap()) {
81 if (GetProcessMemoryUsage(&usage, pid_info.first)) {
82 *total_mem_usage += usage.VmRSSKb + usage.VmSwapKb;
Honglin Yu21616692021-05-14 11:20:22 +100083 } else {
84 RecordProcessErrorEvent(ProcessError::kGetWorkerProcessMemoryUsageFailed);
Honglin Yu7b6c1192020-09-16 10:07:17 +100085 }
86 }
87 // Collect RAM usage for control processes.
88 if (GetProcessMemoryUsage(&usage)) {
89 *total_mem_usage += usage.VmRSSKb + usage.VmSwapKb;
90 return true;
91 } else {
92 return false;
93 }
94}
95
Andrew Moylan40ee4fc2018-08-24 15:46:09 +100096} // namespace
97
98Metrics::Metrics()
Joel Kitching21beaba2019-03-11 15:47:30 +080099 : process_metrics_(base::ProcessMetrics::CreateCurrentProcessMetrics()) {}
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000100
101void Metrics::StartCollectingProcessMetrics() {
102 if (cumulative_metrics_) {
103 LOG(WARNING) << "Multiple calls to StartCollectingProcessMetrics";
104 return;
105 }
106
Andrew Moylan79b34a42020-07-08 11:13:11 +1000107 // Baseline the CPU usage counter in `process_metrics_` to be zero as of now.
Qijiang Fan7cdb3662019-10-21 16:39:20 +0900108 process_metrics_->GetPlatformIndependentCPUUsage();
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000109
110 cumulative_metrics_ = std::make_unique<chromeos_metrics::CumulativeMetrics>(
111 base::FilePath(kCumulativeMetricsBackingDir),
Honglin Yu1cd25072019-07-09 11:54:14 +1000112 std::vector<std::string>{kPeakTotalMemoryCumulativeStatName},
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000113 kCumulativeMetricsUpdatePeriod,
114 base::Bind(&Metrics::UpdateAndRecordMetrics, base::Unretained(this),
115 true /*record_current_metrics*/),
116 kCumulativeMetricsReportPeriod,
117 base::Bind(&RecordCumulativeMetrics,
118 base::Unretained(&metrics_library_)));
119}
120
121void Metrics::UpdateCumulativeMetricsNow() {
122 if (!cumulative_metrics_) {
123 return;
124 }
125 UpdateAndRecordMetrics(false /*record_current_metrics*/,
126 cumulative_metrics_.get());
127}
128
129void Metrics::UpdateAndRecordMetrics(
130 const bool record_current_metrics,
131 chromeos_metrics::CumulativeMetrics* const cumulative_metrics) {
Honglin Yu1cd25072019-07-09 11:54:14 +1000132 size_t usage = 0;
Honglin Yu7b6c1192020-09-16 10:07:17 +1000133 if (!GetControlAndWorkerProcessMemoryUsage(&usage)) {
Honglin Yu1cd25072019-07-09 11:54:14 +1000134 LOG(DFATAL) << "Getting process memory usage failed";
135 return;
136 }
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000137
138 // Update max memory stats.
Honglin Yu1cd25072019-07-09 11:54:14 +1000139 cumulative_metrics->Max(kPeakTotalMemoryCumulativeStatName,
140 static_cast<int64_t>(usage));
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000141
142 if (record_current_metrics) {
143 // Record CPU usage (units = milli-percent i.e. 0.001%):
Honglin Yu7b6c1192020-09-16 10:07:17 +1000144 // First get the CPU usage of the control process.
145 auto cpu_usage = process_metrics_->GetPlatformIndependentCPUUsage();
146 // Then get the CPU usages of the worker processes.
147 for (const auto& pid_info : Process::GetInstance()->GetWorkerPidInfoMap()) {
148 cpu_usage +=
149 pid_info.second.process_metrics->GetPlatformIndependentCPUUsage();
150 }
151
Tom Hughes1d1c1922020-08-27 16:16:53 -0700152 const int cpu_usage_milli_percent = static_cast<int>(
Honglin Yu7b6c1192020-09-16 10:07:17 +1000153 1000. * cpu_usage / base::SysInfo::NumberOfProcessors());
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000154 metrics_library_.SendToUMA(kCpuUsageMetricName, cpu_usage_milli_percent,
155 kCpuUsageMinMilliPercent,
156 kCpuUsageMaxMilliPercent, kCpuUsageBuckets);
157 // Record memory usage:
Tom Hughes1d1c1922020-08-27 16:16:53 -0700158 metrics_library_.SendToUMA(kTotalMemoryMetricName, usage, kMemoryUsageMinKb,
159 kMemoryUsageMaxKb, kMemoryUsageBuckets);
Honglin Yu21616692021-05-14 11:20:22 +1000160
161 // Record how many worker processes.
162 metrics_library_.SendToUMA(
163 kNumWorkerProcessMetricName,
164 Process::GetInstance()->GetWorkerPidInfoMap().size(),
165 kNumWorkerProcessMin, kNumWorkerProcessMax, kNumWorkerProcessBuckets);
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000166 }
167}
168
169void Metrics::RecordMojoConnectionEvent(const MojoConnectionEvent event) {
Honglin Yuca0caf82020-01-23 12:26:03 +1100170 metrics_library_.SendEnumToUMA(
171 kMojoConnectionEventMetricName, static_cast<int>(event),
172 static_cast<int>(MojoConnectionEvent::kMaxValue) + 1);
Andrew Moylan40ee4fc2018-08-24 15:46:09 +1000173}
174
175} // namespace ml