Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 1 | // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "ml/metrics.h" |
| 6 | |
| 7 | #include <algorithm> |
| 8 | #include <string> |
| 9 | #include <vector> |
| 10 | |
| 11 | #include <base/bind.h> |
| 12 | #include <base/files/file_path.h> |
Qijiang Fan | fea991e | 2020-05-11 15:13:52 +0900 | [diff] [blame] | 13 | #include <base/system/sys_info.h> |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 14 | #include <base/time/time.h> |
| 15 | |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 16 | #include "ml/process.h" |
Honglin Yu | 2161669 | 2021-05-14 11:20:22 +1000 | [diff] [blame] | 17 | #include "ml/request_metrics.h" |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 18 | #include "ml/util.h" |
| 19 | |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 20 | namespace ml { |
| 21 | |
| 22 | namespace { |
| 23 | |
| 24 | // UMA metric names: |
| 25 | constexpr char kCpuUsageMetricName[] = |
| 26 | "MachineLearningService.CpuUsageMilliPercent"; |
| 27 | constexpr char kMojoConnectionEventMetricName[] = |
| 28 | "MachineLearningService.MojoConnectionEvent"; |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 29 | constexpr char kTotalMemoryMetricName[] = |
| 30 | "MachineLearningService.TotalMemoryKb"; |
| 31 | constexpr char kPeakTotalMemoryMetricName[] = |
| 32 | "MachineLearningService.PeakTotalMemoryKb"; |
Honglin Yu | 2161669 | 2021-05-14 11:20:22 +1000 | [diff] [blame] | 33 | constexpr char kNumWorkerProcessMetricName[] = |
| 34 | "MachineLearningService.NumWorkerProcess"; |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 35 | |
| 36 | // UMA histogram ranges: |
| 37 | constexpr int kCpuUsageMinMilliPercent = 1; // 0.001% |
| 38 | constexpr int kCpuUsageMaxMilliPercent = 100000; // 100% |
| 39 | constexpr int kCpuUsageBuckets = 25; |
| 40 | constexpr int kMemoryUsageMinKb = 10; // 10 KB |
| 41 | constexpr int kMemoryUsageMaxKb = 100000000; // 100 GB |
| 42 | constexpr int kMemoryUsageBuckets = 100; |
Honglin Yu | 2161669 | 2021-05-14 11:20:22 +1000 | [diff] [blame] | 43 | constexpr int kNumWorkerProcessMin = 0; |
| 44 | constexpr int kNumWorkerProcessMax = 1000; |
| 45 | constexpr int kNumWorkerProcessBuckets = 100; |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 46 | |
| 47 | // chromeos_metrics::CumulativeMetrics constants: |
| 48 | constexpr char kCumulativeMetricsBackingDir[] = "/var/lib/ml_service/metrics"; |
Tom Hughes | 1d1c192 | 2020-08-27 16:16:53 -0700 | [diff] [blame] | 49 | constexpr char kPeakTotalMemoryCumulativeStatName[] = "peak_total_memory_kb"; |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 50 | |
| 51 | constexpr base::TimeDelta kCumulativeMetricsUpdatePeriod = |
| 52 | base::TimeDelta::FromMinutes(5); |
| 53 | constexpr base::TimeDelta kCumulativeMetricsReportPeriod = |
| 54 | base::TimeDelta::FromDays(1); |
| 55 | |
| 56 | void RecordCumulativeMetrics( |
| 57 | MetricsLibrary* const metrics_library, |
| 58 | chromeos_metrics::CumulativeMetrics* const cumulative_metrics) { |
| 59 | metrics_library->SendToUMA( |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 60 | kPeakTotalMemoryMetricName, |
| 61 | cumulative_metrics->Get(kPeakTotalMemoryCumulativeStatName), |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 62 | kMemoryUsageMinKb, kMemoryUsageMaxKb, kMemoryUsageBuckets); |
| 63 | } |
| 64 | |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 65 | // Returns true if getting the RAM of control process succeeds. Otherwise |
| 66 | // returns false in which case the value of `total_mem_usage` should be |
| 67 | // ignored. |
| 68 | // Here we ignore the return status of getting worker processes's RAM usage |
| 69 | // because there may be a case that the worker process has disappeared but it |
| 70 | // has not been removed from Process::GetWorkerPidInfoMap(). We do not want this |
| 71 | // to block the overall metric report. In the future, we may implement some |
| 72 | // dedicated metrics to report such cases. |
| 73 | bool GetControlAndWorkerProcessMemoryUsage(size_t* total_mem_usage) { |
| 74 | DCHECK(total_mem_usage != nullptr); |
| 75 | *total_mem_usage = 0; |
| 76 | MemoryUsage usage; |
| 77 | // Collect RAM usage for worker processes. |
| 78 | // Do not crash if `GetProcessMemoryUsage` fails for worker processes because |
| 79 | // maybe some worker process terminates before it is unregistered. |
| 80 | for (const auto& pid_info : Process::GetInstance()->GetWorkerPidInfoMap()) { |
| 81 | if (GetProcessMemoryUsage(&usage, pid_info.first)) { |
| 82 | *total_mem_usage += usage.VmRSSKb + usage.VmSwapKb; |
Honglin Yu | 2161669 | 2021-05-14 11:20:22 +1000 | [diff] [blame] | 83 | } else { |
| 84 | RecordProcessErrorEvent(ProcessError::kGetWorkerProcessMemoryUsageFailed); |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 85 | } |
| 86 | } |
| 87 | // Collect RAM usage for control processes. |
| 88 | if (GetProcessMemoryUsage(&usage)) { |
| 89 | *total_mem_usage += usage.VmRSSKb + usage.VmSwapKb; |
| 90 | return true; |
| 91 | } else { |
| 92 | return false; |
| 93 | } |
| 94 | } |
| 95 | |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 96 | } // namespace |
| 97 | |
| 98 | Metrics::Metrics() |
Joel Kitching | 21beaba | 2019-03-11 15:47:30 +0800 | [diff] [blame] | 99 | : process_metrics_(base::ProcessMetrics::CreateCurrentProcessMetrics()) {} |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 100 | |
| 101 | void Metrics::StartCollectingProcessMetrics() { |
| 102 | if (cumulative_metrics_) { |
| 103 | LOG(WARNING) << "Multiple calls to StartCollectingProcessMetrics"; |
| 104 | return; |
| 105 | } |
| 106 | |
Andrew Moylan | 79b34a4 | 2020-07-08 11:13:11 +1000 | [diff] [blame] | 107 | // Baseline the CPU usage counter in `process_metrics_` to be zero as of now. |
Qijiang Fan | 7cdb366 | 2019-10-21 16:39:20 +0900 | [diff] [blame] | 108 | process_metrics_->GetPlatformIndependentCPUUsage(); |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 109 | |
| 110 | cumulative_metrics_ = std::make_unique<chromeos_metrics::CumulativeMetrics>( |
| 111 | base::FilePath(kCumulativeMetricsBackingDir), |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 112 | std::vector<std::string>{kPeakTotalMemoryCumulativeStatName}, |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 113 | kCumulativeMetricsUpdatePeriod, |
| 114 | base::Bind(&Metrics::UpdateAndRecordMetrics, base::Unretained(this), |
| 115 | true /*record_current_metrics*/), |
| 116 | kCumulativeMetricsReportPeriod, |
| 117 | base::Bind(&RecordCumulativeMetrics, |
| 118 | base::Unretained(&metrics_library_))); |
| 119 | } |
| 120 | |
| 121 | void Metrics::UpdateCumulativeMetricsNow() { |
| 122 | if (!cumulative_metrics_) { |
| 123 | return; |
| 124 | } |
| 125 | UpdateAndRecordMetrics(false /*record_current_metrics*/, |
| 126 | cumulative_metrics_.get()); |
| 127 | } |
| 128 | |
| 129 | void Metrics::UpdateAndRecordMetrics( |
| 130 | const bool record_current_metrics, |
| 131 | chromeos_metrics::CumulativeMetrics* const cumulative_metrics) { |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 132 | size_t usage = 0; |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 133 | if (!GetControlAndWorkerProcessMemoryUsage(&usage)) { |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 134 | LOG(DFATAL) << "Getting process memory usage failed"; |
| 135 | return; |
| 136 | } |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 137 | |
| 138 | // Update max memory stats. |
Honglin Yu | 1cd2507 | 2019-07-09 11:54:14 +1000 | [diff] [blame] | 139 | cumulative_metrics->Max(kPeakTotalMemoryCumulativeStatName, |
| 140 | static_cast<int64_t>(usage)); |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 141 | |
| 142 | if (record_current_metrics) { |
| 143 | // Record CPU usage (units = milli-percent i.e. 0.001%): |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 144 | // First get the CPU usage of the control process. |
| 145 | auto cpu_usage = process_metrics_->GetPlatformIndependentCPUUsage(); |
| 146 | // Then get the CPU usages of the worker processes. |
| 147 | for (const auto& pid_info : Process::GetInstance()->GetWorkerPidInfoMap()) { |
| 148 | cpu_usage += |
| 149 | pid_info.second.process_metrics->GetPlatformIndependentCPUUsage(); |
| 150 | } |
| 151 | |
Tom Hughes | 1d1c192 | 2020-08-27 16:16:53 -0700 | [diff] [blame] | 152 | const int cpu_usage_milli_percent = static_cast<int>( |
Honglin Yu | 7b6c119 | 2020-09-16 10:07:17 +1000 | [diff] [blame] | 153 | 1000. * cpu_usage / base::SysInfo::NumberOfProcessors()); |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 154 | metrics_library_.SendToUMA(kCpuUsageMetricName, cpu_usage_milli_percent, |
| 155 | kCpuUsageMinMilliPercent, |
| 156 | kCpuUsageMaxMilliPercent, kCpuUsageBuckets); |
| 157 | // Record memory usage: |
Tom Hughes | 1d1c192 | 2020-08-27 16:16:53 -0700 | [diff] [blame] | 158 | metrics_library_.SendToUMA(kTotalMemoryMetricName, usage, kMemoryUsageMinKb, |
| 159 | kMemoryUsageMaxKb, kMemoryUsageBuckets); |
Honglin Yu | 2161669 | 2021-05-14 11:20:22 +1000 | [diff] [blame] | 160 | |
| 161 | // Record how many worker processes. |
| 162 | metrics_library_.SendToUMA( |
| 163 | kNumWorkerProcessMetricName, |
| 164 | Process::GetInstance()->GetWorkerPidInfoMap().size(), |
| 165 | kNumWorkerProcessMin, kNumWorkerProcessMax, kNumWorkerProcessBuckets); |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 166 | } |
| 167 | } |
| 168 | |
| 169 | void Metrics::RecordMojoConnectionEvent(const MojoConnectionEvent event) { |
Honglin Yu | ca0caf8 | 2020-01-23 12:26:03 +1100 | [diff] [blame] | 170 | metrics_library_.SendEnumToUMA( |
| 171 | kMojoConnectionEventMetricName, static_cast<int>(event), |
| 172 | static_cast<int>(MojoConnectionEvent::kMaxValue) + 1); |
Andrew Moylan | 40ee4fc | 2018-08-24 15:46:09 +1000 | [diff] [blame] | 173 | } |
| 174 | |
| 175 | } // namespace ml |