blob: 2488c29bf5e2bf7499c9a2d7a43feb8a662d113d [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2017 The ChromiumOS Authors
Allen Li51bb6122017-06-21 12:04:13 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Process metrics."""
6
7from __future__ import absolute_import
Allen Li51bb6122017-06-21 12:04:13 -07008
Allen Li3992c662018-01-05 15:26:36 -08009from functools import partial
Chris McDonald59650c32021-07-20 15:29:28 -060010import logging
Allen Li3992c662018-01-05 15:26:36 -080011
Mike Frysingercb56b642019-08-25 15:33:08 -040012import psutil # pylint: disable=import-error
Allen Li51bb6122017-06-21 12:04:13 -070013
Allen Lia9c6e802017-07-11 15:42:47 -070014from chromite.lib import metrics
Allen Li51bb6122017-06-21 12:04:13 -070015
Chris McDonald59650c32021-07-20 15:29:28 -060016
Allen Li51bb6122017-06-21 12:04:13 -070017logger = logging.getLogger(__name__)
18
Allen Lia9c6e802017-07-11 15:42:47 -070019_count_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060020 "proc/count", description="Number of processes currently running."
21)
Congbin Guo16b64d52023-02-10 17:50:30 -080022_thread_count_metric = metrics.GaugeMetric(
23 "proc/thread_count", description="Number of threads currently running."
24)
Allen Lia9c6e802017-07-11 15:42:47 -070025_cpu_percent_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060026 "proc/cpu_percent", description="CPU usage percent of processes."
27)
Congbin Guo18b4ed72023-02-11 19:16:16 -080028_cpu_times_metric = metrics.CumulativeMetric(
29 "proc/cpu_times",
30 description="Accumulated CPU time in each specific mode of processes.",
31)
Allen Li51bb6122017-06-21 12:04:13 -070032
33
34def collect_proc_info():
Alex Klein1699fab2022-09-08 08:46:06 -060035 collector = _ProcessMetricsCollector()
36 collector.collect()
Allen Li6bb74d52017-06-22 14:44:53 -070037
38
39class _ProcessMetricsCollector(object):
Alex Klein1699fab2022-09-08 08:46:06 -060040 """Class for collecting process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -070041
Congbin Guo18b4ed72023-02-11 19:16:16 -080042 # We need to store some per process metrics of last run in order to
43 # calculate the detla and aggregate them.
44 old_cpu_times = {}
45
Alex Klein1699fab2022-09-08 08:46:06 -060046 def __init__(self):
47 self._metrics = [
Congbin Guo4ccf0632023-02-12 00:01:14 -080048 _ProcessMetric("adb", test_func=partial(_is_process_name, "adb")),
Alex Klein1699fab2022-09-08 08:46:06 -060049 _ProcessMetric("autoserv", test_func=_is_parent_autoserv),
Congbin Guo522cd982022-10-06 11:47:28 -070050 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080051 "bbagent", test_func=partial(_is_process_name, "bbagent")
52 ),
53 _ProcessMetric(
Congbin Guo3cdc11e2022-10-11 16:02:32 -070054 "cache-downloader",
Congbin Guofcb436b2023-01-23 20:36:01 -080055 test_func=partial(_is_process_name, "downloader"),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070056 ),
Congbin Guoa8432502023-01-23 20:31:01 -080057 _ProcessMetric("cipd", test_func=partial(_is_process_name, "cipd")),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070058 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080059 "cloudtail", test_func=partial(_is_process_name, "cloudtail")
60 ),
61 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070062 "common-tls", test_func=partial(_is_process_name, "common-tls")
63 ),
Alex Klein1699fab2022-09-08 08:46:06 -060064 _ProcessMetric("curl", test_func=partial(_is_process_name, "curl")),
65 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070066 "dnsmasq", test_func=partial(_is_process_name, "dnsmasq")
Alex Klein1699fab2022-09-08 08:46:06 -060067 ),
68 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070069 "drone-agent",
Congbin Guofcb436b2023-01-23 20:36:01 -080070 test_func=partial(_is_process_name, "drone-agent"),
Congbin Guo522cd982022-10-06 11:47:28 -070071 ),
72 _ProcessMetric(
73 "fleet-tlw", test_func=partial(_is_process_name, "fleet-tlw")
74 ),
75 _ProcessMetric(
76 "getty", test_func=partial(_is_process_name, "getty")
Alex Klein1699fab2022-09-08 08:46:06 -060077 ),
78 _ProcessMetric(
79 "gs_offloader",
80 test_func=partial(_is_process_name, "gs_offloader.py"),
81 ),
82 _ProcessMetric("gsutil", test_func=_is_gsutil),
83 _ProcessMetric("java", test_func=partial(_is_process_name, "java")),
Congbin Guo4ccf0632023-02-12 00:01:14 -080084 _ProcessMetric("k8s_system", test_func=_is_k8s_system),
Alex Klein1699fab2022-09-08 08:46:06 -060085 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070086 "labservice", test_func=partial(_is_process_name, "labservice")
87 ),
88 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060089 "lxc-attach", test_func=partial(_is_process_name, "lxc-attach")
90 ),
91 _ProcessMetric(
92 "lxc-start", test_func=partial(_is_process_name, "lxc-start")
93 ),
Congbin Guoa8432502023-01-23 20:31:01 -080094 _ProcessMetric(
95 "podman-pull", test_func=partial(_is_podman, "pull")
96 ),
97 _ProcessMetric("podman-run", test_func=partial(_is_podman, "run")),
Congbin Guo4ccf0632023-02-12 00:01:14 -080098 _ProcessMetric(
99 "phosphorus", test_func=partial(_is_process_name, "phosphorus")
100 ),
101 _ProcessMetric("recipe", test_func=_is_recipe),
Alex Klein1699fab2022-09-08 08:46:06 -0600102 _ProcessMetric("sshd", test_func=partial(_is_process_name, "sshd")),
103 _ProcessMetric("swarming_bot", test_func=_is_swarming_bot),
104 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -0800105 "swarming_sub_task", test_func=_is_swarming_sub_task
106 ),
107 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -0600108 "sysmon",
109 test_func=partial(_is_python_module, "chromite.scripts.sysmon"),
110 ),
Congbin Guo522cd982022-10-06 11:47:28 -0700111 _ProcessMetric("tko_proxy", test_func=_is_tko_proxy),
Alex Klein1699fab2022-09-08 08:46:06 -0600112 ]
113 self._other_metric = _ProcessMetric("other")
Allen Li6bb74d52017-06-22 14:44:53 -0700114
Alex Klein1699fab2022-09-08 08:46:06 -0600115 def collect(self):
Congbin Guo18b4ed72023-02-11 19:16:16 -0800116 new_cpu_times = {}
Alex Klein1699fab2022-09-08 08:46:06 -0600117 for proc in psutil.process_iter():
Congbin Guo18b4ed72023-02-11 19:16:16 -0800118 new_cpu_times[proc.pid] = proc.cpu_times()
Alex Klein1699fab2022-09-08 08:46:06 -0600119 self._collect_proc(proc)
120 self._flush()
Congbin Guo18b4ed72023-02-11 19:16:16 -0800121 _ProcessMetricsCollector.old_cpu_times = new_cpu_times
Allen Li6bb74d52017-06-22 14:44:53 -0700122
Alex Klein1699fab2022-09-08 08:46:06 -0600123 def _collect_proc(self, proc):
124 for metric in self._metrics:
125 if metric.add(proc):
126 break
127 else:
128 self._other_metric.add(proc)
Allen Li6bb74d52017-06-22 14:44:53 -0700129
Alex Klein1699fab2022-09-08 08:46:06 -0600130 def _flush(self):
131 for metric in self._metrics:
132 metric.flush()
133 self._other_metric.flush()
Allen Li6bb74d52017-06-22 14:44:53 -0700134
135
136class _ProcessMetric(object):
Alex Klein1699fab2022-09-08 08:46:06 -0600137 """Class for gathering process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -0700138
Alex Klein1699fab2022-09-08 08:46:06 -0600139 def __init__(self, process_name, test_func=lambda proc: True):
140 """Initialize instance.
Allen Li6bb74d52017-06-22 14:44:53 -0700141
Alex Klein1699fab2022-09-08 08:46:06 -0600142 process_name is used to identify the metric stream.
Allen Li6bb74d52017-06-22 14:44:53 -0700143
Alex Klein1699fab2022-09-08 08:46:06 -0600144 test_func is a function called
145 for each process. If it returns True, the process is counted. The
146 default test is to count every process.
147 """
148 self._fields = {
149 "process_name": process_name,
150 }
151 self._test_func = test_func
152 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800153 self._thread_count = 0
Alex Klein1699fab2022-09-08 08:46:06 -0600154 self._cpu_percent = 0
Congbin Guo18b4ed72023-02-11 19:16:16 -0800155 self._cpu_times = _CPUTimes()
Allen Li6bb74d52017-06-22 14:44:53 -0700156
Alex Klein1699fab2022-09-08 08:46:06 -0600157 def add(self, proc):
158 """Do metric collection for the given process.
Allen Li6bb74d52017-06-22 14:44:53 -0700159
Alex Klein1699fab2022-09-08 08:46:06 -0600160 Returns True if the process was collected.
161 """
162 if not self._test_func(proc):
163 return False
164 self._count += 1
Congbin Guo16b64d52023-02-10 17:50:30 -0800165 self._thread_count += proc.num_threads()
Alex Klein1699fab2022-09-08 08:46:06 -0600166 self._cpu_percent += proc.cpu_percent()
Congbin Guo18b4ed72023-02-11 19:16:16 -0800167
168 self._cpu_times += _CPUTimes(
169 proc.cpu_times()
170 ) - _ProcessMetricsCollector.old_cpu_times.get(proc.pid)
171
Alex Klein1699fab2022-09-08 08:46:06 -0600172 return True
Allen Li6bb74d52017-06-22 14:44:53 -0700173
Alex Klein1699fab2022-09-08 08:46:06 -0600174 def flush(self):
175 """Finish collection and send metrics."""
176 _count_metric.set(self._count, fields=self._fields)
177 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800178
179 _thread_count_metric.set(self._thread_count, fields=self._fields)
180 self._thread_count = 0
181
Alex Klein1699fab2022-09-08 08:46:06 -0600182 _cpu_percent_metric.set(
183 int(round(self._cpu_percent)), fields=self._fields
184 )
185 self._cpu_percent = 0
Allen Li51bb6122017-06-21 12:04:13 -0700186
Congbin Guo18b4ed72023-02-11 19:16:16 -0800187 for mode, t in self._cpu_times.asdict().items():
188 _cpu_times_metric.increment_by(
189 t, fields={**self._fields, "mode": mode}
190 )
191 self._cpu_times = _CPUTimes()
192
193
194class _CPUTimes(object):
195 """A container for CPU times metrics."""
196
197 def __init__(self, v=None):
198 self.system = v.system if v else 0
199 self.user = v.user if v else 0
200 self.iowait = v.iowait if v else 0
201 self.children_system = v.children_system if v else 0
202 self.children_user = v.children_user if v else 0
203
204 def __sub__(self, rhs):
205 if not rhs:
206 return self
207
208 r = _CPUTimes()
209 r.system = self.system - rhs.system
210 r.user = self.user - rhs.user
211 r.iowait = self.iowait - rhs.iowait
212 r.children_system = self.children_system - rhs.children_system
213 r.children_user = self.children_user - rhs.children_user
214 return r
215
216 def __iadd__(self, rhs):
217 if not rhs:
218 return self
219
220 self.system += rhs.system
221 self.user += rhs.user
222 self.iowait += rhs.iowait
223 self.children_system += rhs.children_system
224 self.children_user += rhs.children_user
225 return self
226
227 def asdict(self):
228 return {
229 "system": self.system,
230 "user": self.user,
231 "iowait": self.iowait,
232 "children_system": self.children_system,
233 "children_user": self.children_user,
234 }
235
Allen Li51bb6122017-06-21 12:04:13 -0700236
237def _is_parent_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600238 """Return whether proc is a parent (not forked) autoserv process."""
239 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
Allen Li51bb6122017-06-21 12:04:13 -0700240
241
242def _is_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600243 """Return whether proc is an autoserv process."""
244 # This relies on the autoserv script being run directly. The script should
245 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
246 # NOT /bin/env
247 return _is_process_name("autoserv", proc)
Allen Li51bb6122017-06-21 12:04:13 -0700248
249
Allen Li3992c662018-01-05 15:26:36 -0800250def _is_python_module(module, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600251 """Return whether proc is a process running a Python module."""
252 cmdline = proc.cmdline()
253 return (
254 cmdline
255 and cmdline[0].endswith("python")
256 and cmdline[1:3] == ["-m", module]
257 )
Allen Li3992c662018-01-05 15:26:36 -0800258
259
Prathmesh Prabhu0b795f02018-05-07 13:12:37 -0700260def _is_process_name(name, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600261 """Return whether process proc is named name."""
262 return proc.name() == name
Congbin Guo17542e02022-06-29 13:48:15 -0700263
264
Congbin Guo4ccf0632023-02-12 00:01:14 -0800265def _is_recipe(proc):
266 """Return whether proc is a recipe process.
267
268 An example proc is like
269 '/home/.../bin/python -u -s
270 /home/.../kitchen-checkout/recipe_engine/recipe_engine/main.py ...'.
271 """
272 cmdline = proc.cmdline()
273 return (
274 len(cmdline) >= 4
275 and cmdline[0].endswith("/python")
276 and cmdline[3].endswith("/recipe_engine/main.py")
277 )
278
279
Congbin Guo17542e02022-06-29 13:48:15 -0700280def _is_swarming_bot(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600281 """Return whether proc is a Swarming bot.
Congbin Guo17542e02022-06-29 13:48:15 -0700282
Alex Klein1699fab2022-09-08 08:46:06 -0600283 A swarming bot process is like '/usr/bin/python3.8 <bot-zip-path> start_bot'.
284 """
285 cmdline = proc.cmdline()
286 return (
287 len(cmdline) == 3
288 and cmdline[0].split("/")[-1].startswith("python")
289 and cmdline[2] == "start_bot"
290 )
Congbin Guo17542e02022-06-29 13:48:15 -0700291
292
Congbin Guo4ccf0632023-02-12 00:01:14 -0800293def _is_swarming_sub_task(proc):
294 """Return whether proc is a Swarming bot sub task.
295
296 An example Swarming sub task:
297 /usr/bin/python3.8 -u /.../swarming_bot.2.zip run_isolated ...
298 """
299 cmdline = proc.cmdline()
300 return (
301 len(cmdline) >= 4
302 and cmdline[0].split("/")[-1].startswith("python")
303 and cmdline[2].split("/")[-1].startswith("swarming_bot.")
304 )
305
306
Congbin Guo17542e02022-06-29 13:48:15 -0700307def _is_gsutil(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600308 """Return whether proc is gsutil."""
309 cmdline = proc.cmdline()
310 return (
311 len(cmdline) >= 2
312 and cmdline[0] == "python"
313 and cmdline[1].endswith("gsutil")
314 )
Congbin Guo522cd982022-10-06 11:47:28 -0700315
316
Congbin Guo4ccf0632023-02-12 00:01:14 -0800317def _is_k8s_system(proc):
318 """Return whether proc is a k8s system process."""
319 return proc.name() in ("kubelet", "kube-proxy")
320
321
Congbin Guo522cd982022-10-06 11:47:28 -0700322def _is_tko_proxy(proc):
323 """Return whether proc is a tko proxy.
324
325 A tk proxy process is like
326 '/opt/cloud_sql_proxy -dir=<...>
327 -instances=google.com:chromeos-lab:us-central1:tko
328 -credential_file=<...>'.
329 """
330 cmdline = proc.cmdline()
331 return (
332 len(cmdline) == 4
Congbin Guofcb436b2023-01-23 20:36:01 -0800333 and cmdline[0].split("/")[-1] == "cloud_sql_proxy"
334 and cmdline[2] == "-instances=google.com:chromeos-lab:us-central1:tko"
Congbin Guo522cd982022-10-06 11:47:28 -0700335 )
Congbin Guoa8432502023-01-23 20:31:01 -0800336
337
338def _is_podman(subcmd, proc):
339 """Return whiter proc is a podman process.
340
341 A podman pull process is like
342 'podman pull image:tag'
343 A podman run process is like
344 'podman run --option ... image:tag'
345 """
346 cmdline = proc.cmdline()
Congbin Guoba85d0b2023-01-27 18:32:21 -0800347 return proc.name() == "podman" and len(cmdline) > 1 and cmdline[1] == subcmd