blob: 34d231c63ed5f125483f728ff74166f62107c67a [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2017 The ChromiumOS Authors
Allen Li51bb6122017-06-21 12:04:13 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Process metrics."""
6
7from __future__ import absolute_import
Allen Li51bb6122017-06-21 12:04:13 -07008
Allen Li3992c662018-01-05 15:26:36 -08009from functools import partial
Chris McDonald59650c32021-07-20 15:29:28 -060010import logging
Allen Li3992c662018-01-05 15:26:36 -080011
Mike Frysingercb56b642019-08-25 15:33:08 -040012import psutil # pylint: disable=import-error
Allen Li51bb6122017-06-21 12:04:13 -070013
Allen Lia9c6e802017-07-11 15:42:47 -070014from chromite.lib import metrics
Allen Li51bb6122017-06-21 12:04:13 -070015
Chris McDonald59650c32021-07-20 15:29:28 -060016
Allen Li51bb6122017-06-21 12:04:13 -070017logger = logging.getLogger(__name__)
18
Allen Lia9c6e802017-07-11 15:42:47 -070019_count_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060020 "proc/count", description="Number of processes currently running."
21)
Congbin Guo16b64d52023-02-10 17:50:30 -080022_thread_count_metric = metrics.GaugeMetric(
23 "proc/thread_count", description="Number of threads currently running."
24)
Allen Lia9c6e802017-07-11 15:42:47 -070025_cpu_percent_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060026 "proc/cpu_percent", description="CPU usage percent of processes."
27)
Congbin Guo18b4ed72023-02-11 19:16:16 -080028_cpu_times_metric = metrics.CumulativeMetric(
29 "proc/cpu_times",
30 description="Accumulated CPU time in each specific mode of processes.",
31)
Congbin Guocf2750c2023-02-11 21:25:16 -080032_read_count_metric = metrics.CounterMetric(
33 "proc/read/count",
34 description="Accumulated read operation count of processes.",
35)
36_read_bytes_metric = metrics.CounterMetric(
37 "proc/read/bytes", description="Accumulated read bytes of processes."
38)
39_read_chars_metric = metrics.CounterMetric(
40 "proc/read/chars",
41 description="Accumulated buffered read bytes of processes.",
42)
43_write_count_metric = metrics.CounterMetric(
44 "proc/write/count",
45 description="Accumulated write operation count of processes.",
46)
47_write_bytes_metric = metrics.CounterMetric(
48 "proc/write/bytes", description="Accumulated write bytes of processes."
49)
50_write_chars_metric = metrics.CounterMetric(
51 "proc/write/chars",
52 description="Accumulated buffered write bytes of processes.",
53)
Allen Li51bb6122017-06-21 12:04:13 -070054
55
56def collect_proc_info():
Alex Klein1699fab2022-09-08 08:46:06 -060057 collector = _ProcessMetricsCollector()
58 collector.collect()
Allen Li6bb74d52017-06-22 14:44:53 -070059
60
61class _ProcessMetricsCollector(object):
Alex Klein1699fab2022-09-08 08:46:06 -060062 """Class for collecting process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -070063
Congbin Guo18b4ed72023-02-11 19:16:16 -080064 # We need to store some per process metrics of last run in order to
65 # calculate the detla and aggregate them.
66 old_cpu_times = {}
Congbin Guocf2750c2023-02-11 21:25:16 -080067 old_io_counters = {}
Congbin Guo18b4ed72023-02-11 19:16:16 -080068
Alex Klein1699fab2022-09-08 08:46:06 -060069 def __init__(self):
70 self._metrics = [
Congbin Guo4ccf0632023-02-12 00:01:14 -080071 _ProcessMetric("adb", test_func=partial(_is_process_name, "adb")),
Alex Klein1699fab2022-09-08 08:46:06 -060072 _ProcessMetric("autoserv", test_func=_is_parent_autoserv),
Congbin Guo522cd982022-10-06 11:47:28 -070073 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080074 "bbagent", test_func=partial(_is_process_name, "bbagent")
75 ),
76 _ProcessMetric(
Congbin Guo3cdc11e2022-10-11 16:02:32 -070077 "cache-downloader",
Congbin Guofcb436b2023-01-23 20:36:01 -080078 test_func=partial(_is_process_name, "downloader"),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070079 ),
Congbin Guoa8432502023-01-23 20:31:01 -080080 _ProcessMetric("cipd", test_func=partial(_is_process_name, "cipd")),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070081 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080082 "cloudtail", test_func=partial(_is_process_name, "cloudtail")
83 ),
84 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070085 "common-tls", test_func=partial(_is_process_name, "common-tls")
86 ),
Alex Klein1699fab2022-09-08 08:46:06 -060087 _ProcessMetric("curl", test_func=partial(_is_process_name, "curl")),
88 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070089 "dnsmasq", test_func=partial(_is_process_name, "dnsmasq")
Alex Klein1699fab2022-09-08 08:46:06 -060090 ),
91 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070092 "drone-agent",
Congbin Guofcb436b2023-01-23 20:36:01 -080093 test_func=partial(_is_process_name, "drone-agent"),
Congbin Guo522cd982022-10-06 11:47:28 -070094 ),
95 _ProcessMetric(
96 "fleet-tlw", test_func=partial(_is_process_name, "fleet-tlw")
97 ),
98 _ProcessMetric(
99 "getty", test_func=partial(_is_process_name, "getty")
Alex Klein1699fab2022-09-08 08:46:06 -0600100 ),
101 _ProcessMetric(
102 "gs_offloader",
103 test_func=partial(_is_process_name, "gs_offloader.py"),
104 ),
105 _ProcessMetric("gsutil", test_func=_is_gsutil),
106 _ProcessMetric("java", test_func=partial(_is_process_name, "java")),
Congbin Guo4ccf0632023-02-12 00:01:14 -0800107 _ProcessMetric("k8s_system", test_func=_is_k8s_system),
Alex Klein1699fab2022-09-08 08:46:06 -0600108 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -0700109 "labservice", test_func=partial(_is_process_name, "labservice")
110 ),
111 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -0600112 "lxc-attach", test_func=partial(_is_process_name, "lxc-attach")
113 ),
114 _ProcessMetric(
115 "lxc-start", test_func=partial(_is_process_name, "lxc-start")
116 ),
Congbin Guoa8432502023-01-23 20:31:01 -0800117 _ProcessMetric(
118 "podman-pull", test_func=partial(_is_podman, "pull")
119 ),
120 _ProcessMetric("podman-run", test_func=partial(_is_podman, "run")),
Congbin Guo4ccf0632023-02-12 00:01:14 -0800121 _ProcessMetric(
122 "phosphorus", test_func=partial(_is_process_name, "phosphorus")
123 ),
124 _ProcessMetric("recipe", test_func=_is_recipe),
Alex Klein1699fab2022-09-08 08:46:06 -0600125 _ProcessMetric("sshd", test_func=partial(_is_process_name, "sshd")),
126 _ProcessMetric("swarming_bot", test_func=_is_swarming_bot),
127 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -0800128 "swarming_sub_task", test_func=_is_swarming_sub_task
129 ),
130 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -0600131 "sysmon",
132 test_func=partial(_is_python_module, "chromite.scripts.sysmon"),
133 ),
Congbin Guo522cd982022-10-06 11:47:28 -0700134 _ProcessMetric("tko_proxy", test_func=_is_tko_proxy),
Alex Klein1699fab2022-09-08 08:46:06 -0600135 ]
136 self._other_metric = _ProcessMetric("other")
Allen Li6bb74d52017-06-22 14:44:53 -0700137
Alex Klein1699fab2022-09-08 08:46:06 -0600138 def collect(self):
Congbin Guo18b4ed72023-02-11 19:16:16 -0800139 new_cpu_times = {}
Congbin Guocf2750c2023-02-11 21:25:16 -0800140 new_io_counters = {}
Alex Klein1699fab2022-09-08 08:46:06 -0600141 for proc in psutil.process_iter():
Congbin Guo18b4ed72023-02-11 19:16:16 -0800142 new_cpu_times[proc.pid] = proc.cpu_times()
Congbin Guocf2750c2023-02-11 21:25:16 -0800143 new_io_counters[proc.pid] = proc.io_counters()
Alex Klein1699fab2022-09-08 08:46:06 -0600144 self._collect_proc(proc)
145 self._flush()
Congbin Guo18b4ed72023-02-11 19:16:16 -0800146 _ProcessMetricsCollector.old_cpu_times = new_cpu_times
Congbin Guocf2750c2023-02-11 21:25:16 -0800147 _ProcessMetricsCollector.old_io_counters = new_io_counters
Allen Li6bb74d52017-06-22 14:44:53 -0700148
Alex Klein1699fab2022-09-08 08:46:06 -0600149 def _collect_proc(self, proc):
150 for metric in self._metrics:
151 if metric.add(proc):
152 break
153 else:
154 self._other_metric.add(proc)
Allen Li6bb74d52017-06-22 14:44:53 -0700155
Alex Klein1699fab2022-09-08 08:46:06 -0600156 def _flush(self):
157 for metric in self._metrics:
158 metric.flush()
159 self._other_metric.flush()
Allen Li6bb74d52017-06-22 14:44:53 -0700160
161
162class _ProcessMetric(object):
Alex Klein1699fab2022-09-08 08:46:06 -0600163 """Class for gathering process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -0700164
Alex Klein1699fab2022-09-08 08:46:06 -0600165 def __init__(self, process_name, test_func=lambda proc: True):
166 """Initialize instance.
Allen Li6bb74d52017-06-22 14:44:53 -0700167
Alex Klein1699fab2022-09-08 08:46:06 -0600168 process_name is used to identify the metric stream.
Allen Li6bb74d52017-06-22 14:44:53 -0700169
Alex Klein1699fab2022-09-08 08:46:06 -0600170 test_func is a function called
171 for each process. If it returns True, the process is counted. The
172 default test is to count every process.
173 """
174 self._fields = {
175 "process_name": process_name,
176 }
177 self._test_func = test_func
178 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800179 self._thread_count = 0
Alex Klein1699fab2022-09-08 08:46:06 -0600180 self._cpu_percent = 0
Congbin Guo18b4ed72023-02-11 19:16:16 -0800181 self._cpu_times = _CPUTimes()
Congbin Guocf2750c2023-02-11 21:25:16 -0800182 self._io_counters = _IOCounters()
Allen Li6bb74d52017-06-22 14:44:53 -0700183
Alex Klein1699fab2022-09-08 08:46:06 -0600184 def add(self, proc):
185 """Do metric collection for the given process.
Allen Li6bb74d52017-06-22 14:44:53 -0700186
Alex Klein1699fab2022-09-08 08:46:06 -0600187 Returns True if the process was collected.
188 """
189 if not self._test_func(proc):
190 return False
191 self._count += 1
Congbin Guo16b64d52023-02-10 17:50:30 -0800192 self._thread_count += proc.num_threads()
Alex Klein1699fab2022-09-08 08:46:06 -0600193 self._cpu_percent += proc.cpu_percent()
Congbin Guo18b4ed72023-02-11 19:16:16 -0800194
195 self._cpu_times += _CPUTimes(
196 proc.cpu_times()
197 ) - _ProcessMetricsCollector.old_cpu_times.get(proc.pid)
198
Congbin Guocf2750c2023-02-11 21:25:16 -0800199 self._io_counters += _IOCounters(
200 proc.io_counters()
201 ) - _ProcessMetricsCollector.old_io_counters.get(proc.pid)
202
Alex Klein1699fab2022-09-08 08:46:06 -0600203 return True
Allen Li6bb74d52017-06-22 14:44:53 -0700204
Alex Klein1699fab2022-09-08 08:46:06 -0600205 def flush(self):
206 """Finish collection and send metrics."""
207 _count_metric.set(self._count, fields=self._fields)
208 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800209
210 _thread_count_metric.set(self._thread_count, fields=self._fields)
211 self._thread_count = 0
212
Alex Klein1699fab2022-09-08 08:46:06 -0600213 _cpu_percent_metric.set(
214 int(round(self._cpu_percent)), fields=self._fields
215 )
216 self._cpu_percent = 0
Allen Li51bb6122017-06-21 12:04:13 -0700217
Congbin Guo18b4ed72023-02-11 19:16:16 -0800218 for mode, t in self._cpu_times.asdict().items():
219 _cpu_times_metric.increment_by(
220 t, fields={**self._fields, "mode": mode}
221 )
222 self._cpu_times = _CPUTimes()
223
Congbin Guocf2750c2023-02-11 21:25:16 -0800224 _read_count_metric.increment_by(
225 self._io_counters.read_count, fields=self._fields
226 )
227 _read_bytes_metric.increment_by(
228 self._io_counters.read_bytes, fields=self._fields
229 )
230 _read_chars_metric.increment_by(
231 self._io_counters.read_chars, fields=self._fields
232 )
233 _write_count_metric.increment_by(
234 self._io_counters.write_count, fields=self._fields
235 )
236 _write_bytes_metric.increment_by(
237 self._io_counters.write_bytes, fields=self._fields
238 )
239 _write_chars_metric.increment_by(
240 self._io_counters.write_chars, fields=self._fields
241 )
242 self._io_counters = _IOCounters()
243
Congbin Guo18b4ed72023-02-11 19:16:16 -0800244
245class _CPUTimes(object):
246 """A container for CPU times metrics."""
247
248 def __init__(self, v=None):
249 self.system = v.system if v else 0
250 self.user = v.user if v else 0
251 self.iowait = v.iowait if v else 0
252 self.children_system = v.children_system if v else 0
253 self.children_user = v.children_user if v else 0
254
255 def __sub__(self, rhs):
256 if not rhs:
257 return self
258
259 r = _CPUTimes()
260 r.system = self.system - rhs.system
261 r.user = self.user - rhs.user
262 r.iowait = self.iowait - rhs.iowait
263 r.children_system = self.children_system - rhs.children_system
264 r.children_user = self.children_user - rhs.children_user
265 return r
266
267 def __iadd__(self, rhs):
268 if not rhs:
269 return self
270
271 self.system += rhs.system
272 self.user += rhs.user
273 self.iowait += rhs.iowait
274 self.children_system += rhs.children_system
275 self.children_user += rhs.children_user
276 return self
277
278 def asdict(self):
279 return {
280 "system": self.system,
281 "user": self.user,
282 "iowait": self.iowait,
283 "children_system": self.children_system,
284 "children_user": self.children_user,
285 }
286
Allen Li51bb6122017-06-21 12:04:13 -0700287
288def _is_parent_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600289 """Return whether proc is a parent (not forked) autoserv process."""
290 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
Allen Li51bb6122017-06-21 12:04:13 -0700291
292
293def _is_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600294 """Return whether proc is an autoserv process."""
295 # This relies on the autoserv script being run directly. The script should
296 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
297 # NOT /bin/env
298 return _is_process_name("autoserv", proc)
Allen Li51bb6122017-06-21 12:04:13 -0700299
300
Allen Li3992c662018-01-05 15:26:36 -0800301def _is_python_module(module, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600302 """Return whether proc is a process running a Python module."""
303 cmdline = proc.cmdline()
304 return (
305 cmdline
306 and cmdline[0].endswith("python")
307 and cmdline[1:3] == ["-m", module]
308 )
Allen Li3992c662018-01-05 15:26:36 -0800309
310
Prathmesh Prabhu0b795f02018-05-07 13:12:37 -0700311def _is_process_name(name, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600312 """Return whether process proc is named name."""
313 return proc.name() == name
Congbin Guo17542e02022-06-29 13:48:15 -0700314
315
Congbin Guo4ccf0632023-02-12 00:01:14 -0800316def _is_recipe(proc):
317 """Return whether proc is a recipe process.
318
319 An example proc is like
320 '/home/.../bin/python -u -s
321 /home/.../kitchen-checkout/recipe_engine/recipe_engine/main.py ...'.
322 """
323 cmdline = proc.cmdline()
324 return (
325 len(cmdline) >= 4
326 and cmdline[0].endswith("/python")
327 and cmdline[3].endswith("/recipe_engine/main.py")
328 )
329
330
Congbin Guo17542e02022-06-29 13:48:15 -0700331def _is_swarming_bot(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600332 """Return whether proc is a Swarming bot.
Congbin Guo17542e02022-06-29 13:48:15 -0700333
Alex Klein1699fab2022-09-08 08:46:06 -0600334 A swarming bot process is like '/usr/bin/python3.8 <bot-zip-path> start_bot'.
335 """
336 cmdline = proc.cmdline()
337 return (
338 len(cmdline) == 3
339 and cmdline[0].split("/")[-1].startswith("python")
340 and cmdline[2] == "start_bot"
341 )
Congbin Guo17542e02022-06-29 13:48:15 -0700342
343
Congbin Guo4ccf0632023-02-12 00:01:14 -0800344def _is_swarming_sub_task(proc):
345 """Return whether proc is a Swarming bot sub task.
346
347 An example Swarming sub task:
348 /usr/bin/python3.8 -u /.../swarming_bot.2.zip run_isolated ...
349 """
350 cmdline = proc.cmdline()
351 return (
352 len(cmdline) >= 4
353 and cmdline[0].split("/")[-1].startswith("python")
354 and cmdline[2].split("/")[-1].startswith("swarming_bot.")
355 )
356
357
Congbin Guo17542e02022-06-29 13:48:15 -0700358def _is_gsutil(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600359 """Return whether proc is gsutil."""
360 cmdline = proc.cmdline()
361 return (
362 len(cmdline) >= 2
363 and cmdline[0] == "python"
364 and cmdline[1].endswith("gsutil")
365 )
Congbin Guo522cd982022-10-06 11:47:28 -0700366
367
Congbin Guo4ccf0632023-02-12 00:01:14 -0800368def _is_k8s_system(proc):
369 """Return whether proc is a k8s system process."""
370 return proc.name() in ("kubelet", "kube-proxy")
371
372
Congbin Guo522cd982022-10-06 11:47:28 -0700373def _is_tko_proxy(proc):
374 """Return whether proc is a tko proxy.
375
376 A tk proxy process is like
377 '/opt/cloud_sql_proxy -dir=<...>
378 -instances=google.com:chromeos-lab:us-central1:tko
379 -credential_file=<...>'.
380 """
381 cmdline = proc.cmdline()
382 return (
383 len(cmdline) == 4
Congbin Guofcb436b2023-01-23 20:36:01 -0800384 and cmdline[0].split("/")[-1] == "cloud_sql_proxy"
385 and cmdline[2] == "-instances=google.com:chromeos-lab:us-central1:tko"
Congbin Guo522cd982022-10-06 11:47:28 -0700386 )
Congbin Guoa8432502023-01-23 20:31:01 -0800387
388
389def _is_podman(subcmd, proc):
390 """Return whiter proc is a podman process.
391
392 A podman pull process is like
393 'podman pull image:tag'
394 A podman run process is like
395 'podman run --option ... image:tag'
396 """
397 cmdline = proc.cmdline()
Congbin Guoba85d0b2023-01-27 18:32:21 -0800398 return proc.name() == "podman" and len(cmdline) > 1 and cmdline[1] == subcmd
Congbin Guocf2750c2023-02-11 21:25:16 -0800399
400
401class _CPUTimes(object):
402 """A container for CPU times metrics."""
403
404 def __init__(self, v=None):
405 self.system = v.system if v else 0
406 self.user = v.user if v else 0
407 self.iowait = v.iowait if v else 0
408 self.children_system = v.children_system if v else 0
409 self.children_user = v.children_user if v else 0
410
411 def __sub__(self, rhs):
412 if not rhs:
413 return self
414
415 r = _CPUTimes()
416 r.system = self.system - rhs.system
417 r.user = self.user - rhs.user
418 r.iowait = self.iowait - rhs.iowait
419 r.children_system = self.children_system - rhs.children_system
420 r.children_user = self.children_user - rhs.children_user
421 return r
422
423 def __iadd__(self, rhs):
424 if not rhs:
425 return self
426
427 self.system += rhs.system
428 self.user += rhs.user
429 self.iowait += rhs.iowait
430 self.children_system += rhs.children_system
431 self.children_user += rhs.children_user
432 return self
433
434 def asdict(self):
435 return {
436 "system": self.system,
437 "user": self.user,
438 "iowait": self.iowait,
439 "children_system": self.children_system,
440 "children_user": self.children_user,
441 }
442
443
444class _IOCounters(object):
445 """A container for I/O counter metrics."""
446
447 def __init__(self, v=None):
448 self.read_count = v.read_count if v else 0
449 self.read_bytes = v.read_bytes if v else 0
450 self.read_chars = v.read_chars if v else 0
451 self.write_count = v.write_count if v else 0
452 self.write_bytes = v.write_bytes if v else 0
453 self.write_chars = v.write_chars if v else 0
454
455 def __sub__(self, rhs):
456 if not rhs:
457 return self
458
459 r = _IOCounters()
460 r.read_count = self.read_count - rhs.read_count
461 r.read_bytes = self.read_bytes - rhs.read_bytes
462 r.read_chars = self.read_chars - rhs.read_chars
463 r.write_count = self.write_count - rhs.write_count
464 r.write_bytes = self.write_bytes - rhs.write_bytes
465 r.write_chars = self.write_chars - rhs.write_chars
466 return r
467
468 def __iadd__(self, rhs):
469 if not rhs:
470 return self
471
472 self.read_count += rhs.read_count
473 self.read_bytes += rhs.read_bytes
474 self.write_count += rhs.write_count
475 self.write_bytes += rhs.write_bytes
476 return self