blob: 32e12d08a6c88fe5dfa66f74fa914b5656c27138 [file] [log] [blame]
Mike Frysingerf1ba7ad2022-09-12 05:42:57 -04001# Copyright 2017 The ChromiumOS Authors
Allen Li51bb6122017-06-21 12:04:13 -07002# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Process metrics."""
6
7from __future__ import absolute_import
Allen Li51bb6122017-06-21 12:04:13 -07008
Allen Li3992c662018-01-05 15:26:36 -08009from functools import partial
Chris McDonald59650c32021-07-20 15:29:28 -060010import logging
Allen Li3992c662018-01-05 15:26:36 -080011
Mike Frysingercb56b642019-08-25 15:33:08 -040012import psutil # pylint: disable=import-error
Allen Li51bb6122017-06-21 12:04:13 -070013
Allen Lia9c6e802017-07-11 15:42:47 -070014from chromite.lib import metrics
Allen Li51bb6122017-06-21 12:04:13 -070015
Chris McDonald59650c32021-07-20 15:29:28 -060016
Allen Li51bb6122017-06-21 12:04:13 -070017logger = logging.getLogger(__name__)
18
Allen Lia9c6e802017-07-11 15:42:47 -070019_count_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060020 "proc/count", description="Number of processes currently running."
21)
Congbin Guo16b64d52023-02-10 17:50:30 -080022_thread_count_metric = metrics.GaugeMetric(
23 "proc/thread_count", description="Number of threads currently running."
24)
Allen Lia9c6e802017-07-11 15:42:47 -070025_cpu_percent_metric = metrics.GaugeMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060026 "proc/cpu_percent", description="CPU usage percent of processes."
27)
Allen Li51bb6122017-06-21 12:04:13 -070028
29
30def collect_proc_info():
Alex Klein1699fab2022-09-08 08:46:06 -060031 collector = _ProcessMetricsCollector()
32 collector.collect()
Allen Li6bb74d52017-06-22 14:44:53 -070033
34
35class _ProcessMetricsCollector(object):
Alex Klein1699fab2022-09-08 08:46:06 -060036 """Class for collecting process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -070037
Alex Klein1699fab2022-09-08 08:46:06 -060038 def __init__(self):
39 self._metrics = [
Congbin Guo4ccf0632023-02-12 00:01:14 -080040 _ProcessMetric("adb", test_func=partial(_is_process_name, "adb")),
Alex Klein1699fab2022-09-08 08:46:06 -060041 _ProcessMetric("autoserv", test_func=_is_parent_autoserv),
Congbin Guo522cd982022-10-06 11:47:28 -070042 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080043 "bbagent", test_func=partial(_is_process_name, "bbagent")
44 ),
45 _ProcessMetric(
Congbin Guo3cdc11e2022-10-11 16:02:32 -070046 "cache-downloader",
Congbin Guofcb436b2023-01-23 20:36:01 -080047 test_func=partial(_is_process_name, "downloader"),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070048 ),
Congbin Guoa8432502023-01-23 20:31:01 -080049 _ProcessMetric("cipd", test_func=partial(_is_process_name, "cipd")),
Congbin Guo3cdc11e2022-10-11 16:02:32 -070050 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080051 "cloudtail", test_func=partial(_is_process_name, "cloudtail")
52 ),
53 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070054 "common-tls", test_func=partial(_is_process_name, "common-tls")
55 ),
Alex Klein1699fab2022-09-08 08:46:06 -060056 _ProcessMetric("curl", test_func=partial(_is_process_name, "curl")),
57 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070058 "dnsmasq", test_func=partial(_is_process_name, "dnsmasq")
Alex Klein1699fab2022-09-08 08:46:06 -060059 ),
60 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070061 "drone-agent",
Congbin Guofcb436b2023-01-23 20:36:01 -080062 test_func=partial(_is_process_name, "drone-agent"),
Congbin Guo522cd982022-10-06 11:47:28 -070063 ),
64 _ProcessMetric(
65 "fleet-tlw", test_func=partial(_is_process_name, "fleet-tlw")
66 ),
67 _ProcessMetric(
68 "getty", test_func=partial(_is_process_name, "getty")
Alex Klein1699fab2022-09-08 08:46:06 -060069 ),
70 _ProcessMetric(
71 "gs_offloader",
72 test_func=partial(_is_process_name, "gs_offloader.py"),
73 ),
74 _ProcessMetric("gsutil", test_func=_is_gsutil),
75 _ProcessMetric("java", test_func=partial(_is_process_name, "java")),
Congbin Guo4ccf0632023-02-12 00:01:14 -080076 _ProcessMetric("k8s_system", test_func=_is_k8s_system),
Alex Klein1699fab2022-09-08 08:46:06 -060077 _ProcessMetric(
Congbin Guo522cd982022-10-06 11:47:28 -070078 "labservice", test_func=partial(_is_process_name, "labservice")
79 ),
80 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -060081 "lxc-attach", test_func=partial(_is_process_name, "lxc-attach")
82 ),
83 _ProcessMetric(
84 "lxc-start", test_func=partial(_is_process_name, "lxc-start")
85 ),
Congbin Guoa8432502023-01-23 20:31:01 -080086 _ProcessMetric(
87 "podman-pull", test_func=partial(_is_podman, "pull")
88 ),
89 _ProcessMetric("podman-run", test_func=partial(_is_podman, "run")),
Congbin Guo4ccf0632023-02-12 00:01:14 -080090 _ProcessMetric(
91 "phosphorus", test_func=partial(_is_process_name, "phosphorus")
92 ),
93 _ProcessMetric("recipe", test_func=_is_recipe),
Alex Klein1699fab2022-09-08 08:46:06 -060094 _ProcessMetric("sshd", test_func=partial(_is_process_name, "sshd")),
95 _ProcessMetric("swarming_bot", test_func=_is_swarming_bot),
96 _ProcessMetric(
Congbin Guo4ccf0632023-02-12 00:01:14 -080097 "swarming_sub_task", test_func=_is_swarming_sub_task
98 ),
99 _ProcessMetric(
Alex Klein1699fab2022-09-08 08:46:06 -0600100 "sysmon",
101 test_func=partial(_is_python_module, "chromite.scripts.sysmon"),
102 ),
Congbin Guo522cd982022-10-06 11:47:28 -0700103 _ProcessMetric("tko_proxy", test_func=_is_tko_proxy),
Alex Klein1699fab2022-09-08 08:46:06 -0600104 ]
105 self._other_metric = _ProcessMetric("other")
Allen Li6bb74d52017-06-22 14:44:53 -0700106
Alex Klein1699fab2022-09-08 08:46:06 -0600107 def collect(self):
108 for proc in psutil.process_iter():
109 self._collect_proc(proc)
110 self._flush()
Allen Li6bb74d52017-06-22 14:44:53 -0700111
Alex Klein1699fab2022-09-08 08:46:06 -0600112 def _collect_proc(self, proc):
113 for metric in self._metrics:
114 if metric.add(proc):
115 break
116 else:
117 self._other_metric.add(proc)
Allen Li6bb74d52017-06-22 14:44:53 -0700118
Alex Klein1699fab2022-09-08 08:46:06 -0600119 def _flush(self):
120 for metric in self._metrics:
121 metric.flush()
122 self._other_metric.flush()
Allen Li6bb74d52017-06-22 14:44:53 -0700123
124
125class _ProcessMetric(object):
Alex Klein1699fab2022-09-08 08:46:06 -0600126 """Class for gathering process metrics."""
Allen Li6bb74d52017-06-22 14:44:53 -0700127
Alex Klein1699fab2022-09-08 08:46:06 -0600128 def __init__(self, process_name, test_func=lambda proc: True):
129 """Initialize instance.
Allen Li6bb74d52017-06-22 14:44:53 -0700130
Alex Klein1699fab2022-09-08 08:46:06 -0600131 process_name is used to identify the metric stream.
Allen Li6bb74d52017-06-22 14:44:53 -0700132
Alex Klein1699fab2022-09-08 08:46:06 -0600133 test_func is a function called
134 for each process. If it returns True, the process is counted. The
135 default test is to count every process.
136 """
137 self._fields = {
138 "process_name": process_name,
139 }
140 self._test_func = test_func
141 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800142 self._thread_count = 0
Alex Klein1699fab2022-09-08 08:46:06 -0600143 self._cpu_percent = 0
Allen Li6bb74d52017-06-22 14:44:53 -0700144
Alex Klein1699fab2022-09-08 08:46:06 -0600145 def add(self, proc):
146 """Do metric collection for the given process.
Allen Li6bb74d52017-06-22 14:44:53 -0700147
Alex Klein1699fab2022-09-08 08:46:06 -0600148 Returns True if the process was collected.
149 """
150 if not self._test_func(proc):
151 return False
152 self._count += 1
Congbin Guo16b64d52023-02-10 17:50:30 -0800153 self._thread_count += proc.num_threads()
Alex Klein1699fab2022-09-08 08:46:06 -0600154 self._cpu_percent += proc.cpu_percent()
155 return True
Allen Li6bb74d52017-06-22 14:44:53 -0700156
Alex Klein1699fab2022-09-08 08:46:06 -0600157 def flush(self):
158 """Finish collection and send metrics."""
159 _count_metric.set(self._count, fields=self._fields)
160 self._count = 0
Congbin Guo16b64d52023-02-10 17:50:30 -0800161
162 _thread_count_metric.set(self._thread_count, fields=self._fields)
163 self._thread_count = 0
164
Alex Klein1699fab2022-09-08 08:46:06 -0600165 _cpu_percent_metric.set(
166 int(round(self._cpu_percent)), fields=self._fields
167 )
168 self._cpu_percent = 0
Allen Li51bb6122017-06-21 12:04:13 -0700169
170
171def _is_parent_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600172 """Return whether proc is a parent (not forked) autoserv process."""
173 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
Allen Li51bb6122017-06-21 12:04:13 -0700174
175
176def _is_autoserv(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600177 """Return whether proc is an autoserv process."""
178 # This relies on the autoserv script being run directly. The script should
179 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
180 # NOT /bin/env
181 return _is_process_name("autoserv", proc)
Allen Li51bb6122017-06-21 12:04:13 -0700182
183
Allen Li3992c662018-01-05 15:26:36 -0800184def _is_python_module(module, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600185 """Return whether proc is a process running a Python module."""
186 cmdline = proc.cmdline()
187 return (
188 cmdline
189 and cmdline[0].endswith("python")
190 and cmdline[1:3] == ["-m", module]
191 )
Allen Li3992c662018-01-05 15:26:36 -0800192
193
Prathmesh Prabhu0b795f02018-05-07 13:12:37 -0700194def _is_process_name(name, proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600195 """Return whether process proc is named name."""
196 return proc.name() == name
Congbin Guo17542e02022-06-29 13:48:15 -0700197
198
Congbin Guo4ccf0632023-02-12 00:01:14 -0800199def _is_recipe(proc):
200 """Return whether proc is a recipe process.
201
202 An example proc is like
203 '/home/.../bin/python -u -s
204 /home/.../kitchen-checkout/recipe_engine/recipe_engine/main.py ...'.
205 """
206 cmdline = proc.cmdline()
207 return (
208 len(cmdline) >= 4
209 and cmdline[0].endswith("/python")
210 and cmdline[3].endswith("/recipe_engine/main.py")
211 )
212
213
Congbin Guo17542e02022-06-29 13:48:15 -0700214def _is_swarming_bot(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600215 """Return whether proc is a Swarming bot.
Congbin Guo17542e02022-06-29 13:48:15 -0700216
Alex Klein1699fab2022-09-08 08:46:06 -0600217 A swarming bot process is like '/usr/bin/python3.8 <bot-zip-path> start_bot'.
218 """
219 cmdline = proc.cmdline()
220 return (
221 len(cmdline) == 3
222 and cmdline[0].split("/")[-1].startswith("python")
223 and cmdline[2] == "start_bot"
224 )
Congbin Guo17542e02022-06-29 13:48:15 -0700225
226
Congbin Guo4ccf0632023-02-12 00:01:14 -0800227def _is_swarming_sub_task(proc):
228 """Return whether proc is a Swarming bot sub task.
229
230 An example Swarming sub task:
231 /usr/bin/python3.8 -u /.../swarming_bot.2.zip run_isolated ...
232 """
233 cmdline = proc.cmdline()
234 return (
235 len(cmdline) >= 4
236 and cmdline[0].split("/")[-1].startswith("python")
237 and cmdline[2].split("/")[-1].startswith("swarming_bot.")
238 )
239
240
Congbin Guo17542e02022-06-29 13:48:15 -0700241def _is_gsutil(proc):
Alex Klein1699fab2022-09-08 08:46:06 -0600242 """Return whether proc is gsutil."""
243 cmdline = proc.cmdline()
244 return (
245 len(cmdline) >= 2
246 and cmdline[0] == "python"
247 and cmdline[1].endswith("gsutil")
248 )
Congbin Guo522cd982022-10-06 11:47:28 -0700249
250
Congbin Guo4ccf0632023-02-12 00:01:14 -0800251def _is_k8s_system(proc):
252 """Return whether proc is a k8s system process."""
253 return proc.name() in ("kubelet", "kube-proxy")
254
255
Congbin Guo522cd982022-10-06 11:47:28 -0700256def _is_tko_proxy(proc):
257 """Return whether proc is a tko proxy.
258
259 A tk proxy process is like
260 '/opt/cloud_sql_proxy -dir=<...>
261 -instances=google.com:chromeos-lab:us-central1:tko
262 -credential_file=<...>'.
263 """
264 cmdline = proc.cmdline()
265 return (
266 len(cmdline) == 4
Congbin Guofcb436b2023-01-23 20:36:01 -0800267 and cmdline[0].split("/")[-1] == "cloud_sql_proxy"
268 and cmdline[2] == "-instances=google.com:chromeos-lab:us-central1:tko"
Congbin Guo522cd982022-10-06 11:47:28 -0700269 )
Congbin Guoa8432502023-01-23 20:31:01 -0800270
271
272def _is_podman(subcmd, proc):
273 """Return whiter proc is a podman process.
274
275 A podman pull process is like
276 'podman pull image:tag'
277 A podman run process is like
278 'podman run --option ... image:tag'
279 """
280 cmdline = proc.cmdline()
Congbin Guoba85d0b2023-01-27 18:32:21 -0800281 return proc.name() == "podman" and len(cmdline) > 1 and cmdline[1] == subcmd