blob: 1d755370cf971b53b521d0c5fcb4355c9d1e488c [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
Allen Liec5beb32016-09-08 15:31:41 -07005"""System metrics."""
6
Allen Li13bdf0c2017-03-02 15:18:16 -08007from __future__ import absolute_import
Allen Liec5beb32016-09-08 15:31:41 -07008from __future__ import print_function
Allen Li13bdf0c2017-03-02 15:18:16 -08009from __future__ import unicode_literals
Allen Liec5beb32016-09-08 15:31:41 -070010
11import errno
12import os
Allen Liec5beb32016-09-08 15:31:41 -070013import time
14
15import psutil
16
17from chromite.lib import cros_logging as logging
18from infra_libs import ts_mon
19
Allen Li79317bb2016-12-16 18:25:07 -080020logger = logging.getLogger(__name__)
21
Allen Liec5beb32016-09-08 15:31:41 -070022
Allen Lia6b02252016-10-26 14:40:51 -070023_cpu_count_metric = ts_mon.GaugeMetric(
24 'dev/cpu/count',
25 description='Number of CPU cores.')
26_cpu_time_metric = ts_mon.FloatMetric(
27 'dev/cpu/time',
28 description='percentage of time spent by the CPU '
29 'in different states.')
Allen Liec5beb32016-09-08 15:31:41 -070030
Allen Lia6b02252016-10-26 14:40:51 -070031_disk_free_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070032 'dev/disk/free',
33 description='Available bytes on disk partition.',
34 units=ts_mon.MetricsDataUnits.BYTES)
Allen Lia6b02252016-10-26 14:40:51 -070035_disk_total_metric = ts_mon.GaugeMetric(
36 'dev/disk/total',
37 description='Total bytes on disk partition.',
38 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070039
Allen Lia6b02252016-10-26 14:40:51 -070040_inodes_free_metric = ts_mon.GaugeMetric(
41 'dev/inodes/free',
42 description='Number of available inodes on '
43 'disk partition (unix only).')
44_inodes_total_metric = ts_mon.GaugeMetric(
45 'dev/inodes/total',
46 description='Number of possible inodes on '
47 'disk partition (unix only)')
Allen Liec5beb32016-09-08 15:31:41 -070048
Allen Lia6b02252016-10-26 14:40:51 -070049_mem_free_metric = ts_mon.GaugeMetric(
50 'dev/mem/free',
51 description='Amount of memory available to a '
52 'process (in Bytes). Buffers are considered '
53 'free memory.',
54 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070055
Allen Lia6b02252016-10-26 14:40:51 -070056_mem_total_metric = ts_mon.GaugeMetric(
57 'dev/mem/total',
58 description='Total physical memory in Bytes.',
59 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070060
Allen Li325c0762017-03-02 15:00:19 -080061_BOOT_TIME = psutil.boot_time()
Allen Lic987fc92017-03-02 14:54:51 -080062
Allen Lia6b02252016-10-26 14:40:51 -070063_disk_read_metric = ts_mon.CounterMetric(
Allen Li325c0762017-03-02 15:00:19 -080064 'dev/disk/read', start_time=_BOOT_TIME,
Allen Lia6b02252016-10-26 14:40:51 -070065 description='Number of Bytes read on disk.',
66 units=ts_mon.MetricsDataUnits.BYTES)
67_disk_write_metric = ts_mon.CounterMetric(
Allen Li325c0762017-03-02 15:00:19 -080068 'dev/disk/write', start_time=_BOOT_TIME,
Allen Lia6b02252016-10-26 14:40:51 -070069 description='Number of Bytes written on disk.',
70 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070071
Allen Lia6b02252016-10-26 14:40:51 -070072_uptime_metric = ts_mon.GaugeMetric(
73 'dev/uptime',
74 description='Machine uptime, in seconds.',
75 units=ts_mon.MetricsDataUnits.SECONDS)
Allen Liec5beb32016-09-08 15:31:41 -070076
Allen Lia6b02252016-10-26 14:40:51 -070077_proc_count_metric = ts_mon.GaugeMetric(
78 'dev/proc/count',
79 description='Number of processes currently running.')
Allen Liefe7adf2016-10-27 11:36:04 -070080_autoserv_proc_count_metric = ts_mon.GaugeMetric(
81 'dev/proc/autoserv_count',
82 description='Number of autoserv processes currently running.')
Allen Li0937a522016-11-23 13:34:48 -080083_sysmon_proc_count_metric = ts_mon.GaugeMetric(
84 'dev/proc/sysmon_count',
85 description='Number of sysmon processes currently running.')
Allen Lia6b02252016-10-26 14:40:51 -070086_load_average_metric = ts_mon.FloatMetric(
87 'dev/proc/load_average',
88 description='Number of processes currently '
89 'in the system run queue.')
Allen Liec5beb32016-09-08 15:31:41 -070090
Allen Lia6b02252016-10-26 14:40:51 -070091# ts_mon pipeline uses backend clocks when assigning timestamps to metric
92# points. By comparing point timestamp to the point value (i.e. time by
93# machine's local clock), we can potentially detect some anomalies (clock
94# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
Allen Liec5beb32016-09-08 15:31:41 -070095#
96# It is important to gather this metric right before the flush.
Allen Lia6b02252016-10-26 14:40:51 -070097_unix_time_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070098 'dev/unix_time',
Allen Lia6b02252016-10-26 14:40:51 -070099 description='Number of milliseconds since epoch'
100 ' based on local machine clock.')
Allen Liec5beb32016-09-08 15:31:41 -0700101
Allen Lia6b02252016-10-26 14:40:51 -0700102_os_name_metric = ts_mon.StringMetric(
103 'proc/os/name',
104 description='OS name on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700105
Allen Lia6b02252016-10-26 14:40:51 -0700106_os_version_metric = ts_mon.StringMetric(
107 'proc/os/version',
108 description='OS version on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700109
Allen Lia6b02252016-10-26 14:40:51 -0700110_os_arch_metric = ts_mon.StringMetric(
111 'proc/os/arch',
112 description='OS architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700113
Allen Lia6b02252016-10-26 14:40:51 -0700114_python_arch_metric = ts_mon.StringMetric(
115 'proc/python/arch',
116 description='python userland '
117 'architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700118
119
Allen Li45ae8392017-03-02 14:19:35 -0800120def collect_uptime():
Allen Li325c0762017-03-02 15:00:19 -0800121 _uptime_metric.set(int(time.time() - _BOOT_TIME))
Allen Liec5beb32016-09-08 15:31:41 -0700122
123
Allen Li45ae8392017-03-02 14:19:35 -0800124def collect_cpu_info():
Allen Lia6b02252016-10-26 14:40:51 -0700125 _cpu_count_metric.set(psutil.cpu_count())
Allen Liec5beb32016-09-08 15:31:41 -0700126
127 times = psutil.cpu_times_percent()
128 for mode in ('user', 'system', 'idle'):
Allen Lia6b02252016-10-26 14:40:51 -0700129 _cpu_time_metric.set(getattr(times, mode), {'mode': mode})
Allen Liec5beb32016-09-08 15:31:41 -0700130
131
Allen Li45ae8392017-03-02 14:19:35 -0800132def collect_disk_info(mountpoints=None):
Allen Liec5beb32016-09-08 15:31:41 -0700133 if mountpoints is None:
134 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
Allen Liec5beb32016-09-08 15:31:41 -0700135 for mountpoint in mountpoints:
Allen Li45ae8392017-03-02 14:19:35 -0800136 _collect_disk_info_single(mountpoint)
137 _collect_fs_inode_info(mountpoint)
138 _collect_disk_io_info()
Allen Liec5beb32016-09-08 15:31:41 -0700139
Allen Liec5beb32016-09-08 15:31:41 -0700140
Allen Li45ae8392017-03-02 14:19:35 -0800141def _collect_disk_info_single(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700142 fields = {'path': mountpoint}
Allen Liec5beb32016-09-08 15:31:41 -0700143
144 try:
Allen Lia6b02252016-10-26 14:40:51 -0700145 usage = psutil.disk_usage(mountpoint)
146 except OSError as ex:
147 if ex.errno == errno.ENOENT:
148 # This happens on Windows when querying a removable drive that
149 # doesn't have any media inserted right now.
150 pass
151 else:
152 raise
153 else:
154 _disk_free_metric.set(usage.free, fields=fields)
155 _disk_total_metric.set(usage.total, fields=fields)
156
157 # inode counts are only available on Unix.
158 if os.name == 'posix':
Allen Li45ae8392017-03-02 14:19:35 -0800159 _collect_fs_inode_info(mountpoint)
Allen Lia6b02252016-10-26 14:40:51 -0700160
161
Allen Li45ae8392017-03-02 14:19:35 -0800162def _collect_fs_inode_info(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700163 fields = {'path': mountpoint}
164 stats = os.statvfs(mountpoint)
165 _inodes_free_metric.set(stats.f_favail, fields=fields)
166 _inodes_total_metric.set(stats.f_files, fields=fields)
167
168
Allen Li45ae8392017-03-02 14:19:35 -0800169def _collect_disk_io_info():
Allen Lia6b02252016-10-26 14:40:51 -0700170 try:
171 disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
Allen Liec5beb32016-09-08 15:31:41 -0700172 except RuntimeError as ex:
173 if "couldn't find any physical disk" in str(ex):
174 # Disk performance counters aren't enabled on Windows.
175 pass
176 else:
177 raise
Allen Lia6b02252016-10-26 14:40:51 -0700178 else:
179 for disk, counters in disk_counters:
180 fields = {'disk': disk}
181 _disk_read_metric.set(counters.read_bytes, fields=fields)
182 _disk_write_metric.set(counters.write_bytes, fields=fields)
Allen Liec5beb32016-09-08 15:31:41 -0700183
184
Allen Li45ae8392017-03-02 14:19:35 -0800185def collect_mem_info():
Allen Liec5beb32016-09-08 15:31:41 -0700186 # We don't report mem.used because (due to virtual memory) it is not
187 # useful.
188 mem = psutil.virtual_memory()
Allen Lia6b02252016-10-26 14:40:51 -0700189 _mem_free_metric.set(mem.available)
190 _mem_total_metric.set(mem.total)
Allen Liec5beb32016-09-08 15:31:41 -0700191
192
Allen Li45ae8392017-03-02 14:19:35 -0800193def collect_proc_info():
Allen Liefe7adf2016-10-27 11:36:04 -0700194 autoserv_count = 0
Allen Li0937a522016-11-23 13:34:48 -0800195 sysmon_count = 0
Allen Liefe7adf2016-10-27 11:36:04 -0700196 total = 0
197 for proc in psutil.process_iter():
Allen Li80dae192016-11-01 11:58:10 -0700198 if _is_parent_autoserv(proc):
Allen Liefe7adf2016-10-27 11:36:04 -0700199 autoserv_count += 1
Allen Li0937a522016-11-23 13:34:48 -0800200 elif _is_sysmon(proc):
201 sysmon_count += 1
Allen Liefe7adf2016-10-27 11:36:04 -0700202 total += 1
Allen Li79317bb2016-12-16 18:25:07 -0800203 logger.debug('autoserv_count: %s', autoserv_count)
204 logger.debug('sysmon_count: %s', sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700205 _autoserv_proc_count_metric.set(autoserv_count)
Allen Li0937a522016-11-23 13:34:48 -0800206 _sysmon_proc_count_metric.set(sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700207 _proc_count_metric.set(total)
208
209
Allen Li80dae192016-11-01 11:58:10 -0700210def _is_parent_autoserv(proc):
211 """Return whether proc is a parent (not forked) autoserv process."""
212 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
213
214
215def _is_autoserv(proc):
216 """Return whether proc is an autoserv process."""
217 # This relies on the autoserv script being run directly. The script should
218 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
219 # NOT /bin/env
220 return proc.name() == 'autoserv'
Allen Liec5beb32016-09-08 15:31:41 -0700221
Allen Lia6b02252016-10-26 14:40:51 -0700222
Allen Li0937a522016-11-23 13:34:48 -0800223def _is_sysmon(proc):
224 """Return whether proc is a sysmon process."""
Allen Lidfdfbda2016-12-16 17:41:16 -0800225 return proc.cmdline()[:3] == ['python', '-m', 'chromite.scripts.sysmon']
Allen Li0937a522016-11-23 13:34:48 -0800226
227
Allen Li45ae8392017-03-02 14:19:35 -0800228def collect_load_avg():
Allen Lia6b02252016-10-26 14:40:51 -0700229 try:
230 avg1, avg5, avg15 = os.getloadavg()
231 except OSError:
232 pass
233 else:
234 _load_average_metric.set(avg1, fields={'minutes': 1})
235 _load_average_metric.set(avg5, fields={'minutes': 5})
236 _load_average_metric.set(avg15, fields={'minutes': 15})
Allen Liec5beb32016-09-08 15:31:41 -0700237
238
Allen Li45ae8392017-03-02 14:19:35 -0800239def collect_unix_time():
Allen Lia6b02252016-10-26 14:40:51 -0700240 _unix_time_metric.set(int(time.time() * 1000))