blob: a124662bfa3ae242d514ef4740770bf4dbe271cb [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Copyright (c) 2015 The Chromium Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9"""System metrics."""
10
11from __future__ import print_function
12
13import errno
14import os
Allen Liec5beb32016-09-08 15:31:41 -070015import time
16
17import psutil
18
19from chromite.lib import cros_logging as logging
20from infra_libs import ts_mon
21
Allen Li79317bb2016-12-16 18:25:07 -080022logger = logging.getLogger(__name__)
23
Allen Liec5beb32016-09-08 15:31:41 -070024
Allen Lia6b02252016-10-26 14:40:51 -070025_cpu_count_metric = ts_mon.GaugeMetric(
26 'dev/cpu/count',
27 description='Number of CPU cores.')
28_cpu_time_metric = ts_mon.FloatMetric(
29 'dev/cpu/time',
30 description='percentage of time spent by the CPU '
31 'in different states.')
Allen Liec5beb32016-09-08 15:31:41 -070032
Allen Lia6b02252016-10-26 14:40:51 -070033_disk_free_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070034 'dev/disk/free',
35 description='Available bytes on disk partition.',
36 units=ts_mon.MetricsDataUnits.BYTES)
Allen Lia6b02252016-10-26 14:40:51 -070037_disk_total_metric = ts_mon.GaugeMetric(
38 'dev/disk/total',
39 description='Total bytes on disk partition.',
40 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070041
Allen Lia6b02252016-10-26 14:40:51 -070042_inodes_free_metric = ts_mon.GaugeMetric(
43 'dev/inodes/free',
44 description='Number of available inodes on '
45 'disk partition (unix only).')
46_inodes_total_metric = ts_mon.GaugeMetric(
47 'dev/inodes/total',
48 description='Number of possible inodes on '
49 'disk partition (unix only)')
Allen Liec5beb32016-09-08 15:31:41 -070050
Allen Lia6b02252016-10-26 14:40:51 -070051_mem_free_metric = ts_mon.GaugeMetric(
52 'dev/mem/free',
53 description='Amount of memory available to a '
54 'process (in Bytes). Buffers are considered '
55 'free memory.',
56 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070057
Allen Lia6b02252016-10-26 14:40:51 -070058_mem_total_metric = ts_mon.GaugeMetric(
59 'dev/mem/total',
60 description='Total physical memory in Bytes.',
61 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070062
Allen Li325c0762017-03-02 15:00:19 -080063_BOOT_TIME = psutil.boot_time()
Allen Lic987fc92017-03-02 14:54:51 -080064
Allen Lia6b02252016-10-26 14:40:51 -070065_disk_read_metric = ts_mon.CounterMetric(
Allen Li325c0762017-03-02 15:00:19 -080066 'dev/disk/read', start_time=_BOOT_TIME,
Allen Lia6b02252016-10-26 14:40:51 -070067 description='Number of Bytes read on disk.',
68 units=ts_mon.MetricsDataUnits.BYTES)
69_disk_write_metric = ts_mon.CounterMetric(
Allen Li325c0762017-03-02 15:00:19 -080070 'dev/disk/write', start_time=_BOOT_TIME,
Allen Lia6b02252016-10-26 14:40:51 -070071 description='Number of Bytes written on disk.',
72 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070073
Allen Lia6b02252016-10-26 14:40:51 -070074_uptime_metric = ts_mon.GaugeMetric(
75 'dev/uptime',
76 description='Machine uptime, in seconds.',
77 units=ts_mon.MetricsDataUnits.SECONDS)
Allen Liec5beb32016-09-08 15:31:41 -070078
Allen Lia6b02252016-10-26 14:40:51 -070079_proc_count_metric = ts_mon.GaugeMetric(
80 'dev/proc/count',
81 description='Number of processes currently running.')
Allen Liefe7adf2016-10-27 11:36:04 -070082_autoserv_proc_count_metric = ts_mon.GaugeMetric(
83 'dev/proc/autoserv_count',
84 description='Number of autoserv processes currently running.')
Allen Li0937a522016-11-23 13:34:48 -080085_sysmon_proc_count_metric = ts_mon.GaugeMetric(
86 'dev/proc/sysmon_count',
87 description='Number of sysmon processes currently running.')
Allen Lia6b02252016-10-26 14:40:51 -070088_load_average_metric = ts_mon.FloatMetric(
89 'dev/proc/load_average',
90 description='Number of processes currently '
91 'in the system run queue.')
Allen Liec5beb32016-09-08 15:31:41 -070092
Allen Lia6b02252016-10-26 14:40:51 -070093# ts_mon pipeline uses backend clocks when assigning timestamps to metric
94# points. By comparing point timestamp to the point value (i.e. time by
95# machine's local clock), we can potentially detect some anomalies (clock
96# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
Allen Liec5beb32016-09-08 15:31:41 -070097#
98# It is important to gather this metric right before the flush.
Allen Lia6b02252016-10-26 14:40:51 -070099_unix_time_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -0700100 'dev/unix_time',
Allen Lia6b02252016-10-26 14:40:51 -0700101 description='Number of milliseconds since epoch'
102 ' based on local machine clock.')
Allen Liec5beb32016-09-08 15:31:41 -0700103
Allen Lia6b02252016-10-26 14:40:51 -0700104_os_name_metric = ts_mon.StringMetric(
105 'proc/os/name',
106 description='OS name on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700107
Allen Lia6b02252016-10-26 14:40:51 -0700108_os_version_metric = ts_mon.StringMetric(
109 'proc/os/version',
110 description='OS version on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700111
Allen Lia6b02252016-10-26 14:40:51 -0700112_os_arch_metric = ts_mon.StringMetric(
113 'proc/os/arch',
114 description='OS architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700115
Allen Lia6b02252016-10-26 14:40:51 -0700116_python_arch_metric = ts_mon.StringMetric(
117 'proc/python/arch',
118 description='python userland '
119 'architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700120
121
Allen Li45ae8392017-03-02 14:19:35 -0800122def collect_uptime():
Allen Li325c0762017-03-02 15:00:19 -0800123 _uptime_metric.set(int(time.time() - _BOOT_TIME))
Allen Liec5beb32016-09-08 15:31:41 -0700124
125
Allen Li45ae8392017-03-02 14:19:35 -0800126def collect_cpu_info():
Allen Lia6b02252016-10-26 14:40:51 -0700127 _cpu_count_metric.set(psutil.cpu_count())
Allen Liec5beb32016-09-08 15:31:41 -0700128
129 times = psutil.cpu_times_percent()
130 for mode in ('user', 'system', 'idle'):
Allen Lia6b02252016-10-26 14:40:51 -0700131 _cpu_time_metric.set(getattr(times, mode), {'mode': mode})
Allen Liec5beb32016-09-08 15:31:41 -0700132
133
Allen Li45ae8392017-03-02 14:19:35 -0800134def collect_disk_info(mountpoints=None):
Allen Liec5beb32016-09-08 15:31:41 -0700135 if mountpoints is None:
136 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
Allen Liec5beb32016-09-08 15:31:41 -0700137 for mountpoint in mountpoints:
Allen Li45ae8392017-03-02 14:19:35 -0800138 _collect_disk_info_single(mountpoint)
139 _collect_fs_inode_info(mountpoint)
140 _collect_disk_io_info()
Allen Liec5beb32016-09-08 15:31:41 -0700141
Allen Liec5beb32016-09-08 15:31:41 -0700142
Allen Li45ae8392017-03-02 14:19:35 -0800143def _collect_disk_info_single(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700144 fields = {'path': mountpoint}
Allen Liec5beb32016-09-08 15:31:41 -0700145
146 try:
Allen Lia6b02252016-10-26 14:40:51 -0700147 usage = psutil.disk_usage(mountpoint)
148 except OSError as ex:
149 if ex.errno == errno.ENOENT:
150 # This happens on Windows when querying a removable drive that
151 # doesn't have any media inserted right now.
152 pass
153 else:
154 raise
155 else:
156 _disk_free_metric.set(usage.free, fields=fields)
157 _disk_total_metric.set(usage.total, fields=fields)
158
159 # inode counts are only available on Unix.
160 if os.name == 'posix':
Allen Li45ae8392017-03-02 14:19:35 -0800161 _collect_fs_inode_info(mountpoint)
Allen Lia6b02252016-10-26 14:40:51 -0700162
163
Allen Li45ae8392017-03-02 14:19:35 -0800164def _collect_fs_inode_info(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700165 fields = {'path': mountpoint}
166 stats = os.statvfs(mountpoint)
167 _inodes_free_metric.set(stats.f_favail, fields=fields)
168 _inodes_total_metric.set(stats.f_files, fields=fields)
169
170
Allen Li45ae8392017-03-02 14:19:35 -0800171def _collect_disk_io_info():
Allen Lia6b02252016-10-26 14:40:51 -0700172 try:
173 disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
Allen Liec5beb32016-09-08 15:31:41 -0700174 except RuntimeError as ex:
175 if "couldn't find any physical disk" in str(ex):
176 # Disk performance counters aren't enabled on Windows.
177 pass
178 else:
179 raise
Allen Lia6b02252016-10-26 14:40:51 -0700180 else:
181 for disk, counters in disk_counters:
182 fields = {'disk': disk}
183 _disk_read_metric.set(counters.read_bytes, fields=fields)
184 _disk_write_metric.set(counters.write_bytes, fields=fields)
Allen Liec5beb32016-09-08 15:31:41 -0700185
186
Allen Li45ae8392017-03-02 14:19:35 -0800187def collect_mem_info():
Allen Liec5beb32016-09-08 15:31:41 -0700188 # We don't report mem.used because (due to virtual memory) it is not
189 # useful.
190 mem = psutil.virtual_memory()
Allen Lia6b02252016-10-26 14:40:51 -0700191 _mem_free_metric.set(mem.available)
192 _mem_total_metric.set(mem.total)
Allen Liec5beb32016-09-08 15:31:41 -0700193
194
Allen Li45ae8392017-03-02 14:19:35 -0800195def collect_proc_info():
Allen Liefe7adf2016-10-27 11:36:04 -0700196 autoserv_count = 0
Allen Li0937a522016-11-23 13:34:48 -0800197 sysmon_count = 0
Allen Liefe7adf2016-10-27 11:36:04 -0700198 total = 0
199 for proc in psutil.process_iter():
Allen Li80dae192016-11-01 11:58:10 -0700200 if _is_parent_autoserv(proc):
Allen Liefe7adf2016-10-27 11:36:04 -0700201 autoserv_count += 1
Allen Li0937a522016-11-23 13:34:48 -0800202 elif _is_sysmon(proc):
203 sysmon_count += 1
Allen Liefe7adf2016-10-27 11:36:04 -0700204 total += 1
Allen Li79317bb2016-12-16 18:25:07 -0800205 logger.debug('autoserv_count: %s', autoserv_count)
206 logger.debug('sysmon_count: %s', sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700207 _autoserv_proc_count_metric.set(autoserv_count)
Allen Li0937a522016-11-23 13:34:48 -0800208 _sysmon_proc_count_metric.set(sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700209 _proc_count_metric.set(total)
210
211
Allen Li80dae192016-11-01 11:58:10 -0700212def _is_parent_autoserv(proc):
213 """Return whether proc is a parent (not forked) autoserv process."""
214 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
215
216
217def _is_autoserv(proc):
218 """Return whether proc is an autoserv process."""
219 # This relies on the autoserv script being run directly. The script should
220 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
221 # NOT /bin/env
222 return proc.name() == 'autoserv'
Allen Liec5beb32016-09-08 15:31:41 -0700223
Allen Lia6b02252016-10-26 14:40:51 -0700224
Allen Li0937a522016-11-23 13:34:48 -0800225def _is_sysmon(proc):
226 """Return whether proc is a sysmon process."""
Allen Lidfdfbda2016-12-16 17:41:16 -0800227 return proc.cmdline()[:3] == ['python', '-m', 'chromite.scripts.sysmon']
Allen Li0937a522016-11-23 13:34:48 -0800228
229
Allen Li45ae8392017-03-02 14:19:35 -0800230def collect_load_avg():
Allen Lia6b02252016-10-26 14:40:51 -0700231 try:
232 avg1, avg5, avg15 = os.getloadavg()
233 except OSError:
234 pass
235 else:
236 _load_average_metric.set(avg1, fields={'minutes': 1})
237 _load_average_metric.set(avg5, fields={'minutes': 5})
238 _load_average_metric.set(avg15, fields={'minutes': 15})
Allen Liec5beb32016-09-08 15:31:41 -0700239
240
Allen Li45ae8392017-03-02 14:19:35 -0800241def collect_unix_time():
Allen Lia6b02252016-10-26 14:40:51 -0700242 _unix_time_metric.set(int(time.time() * 1000))