blob: cb0a05d4d4845cfab19dd597447a6c36382c4da6 [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Copyright (c) 2015 The Chromium Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9"""System metrics."""
10
11from __future__ import print_function
12
13import errno
14import os
Allen Liec5beb32016-09-08 15:31:41 -070015import time
16
17import psutil
18
19from chromite.lib import cros_logging as logging
20from infra_libs import ts_mon
21
Allen Li79317bb2016-12-16 18:25:07 -080022logger = logging.getLogger(__name__)
23
Allen Liec5beb32016-09-08 15:31:41 -070024
Allen Lia6b02252016-10-26 14:40:51 -070025_cpu_count_metric = ts_mon.GaugeMetric(
26 'dev/cpu/count',
27 description='Number of CPU cores.')
28_cpu_time_metric = ts_mon.FloatMetric(
29 'dev/cpu/time',
30 description='percentage of time spent by the CPU '
31 'in different states.')
Allen Liec5beb32016-09-08 15:31:41 -070032
Allen Lia6b02252016-10-26 14:40:51 -070033_disk_free_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070034 'dev/disk/free',
35 description='Available bytes on disk partition.',
36 units=ts_mon.MetricsDataUnits.BYTES)
Allen Lia6b02252016-10-26 14:40:51 -070037_disk_total_metric = ts_mon.GaugeMetric(
38 'dev/disk/total',
39 description='Total bytes on disk partition.',
40 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070041
Allen Lia6b02252016-10-26 14:40:51 -070042_inodes_free_metric = ts_mon.GaugeMetric(
43 'dev/inodes/free',
44 description='Number of available inodes on '
45 'disk partition (unix only).')
46_inodes_total_metric = ts_mon.GaugeMetric(
47 'dev/inodes/total',
48 description='Number of possible inodes on '
49 'disk partition (unix only)')
Allen Liec5beb32016-09-08 15:31:41 -070050
Allen Lia6b02252016-10-26 14:40:51 -070051_mem_free_metric = ts_mon.GaugeMetric(
52 'dev/mem/free',
53 description='Amount of memory available to a '
54 'process (in Bytes). Buffers are considered '
55 'free memory.',
56 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070057
Allen Lia6b02252016-10-26 14:40:51 -070058_mem_total_metric = ts_mon.GaugeMetric(
59 'dev/mem/total',
60 description='Total physical memory in Bytes.',
61 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070062
Allen Lia6b02252016-10-26 14:40:51 -070063BOOT_TIME = psutil.boot_time()
64_net_up_metric = ts_mon.CounterMetric(
65 'dev/net/bytes/up', start_time=BOOT_TIME,
66 description='Number of bytes sent on interface.',
67 units=ts_mon.MetricsDataUnits.BYTES)
68_net_down_metric = ts_mon.CounterMetric(
69 'dev/net/bytes/down', start_time=BOOT_TIME,
70 description='Number of Bytes received on '
71 'interface.',
72 units=ts_mon.MetricsDataUnits.BYTES)
73_net_err_up_metric = ts_mon.CounterMetric(
74 'dev/net/err/up', start_time=BOOT_TIME,
75 description='Total number of errors when '
76 'sending (per interface).')
77_net_err_down_metric = ts_mon.CounterMetric(
78 'dev/net/err/down', start_time=BOOT_TIME,
79 description='Total number of errors when '
80 'receiving (per interface).')
81_net_drop_up_metric = ts_mon.CounterMetric(
82 'dev/net/drop/up', start_time=BOOT_TIME,
83 description='Total number of outgoing '
84 'packets that have been dropped.')
85_net_drop_down_metric = ts_mon.CounterMetric(
86 'dev/net/drop/down', start_time=BOOT_TIME,
Allen Liec5beb32016-09-08 15:31:41 -070087 description='Total number of incoming '
88 'packets that have been dropped.')
89
Allen Lic987fc92017-03-02 14:54:51 -080090_net_if_isup_metric = ts_mon.BooleanMetric(
91 'dev/net/isup',
92 description='Whether interface is up or down.')
93_net_if_duplex_metric = ts_mon.GaugeMetric(
94 'dev/net/duplex',
95 description='Whether interface supports full or half duplex.')
96_net_if_speed_metric = ts_mon.GaugeMetric(
97 'dev/net/speed',
98 description='Network interface speed in Mb.')
99_net_if_mtu_metric = ts_mon.GaugeMetric(
100 'dev/net/mtu',
101 description='Network interface MTU in B.')
102
Allen Lia6b02252016-10-26 14:40:51 -0700103_disk_read_metric = ts_mon.CounterMetric(
104 'dev/disk/read', start_time=BOOT_TIME,
105 description='Number of Bytes read on disk.',
106 units=ts_mon.MetricsDataUnits.BYTES)
107_disk_write_metric = ts_mon.CounterMetric(
108 'dev/disk/write', start_time=BOOT_TIME,
109 description='Number of Bytes written on disk.',
110 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -0700111
Allen Lia6b02252016-10-26 14:40:51 -0700112_uptime_metric = ts_mon.GaugeMetric(
113 'dev/uptime',
114 description='Machine uptime, in seconds.',
115 units=ts_mon.MetricsDataUnits.SECONDS)
Allen Liec5beb32016-09-08 15:31:41 -0700116
Allen Lia6b02252016-10-26 14:40:51 -0700117_proc_count_metric = ts_mon.GaugeMetric(
118 'dev/proc/count',
119 description='Number of processes currently running.')
Allen Liefe7adf2016-10-27 11:36:04 -0700120_autoserv_proc_count_metric = ts_mon.GaugeMetric(
121 'dev/proc/autoserv_count',
122 description='Number of autoserv processes currently running.')
Allen Li0937a522016-11-23 13:34:48 -0800123_sysmon_proc_count_metric = ts_mon.GaugeMetric(
124 'dev/proc/sysmon_count',
125 description='Number of sysmon processes currently running.')
Allen Lia6b02252016-10-26 14:40:51 -0700126_load_average_metric = ts_mon.FloatMetric(
127 'dev/proc/load_average',
128 description='Number of processes currently '
129 'in the system run queue.')
Allen Liec5beb32016-09-08 15:31:41 -0700130
Allen Lia6b02252016-10-26 14:40:51 -0700131# ts_mon pipeline uses backend clocks when assigning timestamps to metric
132# points. By comparing point timestamp to the point value (i.e. time by
133# machine's local clock), we can potentially detect some anomalies (clock
134# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
Allen Liec5beb32016-09-08 15:31:41 -0700135#
136# It is important to gather this metric right before the flush.
Allen Lia6b02252016-10-26 14:40:51 -0700137_unix_time_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -0700138 'dev/unix_time',
Allen Lia6b02252016-10-26 14:40:51 -0700139 description='Number of milliseconds since epoch'
140 ' based on local machine clock.')
Allen Liec5beb32016-09-08 15:31:41 -0700141
Allen Lia6b02252016-10-26 14:40:51 -0700142_os_name_metric = ts_mon.StringMetric(
143 'proc/os/name',
144 description='OS name on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700145
Allen Lia6b02252016-10-26 14:40:51 -0700146_os_version_metric = ts_mon.StringMetric(
147 'proc/os/version',
148 description='OS version on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700149
Allen Lia6b02252016-10-26 14:40:51 -0700150_os_arch_metric = ts_mon.StringMetric(
151 'proc/os/arch',
152 description='OS architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700153
Allen Lia6b02252016-10-26 14:40:51 -0700154_python_arch_metric = ts_mon.StringMetric(
155 'proc/python/arch',
156 description='python userland '
157 'architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700158
159
Allen Li45ae8392017-03-02 14:19:35 -0800160def collect_uptime():
Allen Lia6b02252016-10-26 14:40:51 -0700161 _uptime_metric.set(int(time.time() - BOOT_TIME))
Allen Liec5beb32016-09-08 15:31:41 -0700162
163
Allen Li45ae8392017-03-02 14:19:35 -0800164def collect_cpu_info():
Allen Lia6b02252016-10-26 14:40:51 -0700165 _cpu_count_metric.set(psutil.cpu_count())
Allen Liec5beb32016-09-08 15:31:41 -0700166
167 times = psutil.cpu_times_percent()
168 for mode in ('user', 'system', 'idle'):
Allen Lia6b02252016-10-26 14:40:51 -0700169 _cpu_time_metric.set(getattr(times, mode), {'mode': mode})
Allen Liec5beb32016-09-08 15:31:41 -0700170
171
Allen Li45ae8392017-03-02 14:19:35 -0800172def collect_disk_info(mountpoints=None):
Allen Liec5beb32016-09-08 15:31:41 -0700173 if mountpoints is None:
174 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
Allen Liec5beb32016-09-08 15:31:41 -0700175 for mountpoint in mountpoints:
Allen Li45ae8392017-03-02 14:19:35 -0800176 _collect_disk_info_single(mountpoint)
177 _collect_fs_inode_info(mountpoint)
178 _collect_disk_io_info()
Allen Liec5beb32016-09-08 15:31:41 -0700179
Allen Liec5beb32016-09-08 15:31:41 -0700180
Allen Li45ae8392017-03-02 14:19:35 -0800181def _collect_disk_info_single(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700182 fields = {'path': mountpoint}
Allen Liec5beb32016-09-08 15:31:41 -0700183
184 try:
Allen Lia6b02252016-10-26 14:40:51 -0700185 usage = psutil.disk_usage(mountpoint)
186 except OSError as ex:
187 if ex.errno == errno.ENOENT:
188 # This happens on Windows when querying a removable drive that
189 # doesn't have any media inserted right now.
190 pass
191 else:
192 raise
193 else:
194 _disk_free_metric.set(usage.free, fields=fields)
195 _disk_total_metric.set(usage.total, fields=fields)
196
197 # inode counts are only available on Unix.
198 if os.name == 'posix':
Allen Li45ae8392017-03-02 14:19:35 -0800199 _collect_fs_inode_info(mountpoint)
Allen Lia6b02252016-10-26 14:40:51 -0700200
201
Allen Li45ae8392017-03-02 14:19:35 -0800202def _collect_fs_inode_info(mountpoint):
Allen Lia6b02252016-10-26 14:40:51 -0700203 fields = {'path': mountpoint}
204 stats = os.statvfs(mountpoint)
205 _inodes_free_metric.set(stats.f_favail, fields=fields)
206 _inodes_total_metric.set(stats.f_files, fields=fields)
207
208
Allen Li45ae8392017-03-02 14:19:35 -0800209def _collect_disk_io_info():
Allen Lia6b02252016-10-26 14:40:51 -0700210 try:
211 disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
Allen Liec5beb32016-09-08 15:31:41 -0700212 except RuntimeError as ex:
213 if "couldn't find any physical disk" in str(ex):
214 # Disk performance counters aren't enabled on Windows.
215 pass
216 else:
217 raise
Allen Lia6b02252016-10-26 14:40:51 -0700218 else:
219 for disk, counters in disk_counters:
220 fields = {'disk': disk}
221 _disk_read_metric.set(counters.read_bytes, fields=fields)
222 _disk_write_metric.set(counters.write_bytes, fields=fields)
Allen Liec5beb32016-09-08 15:31:41 -0700223
224
Allen Li45ae8392017-03-02 14:19:35 -0800225def collect_mem_info():
Allen Liec5beb32016-09-08 15:31:41 -0700226 # We don't report mem.used because (due to virtual memory) it is not
227 # useful.
228 mem = psutil.virtual_memory()
Allen Lia6b02252016-10-26 14:40:51 -0700229 _mem_free_metric.set(mem.available)
230 _mem_total_metric.set(mem.total)
Allen Liec5beb32016-09-08 15:31:41 -0700231
232
Allen Li45ae8392017-03-02 14:19:35 -0800233def collect_net_info():
Allen Lie04a8052017-03-02 14:30:48 -0800234 """Collect network metrics."""
235 _collect_net_io_counters()
Allen Lic987fc92017-03-02 14:54:51 -0800236 _collect_net_if_stats()
Allen Lie04a8052017-03-02 14:30:48 -0800237
238
Allen Li77a0edb2017-03-02 14:42:14 -0800239_net_io_metrics = (
240 (_net_up_metric, 'bytes_sent'),
241 (_net_down_metric, 'bytes_recv'),
242 (_net_err_up_metric, 'errout'),
243 (_net_err_down_metric, 'errin'),
244 (_net_drop_up_metric, 'dropout'),
245 (_net_drop_down_metric, 'dropin'),
246)
247
248
Allen Lie04a8052017-03-02 14:30:48 -0800249def _collect_net_io_counters():
250 """Collect metrics for network IO counters."""
Allen Liec5beb32016-09-08 15:31:41 -0700251 nics = psutil.net_io_counters(pernic=True)
252 for nic, counters in nics.iteritems():
Allen Lic18ca9d2017-03-02 14:24:57 -0800253 if _is_virtual_netif(nic):
Allen Li10eb79a2016-10-19 11:11:53 -0700254 continue
Allen Liec5beb32016-09-08 15:31:41 -0700255 fields = {'interface': nic}
Allen Li77a0edb2017-03-02 14:42:14 -0800256 for metric, counter_name in _net_io_metrics:
Allen Liec5beb32016-09-08 15:31:41 -0700257 try:
258 metric.set(getattr(counters, counter_name), fields=fields)
259 except ts_mon.MonitoringDecreasingValueError as ex:
260 # This normally shouldn't happen, but might if the network
261 # driver module is reloaded, so log an error and continue
262 # instead of raising an exception.
Allen Li79317bb2016-12-16 18:25:07 -0800263 logger.warning(str(ex))
Allen Liec5beb32016-09-08 15:31:41 -0700264
265
Allen Lic987fc92017-03-02 14:54:51 -0800266_net_if_metrics = (
267 (_net_if_isup_metric, 'isup'),
268 (_net_if_duplex_metric, 'duplex'),
269 (_net_if_speed_metric, 'speed'),
270 (_net_if_mtu_metric, 'mtu'),
271)
272
273
274def _collect_net_if_stats():
275 """Collect metrics for network interface stats."""
276 for nic, stats in psutil.net_if_stats().iteritems():
277 if _is_virtual_netif(nic):
278 continue
279 fields = {'interface': nic}
280 for metric, counter_name in _net_if_metrics:
281 metric.set(getattr(stats, counter_name), fields=fields)
282
283
Allen Lic18ca9d2017-03-02 14:24:57 -0800284def _is_virtual_netif(nic):
285 """Return whether the network interface is virtual."""
286 # TODO(ayatane): Use a different way of identifying virtual interfaces
287 return nic.startswith('veth')
288
289
Allen Li45ae8392017-03-02 14:19:35 -0800290def collect_proc_info():
Allen Liefe7adf2016-10-27 11:36:04 -0700291 autoserv_count = 0
Allen Li0937a522016-11-23 13:34:48 -0800292 sysmon_count = 0
Allen Liefe7adf2016-10-27 11:36:04 -0700293 total = 0
294 for proc in psutil.process_iter():
Allen Li80dae192016-11-01 11:58:10 -0700295 if _is_parent_autoserv(proc):
Allen Liefe7adf2016-10-27 11:36:04 -0700296 autoserv_count += 1
Allen Li0937a522016-11-23 13:34:48 -0800297 elif _is_sysmon(proc):
298 sysmon_count += 1
Allen Liefe7adf2016-10-27 11:36:04 -0700299 total += 1
Allen Li79317bb2016-12-16 18:25:07 -0800300 logger.debug('autoserv_count: %s', autoserv_count)
301 logger.debug('sysmon_count: %s', sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700302 _autoserv_proc_count_metric.set(autoserv_count)
Allen Li0937a522016-11-23 13:34:48 -0800303 _sysmon_proc_count_metric.set(sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700304 _proc_count_metric.set(total)
305
306
Allen Li80dae192016-11-01 11:58:10 -0700307def _is_parent_autoserv(proc):
308 """Return whether proc is a parent (not forked) autoserv process."""
309 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
310
311
312def _is_autoserv(proc):
313 """Return whether proc is an autoserv process."""
314 # This relies on the autoserv script being run directly. The script should
315 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
316 # NOT /bin/env
317 return proc.name() == 'autoserv'
Allen Liec5beb32016-09-08 15:31:41 -0700318
Allen Lia6b02252016-10-26 14:40:51 -0700319
Allen Li0937a522016-11-23 13:34:48 -0800320def _is_sysmon(proc):
321 """Return whether proc is a sysmon process."""
Allen Lidfdfbda2016-12-16 17:41:16 -0800322 return proc.cmdline()[:3] == ['python', '-m', 'chromite.scripts.sysmon']
Allen Li0937a522016-11-23 13:34:48 -0800323
324
Allen Li45ae8392017-03-02 14:19:35 -0800325def collect_load_avg():
Allen Lia6b02252016-10-26 14:40:51 -0700326 try:
327 avg1, avg5, avg15 = os.getloadavg()
328 except OSError:
329 pass
330 else:
331 _load_average_metric.set(avg1, fields={'minutes': 1})
332 _load_average_metric.set(avg5, fields={'minutes': 5})
333 _load_average_metric.set(avg15, fields={'minutes': 15})
Allen Liec5beb32016-09-08 15:31:41 -0700334
335
Allen Li45ae8392017-03-02 14:19:35 -0800336def collect_unix_time():
Allen Lia6b02252016-10-26 14:40:51 -0700337 _unix_time_metric.set(int(time.time() * 1000))