blob: 57575049709e4547db130f8cdf381464b83466a4 [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Copyright (c) 2015 The Chromium Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9"""System metrics."""
10
11from __future__ import print_function
12
13import errno
14import os
Allen Liec5beb32016-09-08 15:31:41 -070015import time
16
17import psutil
18
19from chromite.lib import cros_logging as logging
20from infra_libs import ts_mon
21
22
Allen Lia6b02252016-10-26 14:40:51 -070023_cpu_count_metric = ts_mon.GaugeMetric(
24 'dev/cpu/count',
25 description='Number of CPU cores.')
26_cpu_time_metric = ts_mon.FloatMetric(
27 'dev/cpu/time',
28 description='percentage of time spent by the CPU '
29 'in different states.')
Allen Liec5beb32016-09-08 15:31:41 -070030
Allen Lia6b02252016-10-26 14:40:51 -070031_disk_free_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070032 'dev/disk/free',
33 description='Available bytes on disk partition.',
34 units=ts_mon.MetricsDataUnits.BYTES)
Allen Lia6b02252016-10-26 14:40:51 -070035_disk_total_metric = ts_mon.GaugeMetric(
36 'dev/disk/total',
37 description='Total bytes on disk partition.',
38 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070039
Allen Lia6b02252016-10-26 14:40:51 -070040_inodes_free_metric = ts_mon.GaugeMetric(
41 'dev/inodes/free',
42 description='Number of available inodes on '
43 'disk partition (unix only).')
44_inodes_total_metric = ts_mon.GaugeMetric(
45 'dev/inodes/total',
46 description='Number of possible inodes on '
47 'disk partition (unix only)')
Allen Liec5beb32016-09-08 15:31:41 -070048
Allen Lia6b02252016-10-26 14:40:51 -070049_mem_free_metric = ts_mon.GaugeMetric(
50 'dev/mem/free',
51 description='Amount of memory available to a '
52 'process (in Bytes). Buffers are considered '
53 'free memory.',
54 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070055
Allen Lia6b02252016-10-26 14:40:51 -070056_mem_total_metric = ts_mon.GaugeMetric(
57 'dev/mem/total',
58 description='Total physical memory in Bytes.',
59 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070060
Allen Lia6b02252016-10-26 14:40:51 -070061BOOT_TIME = psutil.boot_time()
62_net_up_metric = ts_mon.CounterMetric(
63 'dev/net/bytes/up', start_time=BOOT_TIME,
64 description='Number of bytes sent on interface.',
65 units=ts_mon.MetricsDataUnits.BYTES)
66_net_down_metric = ts_mon.CounterMetric(
67 'dev/net/bytes/down', start_time=BOOT_TIME,
68 description='Number of Bytes received on '
69 'interface.',
70 units=ts_mon.MetricsDataUnits.BYTES)
71_net_err_up_metric = ts_mon.CounterMetric(
72 'dev/net/err/up', start_time=BOOT_TIME,
73 description='Total number of errors when '
74 'sending (per interface).')
75_net_err_down_metric = ts_mon.CounterMetric(
76 'dev/net/err/down', start_time=BOOT_TIME,
77 description='Total number of errors when '
78 'receiving (per interface).')
79_net_drop_up_metric = ts_mon.CounterMetric(
80 'dev/net/drop/up', start_time=BOOT_TIME,
81 description='Total number of outgoing '
82 'packets that have been dropped.')
83_net_drop_down_metric = ts_mon.CounterMetric(
84 'dev/net/drop/down', start_time=BOOT_TIME,
Allen Liec5beb32016-09-08 15:31:41 -070085 description='Total number of incoming '
86 'packets that have been dropped.')
87
Allen Lia6b02252016-10-26 14:40:51 -070088_disk_read_metric = ts_mon.CounterMetric(
89 'dev/disk/read', start_time=BOOT_TIME,
90 description='Number of Bytes read on disk.',
91 units=ts_mon.MetricsDataUnits.BYTES)
92_disk_write_metric = ts_mon.CounterMetric(
93 'dev/disk/write', start_time=BOOT_TIME,
94 description='Number of Bytes written on disk.',
95 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070096
Allen Lia6b02252016-10-26 14:40:51 -070097_uptime_metric = ts_mon.GaugeMetric(
98 'dev/uptime',
99 description='Machine uptime, in seconds.',
100 units=ts_mon.MetricsDataUnits.SECONDS)
Allen Liec5beb32016-09-08 15:31:41 -0700101
Allen Lia6b02252016-10-26 14:40:51 -0700102_proc_count_metric = ts_mon.GaugeMetric(
103 'dev/proc/count',
104 description='Number of processes currently running.')
Allen Liefe7adf2016-10-27 11:36:04 -0700105_autoserv_proc_count_metric = ts_mon.GaugeMetric(
106 'dev/proc/autoserv_count',
107 description='Number of autoserv processes currently running.')
Allen Li0937a522016-11-23 13:34:48 -0800108_sysmon_proc_count_metric = ts_mon.GaugeMetric(
109 'dev/proc/sysmon_count',
110 description='Number of sysmon processes currently running.')
Allen Lia6b02252016-10-26 14:40:51 -0700111_load_average_metric = ts_mon.FloatMetric(
112 'dev/proc/load_average',
113 description='Number of processes currently '
114 'in the system run queue.')
Allen Liec5beb32016-09-08 15:31:41 -0700115
Allen Lia6b02252016-10-26 14:40:51 -0700116# ts_mon pipeline uses backend clocks when assigning timestamps to metric
117# points. By comparing point timestamp to the point value (i.e. time by
118# machine's local clock), we can potentially detect some anomalies (clock
119# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
Allen Liec5beb32016-09-08 15:31:41 -0700120#
121# It is important to gather this metric right before the flush.
Allen Lia6b02252016-10-26 14:40:51 -0700122_unix_time_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -0700123 'dev/unix_time',
Allen Lia6b02252016-10-26 14:40:51 -0700124 description='Number of milliseconds since epoch'
125 ' based on local machine clock.')
Allen Liec5beb32016-09-08 15:31:41 -0700126
Allen Lia6b02252016-10-26 14:40:51 -0700127_os_name_metric = ts_mon.StringMetric(
128 'proc/os/name',
129 description='OS name on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700130
Allen Lia6b02252016-10-26 14:40:51 -0700131_os_version_metric = ts_mon.StringMetric(
132 'proc/os/version',
133 description='OS version on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700134
Allen Lia6b02252016-10-26 14:40:51 -0700135_os_arch_metric = ts_mon.StringMetric(
136 'proc/os/arch',
137 description='OS architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700138
Allen Lia6b02252016-10-26 14:40:51 -0700139_python_arch_metric = ts_mon.StringMetric(
140 'proc/python/arch',
141 description='python userland '
142 'architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700143
144
145def get_uptime():
Allen Lia6b02252016-10-26 14:40:51 -0700146 _uptime_metric.set(int(time.time() - BOOT_TIME))
Allen Liec5beb32016-09-08 15:31:41 -0700147
148
149def get_cpu_info():
Allen Lia6b02252016-10-26 14:40:51 -0700150 _cpu_count_metric.set(psutil.cpu_count())
Allen Liec5beb32016-09-08 15:31:41 -0700151
152 times = psutil.cpu_times_percent()
153 for mode in ('user', 'system', 'idle'):
Allen Lia6b02252016-10-26 14:40:51 -0700154 _cpu_time_metric.set(getattr(times, mode), {'mode': mode})
Allen Liec5beb32016-09-08 15:31:41 -0700155
156
157def get_disk_info(mountpoints=None):
158 if mountpoints is None:
159 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
Allen Liec5beb32016-09-08 15:31:41 -0700160 for mountpoint in mountpoints:
Allen Lia6b02252016-10-26 14:40:51 -0700161 _get_disk_info_single(mountpoint)
162 _get_fs_inode_info(mountpoint)
163 _get_disk_io_info()
Allen Liec5beb32016-09-08 15:31:41 -0700164
Allen Liec5beb32016-09-08 15:31:41 -0700165
Allen Lia6b02252016-10-26 14:40:51 -0700166def _get_disk_info_single(mountpoint):
167 fields = {'path': mountpoint}
Allen Liec5beb32016-09-08 15:31:41 -0700168
169 try:
Allen Lia6b02252016-10-26 14:40:51 -0700170 usage = psutil.disk_usage(mountpoint)
171 except OSError as ex:
172 if ex.errno == errno.ENOENT:
173 # This happens on Windows when querying a removable drive that
174 # doesn't have any media inserted right now.
175 pass
176 else:
177 raise
178 else:
179 _disk_free_metric.set(usage.free, fields=fields)
180 _disk_total_metric.set(usage.total, fields=fields)
181
182 # inode counts are only available on Unix.
183 if os.name == 'posix':
184 _get_fs_inode_info(mountpoint)
185
186
187def _get_fs_inode_info(mountpoint):
188 fields = {'path': mountpoint}
189 stats = os.statvfs(mountpoint)
190 _inodes_free_metric.set(stats.f_favail, fields=fields)
191 _inodes_total_metric.set(stats.f_files, fields=fields)
192
193
194def _get_disk_io_info():
195 try:
196 disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
Allen Liec5beb32016-09-08 15:31:41 -0700197 except RuntimeError as ex:
198 if "couldn't find any physical disk" in str(ex):
199 # Disk performance counters aren't enabled on Windows.
200 pass
201 else:
202 raise
Allen Lia6b02252016-10-26 14:40:51 -0700203 else:
204 for disk, counters in disk_counters:
205 fields = {'disk': disk}
206 _disk_read_metric.set(counters.read_bytes, fields=fields)
207 _disk_write_metric.set(counters.write_bytes, fields=fields)
Allen Liec5beb32016-09-08 15:31:41 -0700208
209
210def get_mem_info():
211 # We don't report mem.used because (due to virtual memory) it is not
212 # useful.
213 mem = psutil.virtual_memory()
Allen Lia6b02252016-10-26 14:40:51 -0700214 _mem_free_metric.set(mem.available)
215 _mem_total_metric.set(mem.total)
Allen Liec5beb32016-09-08 15:31:41 -0700216
217
218def get_net_info():
219 metric_counter_names = [
Allen Lia6b02252016-10-26 14:40:51 -0700220 (_net_up_metric, 'bytes_sent'),
221 (_net_down_metric, 'bytes_recv'),
222 (_net_err_up_metric, 'errout'),
223 (_net_err_down_metric, 'errin'),
224 (_net_drop_up_metric, 'dropout'),
225 (_net_drop_down_metric, 'dropin'),
Allen Liec5beb32016-09-08 15:31:41 -0700226 ]
227
228 nics = psutil.net_io_counters(pernic=True)
229 for nic, counters in nics.iteritems():
Allen Li10eb79a2016-10-19 11:11:53 -0700230 # TODO(ayatane): Use a different way of identifying virtual interfaces
231 if nic.startswith('veth'):
232 # Skip virtual interfaces
233 continue
Allen Liec5beb32016-09-08 15:31:41 -0700234 fields = {'interface': nic}
235 for metric, counter_name in metric_counter_names:
236 try:
237 metric.set(getattr(counters, counter_name), fields=fields)
238 except ts_mon.MonitoringDecreasingValueError as ex:
239 # This normally shouldn't happen, but might if the network
240 # driver module is reloaded, so log an error and continue
241 # instead of raising an exception.
Allen Lia6b02252016-10-26 14:40:51 -0700242 logging.warning(str(ex))
Allen Liec5beb32016-09-08 15:31:41 -0700243
244
Allen Liec5beb32016-09-08 15:31:41 -0700245def get_proc_info():
Allen Liefe7adf2016-10-27 11:36:04 -0700246 autoserv_count = 0
Allen Li0937a522016-11-23 13:34:48 -0800247 sysmon_count = 0
Allen Liefe7adf2016-10-27 11:36:04 -0700248 total = 0
249 for proc in psutil.process_iter():
Allen Li80dae192016-11-01 11:58:10 -0700250 if _is_parent_autoserv(proc):
Allen Liefe7adf2016-10-27 11:36:04 -0700251 autoserv_count += 1
Allen Li0937a522016-11-23 13:34:48 -0800252 elif _is_sysmon(proc):
253 sysmon_count += 1
Allen Liefe7adf2016-10-27 11:36:04 -0700254 total += 1
Allen Li2b3a93c2016-11-01 12:27:21 -0700255 logging.debug('autoserv_count: %s', autoserv_count)
Allen Lidfdfbda2016-12-16 17:41:16 -0800256 logging.debug('sysmon_count: %s', sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700257 _autoserv_proc_count_metric.set(autoserv_count)
Allen Li0937a522016-11-23 13:34:48 -0800258 _sysmon_proc_count_metric.set(sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700259 _proc_count_metric.set(total)
260
261
Allen Li80dae192016-11-01 11:58:10 -0700262def _is_parent_autoserv(proc):
263 """Return whether proc is a parent (not forked) autoserv process."""
264 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
265
266
267def _is_autoserv(proc):
268 """Return whether proc is an autoserv process."""
269 # This relies on the autoserv script being run directly. The script should
270 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
271 # NOT /bin/env
272 return proc.name() == 'autoserv'
Allen Liec5beb32016-09-08 15:31:41 -0700273
Allen Lia6b02252016-10-26 14:40:51 -0700274
Allen Li0937a522016-11-23 13:34:48 -0800275def _is_sysmon(proc):
276 """Return whether proc is a sysmon process."""
Allen Lidfdfbda2016-12-16 17:41:16 -0800277 return proc.cmdline()[:3] == ['python', '-m', 'chromite.scripts.sysmon']
Allen Li0937a522016-11-23 13:34:48 -0800278
279
Allen Lia6b02252016-10-26 14:40:51 -0700280def get_load_avg():
281 try:
282 avg1, avg5, avg15 = os.getloadavg()
283 except OSError:
284 pass
285 else:
286 _load_average_metric.set(avg1, fields={'minutes': 1})
287 _load_average_metric.set(avg5, fields={'minutes': 5})
288 _load_average_metric.set(avg15, fields={'minutes': 15})
Allen Liec5beb32016-09-08 15:31:41 -0700289
290
291def get_unix_time():
Allen Lia6b02252016-10-26 14:40:51 -0700292 _unix_time_metric.set(int(time.time() * 1000))