blob: ff8f1a5373ed4a2379709d819256bded9d81f96a [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Copyright (c) 2015 The Chromium Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9"""System metrics."""
10
11from __future__ import print_function
12
Allen Lia6b02252016-10-26 14:40:51 -070013import collections
Allen Liec5beb32016-09-08 15:31:41 -070014import errno
15import os
16import platform
17import sys
18import time
19
20import psutil
21
22from chromite.lib import cros_logging as logging
23from infra_libs import ts_mon
24
25
Allen Lia6b02252016-10-26 14:40:51 -070026_cpu_count_metric = ts_mon.GaugeMetric(
27 'dev/cpu/count',
28 description='Number of CPU cores.')
29_cpu_time_metric = ts_mon.FloatMetric(
30 'dev/cpu/time',
31 description='percentage of time spent by the CPU '
32 'in different states.')
Allen Liec5beb32016-09-08 15:31:41 -070033
Allen Lia6b02252016-10-26 14:40:51 -070034_disk_free_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -070035 'dev/disk/free',
36 description='Available bytes on disk partition.',
37 units=ts_mon.MetricsDataUnits.BYTES)
Allen Lia6b02252016-10-26 14:40:51 -070038_disk_total_metric = ts_mon.GaugeMetric(
39 'dev/disk/total',
40 description='Total bytes on disk partition.',
41 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070042
Allen Lia6b02252016-10-26 14:40:51 -070043_inodes_free_metric = ts_mon.GaugeMetric(
44 'dev/inodes/free',
45 description='Number of available inodes on '
46 'disk partition (unix only).')
47_inodes_total_metric = ts_mon.GaugeMetric(
48 'dev/inodes/total',
49 description='Number of possible inodes on '
50 'disk partition (unix only)')
Allen Liec5beb32016-09-08 15:31:41 -070051
Allen Lia6b02252016-10-26 14:40:51 -070052_mem_free_metric = ts_mon.GaugeMetric(
53 'dev/mem/free',
54 description='Amount of memory available to a '
55 'process (in Bytes). Buffers are considered '
56 'free memory.',
57 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070058
Allen Lia6b02252016-10-26 14:40:51 -070059_mem_total_metric = ts_mon.GaugeMetric(
60 'dev/mem/total',
61 description='Total physical memory in Bytes.',
62 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070063
Allen Lia6b02252016-10-26 14:40:51 -070064BOOT_TIME = psutil.boot_time()
65_net_up_metric = ts_mon.CounterMetric(
66 'dev/net/bytes/up', start_time=BOOT_TIME,
67 description='Number of bytes sent on interface.',
68 units=ts_mon.MetricsDataUnits.BYTES)
69_net_down_metric = ts_mon.CounterMetric(
70 'dev/net/bytes/down', start_time=BOOT_TIME,
71 description='Number of Bytes received on '
72 'interface.',
73 units=ts_mon.MetricsDataUnits.BYTES)
74_net_err_up_metric = ts_mon.CounterMetric(
75 'dev/net/err/up', start_time=BOOT_TIME,
76 description='Total number of errors when '
77 'sending (per interface).')
78_net_err_down_metric = ts_mon.CounterMetric(
79 'dev/net/err/down', start_time=BOOT_TIME,
80 description='Total number of errors when '
81 'receiving (per interface).')
82_net_drop_up_metric = ts_mon.CounterMetric(
83 'dev/net/drop/up', start_time=BOOT_TIME,
84 description='Total number of outgoing '
85 'packets that have been dropped.')
86_net_drop_down_metric = ts_mon.CounterMetric(
87 'dev/net/drop/down', start_time=BOOT_TIME,
Allen Liec5beb32016-09-08 15:31:41 -070088 description='Total number of incoming '
89 'packets that have been dropped.')
90
Allen Lia6b02252016-10-26 14:40:51 -070091_disk_read_metric = ts_mon.CounterMetric(
92 'dev/disk/read', start_time=BOOT_TIME,
93 description='Number of Bytes read on disk.',
94 units=ts_mon.MetricsDataUnits.BYTES)
95_disk_write_metric = ts_mon.CounterMetric(
96 'dev/disk/write', start_time=BOOT_TIME,
97 description='Number of Bytes written on disk.',
98 units=ts_mon.MetricsDataUnits.BYTES)
Allen Liec5beb32016-09-08 15:31:41 -070099
Allen Lia6b02252016-10-26 14:40:51 -0700100_uptime_metric = ts_mon.GaugeMetric(
101 'dev/uptime',
102 description='Machine uptime, in seconds.',
103 units=ts_mon.MetricsDataUnits.SECONDS)
Allen Liec5beb32016-09-08 15:31:41 -0700104
Allen Lia6b02252016-10-26 14:40:51 -0700105_proc_count_metric = ts_mon.GaugeMetric(
106 'dev/proc/count',
107 description='Number of processes currently running.')
Allen Liefe7adf2016-10-27 11:36:04 -0700108_autoserv_proc_count_metric = ts_mon.GaugeMetric(
109 'dev/proc/autoserv_count',
110 description='Number of autoserv processes currently running.')
Allen Li0937a522016-11-23 13:34:48 -0800111_sysmon_proc_count_metric = ts_mon.GaugeMetric(
112 'dev/proc/sysmon_count',
113 description='Number of sysmon processes currently running.')
Allen Lia6b02252016-10-26 14:40:51 -0700114_load_average_metric = ts_mon.FloatMetric(
115 'dev/proc/load_average',
116 description='Number of processes currently '
117 'in the system run queue.')
Allen Liec5beb32016-09-08 15:31:41 -0700118
Allen Lia6b02252016-10-26 14:40:51 -0700119# ts_mon pipeline uses backend clocks when assigning timestamps to metric
120# points. By comparing point timestamp to the point value (i.e. time by
121# machine's local clock), we can potentially detect some anomalies (clock
122# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
Allen Liec5beb32016-09-08 15:31:41 -0700123#
124# It is important to gather this metric right before the flush.
Allen Lia6b02252016-10-26 14:40:51 -0700125_unix_time_metric = ts_mon.GaugeMetric(
Allen Liec5beb32016-09-08 15:31:41 -0700126 'dev/unix_time',
Allen Lia6b02252016-10-26 14:40:51 -0700127 description='Number of milliseconds since epoch'
128 ' based on local machine clock.')
Allen Liec5beb32016-09-08 15:31:41 -0700129
Allen Lia6b02252016-10-26 14:40:51 -0700130_os_name_metric = ts_mon.StringMetric(
131 'proc/os/name',
132 description='OS name on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700133
Allen Lia6b02252016-10-26 14:40:51 -0700134_os_version_metric = ts_mon.StringMetric(
135 'proc/os/version',
136 description='OS version on the machine')
Allen Liec5beb32016-09-08 15:31:41 -0700137
Allen Lia6b02252016-10-26 14:40:51 -0700138_os_arch_metric = ts_mon.StringMetric(
139 'proc/os/arch',
140 description='OS architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700141
Allen Lia6b02252016-10-26 14:40:51 -0700142_python_arch_metric = ts_mon.StringMetric(
143 'proc/python/arch',
144 description='python userland '
145 'architecture on this machine')
Allen Liec5beb32016-09-08 15:31:41 -0700146
147
148def get_uptime():
Allen Lia6b02252016-10-26 14:40:51 -0700149 _uptime_metric.set(int(time.time() - BOOT_TIME))
Allen Liec5beb32016-09-08 15:31:41 -0700150
151
152def get_cpu_info():
Allen Lia6b02252016-10-26 14:40:51 -0700153 _cpu_count_metric.set(psutil.cpu_count())
Allen Liec5beb32016-09-08 15:31:41 -0700154
155 times = psutil.cpu_times_percent()
156 for mode in ('user', 'system', 'idle'):
Allen Lia6b02252016-10-26 14:40:51 -0700157 _cpu_time_metric.set(getattr(times, mode), {'mode': mode})
Allen Liec5beb32016-09-08 15:31:41 -0700158
159
160def get_disk_info(mountpoints=None):
161 if mountpoints is None:
162 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
Allen Liec5beb32016-09-08 15:31:41 -0700163 for mountpoint in mountpoints:
Allen Lia6b02252016-10-26 14:40:51 -0700164 _get_disk_info_single(mountpoint)
165 _get_fs_inode_info(mountpoint)
166 _get_disk_io_info()
Allen Liec5beb32016-09-08 15:31:41 -0700167
Allen Liec5beb32016-09-08 15:31:41 -0700168
Allen Lia6b02252016-10-26 14:40:51 -0700169def _get_disk_info_single(mountpoint):
170 fields = {'path': mountpoint}
Allen Liec5beb32016-09-08 15:31:41 -0700171
172 try:
Allen Lia6b02252016-10-26 14:40:51 -0700173 usage = psutil.disk_usage(mountpoint)
174 except OSError as ex:
175 if ex.errno == errno.ENOENT:
176 # This happens on Windows when querying a removable drive that
177 # doesn't have any media inserted right now.
178 pass
179 else:
180 raise
181 else:
182 _disk_free_metric.set(usage.free, fields=fields)
183 _disk_total_metric.set(usage.total, fields=fields)
184
185 # inode counts are only available on Unix.
186 if os.name == 'posix':
187 _get_fs_inode_info(mountpoint)
188
189
190def _get_fs_inode_info(mountpoint):
191 fields = {'path': mountpoint}
192 stats = os.statvfs(mountpoint)
193 _inodes_free_metric.set(stats.f_favail, fields=fields)
194 _inodes_total_metric.set(stats.f_files, fields=fields)
195
196
197def _get_disk_io_info():
198 try:
199 disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
Allen Liec5beb32016-09-08 15:31:41 -0700200 except RuntimeError as ex:
201 if "couldn't find any physical disk" in str(ex):
202 # Disk performance counters aren't enabled on Windows.
203 pass
204 else:
205 raise
Allen Lia6b02252016-10-26 14:40:51 -0700206 else:
207 for disk, counters in disk_counters:
208 fields = {'disk': disk}
209 _disk_read_metric.set(counters.read_bytes, fields=fields)
210 _disk_write_metric.set(counters.write_bytes, fields=fields)
Allen Liec5beb32016-09-08 15:31:41 -0700211
212
213def get_mem_info():
214 # We don't report mem.used because (due to virtual memory) it is not
215 # useful.
216 mem = psutil.virtual_memory()
Allen Lia6b02252016-10-26 14:40:51 -0700217 _mem_free_metric.set(mem.available)
218 _mem_total_metric.set(mem.total)
Allen Liec5beb32016-09-08 15:31:41 -0700219
220
221def get_net_info():
222 metric_counter_names = [
Allen Lia6b02252016-10-26 14:40:51 -0700223 (_net_up_metric, 'bytes_sent'),
224 (_net_down_metric, 'bytes_recv'),
225 (_net_err_up_metric, 'errout'),
226 (_net_err_down_metric, 'errin'),
227 (_net_drop_up_metric, 'dropout'),
228 (_net_drop_down_metric, 'dropin'),
Allen Liec5beb32016-09-08 15:31:41 -0700229 ]
230
231 nics = psutil.net_io_counters(pernic=True)
232 for nic, counters in nics.iteritems():
Allen Li10eb79a2016-10-19 11:11:53 -0700233 # TODO(ayatane): Use a different way of identifying virtual interfaces
234 if nic.startswith('veth'):
235 # Skip virtual interfaces
236 continue
Allen Liec5beb32016-09-08 15:31:41 -0700237 fields = {'interface': nic}
238 for metric, counter_name in metric_counter_names:
239 try:
240 metric.set(getattr(counters, counter_name), fields=fields)
241 except ts_mon.MonitoringDecreasingValueError as ex:
242 # This normally shouldn't happen, but might if the network
243 # driver module is reloaded, so log an error and continue
244 # instead of raising an exception.
Allen Lia6b02252016-10-26 14:40:51 -0700245 logging.warning(str(ex))
Allen Liec5beb32016-09-08 15:31:41 -0700246
247
248def get_os_info():
Allen Lia6b02252016-10-26 14:40:51 -0700249 os_info = _get_os_info()
250 _os_name_metric.set(os_info.name)
251 _os_version_metric.set(os_info.version)
252 _os_arch_metric.set(platform.machine())
253 _python_arch_metric.set(_get_python_arch())
Allen Liec5beb32016-09-08 15:31:41 -0700254
Allen Liec5beb32016-09-08 15:31:41 -0700255
Allen Lia6b02252016-10-26 14:40:51 -0700256OSInfo = collections.namedtuple('OSInfo', 'name,version')
257
258
259def _get_os_info():
260 """Get OS name and version.
261
262 Returns:
263 OSInfo instance
264 """
265 os_name = platform.system().lower()
266 os_version = ''
267 if 'windows' in os_name:
268 os_name = 'windows'
269 # release will be something like '7', 'vista', or 'xp'
270 os_version = platform.release()
271 elif 'linux' in os_name:
Allen Liec5beb32016-09-08 15:31:41 -0700272 # will return something like ('Ubuntu', '14.04', 'trusty')
Allen Lia6b02252016-10-26 14:40:51 -0700273 os_name, os_version, _ = platform.dist()
Allen Liec5beb32016-09-08 15:31:41 -0700274 else:
Allen Lia6b02252016-10-26 14:40:51 -0700275 # On mac platform.system() reports 'darwin'.
276 os_version = _get_mac_version()
277 if os_version:
278 # We found a valid mac.
279 os_name = 'mac'
Allen Liec5beb32016-09-08 15:31:41 -0700280 else:
281 # not a mac, unable to find platform information, reset
Allen Lia6b02252016-10-26 14:40:51 -0700282 os_name = ''
283 os_version = ''
Allen Liec5beb32016-09-08 15:31:41 -0700284
Allen Lia6b02252016-10-26 14:40:51 -0700285 os_name = os_name.lower()
286 os_version = os_version.lower()
287 return OSInfo(name=os_name, version=os_version)
Allen Liec5beb32016-09-08 15:31:41 -0700288
Allen Lia6b02252016-10-26 14:40:51 -0700289def _get_mac_version():
290 """Get Mac system version.
291
292 Returns:
293 Version string, which is empty if not a valid Mac system.
294 """
295 # This tuple is only populated on mac systems.
296 mac_ver = platform.mac_ver()
297 # Will be '10.11.5' or similar on a valid mac or will be '' on a non-mac.
298 os_version = mac_ver[0]
299 return os_version
300
301
302def _get_python_arch():
Allen Liec5beb32016-09-08 15:31:41 -0700303 if sys.maxsize > 2**32:
Allen Lia6b02252016-10-26 14:40:51 -0700304 return '64'
305 else:
306 return '32'
Allen Liec5beb32016-09-08 15:31:41 -0700307
308
309def get_proc_info():
Allen Liefe7adf2016-10-27 11:36:04 -0700310 autoserv_count = 0
Allen Li0937a522016-11-23 13:34:48 -0800311 sysmon_count = 0
Allen Liefe7adf2016-10-27 11:36:04 -0700312 total = 0
313 for proc in psutil.process_iter():
Allen Li80dae192016-11-01 11:58:10 -0700314 if _is_parent_autoserv(proc):
Allen Liefe7adf2016-10-27 11:36:04 -0700315 autoserv_count += 1
Allen Li0937a522016-11-23 13:34:48 -0800316 elif _is_sysmon(proc):
317 sysmon_count += 1
Allen Liefe7adf2016-10-27 11:36:04 -0700318 total += 1
Allen Li2b3a93c2016-11-01 12:27:21 -0700319 logging.debug('autoserv_count: %s', autoserv_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700320 _autoserv_proc_count_metric.set(autoserv_count)
Allen Li0937a522016-11-23 13:34:48 -0800321 _sysmon_proc_count_metric.set(sysmon_count)
Allen Liefe7adf2016-10-27 11:36:04 -0700322 _proc_count_metric.set(total)
323
324
Allen Li80dae192016-11-01 11:58:10 -0700325def _is_parent_autoserv(proc):
326 """Return whether proc is a parent (not forked) autoserv process."""
327 return _is_autoserv(proc) and not _is_autoserv(proc.parent())
328
329
330def _is_autoserv(proc):
331 """Return whether proc is an autoserv process."""
332 # This relies on the autoserv script being run directly. The script should
333 # be named autoserv exactly and start with a shebang that is /usr/bin/python,
334 # NOT /bin/env
335 return proc.name() == 'autoserv'
Allen Liec5beb32016-09-08 15:31:41 -0700336
Allen Lia6b02252016-10-26 14:40:51 -0700337
Allen Li0937a522016-11-23 13:34:48 -0800338def _is_sysmon(proc):
339 """Return whether proc is a sysmon process."""
340 # This is fragile due to the virtualenv bootstrap of sysmon.
341 # The process tree for an Upstart invocation of sysmon is:
342 #
343 # init -> sudo -> python2 -> python
344 #
345 # If sysmon is started without using Upstart:
346 #
347 # init -> (shell) -> python2 -> python
348 #
349 # The extra python2 is due to the virtualenv wrapper script, which should do
350 # an exec to avoid wasting a process. The fact that the first has a 2 and
351 # the second doesn't is basically just luck.
352 #
353 # TODO(ayatane): Once the chromite virtualenv wrapper uses exec, clean this
354 # up.
355 return proc.name() == 'python' and 'sysmon' in ' '.join(proc.cmdline())
356
357
Allen Lia6b02252016-10-26 14:40:51 -0700358def get_load_avg():
359 try:
360 avg1, avg5, avg15 = os.getloadavg()
361 except OSError:
362 pass
363 else:
364 _load_average_metric.set(avg1, fields={'minutes': 1})
365 _load_average_metric.set(avg5, fields={'minutes': 5})
366 _load_average_metric.set(avg15, fields={'minutes': 15})
Allen Liec5beb32016-09-08 15:31:41 -0700367
368
369def get_unix_time():
Allen Lia6b02252016-10-26 14:40:51 -0700370 _unix_time_metric.set(int(time.time() * 1000))