blob: e62e32c43c01e49d1390c0ea2188a7e5bce5d44b [file] [log] [blame]
Allen Liec5beb32016-09-08 15:31:41 -07001# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# Copyright (c) 2015 The Chromium Authors. All rights reserved.
6# Use of this source code is governed by a BSD-style license that can be
7# found in the LICENSE file.
8
9"""System metrics."""
10
11from __future__ import print_function
12
13import errno
14import os
15import platform
16import sys
17import time
18
19import psutil
20
21from chromite.lib import cros_logging as logging
22from infra_libs import ts_mon
23
24
25cpu_count = ts_mon.GaugeMetric('dev/cpu/count',
26 description='Number of CPU cores.')
27cpu_time = ts_mon.FloatMetric('dev/cpu/time',
28 description='percentage of time spent by the CPU'
29 ' in different states.')
30
31disk_free = ts_mon.GaugeMetric(
32 'dev/disk/free',
33 description='Available bytes on disk partition.',
34 units=ts_mon.MetricsDataUnits.BYTES)
35disk_total = ts_mon.GaugeMetric('dev/disk/total',
36 description='Total bytes on disk partition.',
37 units=ts_mon.MetricsDataUnits.BYTES)
38
39# inode counts are only available on Unix.
40if os.name == 'posix': # pragma: no cover
41 inodes_free = ts_mon.GaugeMetric(
42 'dev/inodes/free',
43 description='Number of available inodes on '
44 'disk partition (unix only).')
45 inodes_total = ts_mon.GaugeMetric(
46 'dev/inodes/total',
47 description='Number of possible inodes on '
48 'disk partition (unix only)')
49
50mem_free = ts_mon.GaugeMetric('dev/mem/free',
51 description='Amount of memory available to a '
52 'process (in Bytes). Buffers are considered '
53 'free memory.',
54 units=ts_mon.MetricsDataUnits.BYTES)
55
56mem_total = ts_mon.GaugeMetric('dev/mem/total',
57 description='Total physical memory in Bytes.',
58 units=ts_mon.MetricsDataUnits.BYTES)
59
60START_TIME = psutil.boot_time()
61net_up = ts_mon.CounterMetric('dev/net/bytes/up', start_time=START_TIME,
62 description='Number of bytes sent on interface.',
63 units=ts_mon.MetricsDataUnits.BYTES)
64net_down = ts_mon.CounterMetric('dev/net/bytes/down', start_time=START_TIME,
65 description='Number of Bytes received on '
66 'interface.',
67 units=ts_mon.MetricsDataUnits.BYTES)
68net_err_up = ts_mon.CounterMetric('dev/net/err/up', start_time=START_TIME,
69 description='Total number of errors when '
70 'sending (per interface).')
71net_err_down = ts_mon.CounterMetric('dev/net/err/down', start_time=START_TIME,
72 description='Total number of errors when '
73 'receiving (per interface).')
74net_drop_up = ts_mon.CounterMetric('dev/net/drop/up', start_time=START_TIME,
75 description='Total number of outgoing '
76 'packets that have been dropped.')
77net_drop_down = ts_mon.CounterMetric(
78 'dev/net/drop/down', start_time=START_TIME,
79 description='Total number of incoming '
80 'packets that have been dropped.')
81
82disk_read = ts_mon.CounterMetric('dev/disk/read', start_time=START_TIME,
83 description='Number of Bytes read on disk.',
84 units=ts_mon.MetricsDataUnits.BYTES)
85disk_write = ts_mon.CounterMetric('dev/disk/write', start_time=START_TIME,
86 description='Number of Bytes written on '
87 'disk.',
88 units=ts_mon.MetricsDataUnits.BYTES)
89
90uptime = ts_mon.GaugeMetric('dev/uptime',
91 description='Machine uptime, in seconds.',
92 units=ts_mon.MetricsDataUnits.SECONDS)
93
94proc_count = ts_mon.GaugeMetric('dev/proc/count',
95 description='Number of processes currently '
96 'running.')
97load_average = ts_mon.FloatMetric('dev/proc/load_average',
98 description='Number of processes currently '
99 'in the system run queue.')
100
101# tsmon pipeline uses backend clocks when assigning timestamps to metric points.
102# By comparing point timestamp to the point value (i.e. time by machine's local
103# clock), we can potentially detect some anomalies (clock drift, unusually high
104# metrics pipeline delay, completely wrong clocks, etc).
105#
106# It is important to gather this metric right before the flush.
107unix_time = ts_mon.GaugeMetric(
108 'dev/unix_time',
109 description='Number of milliseconds since epoch '
110 'based on local machine clock.')
111
112os_name = ts_mon.StringMetric('proc/os/name',
113 description='OS name on the machine ')
114
115os_version = ts_mon.StringMetric('proc/os/version',
116 description='OS version on the machine ')
117
118os_arch = ts_mon.StringMetric('proc/os/arch',
119 description='OS architecture on this machine')
120
121python_arch = ts_mon.StringMetric('proc/python/arch',
122 description='python userland '
123 'architecture on this machine')
124
125
126def get_uptime():
127 uptime.set(int(time.time() - START_TIME))
128
129
130def get_cpu_info():
131 cpu_count.set(psutil.cpu_count())
132
133 times = psutil.cpu_times_percent()
134 for mode in ('user', 'system', 'idle'):
135 cpu_time.set(getattr(times, mode), {'mode': mode})
136
137
138def get_disk_info(mountpoints=None):
139 if mountpoints is None:
140 mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
141
142 for mountpoint in mountpoints:
143 fields = {'path': mountpoint}
144
145 try:
146 usage = psutil.disk_usage(mountpoint)
147 except OSError as ex:
148 if ex.errno == errno.ENOENT:
149 # This happens on Windows when querying a removable drive that
150 # doesn't have any media inserted right now.
151 continue
152 raise # pragma: no cover
153
154 disk_free.set(usage.free, fields=fields)
155 disk_total.set(usage.total, fields=fields)
156
157 # inode counts are only available on Unix.
158 if os.name == 'posix': # pragma: no cover
159 stats = os.statvfs(mountpoint)
160 inodes_free.set(stats.f_favail, fields=fields)
161 inodes_total.set(stats.f_files, fields=fields)
162
163 try:
164 for disk, counters in psutil.disk_io_counters(perdisk=True).iteritems():
165 fields = {'disk': disk}
166 disk_read.set(counters.read_bytes, fields=fields)
167 disk_write.set(counters.write_bytes, fields=fields)
168 except RuntimeError as ex:
169 if "couldn't find any physical disk" in str(ex):
170 # Disk performance counters aren't enabled on Windows.
171 pass
172 else:
173 raise
174
175
176def get_mem_info():
177 # We don't report mem.used because (due to virtual memory) it is not
178 # useful.
179 mem = psutil.virtual_memory()
180 mem_free.set(mem.available)
181 mem_total.set(mem.total)
182
183
184def get_net_info():
185 metric_counter_names = [
186 (net_up, 'bytes_sent'),
187 (net_down, 'bytes_recv'),
188 (net_err_up, 'errout'),
189 (net_err_down, 'errin'),
190 (net_drop_up, 'dropout'),
191 (net_drop_down, 'dropin'),
192 ]
193
194 nics = psutil.net_io_counters(pernic=True)
195 for nic, counters in nics.iteritems():
Allen Li10eb79a2016-10-19 11:11:53 -0700196 # TODO(ayatane): Use a different way of identifying virtual interfaces
197 if nic.startswith('veth'):
198 # Skip virtual interfaces
199 continue
Allen Liec5beb32016-09-08 15:31:41 -0700200 fields = {'interface': nic}
201 for metric, counter_name in metric_counter_names:
202 try:
203 metric.set(getattr(counters, counter_name), fields=fields)
204 except ts_mon.MonitoringDecreasingValueError as ex:
205 # This normally shouldn't happen, but might if the network
206 # driver module is reloaded, so log an error and continue
207 # instead of raising an exception.
208 logging.error(str(ex))
209
210
211def get_os_info():
212 os_name_data = ''
213 os_version_data = ''
214
215 os_name_data = platform.system().lower()
216 if 'windows' in os_name_data:
217 os_name_data = 'windows'
218 # os_release will be something like '7', 'vista', or 'xp'
219 os_version_data = platform.release()
220
221 elif 'linux' in os_name_data:
222 # will return something like ('Ubuntu', '14.04', 'trusty')
223 dist_info_data = platform.dist()
224 os_name_data = dist_info_data[0]
225 os_version_data = dist_info_data[1]
226
227 # on mac platform.system() reports 'darwin'
228 else:
229 # this tuple is only populated on mac systems
230 mac_ver_data = platform.mac_ver()
231 # [0] will be '10.11.5' or similar on a valid mac or will be '' on a
232 # non-mac
233 os_version_data = mac_ver_data[0]
234 if os_version_data:
235 # we found a valid mac
236 os_name_data = 'mac'
237 else:
238 # not a mac, unable to find platform information, reset
239 os_name_data = ''
240 os_version_data = ''
241
242 # normalize to lower case
243 os_name_data = os_name_data.lower()
244 os_version_data = os_version_data.lower()
245
246 python_arch_data = '32'
247 if sys.maxsize > 2**32:
248 python_arch_data = '64'
249
250 # construct metrics
251 os_name.set(os_name_data)
252 os_version.set(os_version_data)
253 os_arch.set(platform.machine())
254 python_arch.set(python_arch_data)
255
256
257def clear_os_info():
258 os_name.reset()
259 os_version.reset()
260 os_arch.reset()
261 python_arch.reset()
262
263
264def get_proc_info():
265 procs = psutil.pids()
266 proc_count.set(len(procs))
267
268 if os.name == 'posix': # pragma: no cover
269 try:
270 avg1, avg5, avg15 = os.getloadavg()
271 except OSError: # pragma: no cover
272 pass
273 else:
274 load_average.set(avg1, fields={'minutes': 1})
275 load_average.set(avg5, fields={'minutes': 5})
276 load_average.set(avg15, fields={'minutes': 15})
277
278
279def get_unix_time():
280 unix_time.set(int(time.time() * 1000))