blob: d978fdbacc53bb697fa6cf9c00236ed25b9e8c36 [file] [log] [blame]
Congbin Guo3afae6c2019-08-13 16:29:42 -07001# -*- coding: utf-8 -*-
2# Copyright 2019 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""A cherrypy application to check devserver health status."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
11import json
12import os
13import subprocess
14import threading
15import time
16
Amin Hassanid4e35392019-10-03 11:02:44 -070017import cherrypy # pylint: disable=import-error
Congbin Guo3afae6c2019-08-13 16:29:42 -070018
19import cros_update_progress
Congbin Guo3afae6c2019-08-13 16:29:42 -070020
Achuith Bhandarkar662fb722019-10-31 16:12:49 -070021import setup_chromite # pylint: disable=unused-import
22from chromite.lib.xbuddy import cherrypy_log_util
Congbin Guo3afae6c2019-08-13 16:29:42 -070023
24try:
25 import psutil
26except ImportError:
27 # Ignore psutil import failure. This is for backwards compatibility, so
28 # "cros flash" can still update duts with build without psutil installed.
29 # The reason is that, during cros flash, local devserver code is copied over
30 # to DUT, and devserver will be running inside DUT to stage the build.
Congbin Guo3afae6c2019-08-13 16:29:42 -070031 psutil = None
Achuith Bhandarkar662fb722019-10-31 16:12:49 -070032except OSError:
Congbin Guo3afae6c2019-08-13 16:29:42 -070033 # Ignore error like following. psutil may not work properly in builder. Ignore
34 # the error as load information of devserver is not used in builder.
35 # OSError: [Errno 2] No such file or directory: '/dev/pts/0'
Congbin Guo3afae6c2019-08-13 16:29:42 -070036 psutil = None
37
38
Achuith Bhandarkar662fb722019-10-31 16:12:49 -070039def _Log(message, *args):
40 """Module-local log function."""
41 return cherrypy_log_util.LogWithTag('HEALTHCHECKER', message, *args)
42
Congbin Guo3afae6c2019-08-13 16:29:42 -070043# Number of seconds between the collection of disk and network IO counters.
44STATS_INTERVAL = 10.0
45_1G = 1000000000
46
47
48def require_psutil():
49 """Decorator for functions require psutil to run."""
50 def deco_require_psutil(func):
51 """Wrapper of the decorator function.
52
53 Args:
54 func: function to be called.
55 """
56 def func_require_psutil(*args, **kwargs):
57 """Decorator for functions require psutil to run.
58
59 If psutil is not installed, skip calling the function.
60
61 Args:
62 *args: arguments for function to be called.
63 **kwargs: keyword arguments for function to be called.
64 """
65 if psutil:
66 return func(*args, **kwargs)
67 else:
68 _Log('Python module psutil is not installed. Function call %s is '
69 'skipped.' % func)
70 return func_require_psutil
71 return deco_require_psutil
72
73
74def _get_process_count(process_cmd_pattern):
75 """Get the count of processes that match the given command pattern.
76
77 Args:
78 process_cmd_pattern: The regex pattern of process command to match.
79
80 Returns:
81 The count of processes that match the given command pattern.
82 """
83 try:
84 # Use Popen instead of check_output since the latter cannot run with old
85 # python version (less than 2.7)
86 proc = subprocess.Popen(
87 ['pgrep', '-fc', process_cmd_pattern],
88 stdout=subprocess.PIPE,
89 stderr=subprocess.PIPE,
90 )
91 cmd_output, cmd_error = proc.communicate()
92 if cmd_error:
93 _Log('Error happened when getting process count: %s' % cmd_error)
94
95 return int(cmd_output)
96 except subprocess.CalledProcessError:
97 return 0
98
99
100def get_config():
101 """Get cherrypy config for this application."""
102 return {
103 '/': {
104 # Automatically add trailing slash, i.e.
105 # /check_health -> /check_health/.
106 'tools.trailing_slash.on': False,
107 }
108 }
109
110
111class Root(object):
112 """Cherrypy Root class of the application."""
113 def __init__(self, devserver, static_dir):
114 self._static_dir = static_dir
115 self._devserver = devserver
116
117 # Cache of disk IO stats, a thread refresh the stats every 10 seconds.
118 # lock is not used for these variables as the only thread writes to these
119 # variables is _refresh_io_stats.
120 self.disk_read_bytes_per_sec = 0
121 self.disk_write_bytes_per_sec = 0
122 # Cache of network IO stats.
123 self.network_sent_bytes_per_sec = 0
124 self.network_recv_bytes_per_sec = 0
125 self._start_io_stat_thread()
126
127 @require_psutil()
128 def _get_io_stats(self):
129 """Get the IO stats as a dictionary.
130
131 Returns:
132 A dictionary of IO stats collected by psutil.
133 """
134 return {'disk_read_bytes_per_second': self.disk_read_bytes_per_sec,
135 'disk_write_bytes_per_second': self.disk_write_bytes_per_sec,
136 'disk_total_bytes_per_second': (self.disk_read_bytes_per_sec +
137 self.disk_write_bytes_per_sec),
138 'network_sent_bytes_per_second': self.network_sent_bytes_per_sec,
139 'network_recv_bytes_per_second': self.network_recv_bytes_per_sec,
140 'network_total_bytes_per_second': (self.network_sent_bytes_per_sec +
141 self.network_recv_bytes_per_sec),
142 'cpu_percent': psutil.cpu_percent(), }
143
144 @require_psutil()
145 def _refresh_io_stats(self):
146 """A call running in a thread to update IO stats periodically."""
147 prev_disk_io_counters = psutil.disk_io_counters()
148 prev_network_io_counters = psutil.net_io_counters()
149 prev_read_time = time.time()
150 while True:
151 time.sleep(STATS_INTERVAL)
152 now = time.time()
153 interval = now - prev_read_time
154 prev_read_time = now
155 # Disk IO is for all disks.
156 disk_io_counters = psutil.disk_io_counters()
157 network_io_counters = psutil.net_io_counters()
158
159 self.disk_read_bytes_per_sec = (
160 disk_io_counters.read_bytes -
161 prev_disk_io_counters.read_bytes) / interval
162 self.disk_write_bytes_per_sec = (
163 disk_io_counters.write_bytes -
164 prev_disk_io_counters.write_bytes) / interval
165 prev_disk_io_counters = disk_io_counters
166
167 self.network_sent_bytes_per_sec = (
168 network_io_counters.bytes_sent -
169 prev_network_io_counters.bytes_sent) / interval
170 self.network_recv_bytes_per_sec = (
171 network_io_counters.bytes_recv -
172 prev_network_io_counters.bytes_recv) / interval
173 prev_network_io_counters = network_io_counters
174
175 @require_psutil()
176 def _start_io_stat_thread(self):
177 """Start the thread to collect IO stats."""
178 thread = threading.Thread(target=self._refresh_io_stats)
179 thread.daemon = True
180 thread.start()
181
182 @cherrypy.expose
183 def index(self):
184 """Collect the health status of devserver to see if it's ready for staging.
185
186 Returns:
187 A JSON dictionary containing all or some of the following fields:
188 free_disk (int): free disk space in GB
189 staging_thread_count (int): number of devserver threads currently staging
190 an image
191 apache_client_count (int): count of Apache processes.
192 telemetry_test_count (int): count of telemetry tests.
193 gsutil_count (int): count of gsutil processes.
194 """
195 # Get free disk space.
196 stat = os.statvfs(self._static_dir)
197 free_disk = stat.f_bsize * stat.f_bavail / _1G
198 apache_client_count = _get_process_count('bin/apache2? -k start')
199 telemetry_test_count = _get_process_count('python.*telemetry')
200 gsutil_count = _get_process_count('gsutil')
201 au_process_count = len(cros_update_progress.GetAllRunningAUProcess())
202
203 health_data = {
204 'free_disk': free_disk,
205 'staging_thread_count': self._devserver.staging_thread_count,
206 'apache_client_count': apache_client_count,
207 'telemetry_test_count': telemetry_test_count,
208 'gsutil_count': gsutil_count,
209 'au_process_count': au_process_count,
210 }
211 health_data.update(self._get_io_stats() or {})
212
213 return json.dumps(health_data)