Congbin Guo | 3afae6c | 2019-08-13 16:29:42 -0700 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | # Copyright 2019 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | """A cherrypy application to check devserver health status.""" |
| 6 | |
| 7 | from __future__ import absolute_import |
| 8 | from __future__ import division |
| 9 | from __future__ import print_function |
| 10 | |
| 11 | import json |
| 12 | import os |
| 13 | import subprocess |
| 14 | import threading |
| 15 | import time |
| 16 | |
| 17 | import cherrypy |
| 18 | |
| 19 | import cros_update_progress |
| 20 | import log_util |
| 21 | |
| 22 | |
| 23 | def _Log(message, *args): |
| 24 | """Module-local log function.""" |
| 25 | return log_util.LogWithTag('HEALTHCHECKER', message, *args) |
| 26 | |
| 27 | |
| 28 | try: |
| 29 | import psutil |
| 30 | except ImportError: |
| 31 | # Ignore psutil import failure. This is for backwards compatibility, so |
| 32 | # "cros flash" can still update duts with build without psutil installed. |
| 33 | # The reason is that, during cros flash, local devserver code is copied over |
| 34 | # to DUT, and devserver will be running inside DUT to stage the build. |
| 35 | _Log('Python module psutil is not installed, devserver load data will not be ' |
| 36 | 'collected') |
| 37 | psutil = None |
| 38 | except OSError as e: |
| 39 | # Ignore error like following. psutil may not work properly in builder. Ignore |
| 40 | # the error as load information of devserver is not used in builder. |
| 41 | # OSError: [Errno 2] No such file or directory: '/dev/pts/0' |
| 42 | _Log('psutil is failed to be imported, error: %s. devserver load data will ' |
| 43 | 'not be collected.', e) |
| 44 | psutil = None |
| 45 | |
| 46 | |
| 47 | # Number of seconds between the collection of disk and network IO counters. |
| 48 | STATS_INTERVAL = 10.0 |
| 49 | _1G = 1000000000 |
| 50 | |
| 51 | |
| 52 | def require_psutil(): |
| 53 | """Decorator for functions require psutil to run.""" |
| 54 | def deco_require_psutil(func): |
| 55 | """Wrapper of the decorator function. |
| 56 | |
| 57 | Args: |
| 58 | func: function to be called. |
| 59 | """ |
| 60 | def func_require_psutil(*args, **kwargs): |
| 61 | """Decorator for functions require psutil to run. |
| 62 | |
| 63 | If psutil is not installed, skip calling the function. |
| 64 | |
| 65 | Args: |
| 66 | *args: arguments for function to be called. |
| 67 | **kwargs: keyword arguments for function to be called. |
| 68 | """ |
| 69 | if psutil: |
| 70 | return func(*args, **kwargs) |
| 71 | else: |
| 72 | _Log('Python module psutil is not installed. Function call %s is ' |
| 73 | 'skipped.' % func) |
| 74 | return func_require_psutil |
| 75 | return deco_require_psutil |
| 76 | |
| 77 | |
| 78 | def _get_process_count(process_cmd_pattern): |
| 79 | """Get the count of processes that match the given command pattern. |
| 80 | |
| 81 | Args: |
| 82 | process_cmd_pattern: The regex pattern of process command to match. |
| 83 | |
| 84 | Returns: |
| 85 | The count of processes that match the given command pattern. |
| 86 | """ |
| 87 | try: |
| 88 | # Use Popen instead of check_output since the latter cannot run with old |
| 89 | # python version (less than 2.7) |
| 90 | proc = subprocess.Popen( |
| 91 | ['pgrep', '-fc', process_cmd_pattern], |
| 92 | stdout=subprocess.PIPE, |
| 93 | stderr=subprocess.PIPE, |
| 94 | ) |
| 95 | cmd_output, cmd_error = proc.communicate() |
| 96 | if cmd_error: |
| 97 | _Log('Error happened when getting process count: %s' % cmd_error) |
| 98 | |
| 99 | return int(cmd_output) |
| 100 | except subprocess.CalledProcessError: |
| 101 | return 0 |
| 102 | |
| 103 | |
| 104 | def get_config(): |
| 105 | """Get cherrypy config for this application.""" |
| 106 | return { |
| 107 | '/': { |
| 108 | # Automatically add trailing slash, i.e. |
| 109 | # /check_health -> /check_health/. |
| 110 | 'tools.trailing_slash.on': False, |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | |
| 115 | class Root(object): |
| 116 | """Cherrypy Root class of the application.""" |
| 117 | def __init__(self, devserver, static_dir): |
| 118 | self._static_dir = static_dir |
| 119 | self._devserver = devserver |
| 120 | |
| 121 | # Cache of disk IO stats, a thread refresh the stats every 10 seconds. |
| 122 | # lock is not used for these variables as the only thread writes to these |
| 123 | # variables is _refresh_io_stats. |
| 124 | self.disk_read_bytes_per_sec = 0 |
| 125 | self.disk_write_bytes_per_sec = 0 |
| 126 | # Cache of network IO stats. |
| 127 | self.network_sent_bytes_per_sec = 0 |
| 128 | self.network_recv_bytes_per_sec = 0 |
| 129 | self._start_io_stat_thread() |
| 130 | |
| 131 | @require_psutil() |
| 132 | def _get_io_stats(self): |
| 133 | """Get the IO stats as a dictionary. |
| 134 | |
| 135 | Returns: |
| 136 | A dictionary of IO stats collected by psutil. |
| 137 | """ |
| 138 | return {'disk_read_bytes_per_second': self.disk_read_bytes_per_sec, |
| 139 | 'disk_write_bytes_per_second': self.disk_write_bytes_per_sec, |
| 140 | 'disk_total_bytes_per_second': (self.disk_read_bytes_per_sec + |
| 141 | self.disk_write_bytes_per_sec), |
| 142 | 'network_sent_bytes_per_second': self.network_sent_bytes_per_sec, |
| 143 | 'network_recv_bytes_per_second': self.network_recv_bytes_per_sec, |
| 144 | 'network_total_bytes_per_second': (self.network_sent_bytes_per_sec + |
| 145 | self.network_recv_bytes_per_sec), |
| 146 | 'cpu_percent': psutil.cpu_percent(), } |
| 147 | |
| 148 | @require_psutil() |
| 149 | def _refresh_io_stats(self): |
| 150 | """A call running in a thread to update IO stats periodically.""" |
| 151 | prev_disk_io_counters = psutil.disk_io_counters() |
| 152 | prev_network_io_counters = psutil.net_io_counters() |
| 153 | prev_read_time = time.time() |
| 154 | while True: |
| 155 | time.sleep(STATS_INTERVAL) |
| 156 | now = time.time() |
| 157 | interval = now - prev_read_time |
| 158 | prev_read_time = now |
| 159 | # Disk IO is for all disks. |
| 160 | disk_io_counters = psutil.disk_io_counters() |
| 161 | network_io_counters = psutil.net_io_counters() |
| 162 | |
| 163 | self.disk_read_bytes_per_sec = ( |
| 164 | disk_io_counters.read_bytes - |
| 165 | prev_disk_io_counters.read_bytes) / interval |
| 166 | self.disk_write_bytes_per_sec = ( |
| 167 | disk_io_counters.write_bytes - |
| 168 | prev_disk_io_counters.write_bytes) / interval |
| 169 | prev_disk_io_counters = disk_io_counters |
| 170 | |
| 171 | self.network_sent_bytes_per_sec = ( |
| 172 | network_io_counters.bytes_sent - |
| 173 | prev_network_io_counters.bytes_sent) / interval |
| 174 | self.network_recv_bytes_per_sec = ( |
| 175 | network_io_counters.bytes_recv - |
| 176 | prev_network_io_counters.bytes_recv) / interval |
| 177 | prev_network_io_counters = network_io_counters |
| 178 | |
| 179 | @require_psutil() |
| 180 | def _start_io_stat_thread(self): |
| 181 | """Start the thread to collect IO stats.""" |
| 182 | thread = threading.Thread(target=self._refresh_io_stats) |
| 183 | thread.daemon = True |
| 184 | thread.start() |
| 185 | |
| 186 | @cherrypy.expose |
| 187 | def index(self): |
| 188 | """Collect the health status of devserver to see if it's ready for staging. |
| 189 | |
| 190 | Returns: |
| 191 | A JSON dictionary containing all or some of the following fields: |
| 192 | free_disk (int): free disk space in GB |
| 193 | staging_thread_count (int): number of devserver threads currently staging |
| 194 | an image |
| 195 | apache_client_count (int): count of Apache processes. |
| 196 | telemetry_test_count (int): count of telemetry tests. |
| 197 | gsutil_count (int): count of gsutil processes. |
| 198 | """ |
| 199 | # Get free disk space. |
| 200 | stat = os.statvfs(self._static_dir) |
| 201 | free_disk = stat.f_bsize * stat.f_bavail / _1G |
| 202 | apache_client_count = _get_process_count('bin/apache2? -k start') |
| 203 | telemetry_test_count = _get_process_count('python.*telemetry') |
| 204 | gsutil_count = _get_process_count('gsutil') |
| 205 | au_process_count = len(cros_update_progress.GetAllRunningAUProcess()) |
| 206 | |
| 207 | health_data = { |
| 208 | 'free_disk': free_disk, |
| 209 | 'staging_thread_count': self._devserver.staging_thread_count, |
| 210 | 'apache_client_count': apache_client_count, |
| 211 | 'telemetry_test_count': telemetry_test_count, |
| 212 | 'gsutil_count': gsutil_count, |
| 213 | 'au_process_count': au_process_count, |
| 214 | } |
| 215 | health_data.update(self._get_io_stats() or {}) |
| 216 | |
| 217 | return json.dumps(health_data) |