blob: c4ed59674fa560bf0150a34dfb4d7aa57d1d06da [file] [log] [blame]
Yunlian Jiang00cc30e2013-03-28 13:23:57 -07001# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Ahmad Sharif4467f002012-12-20 12:09:49 -08004"""The experiment setting module."""
5
Yunlian Jiang742ed2c2015-12-10 10:05:59 -08006from __future__ import print_function
7
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -08008import os
9import time
Ahmad Sharif4467f002012-12-20 12:09:49 -080010
cmticee5bc63b2015-05-27 16:59:37 -070011import afe_lock_machine
Han Shenba649282015-08-05 17:19:55 -070012from threading import Lock
cmticee5bc63b2015-05-27 16:59:37 -070013
Yunlian Jiang0d1a9f32015-12-09 10:47:11 -080014from cros_utils import logger
15from cros_utils import misc
Ahmad Sharif4467f002012-12-20 12:09:49 -080016
Han Shene0662972015-09-18 16:53:34 -070017import benchmark_run
Han Shen738e6de2015-12-07 13:22:25 -080018from machine_manager import BadChecksum
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080019from machine_manager import MachineManager
Ahmad Sharif4467f002012-12-20 12:09:49 -080020from machine_manager import MockMachineManager
Ahmad Sharif4467f002012-12-20 12:09:49 -080021import test_flag
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080022
Luis Lozanof2a3ef42015-12-15 13:49:30 -080023
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080024class Experiment(object):
25 """Class representing an Experiment to be run."""
26
Luis Lozanof2a3ef42015-12-15 13:49:30 -080027 def __init__(self, name, remote, working_directory, chromeos_root,
28 cache_conditions, labels, benchmarks, experiment_file, email_to,
29 acquire_timeout, log_dir, log_level, share_cache,
Zhizhou Yang1a199b12018-11-09 11:44:10 -080030 results_directory, locks_directory, cwp_dso):
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080031 self.name = name
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080032 self.working_directory = working_directory
33 self.remote = remote
34 self.chromeos_root = chromeos_root
35 self.cache_conditions = cache_conditions
36 self.experiment_file = experiment_file
Ahmad Shariff395c262012-10-09 17:48:09 -070037 self.email_to = email_to
Yunlian Jiang00cc30e2013-03-28 13:23:57 -070038 if not results_directory:
39 self.results_directory = os.path.join(self.working_directory,
Luis Lozanof2a3ef42015-12-15 13:49:30 -080040 self.name + '_results')
Yunlian Jiang00cc30e2013-03-28 13:23:57 -070041 else:
42 self.results_directory = misc.CanonicalizePath(results_directory)
Luis Lozanof81680c2013-03-15 14:44:13 -070043 self.log_dir = log_dir
cmtice13909242014-03-11 13:38:07 -070044 self.log_level = log_level
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080045 self.labels = labels
46 self.benchmarks = benchmarks
47 self.num_complete = 0
Ahmad Sharif4467f002012-12-20 12:09:49 -080048 self.num_run_complete = 0
cmtice1a224362014-10-16 15:49:56 -070049 self.share_cache = share_cache
Caroline Ticee1a28bd2016-08-02 16:49:57 -070050 self.active_threads = []
cmtice517dc982015-06-12 12:22:32 -070051 # If locks_directory (self.lock_dir) not blank, we will use the file
52 # locking mechanism; if it is blank then we will use the AFE server
53 # locking mechanism.
54 self.locks_dir = locks_directory
cmticef3eb8032015-07-27 13:55:52 -070055 self.locked_machines = []
Zhizhou Yang1a199b12018-11-09 11:44:10 -080056 self.cwp_dso = cwp_dso
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080057
Luis Lozanodd417612015-12-08 12:08:44 -080058 if not remote:
Luis Lozanof2a3ef42015-12-15 13:49:30 -080059 raise RuntimeError('No remote hosts specified')
Luis Lozanodd417612015-12-08 12:08:44 -080060 if not self.benchmarks:
Luis Lozanof2a3ef42015-12-15 13:49:30 -080061 raise RuntimeError('No benchmarks specified')
Luis Lozanodd417612015-12-08 12:08:44 -080062 if not self.labels:
Luis Lozanof2a3ef42015-12-15 13:49:30 -080063 raise RuntimeError('No labels specified')
Luis Lozanodd417612015-12-08 12:08:44 -080064
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080065 # We need one chromeos_root to run the benchmarks in, but it doesn't
66 # matter where it is, unless the ABIs are different.
67 if not chromeos_root:
68 for label in self.labels:
69 if label.chromeos_root:
70 chromeos_root = label.chromeos_root
Luis Lozanodd417612015-12-08 12:08:44 -080071 break
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080072 if not chromeos_root:
Luis Lozanof2a3ef42015-12-15 13:49:30 -080073 raise RuntimeError('No chromeos_root given and could not determine '
74 'one from the image path.')
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080075
David Sharpa9368342016-01-22 17:52:01 -080076 machine_manager_fn = MachineManager
Ahmad Sharif4467f002012-12-20 12:09:49 -080077 if test_flag.GetTestMode():
David Sharpa9368342016-01-22 17:52:01 -080078 machine_manager_fn = MockMachineManager
79 self.machine_manager = machine_manager_fn(chromeos_root, acquire_timeout,
80 log_level, locks_directory)
Luis Lozanof81680c2013-03-15 14:44:13 -070081 self.l = logger.GetLogger(log_dir)
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080082
Han Shenf9b50352015-09-17 11:26:22 -070083 for machine in self.remote:
84 # machine_manager.AddMachine only adds reachable machines.
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -080085 self.machine_manager.AddMachine(machine)
Han Shenf9b50352015-09-17 11:26:22 -070086 # Now machine_manager._all_machines contains a list of reachable
87 # machines. This is a subset of self.remote. We make both lists the same.
Caroline Ticee1a28bd2016-08-02 16:49:57 -070088 self.remote = [m.name for m in self.machine_manager.GetAllMachines()]
Caroline Tice51d7a9b2015-12-09 08:01:54 -080089 if not self.remote:
Luis Lozanof2a3ef42015-12-15 13:49:30 -080090 raise RuntimeError('No machine available for running experiment.')
Han Shenf9b50352015-09-17 11:26:22 -070091
Ahmad Sharif4467f002012-12-20 12:09:49 -080092 for label in labels:
Han Shenf9b50352015-09-17 11:26:22 -070093 # We filter out label remotes that are not reachable (not in
94 # self.remote). So each label.remote is a sublist of experiment.remote.
Caroline Ticee1a28bd2016-08-02 16:49:57 -070095 label.remote = [r for r in label.remote if r in self.remote]
Han Shen738e6de2015-12-07 13:22:25 -080096 try:
97 self.machine_manager.ComputeCommonCheckSum(label)
98 except BadChecksum:
99 # Force same image on all machines, then we do checksum again. No
100 # bailout if checksums still do not match.
101 self.machine_manager.ForceSameImageToAllMachines(label)
102 self.machine_manager.ComputeCommonCheckSum(label)
103
Ahmad Sharif4467f002012-12-20 12:09:49 -0800104 self.machine_manager.ComputeCommonCheckSumString(label)
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800105
106 self.start_time = None
107 self.benchmark_runs = self._GenerateBenchmarkRuns()
108
Han Shenba649282015-08-05 17:19:55 -0700109 self._schedv2 = None
110 self._internal_counter_lock = Lock()
111
112 def set_schedv2(self, schedv2):
Caroline Ticeddde5052015-09-23 09:43:35 -0700113 self._schedv2 = schedv2
Han Shenba649282015-08-05 17:19:55 -0700114
115 def schedv2(self):
Caroline Ticeddde5052015-09-23 09:43:35 -0700116 return self._schedv2
Han Shenba649282015-08-05 17:19:55 -0700117
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800118 def _GenerateBenchmarkRuns(self):
119 """Generate benchmark runs from labels and benchmark defintions."""
120 benchmark_runs = []
121 for label in self.labels:
122 for benchmark in self.benchmarks:
George Burgess IVe56ceb42016-08-08 16:14:24 -0700123 for iteration in xrange(1, benchmark.iterations + 1):
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800124
Luis Lozanof2a3ef42015-12-15 13:49:30 -0800125 benchmark_run_name = '%s: %s (%s)' % (label.name, benchmark.name,
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800126 iteration)
Luis Lozanof2a3ef42015-12-15 13:49:30 -0800127 full_name = '%s_%s_%s' % (label.name, benchmark.name, iteration)
128 logger_to_use = logger.Logger(self.log_dir, 'run.%s' % (full_name),
cmtice77892942014-03-18 13:47:17 -0700129 True)
Caroline Ticef6ef4392017-04-06 17:16:05 -0700130 benchmark_runs.append(
131 benchmark_run.BenchmarkRun(benchmark_run_name, benchmark, label,
132 iteration, self.cache_conditions,
133 self.machine_manager, logger_to_use,
134 self.log_level, self.share_cache))
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800135
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800136 return benchmark_runs
137
138 def Build(self):
139 pass
140
141 def Terminate(self):
Han Shenba649282015-08-05 17:19:55 -0700142 if self._schedv2 is not None:
143 self._schedv2.terminate()
144 else:
145 for t in self.benchmark_runs:
146 if t.isAlive():
147 self.l.LogError("Terminating run: '%s'." % t.name)
148 t.Terminate()
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800149
150 def IsComplete(self):
Han Shenba649282015-08-05 17:19:55 -0700151 if self._schedv2:
152 return self._schedv2.is_complete()
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800153 if self.active_threads:
154 for t in self.active_threads:
155 if t.isAlive():
156 t.join(0)
157 if not t.isAlive():
158 self.num_complete += 1
Ahmad Sharif4467f002012-12-20 12:09:49 -0800159 if not t.cache_hit:
160 self.num_run_complete += 1
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800161 self.active_threads.remove(t)
162 return False
163 return True
164
Han Shenba649282015-08-05 17:19:55 -0700165 def BenchmarkRunFinished(self, br):
Yunlian Jiang742ed2c2015-12-10 10:05:59 -0800166 """Update internal counters after br finishes.
Han Shenba649282015-08-05 17:19:55 -0700167
Yunlian Jiang742ed2c2015-12-10 10:05:59 -0800168 Note this is only used by schedv2 and is called by multiple threads.
169 Never throw any exception here.
170 """
Han Shenba649282015-08-05 17:19:55 -0700171
Yunlian Jiang742ed2c2015-12-10 10:05:59 -0800172 assert self._schedv2 is not None
173 with self._internal_counter_lock:
174 self.num_complete += 1
175 if not br.cache_hit:
176 self.num_run_complete += 1
Han Shenba649282015-08-05 17:19:55 -0700177
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800178 def Run(self):
179 self.start_time = time.time()
Han Shenba649282015-08-05 17:19:55 -0700180 if self._schedv2 is not None:
181 self._schedv2.run_sched()
182 else:
183 self.active_threads = []
Caroline Ticee1a28bd2016-08-02 16:49:57 -0700184 for run in self.benchmark_runs:
Han Shenba649282015-08-05 17:19:55 -0700185 # Set threads to daemon so program exits when ctrl-c is pressed.
Caroline Ticee1a28bd2016-08-02 16:49:57 -0700186 run.daemon = True
187 run.start()
188 self.active_threads.append(run)
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800189
190 def SetCacheConditions(self, cache_conditions):
Caroline Ticee1a28bd2016-08-02 16:49:57 -0700191 for run in self.benchmark_runs:
192 run.SetCacheConditions(cache_conditions)
Ahmad Sharif0dcbc4b2012-02-02 16:37:18 -0800193
194 def Cleanup(self):
cmticee5bc63b2015-05-27 16:59:37 -0700195 """Make sure all machines are unlocked."""
cmtice517dc982015-06-12 12:22:32 -0700196 if self.locks_dir:
197 # We are using the file locks mechanism, so call machine_manager.Cleanup
198 # to unlock everything.
199 self.machine_manager.Cleanup()
200 else:
Caroline Tice7057cf62015-12-10 12:09:40 -0800201 if test_flag.GetTestMode():
202 return
203
cmticef3eb8032015-07-27 13:55:52 -0700204 all_machines = self.locked_machines
205 if not all_machines:
206 return
207
208 # If we locked any machines earlier, make sure we unlock them now.
Luis Lozanof2a3ef42015-12-15 13:49:30 -0800209 lock_mgr = afe_lock_machine.AFELockManager(
210 all_machines, '', self.labels[0].chromeos_root, None)
211 machine_states = lock_mgr.GetMachineStates('unlock')
cmtice517dc982015-06-12 12:22:32 -0700212 for k, state in machine_states.iteritems():
Luis Lozanof2a3ef42015-12-15 13:49:30 -0800213 if state['locked']:
cmtice517dc982015-06-12 12:22:32 -0700214 lock_mgr.UpdateLockInAFE(False, k)