blob: 107d993866901d5f5675401b6100216d49d55db1 [file] [log] [blame]
mbligh321b1f52008-04-09 16:23:43 +00001"""This class defines the Remote host class, mixing in the SiteHost class
2if it is available."""
3
jadmanskidef0c3c2009-03-25 20:07:10 +00004import os, time, pickle, logging
mblighf2c33762008-10-18 14:42:34 +00005from autotest_lib.client.common_lib import error
jadmanskida0aeff2009-02-18 18:53:05 +00006from autotest_lib.server import utils, profiler
mblighf2c33762008-10-18 14:42:34 +00007from autotest_lib.server.hosts import base_classes, bootloader
mbligh321b1f52008-04-09 16:23:43 +00008
9
jadmanski1c5e3a12008-08-15 23:08:20 +000010class RemoteHost(base_classes.Host):
jadmanskid60321a2008-10-28 20:32:05 +000011 """
12 This class represents a remote machine on which you can run
jadmanski0afbb632008-06-06 21:10:57 +000013 programs.
mbligh321b1f52008-04-09 16:23:43 +000014
jadmanski0afbb632008-06-06 21:10:57 +000015 It may be accessed through a network, a serial line, ...
16 It is not the machine autoserv is running on.
mbligh321b1f52008-04-09 16:23:43 +000017
jadmanski0afbb632008-06-06 21:10:57 +000018 Implementation details:
19 This is an abstract class, leaf subclasses must implement the methods
20 listed here and in parent classes which have no implementation. They
21 may reimplement methods which already have an implementation. You
22 must not instantiate this class but should instantiate one of those
jadmanskid60321a2008-10-28 20:32:05 +000023 leaf subclasses.
24 """
mbligh321b1f52008-04-09 16:23:43 +000025
mblighf2c33762008-10-18 14:42:34 +000026 DEFAULT_REBOOT_TIMEOUT = base_classes.Host.DEFAULT_REBOOT_TIMEOUT
27 LAST_BOOT_TAG = object()
28
jadmanskif6562912008-10-21 17:59:01 +000029 def _initialize(self, hostname, autodir=None, *args, **dargs):
30 super(RemoteHost, self)._initialize(*args, **dargs)
mbligh321b1f52008-04-09 16:23:43 +000031
jadmanski1c5e3a12008-08-15 23:08:20 +000032 self.hostname = hostname
mblighf2c33762008-10-18 14:42:34 +000033 self.autodir = autodir
34 self.tmp_dirs = []
jadmanskia2db9412008-08-22 21:47:24 +000035
36
jadmanski53aaf382008-11-17 16:22:31 +000037 def close(self):
38 super(RemoteHost, self).close()
mblighf2c33762008-10-18 14:42:34 +000039 self.stop_loggers()
40
41 if hasattr(self, 'tmp_dirs'):
42 for dir in self.tmp_dirs:
43 try:
44 self.run('rm -rf "%s"' % (utils.sh_escape(dir)))
45 except error.AutoservRunError:
46 pass
47
48
jadmanskid60321a2008-10-28 20:32:05 +000049 def job_start(self):
50 """
51 Abstract method, called the first time a remote host object
52 is created for a specific host after a job starts.
53
54 This method depends on the create_host factory being used to
55 construct your host object. If you directly construct host objects
56 you will need to call this method yourself (and enforce the
57 single-call rule).
58 """
59 pass
60
61
mblighf2c33762008-10-18 14:42:34 +000062 def get_autodir(self):
63 return self.autodir
64
65
66 def set_autodir(self, autodir):
jadmanskid60321a2008-10-28 20:32:05 +000067 """
mblighf2c33762008-10-18 14:42:34 +000068 This method is called to make the host object aware of the
69 where autotest is installed. Called in server/autotest.py
70 after a successful install
jadmanskid60321a2008-10-28 20:32:05 +000071 """
mblighf2c33762008-10-18 14:42:34 +000072 self.autodir = autodir
73
74
75 def sysrq_reboot(self):
76 self.run('echo b > /proc/sysrq-trigger &')
77
78
79 def reboot(self, timeout=DEFAULT_REBOOT_TIMEOUT, label=LAST_BOOT_TAG,
mbligh959ed872009-04-17 22:18:25 +000080 kernel_args=None, wait=True, fastsync=False,
81 reboot_cmd=None, **dargs):
mblighf2c33762008-10-18 14:42:34 +000082 """
83 Reboot the remote host.
84
85 Args:
86 timeout - How long to wait for the reboot.
87 label - The label we should boot into. If None, we will
88 boot into the default kernel. If it's LAST_BOOT_TAG,
89 we'll boot into whichever kernel was .boot'ed last
90 (or the default kernel if we haven't .boot'ed in this
91 job). If it's None, we'll boot into the default kernel.
92 If it's something else, we'll boot into that.
93 wait - Should we wait to see if the machine comes back up.
mbligh2b949772009-02-26 00:59:36 +000094 fastsync - Don't wait for the sync to complete, just start one
95 and move on. This is for cases where rebooting prompty
96 is more important than data integrity and/or the
97 machine may have disks that cause sync to never return.
mbligh959ed872009-04-17 22:18:25 +000098 reboot_cmd - Reboot command to execute.
mblighf2c33762008-10-18 14:42:34 +000099 """
100 if self.job:
101 if label == self.LAST_BOOT_TAG:
102 label = self.job.last_boot_tag
103 else:
104 self.job.last_boot_tag = label
105
106 self.reboot_setup(label=label, kernel_args=kernel_args, **dargs)
107
108 if label or kernel_args:
109 self.bootloader.install_boottool()
110 if not label:
111 default = int(self.bootloader.get_default())
112 label = self.bootloader.get_titles()[default]
113 self.bootloader.boot_once(label)
114 if kernel_args:
115 self.bootloader.add_args(label, kernel_args)
116
117 # define a function for the reboot and run it in a group
118 print "Reboot: initiating reboot"
119 def reboot():
120 self.record("GOOD", None, "reboot.start")
121 try:
jadmanskid544a352009-01-14 23:36:28 +0000122 # sync before starting the reboot, so that a long sync during
123 # shutdown isn't timed out by wait_down's short timeout
mbligh2b949772009-02-26 00:59:36 +0000124 if not fastsync:
mbligh959ed872009-04-17 22:18:25 +0000125 self.run('sync; sync', timeout=timeout, ignore_status=True)
jadmanskid544a352009-01-14 23:36:28 +0000126
mbligh959ed872009-04-17 22:18:25 +0000127 if reboot_cmd:
128 self.run(reboot_cmd)
129 else:
130 # Try several methods of rebooting in increasing harshness.
131 self.run('(('
132 ' sync &'
133 ' sleep 5; reboot &'
134 ' sleep 60; reboot -f &'
135 ' sleep 10; reboot -nf &'
136 ' sleep 10; telinit 6 &'
137 ') </dev/null >/dev/null 2>&1 &)')
mblighf2c33762008-10-18 14:42:34 +0000138 except error.AutoservRunError:
139 self.record("ABORT", None, "reboot.start",
140 "reboot command failed")
141 raise
142 if wait:
jadmanskid778ae42009-01-07 15:07:36 +0000143 self.wait_for_restart(timeout, **dargs)
mblighf2c33762008-10-18 14:42:34 +0000144
145 # if this is a full reboot-and-wait, run the reboot inside a group
146 if wait:
147 self.log_reboot(reboot)
148 else:
149 reboot()
150
151
jadmanski4f909252008-12-01 20:47:10 +0000152 def reboot_followup(self, *args, **dargs):
153 super(RemoteHost, self).reboot_followup(*args, **dargs)
154 if self.job:
155 self.job.profilers.handle_reboot(self)
156
157
jadmanskid778ae42009-01-07 15:07:36 +0000158 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs):
jadmanskid60321a2008-10-28 20:32:05 +0000159 """
160 Wait for the host to come back from a reboot. This wraps the
161 generic wait_for_restart implementation in a reboot group.
162 """
mblighf2c33762008-10-18 14:42:34 +0000163 def reboot_func():
jadmanskid778ae42009-01-07 15:07:36 +0000164 super(RemoteHost, self).wait_for_restart(timeout=timeout, **dargs)
mblighf2c33762008-10-18 14:42:34 +0000165 self.log_reboot(reboot_func)
166
167
mbligh1264b512008-11-05 22:21:49 +0000168 def cleanup(self):
169 super(RemoteHost, self).cleanup()
170 self.reboot()
171
172
mblighe48bcfb2008-11-11 17:09:44 +0000173 def get_tmp_dir(self, parent='/tmp'):
mblighf2c33762008-10-18 14:42:34 +0000174 """
175 Return the pathname of a directory on the host suitable
176 for temporary file storage.
177
178 The directory and its content will be deleted automatically
179 on the destruction of the Host object that was used to obtain
180 it.
181 """
jadmanski9f7dd112008-11-17 16:40:05 +0000182 self.run("mkdir -p %s" % parent)
mblighe48bcfb2008-11-11 17:09:44 +0000183 template = os.path.join(parent, 'autoserv-XXXXXX')
jadmanski9f7dd112008-11-17 16:40:05 +0000184 dir_name = self.run("mktemp -d %s" % template).stdout.rstrip()
mblighf2c33762008-10-18 14:42:34 +0000185 self.tmp_dirs.append(dir_name)
186 return dir_name
187
188
jadmanskiea455662009-03-25 22:25:39 +0000189 def delete_tmp_dir(self, tmpdir):
190 """
191 Delete the given temporary directory on the remote machine.
192 """
193 self.run('rm -rf "%s"' % utils.sh_escape(tmpdir), ignore_status=True)
194 self.tmp_dirs.remove(tmpdir)
195
196
mblighf2c33762008-10-18 14:42:34 +0000197 def ping(self):
198 """
199 Ping the remote system, and return whether it's available
200 """
201 fpingcmd = "%s -q %s" % ('/usr/bin/fping', self.hostname)
202 rc = utils.system(fpingcmd, ignore_status = 1)
203 return (rc == 0)
204
205
206 def check_uptime(self):
207 """
208 Check that uptime is available and monotonically increasing.
209 """
210 if not self.ping():
211 raise error.AutoservHostError('Client is not pingable')
212 result = self.run("/bin/cat /proc/uptime", 30)
213 return result.stdout.strip().split()[0]
214
215
216 def get_crashinfo(self, test_start_time):
jadmanskidef0c3c2009-03-25 20:07:10 +0000217 logging.info("Collecting crash information...")
mblighf2c33762008-10-18 14:42:34 +0000218 super(RemoteHost, self).get_crashinfo(test_start_time)
219
220 # wait for four hours, to see if the machine comes back up
221 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
jadmanskidef0c3c2009-03-25 20:07:10 +0000222 logging.info("Waiting four hours for %s to come up (%s)",
223 self.hostname, current_time)
mblighf2c33762008-10-18 14:42:34 +0000224 if not self.wait_up(timeout=4*60*60):
jadmanskidef0c3c2009-03-25 20:07:10 +0000225 logging.warning("%s down, unable to collect crash info",
226 self.hostname)
mblighf2c33762008-10-18 14:42:34 +0000227 return
228 else:
jadmanskidef0c3c2009-03-25 20:07:10 +0000229 logging.info("%s is back up, collecting crash info", self.hostname)
mblighf2c33762008-10-18 14:42:34 +0000230
231 # find a directory to put the crashinfo into
mbligh210bae62009-04-01 18:33:13 +0000232 try:
233 self.job.resultsdir
234 except AttributeError:
235 self.job.resultsdir = None
236
237 if self.job.resultsdir:
mblighf2c33762008-10-18 14:42:34 +0000238 infodir = self.job.resultdir
239 else:
240 infodir = os.path.abspath(os.getcwd())
241 infodir = os.path.join(infodir, "crashinfo.%s" % self.hostname)
242 if not os.path.exists(infodir):
243 os.mkdir(infodir)
244
245 # collect various log files
246 log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]
247 for log in log_files:
jadmanskidef0c3c2009-03-25 20:07:10 +0000248 logging.info("Collecting %s...", log)
mblighf2c33762008-10-18 14:42:34 +0000249 try:
250 self.get_file(log, infodir)
mbligha2c940d2009-01-30 22:35:19 +0000251 except Exception:
jadmanskidef0c3c2009-03-25 20:07:10 +0000252 logging.warning("Collection of %s failed", log)
mblighf2c33762008-10-18 14:42:34 +0000253
254 # collect dmesg
jadmanskidef0c3c2009-03-25 20:07:10 +0000255 logging.info("Collecting dmesg (saved to crashinfo/dmesg)...")
mbligh78a013a2009-01-13 19:34:28 +0000256 devnull = open("/dev/null", "w")
mblighf2c33762008-10-18 14:42:34 +0000257 try:
mbligh78a013a2009-01-13 19:34:28 +0000258 try:
259 result = self.run("dmesg", stdout_tee=devnull).stdout
260 file(os.path.join(infodir, "dmesg"), "w").write(result)
261 except Exception, e:
jadmanskidef0c3c2009-03-25 20:07:10 +0000262 logging.warning("Collection of dmesg failed:\n%s", e)
mbligh78a013a2009-01-13 19:34:28 +0000263 finally:
264 devnull.close()
mblighf2c33762008-10-18 14:42:34 +0000265
jadmanskida0aeff2009-02-18 18:53:05 +0000266 # collect any profiler data we can find
jadmanskidef0c3c2009-03-25 20:07:10 +0000267 logging.info("Collecting any server-side profiler data lying around...")
jadmanskida0aeff2009-02-18 18:53:05 +0000268 try:
269 cmd = "ls %s" % profiler.PROFILER_TMPDIR
270 profiler_dirs = [path for path in self.run(cmd).stdout.split()
271 if path.startswith("autoserv-")]
272 for profiler_dir in profiler_dirs:
273 remote_path = profiler.get_profiler_results_dir(profiler_dir)
274 remote_exists = self.run("ls %s" % remote_path,
275 ignore_status=True).exit_status == 0
276 if not remote_exists:
277 continue
278 local_path = os.path.join(infodir, "profiler." + profiler_dir)
279 os.mkdir(local_path)
280 self.get_file(remote_path + "/", local_path)
281 except Exception, e:
jadmanskidef0c3c2009-03-25 20:07:10 +0000282 logging.warning("Collection of profiler data failed with:\n%s", e)
283
284
285 # collect any uncollected logs we see (for this host)
mbligh210bae62009-04-01 18:33:13 +0000286 if not self.job.uncollected_log_file:
287 self.job.uncollected_log_file = ''
jadmanskidef0c3c2009-03-25 20:07:10 +0000288 if self.job and os.path.exists(self.job.uncollected_log_file):
289 try:
290 logs = pickle.load(open(self.job.uncollected_log_file))
291 for hostname, remote_path, local_path in logs:
292 if hostname == self.hostname:
293 logging.info("Retrieving logs from %s:%s into %s",
294 hostname, remote_path, local_path)
295 self.get_file(remote_path + "/", local_path + "/")
296 except Exception, e:
297 logging.warning("Error while trying to collect stranded "
298 "Autotest client logs: %s", e)
jadmanskida0aeff2009-02-18 18:53:05 +0000299
mblighf2c33762008-10-18 14:42:34 +0000300
jadmanskica7da372008-10-21 16:26:52 +0000301 def are_wait_up_processes_up(self):
mblighf2c33762008-10-18 14:42:34 +0000302 """
303 Checks if any HOSTS waitup processes are running yet on the
304 remote host.
305
306 Returns True if any the waitup processes are running, False
307 otherwise.
308 """
309 processes = self.get_wait_up_processes()
310 if len(processes) == 0:
311 return True # wait up processes aren't being used
312 for procname in processes:
313 exit_status = self.run("{ ps -e || ps; } | grep '%s'" % procname,
314 ignore_status=True).exit_status
315 if exit_status == 0:
316 return True
317 return False