blob: 2f037a9f9c4f1096990e0dd9daccd5c28f5f7e11 [file] [log] [blame]
mbligh321b1f52008-04-09 16:23:43 +00001"""This class defines the Remote host class, mixing in the SiteHost class
2if it is available."""
3
jadmanskidef0c3c2009-03-25 20:07:10 +00004import os, time, pickle, logging
mblighf2c33762008-10-18 14:42:34 +00005from autotest_lib.client.common_lib import error
jadmanskida0aeff2009-02-18 18:53:05 +00006from autotest_lib.server import utils, profiler
mblighf2c33762008-10-18 14:42:34 +00007from autotest_lib.server.hosts import base_classes, bootloader
mbligh321b1f52008-04-09 16:23:43 +00008
9
jadmanski1c5e3a12008-08-15 23:08:20 +000010class RemoteHost(base_classes.Host):
jadmanskid60321a2008-10-28 20:32:05 +000011 """
12 This class represents a remote machine on which you can run
jadmanski0afbb632008-06-06 21:10:57 +000013 programs.
mbligh321b1f52008-04-09 16:23:43 +000014
jadmanski0afbb632008-06-06 21:10:57 +000015 It may be accessed through a network, a serial line, ...
16 It is not the machine autoserv is running on.
mbligh321b1f52008-04-09 16:23:43 +000017
jadmanski0afbb632008-06-06 21:10:57 +000018 Implementation details:
19 This is an abstract class, leaf subclasses must implement the methods
20 listed here and in parent classes which have no implementation. They
21 may reimplement methods which already have an implementation. You
22 must not instantiate this class but should instantiate one of those
jadmanskid60321a2008-10-28 20:32:05 +000023 leaf subclasses.
24 """
mbligh321b1f52008-04-09 16:23:43 +000025
mblighf2c33762008-10-18 14:42:34 +000026 DEFAULT_REBOOT_TIMEOUT = base_classes.Host.DEFAULT_REBOOT_TIMEOUT
27 LAST_BOOT_TAG = object()
28
jadmanskif6562912008-10-21 17:59:01 +000029 def _initialize(self, hostname, autodir=None, *args, **dargs):
30 super(RemoteHost, self)._initialize(*args, **dargs)
mbligh321b1f52008-04-09 16:23:43 +000031
jadmanski1c5e3a12008-08-15 23:08:20 +000032 self.hostname = hostname
mblighf2c33762008-10-18 14:42:34 +000033 self.autodir = autodir
34 self.tmp_dirs = []
jadmanskia2db9412008-08-22 21:47:24 +000035
36
jadmanski53aaf382008-11-17 16:22:31 +000037 def close(self):
38 super(RemoteHost, self).close()
mblighf2c33762008-10-18 14:42:34 +000039 self.stop_loggers()
40
41 if hasattr(self, 'tmp_dirs'):
42 for dir in self.tmp_dirs:
43 try:
44 self.run('rm -rf "%s"' % (utils.sh_escape(dir)))
45 except error.AutoservRunError:
46 pass
47
48
jadmanskid60321a2008-10-28 20:32:05 +000049 def job_start(self):
50 """
51 Abstract method, called the first time a remote host object
52 is created for a specific host after a job starts.
53
54 This method depends on the create_host factory being used to
55 construct your host object. If you directly construct host objects
56 you will need to call this method yourself (and enforce the
57 single-call rule).
58 """
59 pass
60
61
mblighf2c33762008-10-18 14:42:34 +000062 def get_autodir(self):
63 return self.autodir
64
65
66 def set_autodir(self, autodir):
jadmanskid60321a2008-10-28 20:32:05 +000067 """
mblighf2c33762008-10-18 14:42:34 +000068 This method is called to make the host object aware of the
69 where autotest is installed. Called in server/autotest.py
70 after a successful install
jadmanskid60321a2008-10-28 20:32:05 +000071 """
mblighf2c33762008-10-18 14:42:34 +000072 self.autodir = autodir
73
74
75 def sysrq_reboot(self):
76 self.run('echo b > /proc/sysrq-trigger &')
77
78
79 def reboot(self, timeout=DEFAULT_REBOOT_TIMEOUT, label=LAST_BOOT_TAG,
mbligh2b949772009-02-26 00:59:36 +000080 kernel_args=None, wait=True, fastsync=False, **dargs):
mblighf2c33762008-10-18 14:42:34 +000081 """
82 Reboot the remote host.
83
84 Args:
85 timeout - How long to wait for the reboot.
86 label - The label we should boot into. If None, we will
87 boot into the default kernel. If it's LAST_BOOT_TAG,
88 we'll boot into whichever kernel was .boot'ed last
89 (or the default kernel if we haven't .boot'ed in this
90 job). If it's None, we'll boot into the default kernel.
91 If it's something else, we'll boot into that.
92 wait - Should we wait to see if the machine comes back up.
mbligh2b949772009-02-26 00:59:36 +000093 fastsync - Don't wait for the sync to complete, just start one
94 and move on. This is for cases where rebooting prompty
95 is more important than data integrity and/or the
96 machine may have disks that cause sync to never return.
mblighf2c33762008-10-18 14:42:34 +000097 """
98 if self.job:
99 if label == self.LAST_BOOT_TAG:
100 label = self.job.last_boot_tag
101 else:
102 self.job.last_boot_tag = label
103
104 self.reboot_setup(label=label, kernel_args=kernel_args, **dargs)
105
106 if label or kernel_args:
107 self.bootloader.install_boottool()
108 if not label:
109 default = int(self.bootloader.get_default())
110 label = self.bootloader.get_titles()[default]
111 self.bootloader.boot_once(label)
112 if kernel_args:
113 self.bootloader.add_args(label, kernel_args)
114
115 # define a function for the reboot and run it in a group
116 print "Reboot: initiating reboot"
117 def reboot():
118 self.record("GOOD", None, "reboot.start")
119 try:
jadmanskid544a352009-01-14 23:36:28 +0000120 # sync before starting the reboot, so that a long sync during
121 # shutdown isn't timed out by wait_down's short timeout
mbligh2b949772009-02-26 00:59:36 +0000122 if not fastsync:
123 self.run('sync; sync', timeout=timeout, ignore_status=True)
jadmanskid544a352009-01-14 23:36:28 +0000124
jadmanski0e1881e2009-01-14 23:33:12 +0000125 # Try several methods of rebooting in increasing harshness.
mbligh2b949772009-02-26 00:59:36 +0000126 self.run('(('
127 ' sync &'
jadmanski0e1881e2009-01-14 23:33:12 +0000128 ' sleep 5; reboot &'
129 ' sleep 60; reboot -f &'
130 ' sleep 10; reboot -nf &'
131 ' sleep 10; telinit 6 &'
mbligh2b949772009-02-26 00:59:36 +0000132 ') </dev/null >/dev/null 2>&1 &)')
mblighf2c33762008-10-18 14:42:34 +0000133 except error.AutoservRunError:
134 self.record("ABORT", None, "reboot.start",
135 "reboot command failed")
136 raise
137 if wait:
jadmanskid778ae42009-01-07 15:07:36 +0000138 self.wait_for_restart(timeout, **dargs)
mblighf2c33762008-10-18 14:42:34 +0000139
140 # if this is a full reboot-and-wait, run the reboot inside a group
141 if wait:
142 self.log_reboot(reboot)
143 else:
144 reboot()
145
146
jadmanski4f909252008-12-01 20:47:10 +0000147 def reboot_followup(self, *args, **dargs):
148 super(RemoteHost, self).reboot_followup(*args, **dargs)
149 if self.job:
150 self.job.profilers.handle_reboot(self)
151
152
jadmanskid778ae42009-01-07 15:07:36 +0000153 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs):
jadmanskid60321a2008-10-28 20:32:05 +0000154 """
155 Wait for the host to come back from a reboot. This wraps the
156 generic wait_for_restart implementation in a reboot group.
157 """
mblighf2c33762008-10-18 14:42:34 +0000158 def reboot_func():
jadmanskid778ae42009-01-07 15:07:36 +0000159 super(RemoteHost, self).wait_for_restart(timeout=timeout, **dargs)
mblighf2c33762008-10-18 14:42:34 +0000160 self.log_reboot(reboot_func)
161
162
mbligh1264b512008-11-05 22:21:49 +0000163 def cleanup(self):
164 super(RemoteHost, self).cleanup()
165 self.reboot()
166
167
mblighe48bcfb2008-11-11 17:09:44 +0000168 def get_tmp_dir(self, parent='/tmp'):
mblighf2c33762008-10-18 14:42:34 +0000169 """
170 Return the pathname of a directory on the host suitable
171 for temporary file storage.
172
173 The directory and its content will be deleted automatically
174 on the destruction of the Host object that was used to obtain
175 it.
176 """
jadmanski9f7dd112008-11-17 16:40:05 +0000177 self.run("mkdir -p %s" % parent)
mblighe48bcfb2008-11-11 17:09:44 +0000178 template = os.path.join(parent, 'autoserv-XXXXXX')
jadmanski9f7dd112008-11-17 16:40:05 +0000179 dir_name = self.run("mktemp -d %s" % template).stdout.rstrip()
mblighf2c33762008-10-18 14:42:34 +0000180 self.tmp_dirs.append(dir_name)
181 return dir_name
182
183
jadmanskiea455662009-03-25 22:25:39 +0000184 def delete_tmp_dir(self, tmpdir):
185 """
186 Delete the given temporary directory on the remote machine.
187 """
188 self.run('rm -rf "%s"' % utils.sh_escape(tmpdir), ignore_status=True)
189 self.tmp_dirs.remove(tmpdir)
190
191
mblighf2c33762008-10-18 14:42:34 +0000192 def ping(self):
193 """
194 Ping the remote system, and return whether it's available
195 """
196 fpingcmd = "%s -q %s" % ('/usr/bin/fping', self.hostname)
197 rc = utils.system(fpingcmd, ignore_status = 1)
198 return (rc == 0)
199
200
201 def check_uptime(self):
202 """
203 Check that uptime is available and monotonically increasing.
204 """
205 if not self.ping():
206 raise error.AutoservHostError('Client is not pingable')
207 result = self.run("/bin/cat /proc/uptime", 30)
208 return result.stdout.strip().split()[0]
209
210
211 def get_crashinfo(self, test_start_time):
jadmanskidef0c3c2009-03-25 20:07:10 +0000212 logging.info("Collecting crash information...")
mblighf2c33762008-10-18 14:42:34 +0000213 super(RemoteHost, self).get_crashinfo(test_start_time)
214
215 # wait for four hours, to see if the machine comes back up
216 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
jadmanskidef0c3c2009-03-25 20:07:10 +0000217 logging.info("Waiting four hours for %s to come up (%s)",
218 self.hostname, current_time)
mblighf2c33762008-10-18 14:42:34 +0000219 if not self.wait_up(timeout=4*60*60):
jadmanskidef0c3c2009-03-25 20:07:10 +0000220 logging.warning("%s down, unable to collect crash info",
221 self.hostname)
mblighf2c33762008-10-18 14:42:34 +0000222 return
223 else:
jadmanskidef0c3c2009-03-25 20:07:10 +0000224 logging.info("%s is back up, collecting crash info", self.hostname)
mblighf2c33762008-10-18 14:42:34 +0000225
226 # find a directory to put the crashinfo into
mbligh210bae62009-04-01 18:33:13 +0000227 try:
228 self.job.resultsdir
229 except AttributeError:
230 self.job.resultsdir = None
231
232 if self.job.resultsdir:
mblighf2c33762008-10-18 14:42:34 +0000233 infodir = self.job.resultdir
234 else:
235 infodir = os.path.abspath(os.getcwd())
236 infodir = os.path.join(infodir, "crashinfo.%s" % self.hostname)
237 if not os.path.exists(infodir):
238 os.mkdir(infodir)
239
240 # collect various log files
241 log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]
242 for log in log_files:
jadmanskidef0c3c2009-03-25 20:07:10 +0000243 logging.info("Collecting %s...", log)
mblighf2c33762008-10-18 14:42:34 +0000244 try:
245 self.get_file(log, infodir)
mbligha2c940d2009-01-30 22:35:19 +0000246 except Exception:
jadmanskidef0c3c2009-03-25 20:07:10 +0000247 logging.warning("Collection of %s failed", log)
mblighf2c33762008-10-18 14:42:34 +0000248
249 # collect dmesg
jadmanskidef0c3c2009-03-25 20:07:10 +0000250 logging.info("Collecting dmesg (saved to crashinfo/dmesg)...")
mbligh78a013a2009-01-13 19:34:28 +0000251 devnull = open("/dev/null", "w")
mblighf2c33762008-10-18 14:42:34 +0000252 try:
mbligh78a013a2009-01-13 19:34:28 +0000253 try:
254 result = self.run("dmesg", stdout_tee=devnull).stdout
255 file(os.path.join(infodir, "dmesg"), "w").write(result)
256 except Exception, e:
jadmanskidef0c3c2009-03-25 20:07:10 +0000257 logging.warning("Collection of dmesg failed:\n%s", e)
mbligh78a013a2009-01-13 19:34:28 +0000258 finally:
259 devnull.close()
mblighf2c33762008-10-18 14:42:34 +0000260
jadmanskida0aeff2009-02-18 18:53:05 +0000261 # collect any profiler data we can find
jadmanskidef0c3c2009-03-25 20:07:10 +0000262 logging.info("Collecting any server-side profiler data lying around...")
jadmanskida0aeff2009-02-18 18:53:05 +0000263 try:
264 cmd = "ls %s" % profiler.PROFILER_TMPDIR
265 profiler_dirs = [path for path in self.run(cmd).stdout.split()
266 if path.startswith("autoserv-")]
267 for profiler_dir in profiler_dirs:
268 remote_path = profiler.get_profiler_results_dir(profiler_dir)
269 remote_exists = self.run("ls %s" % remote_path,
270 ignore_status=True).exit_status == 0
271 if not remote_exists:
272 continue
273 local_path = os.path.join(infodir, "profiler." + profiler_dir)
274 os.mkdir(local_path)
275 self.get_file(remote_path + "/", local_path)
276 except Exception, e:
jadmanskidef0c3c2009-03-25 20:07:10 +0000277 logging.warning("Collection of profiler data failed with:\n%s", e)
278
279
280 # collect any uncollected logs we see (for this host)
mbligh210bae62009-04-01 18:33:13 +0000281 if not self.job.uncollected_log_file:
282 self.job.uncollected_log_file = ''
jadmanskidef0c3c2009-03-25 20:07:10 +0000283 if self.job and os.path.exists(self.job.uncollected_log_file):
284 try:
285 logs = pickle.load(open(self.job.uncollected_log_file))
286 for hostname, remote_path, local_path in logs:
287 if hostname == self.hostname:
288 logging.info("Retrieving logs from %s:%s into %s",
289 hostname, remote_path, local_path)
290 self.get_file(remote_path + "/", local_path + "/")
291 except Exception, e:
292 logging.warning("Error while trying to collect stranded "
293 "Autotest client logs: %s", e)
jadmanskida0aeff2009-02-18 18:53:05 +0000294
mblighf2c33762008-10-18 14:42:34 +0000295
jadmanskica7da372008-10-21 16:26:52 +0000296 def are_wait_up_processes_up(self):
mblighf2c33762008-10-18 14:42:34 +0000297 """
298 Checks if any HOSTS waitup processes are running yet on the
299 remote host.
300
301 Returns True if any the waitup processes are running, False
302 otherwise.
303 """
304 processes = self.get_wait_up_processes()
305 if len(processes) == 0:
306 return True # wait up processes aren't being used
307 for procname in processes:
308 exit_status = self.run("{ ps -e || ps; } | grep '%s'" % procname,
309 ignore_status=True).exit_status
310 if exit_status == 0:
311 return True
312 return False