Blame - server/hosts/remote.py - chromium.googlesource.com/chromiumos/platform/tauto

2008-04-09 16:23:43 +0000

[diff] [blame]

1

"""This class defines the Remote host class, mixing in the SiteHost class

2

if it is available."""

3

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

4

import os, time

5

from autotest_lib.client.common_lib import error

jadmanski

da0aeff

2009-02-18 18:53:05 +0000

[diff] [blame]

6

from autotest_lib.server import utils, profiler

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

7

from autotest_lib.server.hosts import base_classes, bootloader

mbligh

2008-04-09 16:23:43 +0000

[diff] [blame]

8

9

jadmanski

1c5e3a1

2008-08-15 23:08:20 +0000

[diff] [blame]

10

class RemoteHost(base_classes.Host):

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

11

"""

12

This class represents a remote machine on which you can run

jadmanski

0afbb63

2008-06-06 21:10:57 +0000

[diff] [blame]

13

programs.

mbligh

2008-04-09 16:23:43 +0000

[diff] [blame]

14

jadmanski

0afbb63

2008-06-06 21:10:57 +0000

[diff] [blame]

15

It may be accessed through a network, a serial line, ...

16

It is not the machine autoserv is running on.

mbligh

2008-04-09 16:23:43 +0000

[diff] [blame]

17

jadmanski

0afbb63

2008-06-06 21:10:57 +0000

[diff] [blame]

18

Implementation details:

19

This is an abstract class, leaf subclasses must implement the methods

20

listed here and in parent classes which have no implementation. They

21

may reimplement methods which already have an implementation. You

22

must not instantiate this class but should instantiate one of those

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

23

leaf subclasses.

24

"""

mbligh

2008-04-09 16:23:43 +0000

[diff] [blame]

25

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

26

DEFAULT_REBOOT_TIMEOUT = base_classes.Host.DEFAULT_REBOOT_TIMEOUT

27

LAST_BOOT_TAG = object()

28

jadmanski

f656291

2008-10-21 17:59:01 +0000

[diff] [blame]

29

def _initialize(self, hostname, autodir=None, *args, **dargs):

30

super(RemoteHost, self)._initialize(*args, **dargs)

mbligh

2008-04-09 16:23:43 +0000

[diff] [blame]

31

jadmanski

1c5e3a1

2008-08-15 23:08:20 +0000

[diff] [blame]

32

self.hostname = hostname

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

33

self.autodir = autodir

34

self.tmp_dirs = []

jadmanski

a2db941

2008-08-22 21:47:24 +0000

[diff] [blame]

35

36

jadmanski

53aaf38

2008-11-17 16:22:31 +0000

[diff] [blame]

37

def close(self):

38

super(RemoteHost, self).close()

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

39

self.stop_loggers()

40

41

if hasattr(self, 'tmp_dirs'):

42

for dir in self.tmp_dirs:

43

try:

44

self.run('rm -rf "%s"' % (utils.sh_escape(dir)))

45

except error.AutoservRunError:

pass

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

49

def job_start(self):

50

"""

51

Abstract method, called the first time a remote host object

52

is created for a specific host after a job starts.

53

54

This method depends on the create_host factory being used to

55

construct your host object. If you directly construct host objects

56

you will need to call this method yourself (and enforce the

single-call rule).

"""

pass

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

62

def get_autodir(self):

return self.autodir

def set_autodir(self, autodir):

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

67

"""

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

68

This method is called to make the host object aware of the

69

where autotest is installed. Called in server/autotest.py

70

after a successful install

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

71

"""

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

72

self.autodir = autodir

73

74

75

def sysrq_reboot(self):

76

self.run('echo b > /proc/sysrq-trigger &')

77

78

79

def reboot(self, timeout=DEFAULT_REBOOT_TIMEOUT, label=LAST_BOOT_TAG,

mbligh

2009-02-26 00:59:36 +0000

[diff] [blame]

80

kernel_args=None, wait=True, fastsync=False, **dargs):

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

81

"""

82

Reboot the remote host.

83

84

Args:

85

timeout - How long to wait for the reboot.

86

label - The label we should boot into. If None, we will

87

boot into the default kernel. If it's LAST_BOOT_TAG,

88

we'll boot into whichever kernel was .boot'ed last

89

(or the default kernel if we haven't .boot'ed in this

90

job). If it's None, we'll boot into the default kernel.

91

If it's something else, we'll boot into that.

92

wait - Should we wait to see if the machine comes back up.

mbligh

2009-02-26 00:59:36 +0000

[diff] [blame]

93

fastsync - Don't wait for the sync to complete, just start one

94

and move on. This is for cases where rebooting prompty

95

is more important than data integrity and/or the

96

machine may have disks that cause sync to never return.

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

97

"""

98

if self.job:

99

if label == self.LAST_BOOT_TAG:

100

label = self.job.last_boot_tag

101

else:

102

self.job.last_boot_tag = label

103

104

self.reboot_setup(label=label, kernel_args=kernel_args, **dargs)

105

106

if label or kernel_args:

107

self.bootloader.install_boottool()

108

if not label:

109

default = int(self.bootloader.get_default())

110

label = self.bootloader.get_titles()[default]

111

self.bootloader.boot_once(label)

112

if kernel_args:

113

self.bootloader.add_args(label, kernel_args)

114

115

# define a function for the reboot and run it in a group

116

print "Reboot: initiating reboot"

117

def reboot():

118

self.record("GOOD", None, "reboot.start")

119

try:

jadmanski

d544a35

2009-01-14 23:36:28 +0000

[diff] [blame]

120

# sync before starting the reboot, so that a long sync during

121

# shutdown isn't timed out by wait_down's short timeout

mbligh

2009-02-26 00:59:36 +0000

[diff] [blame]

122

if not fastsync:

123

self.run('sync; sync', timeout=timeout, ignore_status=True)

jadmanski

d544a35

2009-01-14 23:36:28 +0000

[diff] [blame]

124

jadmanski

0e1881e

2009-01-14 23:33:12 +0000

[diff] [blame]

125

# Try several methods of rebooting in increasing harshness.

mbligh

2009-02-26 00:59:36 +0000

[diff] [blame]

126

self.run('(('

127

' sync &'

jadmanski

0e1881e

2009-01-14 23:33:12 +0000

[diff] [blame]

128

' sleep 5; reboot &'

129

' sleep 60; reboot -f &'

130

' sleep 10; reboot -nf &'

131

' sleep 10; telinit 6 &'

mbligh

2009-02-26 00:59:36 +0000

[diff] [blame]

132

') </dev/null >/dev/null 2>&1 &)')

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

133

except error.AutoservRunError:

134

self.record("ABORT", None, "reboot.start",

135

"reboot command failed")

136

raise

137

if wait:

jadmanski

d778ae4

2009-01-07 15:07:36 +0000

[diff] [blame]

138

self.wait_for_restart(timeout, **dargs)

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

139

140

# if this is a full reboot-and-wait, run the reboot inside a group

141

if wait:

142

self.log_reboot(reboot)

else:

reboot()

jadmanski

2008-12-01 20:47:10 +0000

[diff] [blame]

147

def reboot_followup(self, *args, **dargs):

148

super(RemoteHost, self).reboot_followup(*args, **dargs)

149

if self.job:

150

self.job.profilers.handle_reboot(self)

151

152

jadmanski

d778ae4

2009-01-07 15:07:36 +0000

[diff] [blame]

153

def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, **dargs):

jadmanski

2008-10-28 20:32:05 +0000

[diff] [blame]

154

"""

155

Wait for the host to come back from a reboot. This wraps the

156

generic wait_for_restart implementation in a reboot group.

157

"""

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

158

def reboot_func():

jadmanski

d778ae4

2009-01-07 15:07:36 +0000

[diff] [blame]

159

super(RemoteHost, self).wait_for_restart(timeout=timeout, **dargs)

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

160

self.log_reboot(reboot_func)

161

162

mbligh

1264b51

2008-11-05 22:21:49 +0000

[diff] [blame]

163

def cleanup(self):

164

super(RemoteHost, self).cleanup()

self.reboot()

mbligh

2008-11-11 17:09:44 +0000

[diff] [blame]

168

def get_tmp_dir(self, parent='/tmp'):

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

169

"""

170

Return the pathname of a directory on the host suitable

171

for temporary file storage.

172

173

The directory and its content will be deleted automatically

174

on the destruction of the Host object that was used to obtain

175

it.

176

"""

jadmanski

9f7dd11

2008-11-17 16:40:05 +0000

[diff] [blame]

177

self.run("mkdir -p %s" % parent)

mbligh

e48bcfb

2008-11-11 17:09:44 +0000

[diff] [blame]

178

template = os.path.join(parent, 'autoserv-XXXXXX')

jadmanski

9f7dd11

2008-11-17 16:40:05 +0000

[diff] [blame]

179

dir_name = self.run("mktemp -d %s" % template).stdout.rstrip()

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

180

self.tmp_dirs.append(dir_name)

return dir_name

def ping(self):

"""

Ping the remote system, and return whether it's available

187

"""

188

fpingcmd = "%s -q %s" % ('/usr/bin/fping', self.hostname)

189

rc = utils.system(fpingcmd, ignore_status = 1)

return (rc == 0)

def check_uptime(self):

194

"""

195

Check that uptime is available and monotonically increasing.

196

"""

197

if not self.ping():

198

raise error.AutoservHostError('Client is not pingable')

199

result = self.run("/bin/cat /proc/uptime", 30)

200

return result.stdout.strip().split()[0]

201

202

203

def get_crashinfo(self, test_start_time):

204

print "Collecting crash information..."

205

super(RemoteHost, self).get_crashinfo(test_start_time)

206

207

# wait for four hours, to see if the machine comes back up

208

current_time = time.strftime("%b %d %H:%M:%S", time.localtime())

209

print "Waiting four hours for %s to come up (%s)" % (self.hostname,

210

current_time)

211

if not self.wait_up(timeout=4*60*60):

212

print "%s down, unable to collect crash info" % self.hostname

213

return

214

else:

215

print "%s is back up, collecting crash info" % self.hostname

216

217

# find a directory to put the crashinfo into

218

if self.job:

219

infodir = self.job.resultdir

220

else:

221

infodir = os.path.abspath(os.getcwd())

222

infodir = os.path.join(infodir, "crashinfo.%s" % self.hostname)

223

if not os.path.exists(infodir):

224

os.mkdir(infodir)

225

226

# collect various log files

227

log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]

228

for log in log_files:

229

print "Collecting %s..." % log

230

try:

231

self.get_file(log, infodir)

mbligh

a2c940d

2009-01-30 22:35:19 +0000

[diff] [blame]

232

except Exception:

233

print "Collection of %s failed. Non-fatal, continuing." % log

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

234

235

# collect dmesg

mbligh

78a013a

2009-01-13 19:34:28 +0000

[diff] [blame]

236

print "Collecting dmesg (saved to crashinfo/dmesg)..."

237

devnull = open("/dev/null", "w")

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

238

try:

mbligh

78a013a

2009-01-13 19:34:28 +0000

[diff] [blame]

239

try:

240

result = self.run("dmesg", stdout_tee=devnull).stdout

241

file(os.path.join(infodir, "dmesg"), "w").write(result)

242

except Exception, e:

243

print "crashinfo collection of dmesg failed with:\n%s" % e

244

finally:

245

devnull.close()

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

246

jadmanski

da0aeff

2009-02-18 18:53:05 +0000

[diff] [blame]

247

# collect any profiler data we can find

248

print "Collecting any server-side profiler data lying around..."

249

try:

250

cmd = "ls %s" % profiler.PROFILER_TMPDIR

251

profiler_dirs = [path for path in self.run(cmd).stdout.split()

252

if path.startswith("autoserv-")]

253

for profiler_dir in profiler_dirs:

254

remote_path = profiler.get_profiler_results_dir(profiler_dir)

255

remote_exists = self.run("ls %s" % remote_path,

256

ignore_status=True).exit_status == 0

257

if not remote_exists:

258

continue

259

local_path = os.path.join(infodir, "profiler." + profiler_dir)

260

os.mkdir(local_path)

261

self.get_file(remote_path + "/", local_path)

262

except Exception, e:

263

print "crashinfo collection of profiler data failed with:\n%s" % e

264

mbligh

2008-10-18 14:42:34 +0000

[diff] [blame]

265

jadmanski

ca7da37

2008-10-21 16:26:52 +0000

[diff] [blame]

266

def are_wait_up_processes_up(self):

mbligh