blob: 58c79bec369f5605c0c65f05abca23c5ccf9fdb1 [file] [log] [blame]
Dylan Reid837c74a2016-01-22 17:25:21 -08001/* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _GNU_SOURCE /* For asprintf */
7
8#include <errno.h>
9#include <fcntl.h>
10#include <malloc.h>
11#include <signal.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15#include <sys/mount.h>
16#include <sys/stat.h>
17#include <sys/types.h>
Dylan Reid2bd9ea92016-04-07 20:57:47 -070018#include <sys/wait.h>
Dylan Reid837c74a2016-01-22 17:25:21 -080019#include <unistd.h>
20
21#include "container_cgroup.h"
22#include "libcontainer.h"
23#include "libminijail.h"
24
25struct container_mount {
26 char *name;
27 char *source;
28 char *destination;
29 char *type;
30 char *data;
31 int flags;
32 int uid;
33 int gid;
34 int mode;
35 int mount_in_ns; /* True if mount should happen in new vfs ns */
36 int create; /* True if target should be created if it doesn't exist */
37};
38
39struct container_device {
40 char type; /* 'c' or 'b' for char or block */
41 char *path;
42 int fs_permissions;
43 int major;
44 int minor;
45 int uid;
46 int gid;
47 int read_allowed;
48 int write_allowed;
49 int modify_allowed;
50};
51
52/*
53 * Structure that configures how the container is run.
54 *
55 * rootfs - Path to the root of the container's filesystem.
56 * program_argv - The program to run and args, e.g. "/sbin/init".
57 * num_args - Number of args in program_argv.
58 * uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024"
59 * gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024"
60 * alt_syscall_table - Syscall table to use or NULL if none.
61 * mounts - Filesystems to mount in the new namespace.
62 * num_mounts - Number of above.
63 * devices - Device nodes to create.
64 * num_devices - Number of above.
Dylan Reid2bd9ea92016-04-07 20:57:47 -070065 * run_setfiles - Should run setfiles on mounts to enable selinux.
Dylan Reid837c74a2016-01-22 17:25:21 -080066 */
67struct container_config {
68 char *rootfs;
69 char **program_argv;
70 size_t num_args;
71 char *uid_map;
72 char *gid_map;
73 char *alt_syscall_table;
74 struct container_mount *mounts;
75 size_t num_mounts;
76 struct container_device *devices;
77 size_t num_devices;
Dylan Reid2bd9ea92016-04-07 20:57:47 -070078 const char *run_setfiles;
Dylan Reid837c74a2016-01-22 17:25:21 -080079};
80
81struct container_config *container_config_create()
82{
83 return calloc(1, sizeof(struct container_config));
84}
85
86void container_config_destroy(struct container_config *c)
87{
88 size_t i;
89
90 if (c == NULL)
91 return;
92 free(c->rootfs);
93 for (i = 0; i < c->num_args; ++i)
94 free(c->program_argv[i]);
95 free(c->program_argv);
96 free(c->uid_map);
97 free(c->gid_map);
98 free(c->alt_syscall_table);
99 for (i = 0; i < c->num_mounts; ++i) {
100 free(c->mounts[i].name);
101 free(c->mounts[i].source);
102 free(c->mounts[i].destination);
103 free(c->mounts[i].type);
104 free(c->mounts[i].data);
105 }
106 free(c->mounts);
107 for (i = 0; i < c->num_devices; ++i) {
108 free(c->devices[i].path);
109 }
110 free(c->devices);
111 free(c);
112}
113
114int container_config_rootfs(struct container_config *c, const char *rootfs)
115{
116 c->rootfs = strdup(rootfs);
117 if (!c->rootfs)
118 return -ENOMEM;
119 return 0;
120}
121
122int container_config_program_argv(struct container_config *c,
123 char **argv, size_t num_args)
124{
125 size_t i;
126
127 c->num_args = num_args;
128 c->program_argv = calloc(num_args + 1, sizeof(char *));
129 if (!c->program_argv)
130 return -ENOMEM;
131 for (i = 0; i < num_args; ++i) {
132 c->program_argv[i] = strdup(argv[i]);
133 if (!c->program_argv[i])
134 return -ENOMEM;
135 }
136 c->program_argv[num_args] = NULL;
137 return 0;
138}
139
140int container_config_uid_map(struct container_config *c, const char *uid_map)
141{
142 c->uid_map = strdup(uid_map);
143 if (!c->uid_map)
144 return -ENOMEM;
145 return 0;
146}
147
148int container_config_gid_map(struct container_config *c, const char *gid_map)
149{
150 c->gid_map = strdup(gid_map);
151 if (!c->gid_map)
152 return -ENOMEM;
153 return 0;
154}
155
156int container_config_alt_syscall_table(struct container_config *c,
157 const char *alt_syscall_table)
158{
159 c->alt_syscall_table = strdup(alt_syscall_table);
160 if (!c->alt_syscall_table)
161 return -ENOMEM;
162 return 0;
163}
164
165int container_config_add_mount(struct container_config *c,
166 const char *name,
167 const char *source,
168 const char *destination,
169 const char *type,
170 const char *data,
171 int flags,
172 int uid,
173 int gid,
174 int mode,
175 int mount_in_ns,
176 int create)
177{
178 struct container_mount *mount_ptr;
179
180 if (name == NULL || source == NULL ||
181 destination == NULL || type == NULL)
182 return -EINVAL;
183
184 mount_ptr = realloc(c->mounts,
185 sizeof(c->mounts[0]) * (c->num_mounts + 1));
186 if (!mount_ptr)
187 return -ENOMEM;
188 c->mounts = mount_ptr;
189 c->mounts[c->num_mounts].name = strdup(name);
190 if (!c->mounts[c->num_mounts].name)
191 return -ENOMEM;
192 c->mounts[c->num_mounts].source = strdup(source);
193 if (!c->mounts[c->num_mounts].source)
194 return -ENOMEM;
195 c->mounts[c->num_mounts].destination = strdup(destination);
196 if (!c->mounts[c->num_mounts].destination)
197 return -ENOMEM;
198 c->mounts[c->num_mounts].type = strdup(type);
199 if (!c->mounts[c->num_mounts].type)
200 return -ENOMEM;
201 if (data) {
202 c->mounts[c->num_mounts].data = strdup(data);
203 if (!c->mounts[c->num_mounts].data)
204 return -ENOMEM;
205 } else {
206 c->mounts[c->num_mounts].data = NULL;
207 }
208 c->mounts[c->num_mounts].flags = flags;
209 c->mounts[c->num_mounts].uid = uid;
210 c->mounts[c->num_mounts].gid = gid;
211 c->mounts[c->num_mounts].mode = mode;
212 c->mounts[c->num_mounts].mount_in_ns = mount_in_ns;
213 c->mounts[c->num_mounts].create = create;
214 ++c->num_mounts;
215 return 0;
216}
217
218int container_config_add_device(struct container_config *c,
219 char type,
220 const char *path,
221 int fs_permissions,
222 int major,
223 int minor,
224 int uid,
225 int gid,
226 int read_allowed,
227 int write_allowed,
228 int modify_allowed)
229{
230 struct container_device *dev_ptr;
231
232 if (path == NULL)
233 return -EINVAL;
234 dev_ptr = realloc(c->devices,
235 sizeof(c->devices[0]) * (c->num_devices + 1));
236 if (!dev_ptr)
237 return -ENOMEM;
238 c->devices = dev_ptr;
239 c->devices[c->num_devices].type = type;
240 c->devices[c->num_devices].path = strdup(path);
241 if (!c->devices[c->num_devices].path)
242 return -ENOMEM;
243 c->devices[c->num_devices].fs_permissions = fs_permissions;
244 c->devices[c->num_devices].major = major;
245 c->devices[c->num_devices].minor = minor;
246 c->devices[c->num_devices].uid = uid;
247 c->devices[c->num_devices].gid = gid;
248 c->devices[c->num_devices].read_allowed = read_allowed;
249 c->devices[c->num_devices].write_allowed = write_allowed;
250 c->devices[c->num_devices].modify_allowed = modify_allowed;
251 ++c->num_devices;
252 return 0;
253}
254
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700255void container_config_run_setfiles(struct container_config *c,
256 const char *setfiles_cmd)
257{
258 c->run_setfiles = setfiles_cmd;
259}
Dylan Reid837c74a2016-01-22 17:25:21 -0800260
261/*
262 * Container manipulation
263 */
264struct container {
265 struct container_config *config;
266 struct container_cgroup *cgroup;
267 struct minijail *jail;
268 pid_t init_pid;
269 char *runfs;
270 char *rundir;
271 char *runfsroot;
272 char *pid_file_path;
273 const char *name;
274};
275
276struct container *container_new(const char *name,
277 const char *rundir,
278 struct container_config *config)
279{
280 struct container *c;
281
282 if (!config)
283 return NULL;
284 if (!config->program_argv || !config->program_argv[0])
285 return NULL;
286
287 c = calloc(1, sizeof(*c));
288 c->name = name;
289 c->config = config;
290 c->cgroup = container_cgroup_new(name, "/sys/fs/cgroup");
291 c->rundir = strdup(rundir);
292 if (!c->rundir)
293 return NULL;
294 return c;
295}
296
297void container_destroy(struct container *c)
298{
299 container_config_destroy(c->config);
300 container_cgroup_destroy(c->cgroup);
301 free(c->rundir);
302 free(c);
303}
304
305static int make_dir(const char *path, int uid, int gid, int mode)
306{
307 if (mkdir(path, mode))
308 return -errno;
309 if (chmod(path, mode))
310 return -errno;
311 if (chown(path, uid, gid))
312 return -errno;
313 return 0;
314}
315
316static int touch_file(const char *path, int uid, int gid, int mode)
317{
318 int rc;
319 int fd = open(path, O_RDWR | O_CREAT, mode);
320 if (fd < 0)
321 return -errno;
322 rc = fchown(fd, uid, gid);
323 close(fd);
324
325 if (rc)
326 return -errno;
327 return 0;
328}
329
330/* Make sure the mount target exists in the new rootfs. Create if needed and
331 * possible.
332 */
333static int setup_mount_destination(const struct container_mount *mnt,
334 const char *dest)
335{
336 int rc;
337 struct stat st_buf;
338
339 rc = stat(dest, &st_buf);
340 if (rc == 0) /* destination exists */
341 return 0;
342
343 /* Try to create the destination. Either make directory or touch a file
344 * depending on the source type.
345 */
346 rc = stat(mnt->source, &st_buf);
347 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode))
348 return make_dir(dest, mnt->uid, mnt->gid, mnt->mode);
349
350 return touch_file(dest, mnt->uid, mnt->gid, mnt->mode);
351}
352
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700353/* Fork and exec the setfiles command to configure the selinux policy. */
354static int run_setfiles_command(const struct container *c, const char *dest)
355{
356 int rc;
357 int status;
358 int pid;
359 char *context_path;
360
361 if (!c->config->run_setfiles)
362 return 0;
363
364 if (asprintf(&context_path, "%s/file_contexts",
365 c->runfsroot) < 0)
366 return -errno;
367
368 pid = fork();
369 if (pid == 0) {
370 const char *argv[] = {
371 c->config->run_setfiles,
372 "-r",
373 c->runfsroot,
374 context_path,
375 dest,
376 NULL,
377 };
378 const char *env[] = {
379 NULL,
380 };
381
382 execve(argv[0], (char *const*)argv, (char *const*)env);
383
384 /* Command failed to exec if execve returns. */
385 _exit(-errno);
386 }
387 free(context_path);
388 if (pid < 0)
389 return -errno;
390 do {
391 rc = waitpid(pid, &status, 0);
392 } while (rc == -1 && errno == EINTR);
393 if (rc < 0)
394 return -errno;
395 return status;
396}
397
Dylan Reid837c74a2016-01-22 17:25:21 -0800398int container_start(struct container *c)
399{
400 int rc;
401 unsigned int i;
402 const char *rootfs = c->config->rootfs;
403 char *runfs_template;
404
405 if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0)
406 return -errno;
407
408 c->runfs = mkdtemp(runfs_template);
409 if (!c->runfs) {
410 free(runfs_template);
411 return -errno;
412 }
413 if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0) {
414 free(runfs_template);
415 return -errno;
416 }
417
418 rc = mkdir(c->runfsroot, 0660);
419 if (rc)
420 goto error_rmdir;
421
422 rc = mount(rootfs, c->runfsroot, "", MS_BIND | MS_RDONLY | MS_NOEXEC,
423 NULL);
424 if (rc)
425 goto error_rmdir;
426
427 c->jail = minijail_new();
428
429 for (i = 0; i < c->config->num_mounts; ++i) {
430 const struct container_mount *mnt = &c->config->mounts[i];
431 char *dest;
432
433 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
434 goto error_rmdir;
435
436 if (mnt->create) {
437 rc = setup_mount_destination(mnt, dest);
438 if (rc) {
439 free(dest);
440 goto error_rmdir;
441 }
442 }
443 if (mnt->mount_in_ns) {
444 /*
445 * We can mount this with minijail.
446 * If relative to rootfs, append source to rootfs.
447 */
448 char *tmpsrc = NULL;
449 if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') {
450 if (asprintf(&tmpsrc, "%s/%s", c->runfsroot,
451 mnt->source) < 0) {
452 free(dest);
453 goto error_rmdir;
454 }
455 }
456 rc = minijail_mount(c->jail,
457 tmpsrc ? tmpsrc : mnt->source,
458 mnt->destination, mnt->type,
459 mnt->flags);
460 free(tmpsrc);
461 if (rc) {
462 free(dest);
463 goto error_rmdir;
464 }
465 } else {
466 /*
467 * Mount this externally and unmount it on exit. Don't
468 * allow execution from external mounts.
469 */
470 rc = mount(mnt->source, dest, mnt->type,
471 mnt->flags | MS_NOEXEC, mnt->data);
472 if (rc) {
473 free(dest);
474 goto error_rmdir;
475 }
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700476
477 rc = run_setfiles_command(c, dest);
478 if (rc) {
479 free(dest);
480 goto error_rmdir;
481 }
482
Dylan Reid837c74a2016-01-22 17:25:21 -0800483 }
484 free(dest);
485 }
486
487 c->cgroup->ops->deny_all_devices(c->cgroup);
488
489 for (i = 0; i < c->config->num_devices; i++) {
490 const struct container_device *dev = &c->config->devices[i];
491 int mode;
492 char *path;
493
494 switch (dev->type) {
495 case 'b':
496 mode = S_IFBLK;
497 break;
498 case 'c':
499 mode = S_IFCHR;
500 break;
501 default:
502 goto error_rmdir;
503 }
504 mode |= dev->fs_permissions;
505
506 if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0)
507 goto error_rmdir;
508 if (dev->minor >= 0) {
509 rc = mknod(path, mode, makedev(dev->major, dev->minor));
510 if (rc && errno != EEXIST) {
511 free(path);
512 goto error_rmdir;
513 }
514 rc = chown(path, dev->uid, dev->gid);
515 if (rc) {
516 free(path);
517 goto error_rmdir;
518 }
519 rc = chmod(path, dev->fs_permissions);
520 free(path);
521 if (rc)
522 goto error_rmdir;
523 }
524
525 rc = c->cgroup->ops->add_device(c->cgroup, dev->major,
526 dev->minor, dev->read_allowed,
527 dev->write_allowed,
528 dev->modify_allowed, dev->type);
529 if (rc)
530 goto error_rmdir;
531 }
532
533 /* Setup and start the container with libminijail. */
534 if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0)
535 goto error_rmdir;
536 minijail_write_pid_file(c->jail, c->pid_file_path);
537 minijail_reset_signal_mask(c->jail);
538
539 /* Setup container namespaces. */
540 minijail_namespace_ipc(c->jail);
541 minijail_namespace_vfs(c->jail);
542 minijail_namespace_net(c->jail);
543 minijail_namespace_pids(c->jail);
544/* TODO(dgreid) - Enable user namespaces
545 minijail_namespace_user(c->jail);
546 rc = minijail_uidmap(c->jail, c->config->uid_map);
547 if (rc)
548 goto error_rmdir;
549 rc = minijail_gidmap(c->jail, c->config->gid_map);
550 if (rc)
551 goto error_rmdir;
552*/
553
554 rc = minijail_enter_pivot_root(c->jail, c->runfsroot);
555 if (rc)
556 goto error_rmdir;
557
558 /* Add the cgroups configured above. */
559 rc = minijail_add_to_cgroup(c->jail, cgroup_cpu_tasks_path(c->cgroup));
560 if (rc)
561 goto error_rmdir;
562 rc = minijail_add_to_cgroup(c->jail,
563 cgroup_cpuacct_tasks_path(c->cgroup));
564 if (rc)
565 goto error_rmdir;
566 rc = minijail_add_to_cgroup(c->jail,
567 cgroup_devices_tasks_path(c->cgroup));
568 if (rc)
569 goto error_rmdir;
570 rc = minijail_add_to_cgroup(c->jail,
571 cgroup_freezer_tasks_path(c->cgroup));
572 if (rc)
573 goto error_rmdir;
574
575 if (c->config->alt_syscall_table)
576 minijail_use_alt_syscall(c->jail, c->config->alt_syscall_table);
577
578 minijail_run_as_init(c->jail);
579
580 /* Last mount is to make '/' executable in the container. */
581 rc = minijail_mount(c->jail, rootfs, "/", "",
582 MS_REMOUNT | MS_RDONLY);
583 if (rc)
584 goto error_rmdir;
585
586 rc = minijail_run_pid_pipes_no_preload(c->jail,
587 c->config->program_argv[0],
588 c->config->program_argv,
589 &c->init_pid, NULL, NULL,
590 NULL);
591 if (rc)
592 goto error_rmdir;
593 return 0;
594
595error_rmdir:
596 umount(c->runfsroot);
597 rmdir(c->runfsroot);
598 unlink(c->pid_file_path);
599 free(c->pid_file_path);
600 rmdir(c->runfs);
601 free(c->runfsroot);
602 free(c->runfs);
603 return rc;
604}
605
606const char *container_root(struct container *c)
607{
608 return c->runfs;
609}
610
611int container_pid(struct container *c)
612{
613 return c->init_pid;
614}
615
616static int container_teardown(struct container *c)
617{
618 int i;
619 int ret = 0;
620
621 /*
622 * Unmount anything we mounted in this mount namespace in the opposite
623 * order that they were mounted.
624 */
625 for (i = (int)c->config->num_mounts - 1; i >= 0; --i) {
626 const struct container_mount *mnt = &c->config->mounts[i];
627 char *dest;
628
629 if (mnt->mount_in_ns)
630 continue;
631 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
632 continue;
633 if (umount(dest))
634 ret = -errno;
635 free(dest);
636 }
637 if (umount(c->runfsroot))
638 ret = -errno;
639 if (rmdir(c->runfsroot))
640 ret = -errno;
641 if (unlink(c->pid_file_path))
642 ret = -errno;
643 if (rmdir(c->runfs))
644 ret = -errno;
645 free(c->pid_file_path);
646 free(c->runfsroot);
647 free(c->runfs);
648 return ret;
649}
650
651int container_wait(struct container *c)
652{
653 minijail_wait(c->jail);
654 return container_teardown(c);
655}
656
657int container_kill(struct container *c)
658{
659 int rc;
660
661 rc = kill(c->init_pid, SIGKILL);
662 if (rc)
663 return -errno;
664 return container_wait(c);
665}