blob: 7bb3f1f88cd49f958494153881df33b72e5ec295 [file] [log] [blame]
Dylan Reid837c74a2016-01-22 17:25:21 -08001/* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _GNU_SOURCE /* For asprintf */
7
8#include <errno.h>
9#include <fcntl.h>
Mike Frysinger05e594e2017-01-10 02:11:08 -050010#if USE_device_mapper
11#include <libdevmapper.h>
12#endif
Dylan Reid837c74a2016-01-22 17:25:21 -080013#include <malloc.h>
14#include <signal.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <sys/mount.h>
19#include <sys/stat.h>
20#include <sys/types.h>
Dylan Reid2bd9ea92016-04-07 20:57:47 -070021#include <sys/wait.h>
Dylan Reid837c74a2016-01-22 17:25:21 -080022#include <unistd.h>
23
Mike Frysinger412dbd22017-01-06 01:50:34 -050024#include <linux/loop.h>
25
Dylan Reid837c74a2016-01-22 17:25:21 -080026#include "container_cgroup.h"
27#include "libcontainer.h"
28#include "libminijail.h"
29
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070030#define FREE_AND_NULL(ptr) \
31do { \
32 free(ptr); \
33 ptr = NULL; \
34} while(0)
35
Yusuke Sato91f11f02016-12-02 16:15:13 -080036#define MAX_NUM_SETFILES_ARGS 128
37
Mike Frysinger412dbd22017-01-06 01:50:34 -050038static const char loopdev_ctl[] = "/dev/loop-control";
Mike Frysinger05e594e2017-01-10 02:11:08 -050039#if USE_device_mapper
40static const char dm_dev_prefix[] = "/dev/mapper/";
41#endif
Mike Frysinger412dbd22017-01-06 01:50:34 -050042
Luis Hector Chavez945af482016-06-03 08:39:34 -070043static int container_teardown(struct container *c);
44
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070045static int strdup_and_free(char **dest, const char *src)
46{
47 char *copy = strdup(src);
48 if (!copy)
49 return -ENOMEM;
50 if (*dest)
51 free(*dest);
52 *dest = copy;
53 return 0;
54}
55
Dylan Reid837c74a2016-01-22 17:25:21 -080056struct container_mount {
57 char *name;
58 char *source;
59 char *destination;
60 char *type;
61 char *data;
Mike Frysinger05e594e2017-01-10 02:11:08 -050062 char *verity;
Dylan Reid837c74a2016-01-22 17:25:21 -080063 int flags;
64 int uid;
65 int gid;
66 int mode;
67 int mount_in_ns; /* True if mount should happen in new vfs ns */
68 int create; /* True if target should be created if it doesn't exist */
Mike Frysinger412dbd22017-01-06 01:50:34 -050069 int loopback; /* True if target should be mounted via loopback */
Dylan Reid837c74a2016-01-22 17:25:21 -080070};
71
72struct container_device {
73 char type; /* 'c' or 'b' for char or block */
74 char *path;
75 int fs_permissions;
76 int major;
77 int minor;
Dylan Reid355d5e42016-04-29 16:53:31 -070078 int copy_minor; /* Copy the minor from existing node, ignores |minor| */
Dylan Reid837c74a2016-01-22 17:25:21 -080079 int uid;
80 int gid;
Dylan Reid4843d6b2017-03-31 18:14:30 -070081};
82
83struct container_cgroup_device {
84 int allow;
85 char type;
86 int major; /* -1 means all */
87 int minor; /* -1 means all */
88 int read;
89 int write;
90 int modify;
Dylan Reid837c74a2016-01-22 17:25:21 -080091};
92
Chinyue Chenfac909e2016-06-24 14:17:42 +080093struct container_cpu_cgroup {
94 int shares;
95 int quota;
96 int period;
97 int rt_runtime;
98 int rt_period;
99};
100
Dylan Reid837c74a2016-01-22 17:25:21 -0800101/*
102 * Structure that configures how the container is run.
103 *
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500104 * config_root - Path to the root of the container itself.
Dylan Reid837c74a2016-01-22 17:25:21 -0800105 * rootfs - Path to the root of the container's filesystem.
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700106 * rootfs_mount_flags - Flags that will be passed to mount() for the rootfs.
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700107 * premounted_runfs - Path to where the container will be run.
108 * pid_file_path - Path to the file where the pid should be written.
Dylan Reid837c74a2016-01-22 17:25:21 -0800109 * program_argv - The program to run and args, e.g. "/sbin/init".
110 * num_args - Number of args in program_argv.
Dylan Reid1874feb2016-06-22 17:53:50 -0700111 * uid - The uid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800112 * uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024"
Dylan Reid1874feb2016-06-22 17:53:50 -0700113 * gid - The gid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800114 * gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024"
115 * alt_syscall_table - Syscall table to use or NULL if none.
116 * mounts - Filesystems to mount in the new namespace.
117 * num_mounts - Number of above.
118 * devices - Device nodes to create.
119 * num_devices - Number of above.
Dylan Reid4843d6b2017-03-31 18:14:30 -0700120 * cgroup_devices - Device node cgroup permissions.
121 * num_cgroup_devices - Number of above.
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700122 * run_setfiles - Should run setfiles on mounts to enable selinux.
Chinyue Chenfac909e2016-06-24 14:17:42 +0800123 * cpu_cgparams - CPU cgroup params.
Dylan Reid9e724af2016-07-21 09:58:07 -0700124 * cgroup_parent - Parent dir for cgroup creation
125 * cgroup_owner - uid to own the created cgroups
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700126 * cgroup_group - gid to own the created cgroups
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700127 * share_host_netns - Enable sharing of the host network namespace.
Dylan Reidc4335842016-11-11 10:24:52 -0800128 * keep_fds_open - Allow the child process to keep open FDs (for stdin/out/err).
Dylan Reid837c74a2016-01-22 17:25:21 -0800129 */
130struct container_config {
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500131 char *config_root;
Dylan Reid837c74a2016-01-22 17:25:21 -0800132 char *rootfs;
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700133 unsigned long rootfs_mount_flags;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700134 char *premounted_runfs;
135 char *pid_file_path;
Dylan Reid837c74a2016-01-22 17:25:21 -0800136 char **program_argv;
137 size_t num_args;
Dylan Reid1874feb2016-06-22 17:53:50 -0700138 uid_t uid;
Dylan Reid837c74a2016-01-22 17:25:21 -0800139 char *uid_map;
Dylan Reid1874feb2016-06-22 17:53:50 -0700140 gid_t gid;
Dylan Reid837c74a2016-01-22 17:25:21 -0800141 char *gid_map;
142 char *alt_syscall_table;
143 struct container_mount *mounts;
144 size_t num_mounts;
145 struct container_device *devices;
146 size_t num_devices;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700147 struct container_cgroup_device *cgroup_devices;
148 size_t num_cgroup_devices;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700149 char *run_setfiles;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800150 struct container_cpu_cgroup cpu_cgparams;
Dylan Reid9e724af2016-07-21 09:58:07 -0700151 char *cgroup_parent;
152 uid_t cgroup_owner;
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700153 gid_t cgroup_group;
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700154 int share_host_netns;
Dylan Reidc4335842016-11-11 10:24:52 -0800155 int keep_fds_open;
Dylan Reid837c74a2016-01-22 17:25:21 -0800156};
157
158struct container_config *container_config_create()
159{
160 return calloc(1, sizeof(struct container_config));
161}
162
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700163static void container_free_program_args(struct container_config *c)
164{
165 int i;
166
167 if (!c->program_argv)
168 return;
169 for (i = 0; i < c->num_args; ++i) {
170 FREE_AND_NULL(c->program_argv[i]);
171 }
172 FREE_AND_NULL(c->program_argv);
173}
174
175static void container_config_free_mount(struct container_mount *mount)
176{
177 FREE_AND_NULL(mount->name);
178 FREE_AND_NULL(mount->source);
179 FREE_AND_NULL(mount->destination);
180 FREE_AND_NULL(mount->type);
181 FREE_AND_NULL(mount->data);
182}
183
184static void container_config_free_device(struct container_device *device)
185{
186 FREE_AND_NULL(device->path);
187}
188
Dylan Reid837c74a2016-01-22 17:25:21 -0800189void container_config_destroy(struct container_config *c)
190{
191 size_t i;
192
193 if (c == NULL)
194 return;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700195 FREE_AND_NULL(c->rootfs);
196 container_free_program_args(c);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700197 FREE_AND_NULL(c->premounted_runfs);
198 FREE_AND_NULL(c->pid_file_path);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700199 FREE_AND_NULL(c->uid_map);
200 FREE_AND_NULL(c->gid_map);
201 FREE_AND_NULL(c->alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -0800202 for (i = 0; i < c->num_mounts; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700203 container_config_free_mount(&c->mounts[i]);
Dylan Reid837c74a2016-01-22 17:25:21 -0800204 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700205 FREE_AND_NULL(c->mounts);
Dylan Reid837c74a2016-01-22 17:25:21 -0800206 for (i = 0; i < c->num_devices; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700207 container_config_free_device(&c->devices[i]);
Dylan Reid837c74a2016-01-22 17:25:21 -0800208 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700209 FREE_AND_NULL(c->devices);
210 FREE_AND_NULL(c->run_setfiles);
Dylan Reid9e724af2016-07-21 09:58:07 -0700211 FREE_AND_NULL(c->cgroup_parent);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700212 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800213}
214
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500215int container_config_config_root(struct container_config *c,
216 const char *config_root)
217{
218 return strdup_and_free(&c->config_root, config_root);
219}
220
221const char *container_config_get_config_root(const struct container_config *c)
222{
223 return c->config_root;
224}
225
Dylan Reid837c74a2016-01-22 17:25:21 -0800226int container_config_rootfs(struct container_config *c, const char *rootfs)
227{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700228 return strdup_and_free(&c->rootfs, rootfs);
Dylan Reid837c74a2016-01-22 17:25:21 -0800229}
230
Dylan Reid11456722016-05-02 11:24:50 -0700231const char *container_config_get_rootfs(const struct container_config *c)
232{
233 return c->rootfs;
234}
235
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700236void container_config_rootfs_mount_flags(struct container_config *c,
237 unsigned long rootfs_mount_flags)
238{
239 /* Since we are going to add MS_REMOUNT anyways, add it here so we can
240 * simply check against zero later. MS_BIND is also added to avoid
241 * re-mounting the original filesystem, since the rootfs is always
242 * bind-mounted.
243 */
244 c->rootfs_mount_flags = MS_REMOUNT | MS_BIND | rootfs_mount_flags;
245}
246
247unsigned long container_config_get_rootfs_mount_flags(
248 const struct container_config *c)
249{
250 return c->rootfs_mount_flags;
251}
252
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700253int container_config_premounted_runfs(struct container_config *c, const char *runfs)
254{
255 return strdup_and_free(&c->premounted_runfs, runfs);
256}
257
258const char *container_config_get_premounted_runfs(const struct container_config *c)
259{
260 return c->premounted_runfs;
261}
262
263int container_config_pid_file(struct container_config *c, const char *path)
264{
265 return strdup_and_free(&c->pid_file_path, path);
266}
267
268const char *container_config_get_pid_file(const struct container_config *c)
269{
270 return c->pid_file_path;
271}
272
Dylan Reid837c74a2016-01-22 17:25:21 -0800273int container_config_program_argv(struct container_config *c,
Dylan Reid17fd53f2016-11-18 19:14:41 -0800274 const char **argv, size_t num_args)
Dylan Reid837c74a2016-01-22 17:25:21 -0800275{
276 size_t i;
277
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700278 container_free_program_args(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800279 c->num_args = num_args;
280 c->program_argv = calloc(num_args + 1, sizeof(char *));
281 if (!c->program_argv)
282 return -ENOMEM;
283 for (i = 0; i < num_args; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700284 if (strdup_and_free(&c->program_argv[i], argv[i]))
285 goto error_free_return;
Dylan Reid837c74a2016-01-22 17:25:21 -0800286 }
287 c->program_argv[num_args] = NULL;
288 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700289
290error_free_return:
291 container_free_program_args(c);
292 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800293}
294
Dylan Reid11456722016-05-02 11:24:50 -0700295size_t container_config_get_num_program_args(const struct container_config *c)
296{
297 return c->num_args;
298}
299
300const char *container_config_get_program_arg(const struct container_config *c,
301 size_t index)
302{
303 if (index >= c->num_args)
304 return NULL;
305 return c->program_argv[index];
306}
307
Dylan Reid1874feb2016-06-22 17:53:50 -0700308void container_config_uid(struct container_config *c, uid_t uid)
309{
310 c->uid = uid;
311}
312
313uid_t container_config_get_uid(const struct container_config *c)
314{
315 return c->uid;
316}
317
Dylan Reid837c74a2016-01-22 17:25:21 -0800318int container_config_uid_map(struct container_config *c, const char *uid_map)
319{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700320 return strdup_and_free(&c->uid_map, uid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800321}
322
Dylan Reid1874feb2016-06-22 17:53:50 -0700323void container_config_gid(struct container_config *c, gid_t gid)
324{
325 c->gid = gid;
326}
327
328gid_t container_config_get_gid(const struct container_config *c)
329{
330 return c->gid;
331}
332
Dylan Reid837c74a2016-01-22 17:25:21 -0800333int container_config_gid_map(struct container_config *c, const char *gid_map)
334{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700335 return strdup_and_free(&c->gid_map, gid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800336}
337
338int container_config_alt_syscall_table(struct container_config *c,
339 const char *alt_syscall_table)
340{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700341 return strdup_and_free(&c->alt_syscall_table, alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -0800342}
343
344int container_config_add_mount(struct container_config *c,
345 const char *name,
346 const char *source,
347 const char *destination,
348 const char *type,
349 const char *data,
Mike Frysinger05e594e2017-01-10 02:11:08 -0500350 const char *verity,
Dylan Reid837c74a2016-01-22 17:25:21 -0800351 int flags,
352 int uid,
353 int gid,
354 int mode,
355 int mount_in_ns,
Mike Frysinger412dbd22017-01-06 01:50:34 -0500356 int create,
357 int loopback)
Dylan Reid837c74a2016-01-22 17:25:21 -0800358{
359 struct container_mount *mount_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700360 struct container_mount *current_mount;
Dylan Reid837c74a2016-01-22 17:25:21 -0800361
362 if (name == NULL || source == NULL ||
363 destination == NULL || type == NULL)
364 return -EINVAL;
365
366 mount_ptr = realloc(c->mounts,
367 sizeof(c->mounts[0]) * (c->num_mounts + 1));
368 if (!mount_ptr)
369 return -ENOMEM;
370 c->mounts = mount_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700371 current_mount = &c->mounts[c->num_mounts];
372 memset(current_mount, 0, sizeof(struct container_mount));
373
374 if (strdup_and_free(&current_mount->name, name))
375 goto error_free_return;
376 if (strdup_and_free(&current_mount->source, source))
377 goto error_free_return;
378 if (strdup_and_free(&current_mount->destination, destination))
379 goto error_free_return;
380 if (strdup_and_free(&current_mount->type, type))
381 goto error_free_return;
382 if (data && strdup_and_free(&current_mount->data, data))
383 goto error_free_return;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500384 if (verity && strdup_and_free(&current_mount->verity, verity))
385 goto error_free_return;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700386 current_mount->flags = flags;
387 current_mount->uid = uid;
388 current_mount->gid = gid;
389 current_mount->mode = mode;
390 current_mount->mount_in_ns = mount_in_ns;
391 current_mount->create = create;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500392 current_mount->loopback = loopback;
Dylan Reid837c74a2016-01-22 17:25:21 -0800393 ++c->num_mounts;
394 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700395
396error_free_return:
397 container_config_free_mount(current_mount);
398 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800399}
400
Dylan Reid4843d6b2017-03-31 18:14:30 -0700401int container_config_add_cgroup_device(struct container_config *c,
402 int allow,
403 char type,
404 int major,
405 int minor,
406 int read,
407 int write,
408 int modify)
409{
410 struct container_cgroup_device *dev_ptr;
411 struct container_cgroup_device *current_dev;
412
413 dev_ptr = realloc(c->cgroup_devices,
414 sizeof(c->cgroup_devices[0]) *
415 (c->num_cgroup_devices + 1));
416 if (!dev_ptr)
417 return -ENOMEM;
418 c->cgroup_devices = dev_ptr;
419
420 current_dev = &c->cgroup_devices[c->num_cgroup_devices];
421 memset(current_dev, 0, sizeof(struct container_cgroup_device));
422 current_dev->allow = allow;
423 current_dev->type = type;
424 current_dev->major = major;
425 current_dev->minor = minor;
426 current_dev->read = read;
427 current_dev->write = write;
428 current_dev->modify = modify;
429 ++c->num_cgroup_devices;
430
431 return 0;
432}
433
Dylan Reid837c74a2016-01-22 17:25:21 -0800434int container_config_add_device(struct container_config *c,
435 char type,
436 const char *path,
437 int fs_permissions,
438 int major,
439 int minor,
Dylan Reid355d5e42016-04-29 16:53:31 -0700440 int copy_minor,
Dylan Reid837c74a2016-01-22 17:25:21 -0800441 int uid,
442 int gid,
443 int read_allowed,
444 int write_allowed,
445 int modify_allowed)
446{
447 struct container_device *dev_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700448 struct container_device *current_dev;
Dylan Reid837c74a2016-01-22 17:25:21 -0800449
450 if (path == NULL)
451 return -EINVAL;
Dylan Reid355d5e42016-04-29 16:53:31 -0700452 /* If using a dynamic minor number, ensure that minor is -1. */
453 if (copy_minor && (minor != -1))
454 return -EINVAL;
455
Dylan Reid837c74a2016-01-22 17:25:21 -0800456 dev_ptr = realloc(c->devices,
457 sizeof(c->devices[0]) * (c->num_devices + 1));
458 if (!dev_ptr)
459 return -ENOMEM;
460 c->devices = dev_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700461 current_dev = &c->devices[c->num_devices];
462 memset(current_dev, 0, sizeof(struct container_device));
463
464 current_dev->type = type;
465 if (strdup_and_free(&current_dev->path, path))
466 goto error_free_return;
467 current_dev->fs_permissions = fs_permissions;
468 current_dev->major = major;
469 current_dev->minor = minor;
470 current_dev->copy_minor = copy_minor;
471 current_dev->uid = uid;
472 current_dev->gid = gid;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700473 if (read_allowed || write_allowed || modify_allowed) {
474 if (container_config_add_cgroup_device(c,
475 1,
476 type,
477 major,
478 minor,
479 read_allowed,
480 write_allowed,
481 modify_allowed))
482 goto error_free_return;
483 }
Dylan Reid837c74a2016-01-22 17:25:21 -0800484 ++c->num_devices;
485 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700486
487error_free_return:
488 container_config_free_device(current_dev);
489 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800490}
491
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700492int container_config_run_setfiles(struct container_config *c,
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700493 const char *setfiles_cmd)
494{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700495 return strdup_and_free(&c->run_setfiles, setfiles_cmd);
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700496}
Dylan Reid837c74a2016-01-22 17:25:21 -0800497
Dylan Reid11456722016-05-02 11:24:50 -0700498const char *container_config_get_run_setfiles(const struct container_config *c)
499{
500 return c->run_setfiles;
501}
502
Chinyue Chenfac909e2016-06-24 14:17:42 +0800503int container_config_set_cpu_shares(struct container_config *c, int shares)
504{
505 /* CPU shares must be 2 or higher. */
506 if (shares < 2)
507 return -EINVAL;
508
509 c->cpu_cgparams.shares = shares;
510 return 0;
511}
512
513int container_config_set_cpu_cfs_params(struct container_config *c,
514 int quota,
515 int period)
516{
517 /*
518 * quota could be set higher than period to utilize more than one CPU.
519 * quota could also be set as -1 to indicate the cgroup does not adhere
520 * to any CPU time restrictions.
521 */
522 if (quota <= 0 && quota != -1)
523 return -EINVAL;
524 if (period <= 0)
525 return -EINVAL;
526
527 c->cpu_cgparams.quota = quota;
528 c->cpu_cgparams.period = period;
529 return 0;
530}
531
532int container_config_set_cpu_rt_params(struct container_config *c,
533 int rt_runtime,
534 int rt_period)
535{
536 /*
537 * rt_runtime could be set as 0 to prevent the cgroup from using
538 * realtime CPU.
539 */
540 if (rt_runtime < 0 || rt_runtime >= rt_period)
541 return -EINVAL;
542
543 c->cpu_cgparams.rt_runtime = rt_runtime;
544 c->cpu_cgparams.rt_period = rt_period;
545 return 0;
546}
547
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800548int container_config_get_cpu_shares(struct container_config *c)
549{
550 return c->cpu_cgparams.shares;
551}
552
553int container_config_get_cpu_quota(struct container_config *c)
554{
555 return c->cpu_cgparams.quota;
556}
557
558int container_config_get_cpu_period(struct container_config *c)
559{
560 return c->cpu_cgparams.period;
561}
562
563int container_config_get_cpu_rt_runtime(struct container_config *c)
564{
565 return c->cpu_cgparams.rt_runtime;
566}
567
568int container_config_get_cpu_rt_period(struct container_config *c)
569{
570 return c->cpu_cgparams.rt_period;
571}
572
Dylan Reid9e724af2016-07-21 09:58:07 -0700573int container_config_set_cgroup_parent(struct container_config *c,
574 const char *parent,
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700575 uid_t cgroup_owner, gid_t cgroup_group)
Dylan Reid9e724af2016-07-21 09:58:07 -0700576{
577 c->cgroup_owner = cgroup_owner;
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700578 c->cgroup_group = cgroup_group;
Dylan Reid9e724af2016-07-21 09:58:07 -0700579 return strdup_and_free(&c->cgroup_parent, parent);
580}
581
582const char *container_config_get_cgroup_parent(struct container_config *c)
583{
584 return c->cgroup_parent;
585}
586
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700587void container_config_share_host_netns(struct container_config *c)
588{
589 c->share_host_netns = 1;
590}
591
592int get_container_config_share_host_netns(struct container_config *c)
593{
594 return c->share_host_netns;
595}
596
Dylan Reidc4335842016-11-11 10:24:52 -0800597void container_config_keep_fds_open(struct container_config *c)
598{
599 c->keep_fds_open = 1;
600}
601
Dylan Reid837c74a2016-01-22 17:25:21 -0800602/*
603 * Container manipulation
604 */
605struct container {
Dylan Reid837c74a2016-01-22 17:25:21 -0800606 struct container_cgroup *cgroup;
607 struct minijail *jail;
608 pid_t init_pid;
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500609 char *config_root;
Dylan Reid837c74a2016-01-22 17:25:21 -0800610 char *runfs;
611 char *rundir;
612 char *runfsroot;
613 char *pid_file_path;
Dylan Reide040c6b2016-05-02 18:49:02 -0700614 char **ext_mounts; /* Mounts made outside of the minijail */
615 size_t num_ext_mounts;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500616 char **loopdevs;
617 size_t num_loopdevs;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500618 char **device_mappers;
619 size_t num_device_mappers;
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700620 char *name;
Dylan Reid837c74a2016-01-22 17:25:21 -0800621};
622
623struct container *container_new(const char *name,
Dylan Reide040c6b2016-05-02 18:49:02 -0700624 const char *rundir)
Dylan Reid837c74a2016-01-22 17:25:21 -0800625{
626 struct container *c;
627
Dylan Reid837c74a2016-01-22 17:25:21 -0800628 c = calloc(1, sizeof(*c));
Dylan Reidb435c682016-04-12 04:17:49 -0700629 if (!c)
630 return NULL;
Dylan Reid837c74a2016-01-22 17:25:21 -0800631 c->rundir = strdup(rundir);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700632 c->name = strdup(name);
Dylan Reida9966422016-07-21 10:11:34 -0700633 if (!c->rundir || !c->name) {
Dylan Reid684975e2016-05-02 15:44:47 -0700634 container_destroy(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800635 return NULL;
Dylan Reidb435c682016-04-12 04:17:49 -0700636 }
Dylan Reid837c74a2016-01-22 17:25:21 -0800637 return c;
638}
639
640void container_destroy(struct container *c)
641{
Dylan Reid684975e2016-05-02 15:44:47 -0700642 if (c->cgroup)
643 container_cgroup_destroy(c->cgroup);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700644 if (c->jail)
645 minijail_destroy(c->jail);
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500646 FREE_AND_NULL(c->config_root);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700647 FREE_AND_NULL(c->name);
648 FREE_AND_NULL(c->rundir);
649 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800650}
651
Stephen Barber1a398c72017-01-23 12:39:44 -0800652/*
653 * Given a uid/gid map of "inside1 outside1 length1, ...", and an id
654 * inside of the user namespace, return the equivalent outside id, or
655 * return < 0 on error.
656 */
657static int get_userns_outside_id(const char *map, int id)
658{
659 char *map_copy, *mapping, *saveptr1, *saveptr2;
660 int inside, outside, length;
661 int result = 0;
662 errno = 0;
663
664 if (asprintf(&map_copy, "%s", map) < 0)
665 return -ENOMEM;
666
667 mapping = strtok_r(map_copy, ",", &saveptr1);
668 while (mapping) {
669 inside = strtol(strtok_r(mapping, " ", &saveptr2), NULL, 10);
670 outside = strtol(strtok_r(NULL, " ", &saveptr2), NULL, 10);
671 length = strtol(strtok_r(NULL, "\0", &saveptr2), NULL, 10);
672 if (errno) {
673 goto error_free_return;
674 } else if (inside < 0 || outside < 0 || length < 0) {
675 errno = EINVAL;
676 goto error_free_return;
677 }
678
679 if (id >= inside && id <= (inside + length)) {
680 result = (id - inside) + outside;
681 goto exit;
682 }
683
684 mapping = strtok_r(NULL, ",", &saveptr1);
685 }
686 errno = EINVAL;
687
688error_free_return:
689 result = -errno;
690exit:
691 free(map_copy);
692 return result;
693}
694
Dylan Reid837c74a2016-01-22 17:25:21 -0800695static int make_dir(const char *path, int uid, int gid, int mode)
696{
697 if (mkdir(path, mode))
698 return -errno;
699 if (chmod(path, mode))
700 return -errno;
701 if (chown(path, uid, gid))
702 return -errno;
703 return 0;
704}
705
706static int touch_file(const char *path, int uid, int gid, int mode)
707{
708 int rc;
709 int fd = open(path, O_RDWR | O_CREAT, mode);
710 if (fd < 0)
711 return -errno;
712 rc = fchown(fd, uid, gid);
713 close(fd);
714
715 if (rc)
716 return -errno;
717 return 0;
718}
719
720/* Make sure the mount target exists in the new rootfs. Create if needed and
721 * possible.
722 */
Stephen Barber1a398c72017-01-23 12:39:44 -0800723static int setup_mount_destination(const struct container_config *config,
724 const struct container_mount *mnt,
Dylan Reid2149be92016-04-28 18:38:57 -0700725 const char *source,
Dylan Reid837c74a2016-01-22 17:25:21 -0800726 const char *dest)
727{
Stephen Barber1a398c72017-01-23 12:39:44 -0800728 int uid_userns, gid_userns;
Dylan Reid837c74a2016-01-22 17:25:21 -0800729 int rc;
730 struct stat st_buf;
731
732 rc = stat(dest, &st_buf);
733 if (rc == 0) /* destination exists */
734 return 0;
735
736 /* Try to create the destination. Either make directory or touch a file
737 * depending on the source type.
738 */
Stephen Barber1a398c72017-01-23 12:39:44 -0800739 uid_userns = get_userns_outside_id(config->uid_map, mnt->uid);
740 if (uid_userns < 0)
741 return uid_userns;
742 gid_userns = get_userns_outside_id(config->gid_map, mnt->gid);
743 if (gid_userns < 0)
744 return gid_userns;
745
Dylan Reid2149be92016-04-28 18:38:57 -0700746 rc = stat(source, &st_buf);
Dylan Reid837c74a2016-01-22 17:25:21 -0800747 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode))
Stephen Barber1a398c72017-01-23 12:39:44 -0800748 return make_dir(dest, uid_userns, gid_userns, mnt->mode);
Dylan Reid837c74a2016-01-22 17:25:21 -0800749
Stephen Barber1a398c72017-01-23 12:39:44 -0800750 return touch_file(dest, uid_userns, gid_userns, mnt->mode);
Dylan Reid837c74a2016-01-22 17:25:21 -0800751}
752
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700753/* Fork and exec the setfiles command to configure the selinux policy. */
Dylan Reide040c6b2016-05-02 18:49:02 -0700754static int run_setfiles_command(const struct container *c,
755 const struct container_config *config,
Yusuke Sato91f11f02016-12-02 16:15:13 -0800756 char *const *destinations, size_t num_destinations)
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700757{
758 int rc;
759 int status;
760 int pid;
761 char *context_path;
762
Dylan Reide040c6b2016-05-02 18:49:02 -0700763 if (!config->run_setfiles)
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700764 return 0;
765
766 if (asprintf(&context_path, "%s/file_contexts",
767 c->runfsroot) < 0)
768 return -errno;
769
770 pid = fork();
771 if (pid == 0) {
Yusuke Sato91f11f02016-12-02 16:15:13 -0800772 size_t i;
773 size_t arg_index = 0;
774 const char *argv[MAX_NUM_SETFILES_ARGS];
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700775 const char *env[] = {
776 NULL,
777 };
778
Yusuke Sato91f11f02016-12-02 16:15:13 -0800779 argv[arg_index++] = config->run_setfiles;
780 argv[arg_index++] = "-r";
781 argv[arg_index++] = c->runfsroot;
782 argv[arg_index++] = context_path;
783 if (arg_index + num_destinations >= MAX_NUM_SETFILES_ARGS)
784 _exit(-E2BIG);
785 for (i = 0; i < num_destinations; ++i) {
786 argv[arg_index++] = destinations[i];
787 }
788 argv[arg_index] = NULL;
789
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700790 execve(argv[0], (char *const*)argv, (char *const*)env);
791
792 /* Command failed to exec if execve returns. */
793 _exit(-errno);
794 }
795 free(context_path);
796 if (pid < 0)
797 return -errno;
798 do {
799 rc = waitpid(pid, &status, 0);
800 } while (rc == -1 && errno == EINTR);
801 if (rc < 0)
802 return -errno;
803 return status;
804}
805
Mike Frysinger412dbd22017-01-06 01:50:34 -0500806/* Find a free loop device and attach it. */
807static int loopdev_setup(char **loopdev_ret, const char *source)
808{
809 int ret = 0;
810 int source_fd = -1;
811 int control_fd = -1;
812 int loop_fd = -1;
813 char *loopdev = NULL;
814
815 source_fd = open(source, O_RDONLY|O_CLOEXEC);
816 if (source_fd < 0)
817 goto error;
818
819 control_fd = open(loopdev_ctl, O_RDWR|O_NOFOLLOW|O_CLOEXEC);
820 if (control_fd < 0)
821 goto error;
822
823 while (1) {
824 int num = ioctl(control_fd, LOOP_CTL_GET_FREE);
825 if (num < 0)
826 goto error;
827
828 if (asprintf(&loopdev, "/dev/loop%i", num) < 0)
829 goto error;
830
831 loop_fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
832 if (loop_fd < 0)
833 goto error;
834
835 if (ioctl(loop_fd, LOOP_SET_FD, source_fd) == 0)
836 break;
837
838 if (errno != EBUSY)
839 goto error;
840
841 /* Clean up resources for the next pass. */
842 free(loopdev);
843 close(loop_fd);
844 }
845
846 *loopdev_ret = loopdev;
847 goto exit;
848
849error:
850 ret = -errno;
851 free(loopdev);
852exit:
853 if (source_fd != -1)
854 close(source_fd);
855 if (control_fd != -1)
856 close(control_fd);
857 if (loop_fd != -1)
858 close(loop_fd);
859 return ret;
860}
861
862/* Detach the specified loop device. */
863static int loopdev_detach(const char *loopdev)
864{
865 int ret = 0;
866 int fd;
867
868 fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
869 if (fd < 0)
870 goto error;
871 if (ioctl(fd, LOOP_CLR_FD) < 0)
872 goto error;
873
874 goto exit;
875
876error:
877 ret = -errno;
878exit:
879 if (fd != -1)
880 close(fd);
881 return ret;
882}
883
Mike Frysinger05e594e2017-01-10 02:11:08 -0500884/* Create a new device mapper target for the source. */
885static int dm_setup(char **dm_path_ret, char **dm_name_ret, const char *source,
886 const char *verity_cmdline)
887{
888 int ret = 0;
889#if USE_device_mapper
890 char *p;
891 char *dm_path = NULL;
892 char *dm_name = NULL;
893 char *verity = NULL;
894 struct dm_task *dmt = NULL;
895 uint32_t cookie = 0;
896
897 /* Normalize the name into something unique-esque. */
898 if (asprintf(&dm_name, "cros-containers-%s", source) < 0)
899 goto error;
900 p = dm_name;
901 while ((p = strchr(p, '/')) != NULL)
902 *p++ = '_';
903
904 /* Get the /dev path for the higher levels to mount. */
905 if (asprintf(&dm_path, "%s%s", dm_dev_prefix, dm_name) < 0)
906 goto error;
907
908 /* Insert the source path in the verity command line. */
909 size_t source_len = strlen(source);
910 verity = malloc(strlen(verity_cmdline) + source_len * 2 + 1);
911 strcpy(verity, verity_cmdline);
912 while ((p = strstr(verity, "@DEV@")) != NULL) {
913 memmove(p + source_len, p + 5, strlen(p + 5) + 1);
914 memcpy(p, source, source_len);
915 }
916
917 /* Extract the first three parameters for dm-verity settings. */
918 char ttype[20];
919 unsigned long long start, size;
920 int n;
921 if (sscanf(verity, "%llu %llu %10s %n", &start, &size, ttype, &n) != 3)
922 goto error;
923
924 /* Finally create the device mapper. */
925 dmt = dm_task_create(DM_DEVICE_CREATE);
926 if (dmt == NULL)
927 goto error;
928
929 if (!dm_task_set_name(dmt, dm_name))
930 goto error;
931
932 if (!dm_task_set_ro(dmt))
933 goto error;
934
935 if (!dm_task_add_target(dmt, start, size, ttype, verity + n))
936 goto error;
937
938 if (!dm_task_set_cookie(dmt, &cookie, 0))
939 goto error;
940
941 if (!dm_task_run(dmt))
942 goto error;
943
944 /* Make sure the node exists before we continue. */
945 dm_udev_wait(cookie);
946
947 *dm_path_ret = dm_path;
948 *dm_name_ret = dm_name;
949 goto exit;
950
951error:
952 ret = -errno;
953 free(dm_name);
954 free(dm_path);
955exit:
956 free(verity);
957 if (dmt)
958 dm_task_destroy(dmt);
959#endif
960 return ret;
961}
962
963/* Tear down the device mapper target. */
964static int dm_detach(const char *dm_name)
965{
966 int ret = 0;
967#if USE_device_mapper
968 struct dm_task *dmt;
969
970 dmt = dm_task_create(DM_DEVICE_REMOVE);
971 if (dmt == NULL)
972 goto error;
973
974 if (!dm_task_set_name(dmt, dm_name))
975 goto error;
976
977 if (!dm_task_run(dmt))
978 goto error;
979
980 goto exit;
981
982error:
983 ret = -errno;
984exit:
985 dm_task_destroy(dmt);
986#endif
987 return ret;
988}
989
Dylan Reide040c6b2016-05-02 18:49:02 -0700990/*
991 * Unmounts anything we mounted in this mount namespace in the opposite order
992 * that they were mounted.
993 */
994static int unmount_external_mounts(struct container *c)
995{
996 int ret = 0;
997
998 while (c->num_ext_mounts) {
999 c->num_ext_mounts--;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001000 if (!c->ext_mounts[c->num_ext_mounts])
1001 continue;
Dylan Reide040c6b2016-05-02 18:49:02 -07001002 if (umount(c->ext_mounts[c->num_ext_mounts]))
1003 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001004 FREE_AND_NULL(c->ext_mounts[c->num_ext_mounts]);
Dylan Reide040c6b2016-05-02 18:49:02 -07001005 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001006 FREE_AND_NULL(c->ext_mounts);
Mike Frysinger412dbd22017-01-06 01:50:34 -05001007
1008 while (c->num_loopdevs) {
1009 c->num_loopdevs--;
1010 if (loopdev_detach(c->loopdevs[c->num_loopdevs]))
1011 ret = -errno;
1012 FREE_AND_NULL(c->loopdevs[c->num_loopdevs]);
1013 }
1014 FREE_AND_NULL(c->loopdevs);
1015
Mike Frysinger05e594e2017-01-10 02:11:08 -05001016 while (c->num_device_mappers) {
1017 c->num_device_mappers--;
1018 if (dm_detach(c->device_mappers[c->num_device_mappers]))
1019 ret = -errno;
1020 FREE_AND_NULL(c->device_mappers[c->num_device_mappers]);
1021 }
1022 FREE_AND_NULL(c->device_mappers);
1023
Dylan Reide040c6b2016-05-02 18:49:02 -07001024 return ret;
1025}
1026
Junichi Uekawa5d272772016-07-21 16:07:19 +09001027/*
1028 * Match mount_one in minijail, mount one mountpoint with
1029 * consideration for combination of MS_BIND/MS_RDONLY flag.
1030 */
1031static int mount_external(const char *src, const char *dest, const char *type,
1032 unsigned long flags, const void *data)
1033{
1034 int remount_ro = 0;
1035
1036 /*
1037 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1038 * can't both be specified in the original bind mount.
1039 * Remount R/O after the initial mount.
1040 */
1041 if ((flags & MS_BIND) && (flags & MS_RDONLY)) {
1042 remount_ro = 1;
1043 flags &= ~MS_RDONLY;
1044 }
1045
1046 if (mount(src, dest, type, flags, data) == -1)
1047 return -1;
1048
1049 if (remount_ro) {
1050 flags |= MS_RDONLY;
1051 if (mount(src, dest, NULL, flags | MS_REMOUNT, data) == -1)
1052 return -1;
1053 }
1054
1055 return 0;
1056}
1057
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001058static int do_container_mount(struct container *c,
Stephen Barber1a398c72017-01-23 12:39:44 -08001059 const struct container_config *config,
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001060 const struct container_mount *mnt)
1061{
Mike Frysinger05e594e2017-01-10 02:11:08 -05001062 char *dm_source = NULL;
Mike Frysinger412dbd22017-01-06 01:50:34 -05001063 char *loop_source = NULL;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001064 char *source = NULL;
1065 char *dest = NULL;
1066 int rc = 0;
1067
1068 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
1069 return -errno;
1070
1071 /*
1072 * If it's a bind mount relative to rootfs, append source to
1073 * rootfs path, otherwise source path is absolute.
1074 */
1075 if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') {
1076 if (asprintf(&source, "%s/%s", c->runfsroot, mnt->source) < 0)
1077 goto error_free_return;
Mike Frysingerb22acdf2017-01-08 02:02:35 -05001078 } else if (mnt->loopback && mnt->source[0] != '/' && c->config_root) {
1079 if (asprintf(&source, "%s/%s", c->config_root, mnt->source) < 0)
1080 goto error_free_return;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001081 } else {
1082 if (asprintf(&source, "%s", mnt->source) < 0)
1083 goto error_free_return;
1084 }
1085
1086 if (mnt->create) {
Stephen Barber1a398c72017-01-23 12:39:44 -08001087 rc = setup_mount_destination(config, mnt, source, dest);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001088 if (rc)
1089 goto error_free_return;
1090 }
Mike Frysinger412dbd22017-01-06 01:50:34 -05001091 if (mnt->loopback) {
1092 /* Record this loopback file for cleanup later. */
1093 loop_source = source;
1094 source = NULL;
1095 rc = loopdev_setup(&source, loop_source);
1096 if (rc)
1097 goto error_free_return;
1098
Mike Frysinger05e594e2017-01-10 02:11:08 -05001099 /* Save this to cleanup when shutting down. */
Mike Frysinger412dbd22017-01-06 01:50:34 -05001100 rc = strdup_and_free(&c->loopdevs[c->num_loopdevs], source);
1101 if (rc)
1102 goto error_free_return;
1103 c->num_loopdevs++;
1104 }
Mike Frysinger05e594e2017-01-10 02:11:08 -05001105 if (mnt->verity) {
1106 /* Set this device up via dm-verity. */
1107 char *dm_name;
1108 dm_source = source;
1109 source = NULL;
1110 rc = dm_setup(&source, &dm_name, dm_source, mnt->verity);
1111 if (rc)
1112 goto error_free_return;
1113
1114 /* Save this to cleanup when shutting down. */
1115 rc = strdup_and_free(&c->device_mappers[c->num_device_mappers],
1116 dm_name);
1117 free(dm_name);
1118 if (rc)
1119 goto error_free_return;
1120 c->num_device_mappers++;
1121 }
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001122 if (mnt->mount_in_ns) {
1123 /* We can mount this with minijail. */
Dylan Reid36b9c012016-06-24 18:27:08 -07001124 rc = minijail_mount_with_data(c->jail, source, mnt->destination,
1125 mnt->type, mnt->flags, mnt->data);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001126 if (rc)
1127 goto error_free_return;
1128 } else {
1129 /* Mount this externally and unmount it on exit. */
Junichi Uekawa5d272772016-07-21 16:07:19 +09001130 if (mount_external(source, dest, mnt->type, mnt->flags,
1131 mnt->data))
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001132 goto error_free_return;
1133 /* Save this to unmount when shutting down. */
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001134 rc = strdup_and_free(&c->ext_mounts[c->num_ext_mounts], dest);
1135 if (rc)
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001136 goto error_free_return;
1137 c->num_ext_mounts++;
1138 }
1139
1140 goto exit;
1141
1142error_free_return:
1143 if (!rc)
1144 rc = -errno;
1145exit:
Mike Frysinger05e594e2017-01-10 02:11:08 -05001146 free(dm_source);
Mike Frysinger412dbd22017-01-06 01:50:34 -05001147 free(loop_source);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001148 free(source);
1149 free(dest);
1150 return rc;
1151}
1152
Dylan Reide040c6b2016-05-02 18:49:02 -07001153static int do_container_mounts(struct container *c,
1154 const struct container_config *config)
Dylan Reid7daf9982016-04-28 16:55:42 -07001155{
1156 unsigned int i;
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001157 int rc = 0;
Dylan Reid7daf9982016-04-28 16:55:42 -07001158
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001159 unmount_external_mounts(c);
Dylan Reide040c6b2016-05-02 18:49:02 -07001160 /*
1161 * Allocate space to track anything we mount in our mount namespace.
1162 * This over-allocates as it has space for all mounts.
1163 */
1164 c->ext_mounts = calloc(config->num_mounts, sizeof(*c->ext_mounts));
1165 if (!c->ext_mounts)
1166 return -errno;
Mike Frysinger412dbd22017-01-06 01:50:34 -05001167 c->loopdevs = calloc(config->num_mounts, sizeof(*c->loopdevs));
1168 if (!c->loopdevs)
1169 return -errno;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001170 c->device_mappers = calloc(config->num_mounts, sizeof(*c->device_mappers));
1171 if (!c->device_mappers)
1172 return -errno;
Dylan Reide040c6b2016-05-02 18:49:02 -07001173
1174 for (i = 0; i < config->num_mounts; ++i) {
Stephen Barber1a398c72017-01-23 12:39:44 -08001175 rc = do_container_mount(c, config, &config->mounts[i]);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001176 if (rc)
1177 goto error_free_return;
Dylan Reid7daf9982016-04-28 16:55:42 -07001178 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001179
Dylan Reid7daf9982016-04-28 16:55:42 -07001180 return 0;
Dylan Reid2149be92016-04-28 18:38:57 -07001181
1182error_free_return:
Dylan Reide040c6b2016-05-02 18:49:02 -07001183 unmount_external_mounts(c);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001184 return rc;
Dylan Reid7daf9982016-04-28 16:55:42 -07001185}
1186
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001187static int container_create_device(const struct container *c,
Stephen Barber1a398c72017-01-23 12:39:44 -08001188 const struct container_config *config,
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001189 const struct container_device *dev,
1190 int minor)
1191{
1192 char *path = NULL;
1193 int rc = 0;
1194 int mode;
Stephen Barber1a398c72017-01-23 12:39:44 -08001195 int uid_userns, gid_userns;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001196
1197 switch (dev->type) {
1198 case 'b':
1199 mode = S_IFBLK;
1200 break;
1201 case 'c':
1202 mode = S_IFCHR;
1203 break;
1204 default:
1205 return -EINVAL;
1206 }
1207 mode |= dev->fs_permissions;
1208
Stephen Barber1a398c72017-01-23 12:39:44 -08001209 uid_userns = get_userns_outside_id(config->uid_map, dev->uid);
1210 if (uid_userns < 0)
1211 return uid_userns;
1212 gid_userns = get_userns_outside_id(config->gid_map, dev->gid);
1213 if (gid_userns < 0)
1214 return gid_userns;
1215
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001216 if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0)
1217 goto error_free_return;
1218 if (mknod(path, mode, makedev(dev->major, minor)) && errno != EEXIST)
1219 goto error_free_return;
Stephen Barber1a398c72017-01-23 12:39:44 -08001220 if (chown(path, uid_userns, gid_userns))
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001221 goto error_free_return;
1222 if (chmod(path, dev->fs_permissions))
1223 goto error_free_return;
1224
1225 goto exit;
1226
1227error_free_return:
1228 rc = -errno;
1229exit:
1230 free(path);
1231 return rc;
1232}
1233
Stephen Barber1a398c72017-01-23 12:39:44 -08001234
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001235static int mount_runfs(struct container *c, const struct container_config *config)
Dylan Reid837c74a2016-01-22 17:25:21 -08001236{
Dylan Reidb3621832016-03-24 10:24:57 -07001237 static const mode_t root_dir_mode = 0660;
Dylan Reide040c6b2016-05-02 18:49:02 -07001238 const char *rootfs = config->rootfs;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001239 char *runfs_template = NULL;
Stephen Barber1a398c72017-01-23 12:39:44 -08001240 int uid_userns, gid_userns;
Dylan Reid837c74a2016-01-22 17:25:21 -08001241
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001242 if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0)
1243 return -ENOMEM;
1244
1245 c->runfs = mkdtemp(runfs_template);
1246 if (!c->runfs) {
1247 free(runfs_template);
1248 return -errno;
1249 }
1250
Stephen Barber1a398c72017-01-23 12:39:44 -08001251 uid_userns = get_userns_outside_id(config->uid_map, config->uid);
1252 if (uid_userns < 0)
1253 return uid_userns;
1254 gid_userns = get_userns_outside_id(config->gid_map, config->gid);
1255 if (gid_userns < 0)
1256 return gid_userns;
1257
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001258 /* Make sure the container uid can access the rootfs. */
1259 if (chmod(c->runfs, 0700))
1260 return -errno;
Stephen Barber1a398c72017-01-23 12:39:44 -08001261 if (chown(c->runfs, uid_userns, gid_userns))
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001262 return -errno;
1263
1264 if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0)
1265 return -errno;
1266
1267 if (mkdir(c->runfsroot, root_dir_mode))
1268 return -errno;
1269 if (chmod(c->runfsroot, root_dir_mode))
1270 return -errno;
1271
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -07001272 if (mount(rootfs, c->runfsroot, "", MS_BIND, NULL))
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001273 return -errno;
1274
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -07001275 /* MS_BIND ignores any flags passed to it (except MS_REC). We need a
1276 * second call to mount() to actually set them.
1277 */
1278 if (config->rootfs_mount_flags &&
1279 mount(rootfs, c->runfsroot, "",
1280 config->rootfs_mount_flags, NULL)) {
1281 return -errno;
1282 }
1283
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001284 return 0;
1285}
1286
Dylan Reidacedff92017-03-31 17:41:40 -07001287static int device_setup(struct container *c,
1288 const struct container_config *config)
1289{
1290 int rc, i;
1291
1292 c->cgroup->ops->deny_all_devices(c->cgroup);
1293
Dylan Reid4843d6b2017-03-31 18:14:30 -07001294 for (i = 0; i < config->num_cgroup_devices; i++) {
1295 const struct container_cgroup_device *dev =
1296 &config->cgroup_devices[i];
1297 rc = c->cgroup->ops->add_device(c->cgroup,
1298 dev->allow,
1299 dev->major,
1300 dev->minor,
1301 dev->read,
1302 dev->write,
1303 dev->modify,
1304 dev->type);
1305 if (rc)
1306 return rc;
1307 }
1308
Dylan Reidacedff92017-03-31 17:41:40 -07001309 for (i = 0; i < config->num_devices; i++) {
1310 const struct container_device *dev = &config->devices[i];
1311 int minor = dev->minor;
1312
1313 if (dev->copy_minor) {
1314 struct stat st_buff;
1315 if (stat(dev->path, &st_buff) < 0)
1316 continue;
1317 minor = minor(st_buff.st_rdev);
1318 }
1319 if (minor >= 0) {
1320 rc = container_create_device(c, config, dev, minor);
1321 if (rc)
1322 return rc;
1323 }
Dylan Reidacedff92017-03-31 17:41:40 -07001324 }
1325
1326 for (i = 0; i < c->num_loopdevs; ++i) {
1327 struct stat st;
1328
1329 if (stat(c->loopdevs[i], &st) < 0)
1330 return rc;
Dylan Reid4843d6b2017-03-31 18:14:30 -07001331 rc = c->cgroup->ops->add_device(c->cgroup, 1, major(st.st_rdev),
Dylan Reidacedff92017-03-31 17:41:40 -07001332 minor(st.st_rdev),
1333 1, 0, 0, 'b');
1334 if (rc)
1335 return rc;
1336 }
1337
1338 return 0;
1339}
1340
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001341int container_start(struct container *c, const struct container_config *config)
1342{
1343 int rc = 0;
1344 unsigned int i;
Stephen Barber1a398c72017-01-23 12:39:44 -08001345 int cgroup_uid, cgroup_gid;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001346 char **destinations;
1347 size_t num_destinations;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001348
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001349 if (!c)
1350 return -EINVAL;
Dylan Reide040c6b2016-05-02 18:49:02 -07001351 if (!config)
1352 return -EINVAL;
1353 if (!config->program_argv || !config->program_argv[0])
1354 return -EINVAL;
1355
Mike Frysingerb22acdf2017-01-08 02:02:35 -05001356 if (config->config_root) {
1357 c->config_root = strdup(config->config_root);
1358 if (!c->config_root) {
1359 rc = -ENOMEM;
1360 goto error_rmdir;
1361 }
1362 }
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001363 if (config->premounted_runfs) {
1364 c->runfs = NULL;
1365 c->runfsroot = strdup(config->premounted_runfs);
1366 if (!c->runfsroot) {
1367 rc = -ENOMEM;
1368 goto error_rmdir;
1369 }
1370 } else {
1371 rc = mount_runfs(c, config);
1372 if (rc)
1373 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001374 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001375
1376 c->jail = minijail_new();
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001377 if (!c->jail)
Luis Hector Chavez945af482016-06-03 08:39:34 -07001378 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001379
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001380 rc = do_container_mounts(c, config);
1381 if (rc)
Dylan Reid7daf9982016-04-28 16:55:42 -07001382 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001383
Stephen Barber1a398c72017-01-23 12:39:44 -08001384 cgroup_uid = get_userns_outside_id(config->uid_map,
1385 config->cgroup_owner);
1386 if (cgroup_uid < 0) {
1387 rc = cgroup_uid;
1388 goto error_rmdir;
1389 }
1390 cgroup_gid = get_userns_outside_id(config->gid_map,
1391 config->cgroup_group);
1392 if (cgroup_gid < 0) {
1393 rc = cgroup_gid;
1394 goto error_rmdir;
1395 }
1396
Dylan Reida9966422016-07-21 10:11:34 -07001397 c->cgroup = container_cgroup_new(c->name,
1398 "/sys/fs/cgroup",
1399 config->cgroup_parent,
Stephen Barber1a398c72017-01-23 12:39:44 -08001400 cgroup_uid,
1401 cgroup_gid);
Dylan Reida9966422016-07-21 10:11:34 -07001402 if (!c->cgroup)
1403 goto error_rmdir;
1404
Keshav Santhanam268fa032016-07-14 09:59:24 -07001405 /* Must be root to modify device cgroup or mknod */
1406 if (getuid() == 0) {
Dylan Reidacedff92017-03-31 17:41:40 -07001407 if (device_setup(c, config))
1408 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001409 }
1410
Dylan Reidd7229582016-04-27 17:08:40 -07001411 /* Potentailly run setfiles on mounts configured outside of the jail */
Yusuke Sato91f11f02016-12-02 16:15:13 -08001412 destinations = calloc(config->num_mounts, sizeof(char *));
1413 num_destinations = 0;
Dylan Reide040c6b2016-05-02 18:49:02 -07001414 for (i = 0; i < config->num_mounts; i++) {
1415 const struct container_mount *mnt = &config->mounts[i];
Yusuke Sato91f11f02016-12-02 16:15:13 -08001416 char* dest = mnt->destination;
Dylan Reidd7229582016-04-27 17:08:40 -07001417
1418 if (mnt->mount_in_ns)
1419 continue;
Junichi Uekawa5d272772016-07-21 16:07:19 +09001420 if (mnt->flags & MS_RDONLY)
1421 continue;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001422
Yusuke Satod33db432016-12-05 16:24:37 -08001423 /* A hack to avoid setfiles on /data and /cache. */
1424 if (!strcmp(dest, "/data") || !strcmp(dest, "/cache"))
Yusuke Sato91f11f02016-12-02 16:15:13 -08001425 continue;
1426
1427 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) {
1428 size_t j;
1429 for (j = 0; j < num_destinations; ++j) {
1430 free(destinations[j]);
1431 }
1432 free(destinations);
Dylan Reidd7229582016-04-27 17:08:40 -07001433 goto error_rmdir;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001434 }
1435
1436 destinations[num_destinations++] = dest;
Dylan Reidd7229582016-04-27 17:08:40 -07001437 }
Yusuke Sato91f11f02016-12-02 16:15:13 -08001438 if (num_destinations) {
1439 size_t i;
1440 rc = run_setfiles_command(c, config, destinations, num_destinations);
1441 for (i = 0; i < num_destinations; ++i) {
1442 free(destinations[i]);
1443 }
1444 }
1445 free(destinations);
1446 if (rc)
1447 goto error_rmdir;
Dylan Reidd7229582016-04-27 17:08:40 -07001448
Chinyue Chenfac909e2016-06-24 14:17:42 +08001449 /* Setup CPU cgroup params. */
1450 if (config->cpu_cgparams.shares) {
1451 rc = c->cgroup->ops->set_cpu_shares(
1452 c->cgroup, config->cpu_cgparams.shares);
1453 if (rc)
1454 goto error_rmdir;
1455 }
1456 if (config->cpu_cgparams.period) {
1457 rc = c->cgroup->ops->set_cpu_quota(
1458 c->cgroup, config->cpu_cgparams.quota);
1459 if (rc)
1460 goto error_rmdir;
1461 rc = c->cgroup->ops->set_cpu_period(
1462 c->cgroup, config->cpu_cgparams.period);
1463 if (rc)
1464 goto error_rmdir;
1465 }
1466 if (config->cpu_cgparams.rt_period) {
1467 rc = c->cgroup->ops->set_cpu_rt_runtime(
1468 c->cgroup, config->cpu_cgparams.rt_runtime);
1469 if (rc)
1470 goto error_rmdir;
1471 rc = c->cgroup->ops->set_cpu_rt_period(
1472 c->cgroup, config->cpu_cgparams.rt_period);
1473 if (rc)
1474 goto error_rmdir;
1475 }
1476
Dylan Reid837c74a2016-01-22 17:25:21 -08001477 /* Setup and start the container with libminijail. */
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001478 if (config->pid_file_path) {
1479 c->pid_file_path = strdup(config->pid_file_path);
1480 if (!c->pid_file_path) {
1481 rc = -ENOMEM;
1482 goto error_rmdir;
1483 }
1484 } else if (c->runfs) {
1485 if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0) {
1486 rc = -ENOMEM;
1487 goto error_rmdir;
1488 }
1489 }
1490
1491 if (c->pid_file_path)
1492 minijail_write_pid_file(c->jail, c->pid_file_path);
Dylan Reid837c74a2016-01-22 17:25:21 -08001493 minijail_reset_signal_mask(c->jail);
1494
1495 /* Setup container namespaces. */
1496 minijail_namespace_ipc(c->jail);
1497 minijail_namespace_vfs(c->jail);
Keshav Santhanam1b6bf672016-08-10 18:35:12 -07001498 if (!config->share_host_netns)
1499 minijail_namespace_net(c->jail);
Dylan Reid837c74a2016-01-22 17:25:21 -08001500 minijail_namespace_pids(c->jail);
Dylan Reid837c74a2016-01-22 17:25:21 -08001501 minijail_namespace_user(c->jail);
Mike Frysingerfbd60552017-01-03 17:28:48 -05001502 if (getuid() != 0)
1503 minijail_namespace_user_disable_setgroups(c->jail);
Dylan Reidc6ca1042016-07-11 15:03:27 -07001504 minijail_namespace_cgroups(c->jail);
Dylan Reide040c6b2016-05-02 18:49:02 -07001505 rc = minijail_uidmap(c->jail, config->uid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -08001506 if (rc)
1507 goto error_rmdir;
Dylan Reide040c6b2016-05-02 18:49:02 -07001508 rc = minijail_gidmap(c->jail, config->gid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -08001509 if (rc)
1510 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001511
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001512 /* Set the UID/GID inside the container if not 0. */
Stephen Barber1a398c72017-01-23 12:39:44 -08001513 if (get_userns_outside_id(config->uid_map, config->uid) < 0)
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001514 goto error_rmdir;
Stephen Barber1a398c72017-01-23 12:39:44 -08001515 else if (config->uid > 0)
1516 minijail_change_uid(c->jail, config->uid);
1517 if (get_userns_outside_id(config->gid_map, config->gid) < 0)
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001518 goto error_rmdir;
Stephen Barber1a398c72017-01-23 12:39:44 -08001519 else if (config->gid > 0)
1520 minijail_change_gid(c->jail, config->gid);
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001521
Dylan Reid837c74a2016-01-22 17:25:21 -08001522 rc = minijail_enter_pivot_root(c->jail, c->runfsroot);
1523 if (rc)
1524 goto error_rmdir;
1525
1526 /* Add the cgroups configured above. */
Dmitry Torokhov0d253a62017-01-05 09:41:33 -08001527 for (i = 0; i < NUM_CGROUP_TYPES; i++) {
1528 if (c->cgroup->cgroup_tasks_paths[i]) {
1529 rc = minijail_add_to_cgroup(c->jail,
1530 c->cgroup->cgroup_tasks_paths[i]);
1531 if (rc)
1532 goto error_rmdir;
1533 }
1534 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001535
Dylan Reide040c6b2016-05-02 18:49:02 -07001536 if (config->alt_syscall_table)
1537 minijail_use_alt_syscall(c->jail, config->alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -08001538
1539 minijail_run_as_init(c->jail);
1540
Dylan Reid3da683b2016-04-05 03:35:35 -07001541 /* TODO(dgreid) - remove this once shared mounts are cleaned up. */
1542 minijail_skip_remount_private(c->jail);
1543
Dylan Reidc4335842016-11-11 10:24:52 -08001544 if (!config->keep_fds_open)
1545 minijail_close_open_fds(c->jail);
Luis Hector Chaveze18e7d42016-10-12 07:35:32 -07001546
Dylan Reid837c74a2016-01-22 17:25:21 -08001547 rc = minijail_run_pid_pipes_no_preload(c->jail,
Dylan Reide040c6b2016-05-02 18:49:02 -07001548 config->program_argv[0],
1549 config->program_argv,
Dylan Reid837c74a2016-01-22 17:25:21 -08001550 &c->init_pid, NULL, NULL,
1551 NULL);
1552 if (rc)
1553 goto error_rmdir;
1554 return 0;
1555
1556error_rmdir:
Luis Hector Chavez945af482016-06-03 08:39:34 -07001557 if (!rc)
1558 rc = -errno;
1559 container_teardown(c);
Dylan Reid837c74a2016-01-22 17:25:21 -08001560 return rc;
1561}
1562
1563const char *container_root(struct container *c)
1564{
1565 return c->runfs;
1566}
1567
1568int container_pid(struct container *c)
1569{
1570 return c->init_pid;
1571}
1572
1573static int container_teardown(struct container *c)
1574{
Dylan Reid837c74a2016-01-22 17:25:21 -08001575 int ret = 0;
1576
Dylan Reide040c6b2016-05-02 18:49:02 -07001577 unmount_external_mounts(c);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001578 if (c->runfsroot && c->runfs) {
Luis Hector Chavez945af482016-06-03 08:39:34 -07001579 if (umount(c->runfsroot))
1580 ret = -errno;
1581 if (rmdir(c->runfsroot))
1582 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001583 FREE_AND_NULL(c->runfsroot);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001584 }
1585 if (c->pid_file_path) {
1586 if (unlink(c->pid_file_path))
1587 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001588 FREE_AND_NULL(c->pid_file_path);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001589 }
1590 if (c->runfs) {
1591 if (rmdir(c->runfs))
1592 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001593 FREE_AND_NULL(c->runfs);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001594 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001595 return ret;
1596}
1597
1598int container_wait(struct container *c)
1599{
Dylan Reidcf745c52016-04-22 10:18:03 -07001600 int rc;
1601
1602 do {
1603 rc = minijail_wait(c->jail);
Luis Hector Chavez4641e852016-06-02 15:40:19 -07001604 } while (rc == -EINTR);
Dylan Reidcf745c52016-04-22 10:18:03 -07001605
Luis Hector Chavez945af482016-06-03 08:39:34 -07001606 // If the process had already been reaped, still perform teardown.
1607 if (rc == -ECHILD || rc >= 0) {
Dylan Reidcf745c52016-04-22 10:18:03 -07001608 rc = container_teardown(c);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001609 }
Dylan Reidcf745c52016-04-22 10:18:03 -07001610 return rc;
Dylan Reid837c74a2016-01-22 17:25:21 -08001611}
1612
1613int container_kill(struct container *c)
1614{
Luis Hector Chavez945af482016-06-03 08:39:34 -07001615 if (kill(c->init_pid, SIGKILL) && errno != ESRCH)
Dylan Reid837c74a2016-01-22 17:25:21 -08001616 return -errno;
1617 return container_wait(c);
1618}