blob: fca2cce4c2803dcd2149bd892ec62231d28a19e9 [file] [log] [blame]
Dylan Reid837c74a2016-01-22 17:25:21 -08001/* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#define _GNU_SOURCE /* For asprintf */
7
8#include <errno.h>
9#include <fcntl.h>
Mike Frysinger05e594e2017-01-10 02:11:08 -050010#if USE_device_mapper
11#include <libdevmapper.h>
12#endif
Dylan Reid837c74a2016-01-22 17:25:21 -080013#include <malloc.h>
14#include <signal.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <sys/mount.h>
19#include <sys/stat.h>
20#include <sys/types.h>
Dylan Reid2bd9ea92016-04-07 20:57:47 -070021#include <sys/wait.h>
Dylan Reid837c74a2016-01-22 17:25:21 -080022#include <unistd.h>
23
Mike Frysinger412dbd22017-01-06 01:50:34 -050024#include <linux/loop.h>
25
Dylan Reid837c74a2016-01-22 17:25:21 -080026#include "container_cgroup.h"
27#include "libcontainer.h"
28#include "libminijail.h"
29
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070030#define FREE_AND_NULL(ptr) \
31do { \
32 free(ptr); \
33 ptr = NULL; \
34} while(0)
35
Yusuke Sato91f11f02016-12-02 16:15:13 -080036#define MAX_NUM_SETFILES_ARGS 128
37
Mike Frysinger412dbd22017-01-06 01:50:34 -050038static const char loopdev_ctl[] = "/dev/loop-control";
Mike Frysinger05e594e2017-01-10 02:11:08 -050039#if USE_device_mapper
40static const char dm_dev_prefix[] = "/dev/mapper/";
41#endif
Mike Frysinger412dbd22017-01-06 01:50:34 -050042
Luis Hector Chavez945af482016-06-03 08:39:34 -070043static int container_teardown(struct container *c);
44
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070045static int strdup_and_free(char **dest, const char *src)
46{
47 char *copy = strdup(src);
48 if (!copy)
49 return -ENOMEM;
50 if (*dest)
51 free(*dest);
52 *dest = copy;
53 return 0;
54}
55
Dylan Reid837c74a2016-01-22 17:25:21 -080056struct container_mount {
57 char *name;
58 char *source;
59 char *destination;
60 char *type;
61 char *data;
Mike Frysinger05e594e2017-01-10 02:11:08 -050062 char *verity;
Dylan Reid837c74a2016-01-22 17:25:21 -080063 int flags;
64 int uid;
65 int gid;
66 int mode;
67 int mount_in_ns; /* True if mount should happen in new vfs ns */
68 int create; /* True if target should be created if it doesn't exist */
Mike Frysinger412dbd22017-01-06 01:50:34 -050069 int loopback; /* True if target should be mounted via loopback */
Dylan Reid837c74a2016-01-22 17:25:21 -080070};
71
72struct container_device {
73 char type; /* 'c' or 'b' for char or block */
74 char *path;
75 int fs_permissions;
76 int major;
77 int minor;
Dylan Reid355d5e42016-04-29 16:53:31 -070078 int copy_minor; /* Copy the minor from existing node, ignores |minor| */
Dylan Reid837c74a2016-01-22 17:25:21 -080079 int uid;
80 int gid;
81 int read_allowed;
82 int write_allowed;
83 int modify_allowed;
84};
85
Chinyue Chenfac909e2016-06-24 14:17:42 +080086struct container_cpu_cgroup {
87 int shares;
88 int quota;
89 int period;
90 int rt_runtime;
91 int rt_period;
92};
93
Dylan Reid837c74a2016-01-22 17:25:21 -080094/*
95 * Structure that configures how the container is run.
96 *
Mike Frysingerb22acdf2017-01-08 02:02:35 -050097 * config_root - Path to the root of the container itself.
Dylan Reid837c74a2016-01-22 17:25:21 -080098 * rootfs - Path to the root of the container's filesystem.
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -070099 * rootfs_mount_flags - Flags that will be passed to mount() for the rootfs.
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700100 * premounted_runfs - Path to where the container will be run.
101 * pid_file_path - Path to the file where the pid should be written.
Dylan Reid837c74a2016-01-22 17:25:21 -0800102 * program_argv - The program to run and args, e.g. "/sbin/init".
103 * num_args - Number of args in program_argv.
Dylan Reid1874feb2016-06-22 17:53:50 -0700104 * uid - The uid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800105 * uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024"
Dylan Reid1874feb2016-06-22 17:53:50 -0700106 * gid - The gid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800107 * gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024"
108 * alt_syscall_table - Syscall table to use or NULL if none.
109 * mounts - Filesystems to mount in the new namespace.
110 * num_mounts - Number of above.
111 * devices - Device nodes to create.
112 * num_devices - Number of above.
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700113 * run_setfiles - Should run setfiles on mounts to enable selinux.
Chinyue Chenfac909e2016-06-24 14:17:42 +0800114 * cpu_cgparams - CPU cgroup params.
Dylan Reid9e724af2016-07-21 09:58:07 -0700115 * cgroup_parent - Parent dir for cgroup creation
116 * cgroup_owner - uid to own the created cgroups
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700117 * cgroup_group - gid to own the created cgroups
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700118 * share_host_netns - Enable sharing of the host network namespace.
Dylan Reidc4335842016-11-11 10:24:52 -0800119 * keep_fds_open - Allow the child process to keep open FDs (for stdin/out/err).
Dylan Reid837c74a2016-01-22 17:25:21 -0800120 */
121struct container_config {
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500122 char *config_root;
Dylan Reid837c74a2016-01-22 17:25:21 -0800123 char *rootfs;
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700124 unsigned long rootfs_mount_flags;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700125 char *premounted_runfs;
126 char *pid_file_path;
Dylan Reid837c74a2016-01-22 17:25:21 -0800127 char **program_argv;
128 size_t num_args;
Dylan Reid1874feb2016-06-22 17:53:50 -0700129 uid_t uid;
Dylan Reid837c74a2016-01-22 17:25:21 -0800130 char *uid_map;
Dylan Reid1874feb2016-06-22 17:53:50 -0700131 gid_t gid;
Dylan Reid837c74a2016-01-22 17:25:21 -0800132 char *gid_map;
133 char *alt_syscall_table;
134 struct container_mount *mounts;
135 size_t num_mounts;
136 struct container_device *devices;
137 size_t num_devices;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700138 char *run_setfiles;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800139 struct container_cpu_cgroup cpu_cgparams;
Dylan Reid9e724af2016-07-21 09:58:07 -0700140 char *cgroup_parent;
141 uid_t cgroup_owner;
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700142 gid_t cgroup_group;
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700143 int share_host_netns;
Dylan Reidc4335842016-11-11 10:24:52 -0800144 int keep_fds_open;
Dylan Reid837c74a2016-01-22 17:25:21 -0800145};
146
147struct container_config *container_config_create()
148{
149 return calloc(1, sizeof(struct container_config));
150}
151
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700152static void container_free_program_args(struct container_config *c)
153{
154 int i;
155
156 if (!c->program_argv)
157 return;
158 for (i = 0; i < c->num_args; ++i) {
159 FREE_AND_NULL(c->program_argv[i]);
160 }
161 FREE_AND_NULL(c->program_argv);
162}
163
164static void container_config_free_mount(struct container_mount *mount)
165{
166 FREE_AND_NULL(mount->name);
167 FREE_AND_NULL(mount->source);
168 FREE_AND_NULL(mount->destination);
169 FREE_AND_NULL(mount->type);
170 FREE_AND_NULL(mount->data);
171}
172
173static void container_config_free_device(struct container_device *device)
174{
175 FREE_AND_NULL(device->path);
176}
177
Dylan Reid837c74a2016-01-22 17:25:21 -0800178void container_config_destroy(struct container_config *c)
179{
180 size_t i;
181
182 if (c == NULL)
183 return;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700184 FREE_AND_NULL(c->rootfs);
185 container_free_program_args(c);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700186 FREE_AND_NULL(c->premounted_runfs);
187 FREE_AND_NULL(c->pid_file_path);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700188 FREE_AND_NULL(c->uid_map);
189 FREE_AND_NULL(c->gid_map);
190 FREE_AND_NULL(c->alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -0800191 for (i = 0; i < c->num_mounts; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700192 container_config_free_mount(&c->mounts[i]);
Dylan Reid837c74a2016-01-22 17:25:21 -0800193 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700194 FREE_AND_NULL(c->mounts);
Dylan Reid837c74a2016-01-22 17:25:21 -0800195 for (i = 0; i < c->num_devices; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700196 container_config_free_device(&c->devices[i]);
Dylan Reid837c74a2016-01-22 17:25:21 -0800197 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700198 FREE_AND_NULL(c->devices);
199 FREE_AND_NULL(c->run_setfiles);
Dylan Reid9e724af2016-07-21 09:58:07 -0700200 FREE_AND_NULL(c->cgroup_parent);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700201 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800202}
203
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500204int container_config_config_root(struct container_config *c,
205 const char *config_root)
206{
207 return strdup_and_free(&c->config_root, config_root);
208}
209
210const char *container_config_get_config_root(const struct container_config *c)
211{
212 return c->config_root;
213}
214
Dylan Reid837c74a2016-01-22 17:25:21 -0800215int container_config_rootfs(struct container_config *c, const char *rootfs)
216{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700217 return strdup_and_free(&c->rootfs, rootfs);
Dylan Reid837c74a2016-01-22 17:25:21 -0800218}
219
Dylan Reid11456722016-05-02 11:24:50 -0700220const char *container_config_get_rootfs(const struct container_config *c)
221{
222 return c->rootfs;
223}
224
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700225void container_config_rootfs_mount_flags(struct container_config *c,
226 unsigned long rootfs_mount_flags)
227{
228 /* Since we are going to add MS_REMOUNT anyways, add it here so we can
229 * simply check against zero later. MS_BIND is also added to avoid
230 * re-mounting the original filesystem, since the rootfs is always
231 * bind-mounted.
232 */
233 c->rootfs_mount_flags = MS_REMOUNT | MS_BIND | rootfs_mount_flags;
234}
235
236unsigned long container_config_get_rootfs_mount_flags(
237 const struct container_config *c)
238{
239 return c->rootfs_mount_flags;
240}
241
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700242int container_config_premounted_runfs(struct container_config *c, const char *runfs)
243{
244 return strdup_and_free(&c->premounted_runfs, runfs);
245}
246
247const char *container_config_get_premounted_runfs(const struct container_config *c)
248{
249 return c->premounted_runfs;
250}
251
252int container_config_pid_file(struct container_config *c, const char *path)
253{
254 return strdup_and_free(&c->pid_file_path, path);
255}
256
257const char *container_config_get_pid_file(const struct container_config *c)
258{
259 return c->pid_file_path;
260}
261
Dylan Reid837c74a2016-01-22 17:25:21 -0800262int container_config_program_argv(struct container_config *c,
Dylan Reid17fd53f2016-11-18 19:14:41 -0800263 const char **argv, size_t num_args)
Dylan Reid837c74a2016-01-22 17:25:21 -0800264{
265 size_t i;
266
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700267 container_free_program_args(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800268 c->num_args = num_args;
269 c->program_argv = calloc(num_args + 1, sizeof(char *));
270 if (!c->program_argv)
271 return -ENOMEM;
272 for (i = 0; i < num_args; ++i) {
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700273 if (strdup_and_free(&c->program_argv[i], argv[i]))
274 goto error_free_return;
Dylan Reid837c74a2016-01-22 17:25:21 -0800275 }
276 c->program_argv[num_args] = NULL;
277 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700278
279error_free_return:
280 container_free_program_args(c);
281 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800282}
283
Dylan Reid11456722016-05-02 11:24:50 -0700284size_t container_config_get_num_program_args(const struct container_config *c)
285{
286 return c->num_args;
287}
288
289const char *container_config_get_program_arg(const struct container_config *c,
290 size_t index)
291{
292 if (index >= c->num_args)
293 return NULL;
294 return c->program_argv[index];
295}
296
Dylan Reid1874feb2016-06-22 17:53:50 -0700297void container_config_uid(struct container_config *c, uid_t uid)
298{
299 c->uid = uid;
300}
301
302uid_t container_config_get_uid(const struct container_config *c)
303{
304 return c->uid;
305}
306
Dylan Reid837c74a2016-01-22 17:25:21 -0800307int container_config_uid_map(struct container_config *c, const char *uid_map)
308{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700309 return strdup_and_free(&c->uid_map, uid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800310}
311
Dylan Reid1874feb2016-06-22 17:53:50 -0700312void container_config_gid(struct container_config *c, gid_t gid)
313{
314 c->gid = gid;
315}
316
317gid_t container_config_get_gid(const struct container_config *c)
318{
319 return c->gid;
320}
321
Dylan Reid837c74a2016-01-22 17:25:21 -0800322int container_config_gid_map(struct container_config *c, const char *gid_map)
323{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700324 return strdup_and_free(&c->gid_map, gid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800325}
326
327int container_config_alt_syscall_table(struct container_config *c,
328 const char *alt_syscall_table)
329{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700330 return strdup_and_free(&c->alt_syscall_table, alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -0800331}
332
333int container_config_add_mount(struct container_config *c,
334 const char *name,
335 const char *source,
336 const char *destination,
337 const char *type,
338 const char *data,
Mike Frysinger05e594e2017-01-10 02:11:08 -0500339 const char *verity,
Dylan Reid837c74a2016-01-22 17:25:21 -0800340 int flags,
341 int uid,
342 int gid,
343 int mode,
344 int mount_in_ns,
Mike Frysinger412dbd22017-01-06 01:50:34 -0500345 int create,
346 int loopback)
Dylan Reid837c74a2016-01-22 17:25:21 -0800347{
348 struct container_mount *mount_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700349 struct container_mount *current_mount;
Dylan Reid837c74a2016-01-22 17:25:21 -0800350
351 if (name == NULL || source == NULL ||
352 destination == NULL || type == NULL)
353 return -EINVAL;
354
355 mount_ptr = realloc(c->mounts,
356 sizeof(c->mounts[0]) * (c->num_mounts + 1));
357 if (!mount_ptr)
358 return -ENOMEM;
359 c->mounts = mount_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700360 current_mount = &c->mounts[c->num_mounts];
361 memset(current_mount, 0, sizeof(struct container_mount));
362
363 if (strdup_and_free(&current_mount->name, name))
364 goto error_free_return;
365 if (strdup_and_free(&current_mount->source, source))
366 goto error_free_return;
367 if (strdup_and_free(&current_mount->destination, destination))
368 goto error_free_return;
369 if (strdup_and_free(&current_mount->type, type))
370 goto error_free_return;
371 if (data && strdup_and_free(&current_mount->data, data))
372 goto error_free_return;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500373 if (verity && strdup_and_free(&current_mount->verity, verity))
374 goto error_free_return;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700375 current_mount->flags = flags;
376 current_mount->uid = uid;
377 current_mount->gid = gid;
378 current_mount->mode = mode;
379 current_mount->mount_in_ns = mount_in_ns;
380 current_mount->create = create;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500381 current_mount->loopback = loopback;
Dylan Reid837c74a2016-01-22 17:25:21 -0800382 ++c->num_mounts;
383 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700384
385error_free_return:
386 container_config_free_mount(current_mount);
387 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800388}
389
390int container_config_add_device(struct container_config *c,
391 char type,
392 const char *path,
393 int fs_permissions,
394 int major,
395 int minor,
Dylan Reid355d5e42016-04-29 16:53:31 -0700396 int copy_minor,
Dylan Reid837c74a2016-01-22 17:25:21 -0800397 int uid,
398 int gid,
399 int read_allowed,
400 int write_allowed,
401 int modify_allowed)
402{
403 struct container_device *dev_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700404 struct container_device *current_dev;
Dylan Reid837c74a2016-01-22 17:25:21 -0800405
406 if (path == NULL)
407 return -EINVAL;
Dylan Reid355d5e42016-04-29 16:53:31 -0700408 /* If using a dynamic minor number, ensure that minor is -1. */
409 if (copy_minor && (minor != -1))
410 return -EINVAL;
411
Dylan Reid837c74a2016-01-22 17:25:21 -0800412 dev_ptr = realloc(c->devices,
413 sizeof(c->devices[0]) * (c->num_devices + 1));
414 if (!dev_ptr)
415 return -ENOMEM;
416 c->devices = dev_ptr;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700417 current_dev = &c->devices[c->num_devices];
418 memset(current_dev, 0, sizeof(struct container_device));
419
420 current_dev->type = type;
421 if (strdup_and_free(&current_dev->path, path))
422 goto error_free_return;
423 current_dev->fs_permissions = fs_permissions;
424 current_dev->major = major;
425 current_dev->minor = minor;
426 current_dev->copy_minor = copy_minor;
427 current_dev->uid = uid;
428 current_dev->gid = gid;
429 current_dev->read_allowed = read_allowed;
430 current_dev->write_allowed = write_allowed;
431 current_dev->modify_allowed = modify_allowed;
Dylan Reid837c74a2016-01-22 17:25:21 -0800432 ++c->num_devices;
433 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700434
435error_free_return:
436 container_config_free_device(current_dev);
437 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800438}
439
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700440int container_config_run_setfiles(struct container_config *c,
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700441 const char *setfiles_cmd)
442{
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700443 return strdup_and_free(&c->run_setfiles, setfiles_cmd);
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700444}
Dylan Reid837c74a2016-01-22 17:25:21 -0800445
Dylan Reid11456722016-05-02 11:24:50 -0700446const char *container_config_get_run_setfiles(const struct container_config *c)
447{
448 return c->run_setfiles;
449}
450
Chinyue Chenfac909e2016-06-24 14:17:42 +0800451int container_config_set_cpu_shares(struct container_config *c, int shares)
452{
453 /* CPU shares must be 2 or higher. */
454 if (shares < 2)
455 return -EINVAL;
456
457 c->cpu_cgparams.shares = shares;
458 return 0;
459}
460
461int container_config_set_cpu_cfs_params(struct container_config *c,
462 int quota,
463 int period)
464{
465 /*
466 * quota could be set higher than period to utilize more than one CPU.
467 * quota could also be set as -1 to indicate the cgroup does not adhere
468 * to any CPU time restrictions.
469 */
470 if (quota <= 0 && quota != -1)
471 return -EINVAL;
472 if (period <= 0)
473 return -EINVAL;
474
475 c->cpu_cgparams.quota = quota;
476 c->cpu_cgparams.period = period;
477 return 0;
478}
479
480int container_config_set_cpu_rt_params(struct container_config *c,
481 int rt_runtime,
482 int rt_period)
483{
484 /*
485 * rt_runtime could be set as 0 to prevent the cgroup from using
486 * realtime CPU.
487 */
488 if (rt_runtime < 0 || rt_runtime >= rt_period)
489 return -EINVAL;
490
491 c->cpu_cgparams.rt_runtime = rt_runtime;
492 c->cpu_cgparams.rt_period = rt_period;
493 return 0;
494}
495
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800496int container_config_get_cpu_shares(struct container_config *c)
497{
498 return c->cpu_cgparams.shares;
499}
500
501int container_config_get_cpu_quota(struct container_config *c)
502{
503 return c->cpu_cgparams.quota;
504}
505
506int container_config_get_cpu_period(struct container_config *c)
507{
508 return c->cpu_cgparams.period;
509}
510
511int container_config_get_cpu_rt_runtime(struct container_config *c)
512{
513 return c->cpu_cgparams.rt_runtime;
514}
515
516int container_config_get_cpu_rt_period(struct container_config *c)
517{
518 return c->cpu_cgparams.rt_period;
519}
520
Dylan Reid9e724af2016-07-21 09:58:07 -0700521int container_config_set_cgroup_parent(struct container_config *c,
522 const char *parent,
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700523 uid_t cgroup_owner, gid_t cgroup_group)
Dylan Reid9e724af2016-07-21 09:58:07 -0700524{
525 c->cgroup_owner = cgroup_owner;
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700526 c->cgroup_group = cgroup_group;
Dylan Reid9e724af2016-07-21 09:58:07 -0700527 return strdup_and_free(&c->cgroup_parent, parent);
528}
529
530const char *container_config_get_cgroup_parent(struct container_config *c)
531{
532 return c->cgroup_parent;
533}
534
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700535void container_config_share_host_netns(struct container_config *c)
536{
537 c->share_host_netns = 1;
538}
539
540int get_container_config_share_host_netns(struct container_config *c)
541{
542 return c->share_host_netns;
543}
544
Dylan Reidc4335842016-11-11 10:24:52 -0800545void container_config_keep_fds_open(struct container_config *c)
546{
547 c->keep_fds_open = 1;
548}
549
Dylan Reid837c74a2016-01-22 17:25:21 -0800550/*
551 * Container manipulation
552 */
553struct container {
Dylan Reid837c74a2016-01-22 17:25:21 -0800554 struct container_cgroup *cgroup;
555 struct minijail *jail;
556 pid_t init_pid;
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500557 char *config_root;
Dylan Reid837c74a2016-01-22 17:25:21 -0800558 char *runfs;
559 char *rundir;
560 char *runfsroot;
561 char *pid_file_path;
Dylan Reide040c6b2016-05-02 18:49:02 -0700562 char **ext_mounts; /* Mounts made outside of the minijail */
563 size_t num_ext_mounts;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500564 char **loopdevs;
565 size_t num_loopdevs;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500566 char **device_mappers;
567 size_t num_device_mappers;
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700568 char *name;
Dylan Reid837c74a2016-01-22 17:25:21 -0800569};
570
571struct container *container_new(const char *name,
Dylan Reide040c6b2016-05-02 18:49:02 -0700572 const char *rundir)
Dylan Reid837c74a2016-01-22 17:25:21 -0800573{
574 struct container *c;
575
Dylan Reid837c74a2016-01-22 17:25:21 -0800576 c = calloc(1, sizeof(*c));
Dylan Reidb435c682016-04-12 04:17:49 -0700577 if (!c)
578 return NULL;
Dylan Reid837c74a2016-01-22 17:25:21 -0800579 c->rundir = strdup(rundir);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700580 c->name = strdup(name);
Dylan Reida9966422016-07-21 10:11:34 -0700581 if (!c->rundir || !c->name) {
Dylan Reid684975e2016-05-02 15:44:47 -0700582 container_destroy(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800583 return NULL;
Dylan Reidb435c682016-04-12 04:17:49 -0700584 }
Dylan Reid837c74a2016-01-22 17:25:21 -0800585 return c;
586}
587
588void container_destroy(struct container *c)
589{
Dylan Reid684975e2016-05-02 15:44:47 -0700590 if (c->cgroup)
591 container_cgroup_destroy(c->cgroup);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -0700592 if (c->jail)
593 minijail_destroy(c->jail);
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500594 FREE_AND_NULL(c->config_root);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700595 FREE_AND_NULL(c->name);
596 FREE_AND_NULL(c->rundir);
597 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800598}
599
600static int make_dir(const char *path, int uid, int gid, int mode)
601{
602 if (mkdir(path, mode))
603 return -errno;
604 if (chmod(path, mode))
605 return -errno;
606 if (chown(path, uid, gid))
607 return -errno;
608 return 0;
609}
610
611static int touch_file(const char *path, int uid, int gid, int mode)
612{
613 int rc;
614 int fd = open(path, O_RDWR | O_CREAT, mode);
615 if (fd < 0)
616 return -errno;
617 rc = fchown(fd, uid, gid);
618 close(fd);
619
620 if (rc)
621 return -errno;
622 return 0;
623}
624
625/* Make sure the mount target exists in the new rootfs. Create if needed and
626 * possible.
627 */
628static int setup_mount_destination(const struct container_mount *mnt,
Dylan Reid2149be92016-04-28 18:38:57 -0700629 const char *source,
Dylan Reid837c74a2016-01-22 17:25:21 -0800630 const char *dest)
631{
632 int rc;
633 struct stat st_buf;
634
635 rc = stat(dest, &st_buf);
636 if (rc == 0) /* destination exists */
637 return 0;
638
639 /* Try to create the destination. Either make directory or touch a file
640 * depending on the source type.
641 */
Dylan Reid2149be92016-04-28 18:38:57 -0700642 rc = stat(source, &st_buf);
Dylan Reid837c74a2016-01-22 17:25:21 -0800643 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode))
644 return make_dir(dest, mnt->uid, mnt->gid, mnt->mode);
645
646 return touch_file(dest, mnt->uid, mnt->gid, mnt->mode);
647}
648
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700649/* Fork and exec the setfiles command to configure the selinux policy. */
Dylan Reide040c6b2016-05-02 18:49:02 -0700650static int run_setfiles_command(const struct container *c,
651 const struct container_config *config,
Yusuke Sato91f11f02016-12-02 16:15:13 -0800652 char *const *destinations, size_t num_destinations)
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700653{
654 int rc;
655 int status;
656 int pid;
657 char *context_path;
658
Dylan Reide040c6b2016-05-02 18:49:02 -0700659 if (!config->run_setfiles)
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700660 return 0;
661
662 if (asprintf(&context_path, "%s/file_contexts",
663 c->runfsroot) < 0)
664 return -errno;
665
666 pid = fork();
667 if (pid == 0) {
Yusuke Sato91f11f02016-12-02 16:15:13 -0800668 size_t i;
669 size_t arg_index = 0;
670 const char *argv[MAX_NUM_SETFILES_ARGS];
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700671 const char *env[] = {
672 NULL,
673 };
674
Yusuke Sato91f11f02016-12-02 16:15:13 -0800675 argv[arg_index++] = config->run_setfiles;
676 argv[arg_index++] = "-r";
677 argv[arg_index++] = c->runfsroot;
678 argv[arg_index++] = context_path;
679 if (arg_index + num_destinations >= MAX_NUM_SETFILES_ARGS)
680 _exit(-E2BIG);
681 for (i = 0; i < num_destinations; ++i) {
682 argv[arg_index++] = destinations[i];
683 }
684 argv[arg_index] = NULL;
685
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700686 execve(argv[0], (char *const*)argv, (char *const*)env);
687
688 /* Command failed to exec if execve returns. */
689 _exit(-errno);
690 }
691 free(context_path);
692 if (pid < 0)
693 return -errno;
694 do {
695 rc = waitpid(pid, &status, 0);
696 } while (rc == -1 && errno == EINTR);
697 if (rc < 0)
698 return -errno;
699 return status;
700}
701
Mike Frysinger412dbd22017-01-06 01:50:34 -0500702/* Find a free loop device and attach it. */
703static int loopdev_setup(char **loopdev_ret, const char *source)
704{
705 int ret = 0;
706 int source_fd = -1;
707 int control_fd = -1;
708 int loop_fd = -1;
709 char *loopdev = NULL;
710
711 source_fd = open(source, O_RDONLY|O_CLOEXEC);
712 if (source_fd < 0)
713 goto error;
714
715 control_fd = open(loopdev_ctl, O_RDWR|O_NOFOLLOW|O_CLOEXEC);
716 if (control_fd < 0)
717 goto error;
718
719 while (1) {
720 int num = ioctl(control_fd, LOOP_CTL_GET_FREE);
721 if (num < 0)
722 goto error;
723
724 if (asprintf(&loopdev, "/dev/loop%i", num) < 0)
725 goto error;
726
727 loop_fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
728 if (loop_fd < 0)
729 goto error;
730
731 if (ioctl(loop_fd, LOOP_SET_FD, source_fd) == 0)
732 break;
733
734 if (errno != EBUSY)
735 goto error;
736
737 /* Clean up resources for the next pass. */
738 free(loopdev);
739 close(loop_fd);
740 }
741
742 *loopdev_ret = loopdev;
743 goto exit;
744
745error:
746 ret = -errno;
747 free(loopdev);
748exit:
749 if (source_fd != -1)
750 close(source_fd);
751 if (control_fd != -1)
752 close(control_fd);
753 if (loop_fd != -1)
754 close(loop_fd);
755 return ret;
756}
757
758/* Detach the specified loop device. */
759static int loopdev_detach(const char *loopdev)
760{
761 int ret = 0;
762 int fd;
763
764 fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
765 if (fd < 0)
766 goto error;
767 if (ioctl(fd, LOOP_CLR_FD) < 0)
768 goto error;
769
770 goto exit;
771
772error:
773 ret = -errno;
774exit:
775 if (fd != -1)
776 close(fd);
777 return ret;
778}
779
Mike Frysinger05e594e2017-01-10 02:11:08 -0500780/* Create a new device mapper target for the source. */
781static int dm_setup(char **dm_path_ret, char **dm_name_ret, const char *source,
782 const char *verity_cmdline)
783{
784 int ret = 0;
785#if USE_device_mapper
786 char *p;
787 char *dm_path = NULL;
788 char *dm_name = NULL;
789 char *verity = NULL;
790 struct dm_task *dmt = NULL;
791 uint32_t cookie = 0;
792
793 /* Normalize the name into something unique-esque. */
794 if (asprintf(&dm_name, "cros-containers-%s", source) < 0)
795 goto error;
796 p = dm_name;
797 while ((p = strchr(p, '/')) != NULL)
798 *p++ = '_';
799
800 /* Get the /dev path for the higher levels to mount. */
801 if (asprintf(&dm_path, "%s%s", dm_dev_prefix, dm_name) < 0)
802 goto error;
803
804 /* Insert the source path in the verity command line. */
805 size_t source_len = strlen(source);
806 verity = malloc(strlen(verity_cmdline) + source_len * 2 + 1);
807 strcpy(verity, verity_cmdline);
808 while ((p = strstr(verity, "@DEV@")) != NULL) {
809 memmove(p + source_len, p + 5, strlen(p + 5) + 1);
810 memcpy(p, source, source_len);
811 }
812
813 /* Extract the first three parameters for dm-verity settings. */
814 char ttype[20];
815 unsigned long long start, size;
816 int n;
817 if (sscanf(verity, "%llu %llu %10s %n", &start, &size, ttype, &n) != 3)
818 goto error;
819
820 /* Finally create the device mapper. */
821 dmt = dm_task_create(DM_DEVICE_CREATE);
822 if (dmt == NULL)
823 goto error;
824
825 if (!dm_task_set_name(dmt, dm_name))
826 goto error;
827
828 if (!dm_task_set_ro(dmt))
829 goto error;
830
831 if (!dm_task_add_target(dmt, start, size, ttype, verity + n))
832 goto error;
833
834 if (!dm_task_set_cookie(dmt, &cookie, 0))
835 goto error;
836
837 if (!dm_task_run(dmt))
838 goto error;
839
840 /* Make sure the node exists before we continue. */
841 dm_udev_wait(cookie);
842
843 *dm_path_ret = dm_path;
844 *dm_name_ret = dm_name;
845 goto exit;
846
847error:
848 ret = -errno;
849 free(dm_name);
850 free(dm_path);
851exit:
852 free(verity);
853 if (dmt)
854 dm_task_destroy(dmt);
855#endif
856 return ret;
857}
858
859/* Tear down the device mapper target. */
860static int dm_detach(const char *dm_name)
861{
862 int ret = 0;
863#if USE_device_mapper
864 struct dm_task *dmt;
865
866 dmt = dm_task_create(DM_DEVICE_REMOVE);
867 if (dmt == NULL)
868 goto error;
869
870 if (!dm_task_set_name(dmt, dm_name))
871 goto error;
872
873 if (!dm_task_run(dmt))
874 goto error;
875
876 goto exit;
877
878error:
879 ret = -errno;
880exit:
881 dm_task_destroy(dmt);
882#endif
883 return ret;
884}
885
Dylan Reide040c6b2016-05-02 18:49:02 -0700886/*
887 * Unmounts anything we mounted in this mount namespace in the opposite order
888 * that they were mounted.
889 */
890static int unmount_external_mounts(struct container *c)
891{
892 int ret = 0;
893
894 while (c->num_ext_mounts) {
895 c->num_ext_mounts--;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700896 if (!c->ext_mounts[c->num_ext_mounts])
897 continue;
Dylan Reide040c6b2016-05-02 18:49:02 -0700898 if (umount(c->ext_mounts[c->num_ext_mounts]))
899 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700900 FREE_AND_NULL(c->ext_mounts[c->num_ext_mounts]);
Dylan Reide040c6b2016-05-02 18:49:02 -0700901 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700902 FREE_AND_NULL(c->ext_mounts);
Mike Frysinger412dbd22017-01-06 01:50:34 -0500903
904 while (c->num_loopdevs) {
905 c->num_loopdevs--;
906 if (loopdev_detach(c->loopdevs[c->num_loopdevs]))
907 ret = -errno;
908 FREE_AND_NULL(c->loopdevs[c->num_loopdevs]);
909 }
910 FREE_AND_NULL(c->loopdevs);
911
Mike Frysinger05e594e2017-01-10 02:11:08 -0500912 while (c->num_device_mappers) {
913 c->num_device_mappers--;
914 if (dm_detach(c->device_mappers[c->num_device_mappers]))
915 ret = -errno;
916 FREE_AND_NULL(c->device_mappers[c->num_device_mappers]);
917 }
918 FREE_AND_NULL(c->device_mappers);
919
Dylan Reide040c6b2016-05-02 18:49:02 -0700920 return ret;
921}
922
Junichi Uekawa5d272772016-07-21 16:07:19 +0900923/*
924 * Match mount_one in minijail, mount one mountpoint with
925 * consideration for combination of MS_BIND/MS_RDONLY flag.
926 */
927static int mount_external(const char *src, const char *dest, const char *type,
928 unsigned long flags, const void *data)
929{
930 int remount_ro = 0;
931
932 /*
933 * R/O bind mounts have to be remounted since 'bind' and 'ro'
934 * can't both be specified in the original bind mount.
935 * Remount R/O after the initial mount.
936 */
937 if ((flags & MS_BIND) && (flags & MS_RDONLY)) {
938 remount_ro = 1;
939 flags &= ~MS_RDONLY;
940 }
941
942 if (mount(src, dest, type, flags, data) == -1)
943 return -1;
944
945 if (remount_ro) {
946 flags |= MS_RDONLY;
947 if (mount(src, dest, NULL, flags | MS_REMOUNT, data) == -1)
948 return -1;
949 }
950
951 return 0;
952}
953
Luis Hector Chavez3341ed62016-06-06 08:04:04 -0700954static int do_container_mount(struct container *c,
955 const struct container_mount *mnt)
956{
Mike Frysinger05e594e2017-01-10 02:11:08 -0500957 char *dm_source = NULL;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500958 char *loop_source = NULL;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -0700959 char *source = NULL;
960 char *dest = NULL;
961 int rc = 0;
962
963 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
964 return -errno;
965
966 /*
967 * If it's a bind mount relative to rootfs, append source to
968 * rootfs path, otherwise source path is absolute.
969 */
970 if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') {
971 if (asprintf(&source, "%s/%s", c->runfsroot, mnt->source) < 0)
972 goto error_free_return;
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500973 } else if (mnt->loopback && mnt->source[0] != '/' && c->config_root) {
974 if (asprintf(&source, "%s/%s", c->config_root, mnt->source) < 0)
975 goto error_free_return;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -0700976 } else {
977 if (asprintf(&source, "%s", mnt->source) < 0)
978 goto error_free_return;
979 }
980
981 if (mnt->create) {
982 rc = setup_mount_destination(mnt, source, dest);
983 if (rc)
984 goto error_free_return;
985 }
Mike Frysinger412dbd22017-01-06 01:50:34 -0500986 if (mnt->loopback) {
987 /* Record this loopback file for cleanup later. */
988 loop_source = source;
989 source = NULL;
990 rc = loopdev_setup(&source, loop_source);
991 if (rc)
992 goto error_free_return;
993
Mike Frysinger05e594e2017-01-10 02:11:08 -0500994 /* Save this to cleanup when shutting down. */
Mike Frysinger412dbd22017-01-06 01:50:34 -0500995 rc = strdup_and_free(&c->loopdevs[c->num_loopdevs], source);
996 if (rc)
997 goto error_free_return;
998 c->num_loopdevs++;
999 }
Mike Frysinger05e594e2017-01-10 02:11:08 -05001000 if (mnt->verity) {
1001 /* Set this device up via dm-verity. */
1002 char *dm_name;
1003 dm_source = source;
1004 source = NULL;
1005 rc = dm_setup(&source, &dm_name, dm_source, mnt->verity);
1006 if (rc)
1007 goto error_free_return;
1008
1009 /* Save this to cleanup when shutting down. */
1010 rc = strdup_and_free(&c->device_mappers[c->num_device_mappers],
1011 dm_name);
1012 free(dm_name);
1013 if (rc)
1014 goto error_free_return;
1015 c->num_device_mappers++;
1016 }
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001017 if (mnt->mount_in_ns) {
1018 /* We can mount this with minijail. */
Dylan Reid36b9c012016-06-24 18:27:08 -07001019 rc = minijail_mount_with_data(c->jail, source, mnt->destination,
1020 mnt->type, mnt->flags, mnt->data);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001021 if (rc)
1022 goto error_free_return;
1023 } else {
1024 /* Mount this externally and unmount it on exit. */
Junichi Uekawa5d272772016-07-21 16:07:19 +09001025 if (mount_external(source, dest, mnt->type, mnt->flags,
1026 mnt->data))
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001027 goto error_free_return;
1028 /* Save this to unmount when shutting down. */
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001029 rc = strdup_and_free(&c->ext_mounts[c->num_ext_mounts], dest);
1030 if (rc)
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001031 goto error_free_return;
1032 c->num_ext_mounts++;
1033 }
1034
1035 goto exit;
1036
1037error_free_return:
1038 if (!rc)
1039 rc = -errno;
1040exit:
Mike Frysinger05e594e2017-01-10 02:11:08 -05001041 free(dm_source);
Mike Frysinger412dbd22017-01-06 01:50:34 -05001042 free(loop_source);
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001043 free(source);
1044 free(dest);
1045 return rc;
1046}
1047
Dylan Reide040c6b2016-05-02 18:49:02 -07001048static int do_container_mounts(struct container *c,
1049 const struct container_config *config)
Dylan Reid7daf9982016-04-28 16:55:42 -07001050{
1051 unsigned int i;
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001052 int rc = 0;
Dylan Reid7daf9982016-04-28 16:55:42 -07001053
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001054 unmount_external_mounts(c);
Dylan Reide040c6b2016-05-02 18:49:02 -07001055 /*
1056 * Allocate space to track anything we mount in our mount namespace.
1057 * This over-allocates as it has space for all mounts.
1058 */
1059 c->ext_mounts = calloc(config->num_mounts, sizeof(*c->ext_mounts));
1060 if (!c->ext_mounts)
1061 return -errno;
Mike Frysinger412dbd22017-01-06 01:50:34 -05001062 c->loopdevs = calloc(config->num_mounts, sizeof(*c->loopdevs));
1063 if (!c->loopdevs)
1064 return -errno;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001065 c->device_mappers = calloc(config->num_mounts, sizeof(*c->device_mappers));
1066 if (!c->device_mappers)
1067 return -errno;
Dylan Reide040c6b2016-05-02 18:49:02 -07001068
1069 for (i = 0; i < config->num_mounts; ++i) {
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001070 rc = do_container_mount(c, &config->mounts[i]);
1071 if (rc)
1072 goto error_free_return;
Dylan Reid7daf9982016-04-28 16:55:42 -07001073 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001074
Dylan Reid7daf9982016-04-28 16:55:42 -07001075 return 0;
Dylan Reid2149be92016-04-28 18:38:57 -07001076
1077error_free_return:
Dylan Reide040c6b2016-05-02 18:49:02 -07001078 unmount_external_mounts(c);
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001079 return rc;
Dylan Reid7daf9982016-04-28 16:55:42 -07001080}
1081
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001082static int container_create_device(const struct container *c,
1083 const struct container_device *dev,
1084 int minor)
1085{
1086 char *path = NULL;
1087 int rc = 0;
1088 int mode;
1089
1090 switch (dev->type) {
1091 case 'b':
1092 mode = S_IFBLK;
1093 break;
1094 case 'c':
1095 mode = S_IFCHR;
1096 break;
1097 default:
1098 return -EINVAL;
1099 }
1100 mode |= dev->fs_permissions;
1101
1102 if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0)
1103 goto error_free_return;
1104 if (mknod(path, mode, makedev(dev->major, minor)) && errno != EEXIST)
1105 goto error_free_return;
1106 if (chown(path, dev->uid, dev->gid))
1107 goto error_free_return;
1108 if (chmod(path, dev->fs_permissions))
1109 goto error_free_return;
1110
1111 goto exit;
1112
1113error_free_return:
1114 rc = -errno;
1115exit:
1116 free(path);
1117 return rc;
1118}
1119
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001120static int mount_runfs(struct container *c, const struct container_config *config)
Dylan Reid837c74a2016-01-22 17:25:21 -08001121{
Dylan Reidb3621832016-03-24 10:24:57 -07001122 static const mode_t root_dir_mode = 0660;
Dylan Reide040c6b2016-05-02 18:49:02 -07001123 const char *rootfs = config->rootfs;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001124 char *runfs_template = NULL;
Dylan Reid837c74a2016-01-22 17:25:21 -08001125
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001126 if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0)
1127 return -ENOMEM;
1128
1129 c->runfs = mkdtemp(runfs_template);
1130 if (!c->runfs) {
1131 free(runfs_template);
1132 return -errno;
1133 }
1134
1135 /* Make sure the container uid can access the rootfs. */
1136 if (chmod(c->runfs, 0700))
1137 return -errno;
1138 if (chown(c->runfs, config->uid, config->gid))
1139 return -errno;
1140
1141 if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0)
1142 return -errno;
1143
1144 if (mkdir(c->runfsroot, root_dir_mode))
1145 return -errno;
1146 if (chmod(c->runfsroot, root_dir_mode))
1147 return -errno;
1148
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -07001149 if (mount(rootfs, c->runfsroot, "", MS_BIND, NULL))
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001150 return -errno;
1151
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -07001152 /* MS_BIND ignores any flags passed to it (except MS_REC). We need a
1153 * second call to mount() to actually set them.
1154 */
1155 if (config->rootfs_mount_flags &&
1156 mount(rootfs, c->runfsroot, "",
1157 config->rootfs_mount_flags, NULL)) {
1158 return -errno;
1159 }
1160
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001161 return 0;
1162}
1163
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001164static int get_userns_id(const char *map, int id)
1165{
1166 char *map_copy, *mapping, *saveptr1, *saveptr2;
1167 int inside, outside, length;
1168 int result = 0;
1169 errno = 0;
1170
1171 if (asprintf(&map_copy, "%s", map) < 0)
1172 return -ENOMEM;
1173
1174 mapping = strtok_r(map_copy, ",", &saveptr1);
1175 while (mapping) {
1176 inside = strtol(strtok_r(mapping, " ", &saveptr2), NULL, 10);
1177 outside = strtol(strtok_r(NULL, " ", &saveptr2), NULL, 10);
1178 length = strtol(strtok_r(NULL, "\0", &saveptr2), NULL, 10);
1179 if (errno) {
1180 goto error_free_return;
1181 } else if (inside < 0 || outside < 0 || length < 0) {
1182 errno = EINVAL;
1183 goto error_free_return;
1184 }
1185
1186 if (id >= outside && id <= (outside + length)) {
1187 result = id - (outside - inside);
1188 goto exit;
1189 }
1190
1191 mapping = strtok_r(NULL, ",", &saveptr1);
1192 }
1193 errno = EINVAL;
1194
1195error_free_return:
1196 result = -errno;
1197exit:
1198 free(map_copy);
1199 return result;
1200}
1201
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001202int container_start(struct container *c, const struct container_config *config)
1203{
1204 int rc = 0;
1205 unsigned int i;
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001206 int uid_userns, gid_userns;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001207 char **destinations;
1208 size_t num_destinations;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001209
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001210 if (!c)
1211 return -EINVAL;
Dylan Reide040c6b2016-05-02 18:49:02 -07001212 if (!config)
1213 return -EINVAL;
1214 if (!config->program_argv || !config->program_argv[0])
1215 return -EINVAL;
1216
Mike Frysingerb22acdf2017-01-08 02:02:35 -05001217 if (config->config_root) {
1218 c->config_root = strdup(config->config_root);
1219 if (!c->config_root) {
1220 rc = -ENOMEM;
1221 goto error_rmdir;
1222 }
1223 }
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001224 if (config->premounted_runfs) {
1225 c->runfs = NULL;
1226 c->runfsroot = strdup(config->premounted_runfs);
1227 if (!c->runfsroot) {
1228 rc = -ENOMEM;
1229 goto error_rmdir;
1230 }
1231 } else {
1232 rc = mount_runfs(c, config);
1233 if (rc)
1234 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001235 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001236
1237 c->jail = minijail_new();
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001238 if (!c->jail)
Luis Hector Chavez945af482016-06-03 08:39:34 -07001239 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001240
Luis Hector Chavez8e7b6d52016-06-02 20:40:43 -07001241 rc = do_container_mounts(c, config);
1242 if (rc)
Dylan Reid7daf9982016-04-28 16:55:42 -07001243 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001244
Dylan Reida9966422016-07-21 10:11:34 -07001245 c->cgroup = container_cgroup_new(c->name,
1246 "/sys/fs/cgroup",
1247 config->cgroup_parent,
Dmitry Torokhov14eef722016-09-27 16:40:37 -07001248 config->cgroup_owner,
1249 config->cgroup_group);
Dylan Reida9966422016-07-21 10:11:34 -07001250 if (!c->cgroup)
1251 goto error_rmdir;
1252
Keshav Santhanam268fa032016-07-14 09:59:24 -07001253 /* Must be root to modify device cgroup or mknod */
1254 if (getuid() == 0) {
1255 c->cgroup->ops->deny_all_devices(c->cgroup);
Dylan Reid837c74a2016-01-22 17:25:21 -08001256
Keshav Santhanam268fa032016-07-14 09:59:24 -07001257 for (i = 0; i < config->num_devices; i++) {
1258 const struct container_device *dev = &config->devices[i];
1259 int minor = dev->minor;
Dylan Reid837c74a2016-01-22 17:25:21 -08001260
Keshav Santhanam268fa032016-07-14 09:59:24 -07001261 if (dev->copy_minor) {
1262 struct stat st_buff;
1263 if (stat(dev->path, &st_buff) < 0)
1264 continue;
1265 /* Use the minor macro to extract the device number. */
1266 minor = minor(st_buff.st_rdev);
1267 }
1268 if (minor >= 0) {
1269 rc = container_create_device(c, dev, minor);
1270 if (rc)
1271 goto error_rmdir;
1272 }
1273
1274 rc = c->cgroup->ops->add_device(c->cgroup, dev->major,
1275 minor, dev->read_allowed,
1276 dev->write_allowed,
1277 dev->modify_allowed, dev->type);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001278 if (rc)
Dylan Reid355d5e42016-04-29 16:53:31 -07001279 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001280 }
Mike Frysinger412dbd22017-01-06 01:50:34 -05001281
1282 for (i = 0; i < c->num_loopdevs; ++i) {
1283 struct stat st;
1284
1285 if (stat(c->loopdevs[i], &st) < 0)
1286 goto error_rmdir;
1287 rc = c->cgroup->ops->add_device(c->cgroup, major(st.st_rdev),
1288 minor(st.st_rdev), 1, 0, 0, 'b');
1289 if (rc)
1290 goto error_rmdir;
1291 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001292 }
1293
Dylan Reidd7229582016-04-27 17:08:40 -07001294 /* Potentailly run setfiles on mounts configured outside of the jail */
Yusuke Sato91f11f02016-12-02 16:15:13 -08001295 destinations = calloc(config->num_mounts, sizeof(char *));
1296 num_destinations = 0;
Dylan Reide040c6b2016-05-02 18:49:02 -07001297 for (i = 0; i < config->num_mounts; i++) {
1298 const struct container_mount *mnt = &config->mounts[i];
Yusuke Sato91f11f02016-12-02 16:15:13 -08001299 char* dest = mnt->destination;
Dylan Reidd7229582016-04-27 17:08:40 -07001300
1301 if (mnt->mount_in_ns)
1302 continue;
Junichi Uekawa5d272772016-07-21 16:07:19 +09001303 if (mnt->flags & MS_RDONLY)
1304 continue;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001305
Yusuke Satod33db432016-12-05 16:24:37 -08001306 /* A hack to avoid setfiles on /data and /cache. */
1307 if (!strcmp(dest, "/data") || !strcmp(dest, "/cache"))
Yusuke Sato91f11f02016-12-02 16:15:13 -08001308 continue;
1309
1310 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) {
1311 size_t j;
1312 for (j = 0; j < num_destinations; ++j) {
1313 free(destinations[j]);
1314 }
1315 free(destinations);
Dylan Reidd7229582016-04-27 17:08:40 -07001316 goto error_rmdir;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001317 }
1318
1319 destinations[num_destinations++] = dest;
Dylan Reidd7229582016-04-27 17:08:40 -07001320 }
Yusuke Sato91f11f02016-12-02 16:15:13 -08001321 if (num_destinations) {
1322 size_t i;
1323 rc = run_setfiles_command(c, config, destinations, num_destinations);
1324 for (i = 0; i < num_destinations; ++i) {
1325 free(destinations[i]);
1326 }
1327 }
1328 free(destinations);
1329 if (rc)
1330 goto error_rmdir;
Dylan Reidd7229582016-04-27 17:08:40 -07001331
Chinyue Chenfac909e2016-06-24 14:17:42 +08001332 /* Setup CPU cgroup params. */
1333 if (config->cpu_cgparams.shares) {
1334 rc = c->cgroup->ops->set_cpu_shares(
1335 c->cgroup, config->cpu_cgparams.shares);
1336 if (rc)
1337 goto error_rmdir;
1338 }
1339 if (config->cpu_cgparams.period) {
1340 rc = c->cgroup->ops->set_cpu_quota(
1341 c->cgroup, config->cpu_cgparams.quota);
1342 if (rc)
1343 goto error_rmdir;
1344 rc = c->cgroup->ops->set_cpu_period(
1345 c->cgroup, config->cpu_cgparams.period);
1346 if (rc)
1347 goto error_rmdir;
1348 }
1349 if (config->cpu_cgparams.rt_period) {
1350 rc = c->cgroup->ops->set_cpu_rt_runtime(
1351 c->cgroup, config->cpu_cgparams.rt_runtime);
1352 if (rc)
1353 goto error_rmdir;
1354 rc = c->cgroup->ops->set_cpu_rt_period(
1355 c->cgroup, config->cpu_cgparams.rt_period);
1356 if (rc)
1357 goto error_rmdir;
1358 }
1359
Dylan Reid837c74a2016-01-22 17:25:21 -08001360 /* Setup and start the container with libminijail. */
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001361 if (config->pid_file_path) {
1362 c->pid_file_path = strdup(config->pid_file_path);
1363 if (!c->pid_file_path) {
1364 rc = -ENOMEM;
1365 goto error_rmdir;
1366 }
1367 } else if (c->runfs) {
1368 if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0) {
1369 rc = -ENOMEM;
1370 goto error_rmdir;
1371 }
1372 }
1373
1374 if (c->pid_file_path)
1375 minijail_write_pid_file(c->jail, c->pid_file_path);
Dylan Reid837c74a2016-01-22 17:25:21 -08001376 minijail_reset_signal_mask(c->jail);
1377
1378 /* Setup container namespaces. */
1379 minijail_namespace_ipc(c->jail);
1380 minijail_namespace_vfs(c->jail);
Keshav Santhanam1b6bf672016-08-10 18:35:12 -07001381 if (!config->share_host_netns)
1382 minijail_namespace_net(c->jail);
Dylan Reid837c74a2016-01-22 17:25:21 -08001383 minijail_namespace_pids(c->jail);
Dylan Reid837c74a2016-01-22 17:25:21 -08001384 minijail_namespace_user(c->jail);
Mike Frysingerfbd60552017-01-03 17:28:48 -05001385 if (getuid() != 0)
1386 minijail_namespace_user_disable_setgroups(c->jail);
Dylan Reidc6ca1042016-07-11 15:03:27 -07001387 minijail_namespace_cgroups(c->jail);
Dylan Reide040c6b2016-05-02 18:49:02 -07001388 rc = minijail_uidmap(c->jail, config->uid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -08001389 if (rc)
1390 goto error_rmdir;
Dylan Reide040c6b2016-05-02 18:49:02 -07001391 rc = minijail_gidmap(c->jail, config->gid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -08001392 if (rc)
1393 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001394
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001395 /* Set the UID/GID inside the container if not 0. */
1396 uid_userns = get_userns_id(config->uid_map, config->uid);
1397 if (uid_userns < 0)
1398 goto error_rmdir;
1399 else if (uid_userns > 0)
1400 minijail_change_uid(c->jail, (uid_t) uid_userns);
1401 gid_userns = get_userns_id(config->gid_map, config->gid);
1402 if (gid_userns < 0)
1403 goto error_rmdir;
1404 else if (gid_userns > 0)
1405 minijail_change_gid(c->jail, (gid_t) gid_userns);
1406
Dylan Reid837c74a2016-01-22 17:25:21 -08001407 rc = minijail_enter_pivot_root(c->jail, c->runfsroot);
1408 if (rc)
1409 goto error_rmdir;
1410
1411 /* Add the cgroups configured above. */
Dmitry Torokhov0d253a62017-01-05 09:41:33 -08001412 for (i = 0; i < NUM_CGROUP_TYPES; i++) {
1413 if (c->cgroup->cgroup_tasks_paths[i]) {
1414 rc = minijail_add_to_cgroup(c->jail,
1415 c->cgroup->cgroup_tasks_paths[i]);
1416 if (rc)
1417 goto error_rmdir;
1418 }
1419 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001420
Dylan Reide040c6b2016-05-02 18:49:02 -07001421 if (config->alt_syscall_table)
1422 minijail_use_alt_syscall(c->jail, config->alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -08001423
1424 minijail_run_as_init(c->jail);
1425
Dylan Reid3da683b2016-04-05 03:35:35 -07001426 /* TODO(dgreid) - remove this once shared mounts are cleaned up. */
1427 minijail_skip_remount_private(c->jail);
1428
Dylan Reidc4335842016-11-11 10:24:52 -08001429 if (!config->keep_fds_open)
1430 minijail_close_open_fds(c->jail);
Luis Hector Chaveze18e7d42016-10-12 07:35:32 -07001431
Dylan Reid837c74a2016-01-22 17:25:21 -08001432 rc = minijail_run_pid_pipes_no_preload(c->jail,
Dylan Reide040c6b2016-05-02 18:49:02 -07001433 config->program_argv[0],
1434 config->program_argv,
Dylan Reid837c74a2016-01-22 17:25:21 -08001435 &c->init_pid, NULL, NULL,
1436 NULL);
1437 if (rc)
1438 goto error_rmdir;
1439 return 0;
1440
1441error_rmdir:
Luis Hector Chavez945af482016-06-03 08:39:34 -07001442 if (!rc)
1443 rc = -errno;
1444 container_teardown(c);
Dylan Reid837c74a2016-01-22 17:25:21 -08001445 return rc;
1446}
1447
1448const char *container_root(struct container *c)
1449{
1450 return c->runfs;
1451}
1452
1453int container_pid(struct container *c)
1454{
1455 return c->init_pid;
1456}
1457
1458static int container_teardown(struct container *c)
1459{
Dylan Reid837c74a2016-01-22 17:25:21 -08001460 int ret = 0;
1461
Dylan Reide040c6b2016-05-02 18:49:02 -07001462 unmount_external_mounts(c);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001463 if (c->runfsroot && c->runfs) {
Luis Hector Chavez945af482016-06-03 08:39:34 -07001464 if (umount(c->runfsroot))
1465 ret = -errno;
1466 if (rmdir(c->runfsroot))
1467 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001468 FREE_AND_NULL(c->runfsroot);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001469 }
1470 if (c->pid_file_path) {
1471 if (unlink(c->pid_file_path))
1472 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001473 FREE_AND_NULL(c->pid_file_path);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001474 }
1475 if (c->runfs) {
1476 if (rmdir(c->runfs))
1477 ret = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001478 FREE_AND_NULL(c->runfs);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001479 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001480 return ret;
1481}
1482
1483int container_wait(struct container *c)
1484{
Dylan Reidcf745c52016-04-22 10:18:03 -07001485 int rc;
1486
1487 do {
1488 rc = minijail_wait(c->jail);
Luis Hector Chavez4641e852016-06-02 15:40:19 -07001489 } while (rc == -EINTR);
Dylan Reidcf745c52016-04-22 10:18:03 -07001490
Luis Hector Chavez945af482016-06-03 08:39:34 -07001491 // If the process had already been reaped, still perform teardown.
1492 if (rc == -ECHILD || rc >= 0) {
Dylan Reidcf745c52016-04-22 10:18:03 -07001493 rc = container_teardown(c);
Luis Hector Chavez945af482016-06-03 08:39:34 -07001494 }
Dylan Reidcf745c52016-04-22 10:18:03 -07001495 return rc;
Dylan Reid837c74a2016-01-22 17:25:21 -08001496}
1497
1498int container_kill(struct container *c)
1499{
Luis Hector Chavez945af482016-06-03 08:39:34 -07001500 if (kill(c->init_pid, SIGKILL) && errno != ESRCH)
Dylan Reid837c74a2016-01-22 17:25:21 -08001501 return -errno;
1502 return container_wait(c);
1503}