blob: 0f25ebfc577a7d89777d0390bba20652b242a753 [file] [log] [blame]
Dylan Reid837c74a2016-01-22 17:25:21 -08001/* Copyright 2016 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
Dylan Reid837c74a2016-01-22 17:25:21 -08006#include <errno.h>
7#include <fcntl.h>
Mike Frysinger05e594e2017-01-10 02:11:08 -05008#if USE_device_mapper
9#include <libdevmapper.h>
10#endif
Dylan Reid837c74a2016-01-22 17:25:21 -080011#include <malloc.h>
12#include <signal.h>
Luis Hector Chavezff5978f2017-06-27 12:52:58 -070013#include <stdint.h>
Dylan Reid837c74a2016-01-22 17:25:21 -080014#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <sys/mount.h>
18#include <sys/stat.h>
19#include <sys/types.h>
Dylan Reid2bd9ea92016-04-07 20:57:47 -070020#include <sys/wait.h>
Luis Hector Chavez836d7b22017-09-14 15:11:15 -070021#include <syscall.h>
Dylan Reid837c74a2016-01-22 17:25:21 -080022#include <unistd.h>
23
Luis Hector Chavez836d7b22017-09-14 15:11:15 -070024#include <libminijail.h>
Mike Frysinger412dbd22017-01-06 01:50:34 -050025#include <linux/loop.h>
26
Luis Hector Chavez836d7b22017-09-14 15:11:15 -070027#include "libcontainer/container_cgroup.h"
28#include "libcontainer/libcontainer.h"
Dylan Reid837c74a2016-01-22 17:25:21 -080029
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070030#define FREE_AND_NULL(ptr) \
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070031 do { \
32 free(ptr); \
33 ptr = nullptr; \
34 } while (0)
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070035
Yusuke Sato91f11f02016-12-02 16:15:13 -080036#define MAX_NUM_SETFILES_ARGS 128
Luis Hector Chavez836d7b22017-09-14 15:11:15 -070037#define MAX_RLIMITS 32 // Linux defines 15 at the time of writing.
Yusuke Sato91f11f02016-12-02 16:15:13 -080038
Mike Frysinger412dbd22017-01-06 01:50:34 -050039static const char loopdev_ctl[] = "/dev/loop-control";
Mike Frysinger05e594e2017-01-10 02:11:08 -050040#if USE_device_mapper
41static const char dm_dev_prefix[] = "/dev/mapper/";
42#endif
Mike Frysinger412dbd22017-01-06 01:50:34 -050043
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070044static int container_teardown(struct container* c);
Luis Hector Chavez945af482016-06-03 08:39:34 -070045
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070046static int strdup_and_free(char** dest, const char* src) {
47 char* copy = strdup(src);
48 if (!copy)
49 return -ENOMEM;
50 if (*dest)
51 free(*dest);
52 *dest = copy;
53 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -070054}
55
Dylan Reid837c74a2016-01-22 17:25:21 -080056struct container_mount {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070057 char* name;
58 char* source;
59 char* destination;
60 char* type;
61 char* data;
62 char* verity;
63 int flags;
64 int uid;
65 int gid;
66 int mode;
67 int mount_in_ns; /* True if mount should happen in new vfs ns */
68 int create; /* True if target should be created if it doesn't exist */
69 int loopback; /* True if target should be mounted via loopback */
Dylan Reid837c74a2016-01-22 17:25:21 -080070};
71
72struct container_device {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070073 char type; /* 'c' or 'b' for char or block */
74 char* path;
75 int fs_permissions;
76 int major;
77 int minor;
78 int copy_minor; /* Copy the minor from existing node, ignores |minor| */
79 int uid;
80 int gid;
Dylan Reid4843d6b2017-03-31 18:14:30 -070081};
82
83struct container_cgroup_device {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070084 int allow;
85 char type;
86 int major; /* -1 means all */
87 int minor; /* -1 means all */
88 int read;
89 int write;
90 int modify;
Dylan Reid837c74a2016-01-22 17:25:21 -080091};
92
Chinyue Chenfac909e2016-06-24 14:17:42 +080093struct container_cpu_cgroup {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -070094 int shares;
95 int quota;
96 int period;
97 int rt_runtime;
98 int rt_period;
Chinyue Chenfac909e2016-06-24 14:17:42 +080099};
100
Dylan Reid93fa4602017-06-06 13:39:31 -0700101struct container_rlimit {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700102 int type;
103 uint32_t cur;
104 uint32_t max;
Dylan Reid93fa4602017-06-06 13:39:31 -0700105};
106
Dylan Reid837c74a2016-01-22 17:25:21 -0800107/*
108 * Structure that configures how the container is run.
109 *
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500110 * config_root - Path to the root of the container itself.
Dylan Reid837c74a2016-01-22 17:25:21 -0800111 * rootfs - Path to the root of the container's filesystem.
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700112 * rootfs_mount_flags - Flags that will be passed to mount() for the rootfs.
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700113 * premounted_runfs - Path to where the container will be run.
114 * pid_file_path - Path to the file where the pid should be written.
Dylan Reid837c74a2016-01-22 17:25:21 -0800115 * program_argv - The program to run and args, e.g. "/sbin/init".
116 * num_args - Number of args in program_argv.
Dylan Reid1874feb2016-06-22 17:53:50 -0700117 * uid - The uid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800118 * uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024"
Dylan Reid1874feb2016-06-22 17:53:50 -0700119 * gid - The gid the container will run as.
Dylan Reid837c74a2016-01-22 17:25:21 -0800120 * gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024"
Luis Hector Chavez8b4b7a02017-09-15 08:06:06 -0700121 * alt_syscall_table - Syscall table to use or nullptr if none.
Dylan Reid837c74a2016-01-22 17:25:21 -0800122 * mounts - Filesystems to mount in the new namespace.
123 * num_mounts - Number of above.
124 * devices - Device nodes to create.
125 * num_devices - Number of above.
Dylan Reid4843d6b2017-03-31 18:14:30 -0700126 * cgroup_devices - Device node cgroup permissions.
127 * num_cgroup_devices - Number of above.
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700128 * run_setfiles - Should run setfiles on mounts to enable selinux.
Chinyue Chenfac909e2016-06-24 14:17:42 +0800129 * cpu_cgparams - CPU cgroup params.
Dylan Reid9e724af2016-07-21 09:58:07 -0700130 * cgroup_parent - Parent dir for cgroup creation
131 * cgroup_owner - uid to own the created cgroups
Dmitry Torokhov14eef722016-09-27 16:40:37 -0700132 * cgroup_group - gid to own the created cgroups
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700133 * share_host_netns - Enable sharing of the host network namespace.
Dylan Reidc4335842016-11-11 10:24:52 -0800134 * keep_fds_open - Allow the child process to keep open FDs (for stdin/out/err).
Dylan Reid93fa4602017-06-06 13:39:31 -0700135 * rlimits - Array of rlimits for the contained process.
136 * num_rlimits - The number of elements in `rlimits`.
Luis Hector Chavezcd44ba72017-06-30 13:01:38 -0700137 * securebits_skip_mask - The mask of securebits to skip when restricting caps.
Luis Hector Chavezdac65c32017-07-21 10:30:23 -0700138 * do_init - Whether the container needs an extra process to be run as init.
Luis Hector Chavez15e8e672017-07-20 15:13:27 -0700139 * selinux_context - The SELinux context name the container will run under.
Luis Hector Chavezf8e8f4c2017-08-01 01:09:39 -0700140 * pre_start_hook - A function pointer to be called prior to calling execve(2).
141 * pre_start_hook_payload - Parameter that will be passed to pre_start_hook().
Dylan Reid837c74a2016-01-22 17:25:21 -0800142 */
143struct container_config {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700144 char* config_root;
145 char* rootfs;
146 unsigned long rootfs_mount_flags;
147 char* premounted_runfs;
148 char* pid_file_path;
149 char** program_argv;
150 size_t num_args;
151 uid_t uid;
152 char* uid_map;
153 gid_t gid;
154 char* gid_map;
155 char* alt_syscall_table;
156 struct container_mount* mounts;
157 size_t num_mounts;
158 struct container_device* devices;
159 size_t num_devices;
160 struct container_cgroup_device* cgroup_devices;
161 size_t num_cgroup_devices;
162 char* run_setfiles;
163 struct container_cpu_cgroup cpu_cgparams;
164 char* cgroup_parent;
165 uid_t cgroup_owner;
166 gid_t cgroup_group;
167 int share_host_netns;
168 int keep_fds_open;
169 struct container_rlimit rlimits[MAX_RLIMITS];
170 int num_rlimits;
171 int use_capmask;
172 int use_capmask_ambient;
173 uint64_t capmask;
174 uint64_t securebits_skip_mask;
175 int do_init;
176 char* selinux_context;
177 minijail_hook_t pre_start_hook;
178 void* pre_start_hook_payload;
179 int* inherited_fds;
180 size_t inherited_fd_count;
Dylan Reid837c74a2016-01-22 17:25:21 -0800181};
182
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700183struct container_config* container_config_create() {
184 return reinterpret_cast<struct container_config*>(
185 calloc(1, sizeof(struct container_config)));
Dylan Reid837c74a2016-01-22 17:25:21 -0800186}
187
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700188static void container_free_program_args(struct container_config* c) {
189 unsigned int i;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700190
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700191 if (!c->program_argv)
192 return;
193 for (i = 0; i < c->num_args; ++i) {
194 FREE_AND_NULL(c->program_argv[i]);
195 }
196 FREE_AND_NULL(c->program_argv);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700197}
198
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700199static void container_config_free_mount(struct container_mount* mount) {
200 FREE_AND_NULL(mount->name);
201 FREE_AND_NULL(mount->source);
202 FREE_AND_NULL(mount->destination);
203 FREE_AND_NULL(mount->type);
204 FREE_AND_NULL(mount->data);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700205}
206
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700207static void container_config_free_device(struct container_device* device) {
208 FREE_AND_NULL(device->path);
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700209}
210
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700211void container_config_destroy(struct container_config* c) {
212 size_t i;
Dylan Reid837c74a2016-01-22 17:25:21 -0800213
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700214 if (c == nullptr)
215 return;
216 FREE_AND_NULL(c->rootfs);
217 container_free_program_args(c);
218 FREE_AND_NULL(c->premounted_runfs);
219 FREE_AND_NULL(c->pid_file_path);
220 FREE_AND_NULL(c->uid_map);
221 FREE_AND_NULL(c->gid_map);
222 FREE_AND_NULL(c->alt_syscall_table);
223 for (i = 0; i < c->num_mounts; ++i) {
224 container_config_free_mount(&c->mounts[i]);
225 }
226 FREE_AND_NULL(c->mounts);
227 for (i = 0; i < c->num_devices; ++i) {
228 container_config_free_device(&c->devices[i]);
229 }
230 FREE_AND_NULL(c->devices);
231 FREE_AND_NULL(c->cgroup_devices);
232 FREE_AND_NULL(c->run_setfiles);
233 FREE_AND_NULL(c->cgroup_parent);
234 FREE_AND_NULL(c->selinux_context);
235 FREE_AND_NULL(c->inherited_fds);
236 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800237}
238
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700239int container_config_config_root(struct container_config* c,
240 const char* config_root) {
241 return strdup_and_free(&c->config_root, config_root);
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500242}
243
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700244const char* container_config_get_config_root(const struct container_config* c) {
245 return c->config_root;
Mike Frysingerb22acdf2017-01-08 02:02:35 -0500246}
247
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700248int container_config_rootfs(struct container_config* c, const char* rootfs) {
249 return strdup_and_free(&c->rootfs, rootfs);
Dylan Reid837c74a2016-01-22 17:25:21 -0800250}
251
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700252const char* container_config_get_rootfs(const struct container_config* c) {
253 return c->rootfs;
Dylan Reid11456722016-05-02 11:24:50 -0700254}
255
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700256void container_config_rootfs_mount_flags(struct container_config* c,
257 unsigned long rootfs_mount_flags) {
258 /* Since we are going to add MS_REMOUNT anyways, add it here so we can
259 * simply check against zero later. MS_BIND is also added to avoid
260 * re-mounting the original filesystem, since the rootfs is always
261 * bind-mounted.
262 */
263 c->rootfs_mount_flags = MS_REMOUNT | MS_BIND | rootfs_mount_flags;
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700264}
265
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700266unsigned long container_config_get_rootfs_mount_flags(
267 const struct container_config* c) {
268 return c->rootfs_mount_flags;
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -0700269}
270
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700271int container_config_premounted_runfs(struct container_config* c,
272 const char* runfs) {
273 return strdup_and_free(&c->premounted_runfs, runfs);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700274}
275
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700276const char* container_config_get_premounted_runfs(
277 const struct container_config* c) {
278 return c->premounted_runfs;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700279}
280
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700281int container_config_pid_file(struct container_config* c, const char* path) {
282 return strdup_and_free(&c->pid_file_path, path);
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700283}
284
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700285const char* container_config_get_pid_file(const struct container_config* c) {
286 return c->pid_file_path;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -0700287}
288
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700289int container_config_program_argv(struct container_config* c,
290 const char** argv,
291 size_t num_args) {
292 size_t i;
Dylan Reid837c74a2016-01-22 17:25:21 -0800293
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700294 container_free_program_args(c);
295 c->num_args = num_args;
296 c->program_argv =
297 reinterpret_cast<char**>(calloc(num_args + 1, sizeof(char*)));
298 if (!c->program_argv)
299 return -ENOMEM;
300 for (i = 0; i < num_args; ++i) {
301 if (strdup_and_free(&c->program_argv[i], argv[i]))
302 goto error_free_return;
303 }
304 c->program_argv[num_args] = nullptr;
305 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700306
307error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700308 container_free_program_args(c);
309 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800310}
311
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700312size_t container_config_get_num_program_args(const struct container_config* c) {
313 return c->num_args;
Dylan Reid11456722016-05-02 11:24:50 -0700314}
315
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700316const char* container_config_get_program_arg(const struct container_config* c,
317 size_t index) {
318 if (index >= c->num_args)
319 return nullptr;
320 return c->program_argv[index];
Dylan Reid11456722016-05-02 11:24:50 -0700321}
322
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700323void container_config_uid(struct container_config* c, uid_t uid) {
324 c->uid = uid;
Dylan Reid1874feb2016-06-22 17:53:50 -0700325}
326
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700327uid_t container_config_get_uid(const struct container_config* c) {
328 return c->uid;
Dylan Reid1874feb2016-06-22 17:53:50 -0700329}
330
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700331int container_config_uid_map(struct container_config* c, const char* uid_map) {
332 return strdup_and_free(&c->uid_map, uid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800333}
334
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700335void container_config_gid(struct container_config* c, gid_t gid) {
336 c->gid = gid;
Dylan Reid1874feb2016-06-22 17:53:50 -0700337}
338
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700339gid_t container_config_get_gid(const struct container_config* c) {
340 return c->gid;
Dylan Reid1874feb2016-06-22 17:53:50 -0700341}
342
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700343int container_config_gid_map(struct container_config* c, const char* gid_map) {
344 return strdup_and_free(&c->gid_map, gid_map);
Dylan Reid837c74a2016-01-22 17:25:21 -0800345}
346
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700347int container_config_alt_syscall_table(struct container_config* c,
348 const char* alt_syscall_table) {
349 return strdup_and_free(&c->alt_syscall_table, alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -0800350}
351
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700352int container_config_add_rlimit(struct container_config* c,
353 int type,
354 uint32_t cur,
355 uint32_t max) {
356 if (c->num_rlimits >= MAX_RLIMITS) {
357 return -ENOMEM;
358 }
359 c->rlimits[c->num_rlimits].type = type;
360 c->rlimits[c->num_rlimits].cur = cur;
361 c->rlimits[c->num_rlimits].max = max;
362 c->num_rlimits++;
363 return 0;
Dylan Reid93fa4602017-06-06 13:39:31 -0700364}
365
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700366int container_config_add_mount(struct container_config* c,
367 const char* name,
368 const char* source,
369 const char* destination,
370 const char* type,
371 const char* data,
372 const char* verity,
373 int flags,
374 int uid,
375 int gid,
376 int mode,
377 int mount_in_ns,
378 int create,
379 int loopback) {
380 struct container_mount* mount_ptr;
381 struct container_mount* current_mount;
Dylan Reid837c74a2016-01-22 17:25:21 -0800382
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700383 if (name == nullptr || source == nullptr || destination == nullptr ||
384 type == nullptr)
385 return -EINVAL;
Dylan Reid837c74a2016-01-22 17:25:21 -0800386
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700387 mount_ptr = reinterpret_cast<struct container_mount*>(
388 realloc(c->mounts, sizeof(c->mounts[0]) * (c->num_mounts + 1)));
389 if (!mount_ptr)
390 return -ENOMEM;
391 c->mounts = mount_ptr;
392 current_mount = &c->mounts[c->num_mounts];
393 memset(current_mount, 0, sizeof(struct container_mount));
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700394
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700395 if (strdup_and_free(&current_mount->name, name))
396 goto error_free_return;
397 if (strdup_and_free(&current_mount->source, source))
398 goto error_free_return;
399 if (strdup_and_free(&current_mount->destination, destination))
400 goto error_free_return;
401 if (strdup_and_free(&current_mount->type, type))
402 goto error_free_return;
403 if (data && strdup_and_free(&current_mount->data, data))
404 goto error_free_return;
405 if (verity && strdup_and_free(&current_mount->verity, verity))
406 goto error_free_return;
407 current_mount->flags = flags;
408 current_mount->uid = uid;
409 current_mount->gid = gid;
410 current_mount->mode = mode;
411 current_mount->mount_in_ns = mount_in_ns;
412 current_mount->create = create;
413 current_mount->loopback = loopback;
414 ++c->num_mounts;
415 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700416
417error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700418 container_config_free_mount(current_mount);
419 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800420}
421
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700422int container_config_add_cgroup_device(struct container_config* c,
423 int allow,
424 char type,
425 int major,
426 int minor,
427 int read,
428 int write,
429 int modify) {
430 struct container_cgroup_device* dev_ptr;
431 struct container_cgroup_device* current_dev;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700432
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700433 dev_ptr = reinterpret_cast<struct container_cgroup_device*>(
434 realloc(c->cgroup_devices,
435 sizeof(c->cgroup_devices[0]) * (c->num_cgroup_devices + 1)));
436 if (!dev_ptr)
437 return -ENOMEM;
438 c->cgroup_devices = dev_ptr;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700439
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700440 current_dev = &c->cgroup_devices[c->num_cgroup_devices];
441 memset(current_dev, 0, sizeof(struct container_cgroup_device));
442 current_dev->allow = allow;
443 current_dev->type = type;
444 current_dev->major = major;
445 current_dev->minor = minor;
446 current_dev->read = read;
447 current_dev->write = write;
448 current_dev->modify = modify;
449 ++c->num_cgroup_devices;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700450
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700451 return 0;
Dylan Reid4843d6b2017-03-31 18:14:30 -0700452}
453
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700454int container_config_add_device(struct container_config* c,
455 char type,
456 const char* path,
457 int fs_permissions,
458 int major,
459 int minor,
460 int copy_minor,
461 int uid,
462 int gid,
463 int read_allowed,
464 int write_allowed,
465 int modify_allowed) {
466 struct container_device* dev_ptr;
467 struct container_device* current_dev;
Dylan Reid837c74a2016-01-22 17:25:21 -0800468
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700469 if (path == nullptr)
470 return -EINVAL;
471 /* If using a dynamic minor number, ensure that minor is -1. */
472 if (copy_minor && (minor != -1))
473 return -EINVAL;
Dylan Reid355d5e42016-04-29 16:53:31 -0700474
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700475 dev_ptr = reinterpret_cast<struct container_device*>(
476 realloc(c->devices, sizeof(c->devices[0]) * (c->num_devices + 1)));
477 if (!dev_ptr)
478 return -ENOMEM;
479 c->devices = dev_ptr;
480 current_dev = &c->devices[c->num_devices];
481 memset(current_dev, 0, sizeof(struct container_device));
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700482
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700483 current_dev->type = type;
484 if (strdup_and_free(&current_dev->path, path))
485 goto error_free_return;
486 current_dev->fs_permissions = fs_permissions;
487 current_dev->major = major;
488 current_dev->minor = minor;
489 current_dev->copy_minor = copy_minor;
490 current_dev->uid = uid;
491 current_dev->gid = gid;
492 if (read_allowed || write_allowed || modify_allowed) {
493 if (container_config_add_cgroup_device(c,
494 1,
495 type,
496 major,
497 minor,
498 read_allowed,
499 write_allowed,
500 modify_allowed))
501 goto error_free_return;
502 }
503 ++c->num_devices;
504 return 0;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -0700505
506error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700507 container_config_free_device(current_dev);
508 return -ENOMEM;
Dylan Reid837c74a2016-01-22 17:25:21 -0800509}
510
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700511int container_config_run_setfiles(struct container_config* c,
512 const char* setfiles_cmd) {
513 return strdup_and_free(&c->run_setfiles, setfiles_cmd);
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700514}
Dylan Reid837c74a2016-01-22 17:25:21 -0800515
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700516const char* container_config_get_run_setfiles(
517 const struct container_config* c) {
518 return c->run_setfiles;
Dylan Reid11456722016-05-02 11:24:50 -0700519}
520
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700521int container_config_set_cpu_shares(struct container_config* c, int shares) {
522 /* CPU shares must be 2 or higher. */
523 if (shares < 2)
524 return -EINVAL;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800525
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700526 c->cpu_cgparams.shares = shares;
527 return 0;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800528}
529
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700530int container_config_set_cpu_cfs_params(struct container_config* c,
531 int quota,
532 int period) {
533 /*
534 * quota could be set higher than period to utilize more than one CPU.
535 * quota could also be set as -1 to indicate the cgroup does not adhere
536 * to any CPU time restrictions.
537 */
538 if (quota <= 0 && quota != -1)
539 return -EINVAL;
540 if (period <= 0)
541 return -EINVAL;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800542
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700543 c->cpu_cgparams.quota = quota;
544 c->cpu_cgparams.period = period;
545 return 0;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800546}
547
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700548int container_config_set_cpu_rt_params(struct container_config* c,
549 int rt_runtime,
550 int rt_period) {
551 /*
552 * rt_runtime could be set as 0 to prevent the cgroup from using
553 * realtime CPU.
554 */
555 if (rt_runtime < 0 || rt_runtime >= rt_period)
556 return -EINVAL;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800557
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700558 c->cpu_cgparams.rt_runtime = rt_runtime;
559 c->cpu_cgparams.rt_period = rt_period;
560 return 0;
Chinyue Chenfac909e2016-06-24 14:17:42 +0800561}
562
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700563int container_config_get_cpu_shares(struct container_config* c) {
564 return c->cpu_cgparams.shares;
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800565}
566
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700567int container_config_get_cpu_quota(struct container_config* c) {
568 return c->cpu_cgparams.quota;
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800569}
570
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700571int container_config_get_cpu_period(struct container_config* c) {
572 return c->cpu_cgparams.period;
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800573}
574
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700575int container_config_get_cpu_rt_runtime(struct container_config* c) {
576 return c->cpu_cgparams.rt_runtime;
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800577}
578
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700579int container_config_get_cpu_rt_period(struct container_config* c) {
580 return c->cpu_cgparams.rt_period;
Chinyue Chen4f3fd682016-07-01 14:11:42 +0800581}
582
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700583int container_config_set_cgroup_parent(struct container_config* c,
584 const char* parent,
585 uid_t cgroup_owner,
586 gid_t cgroup_group) {
587 c->cgroup_owner = cgroup_owner;
588 c->cgroup_group = cgroup_group;
589 return strdup_and_free(&c->cgroup_parent, parent);
Dylan Reid9e724af2016-07-21 09:58:07 -0700590}
591
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700592const char* container_config_get_cgroup_parent(struct container_config* c) {
593 return c->cgroup_parent;
Dylan Reid9e724af2016-07-21 09:58:07 -0700594}
595
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700596void container_config_share_host_netns(struct container_config* c) {
597 c->share_host_netns = 1;
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700598}
599
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700600int get_container_config_share_host_netns(struct container_config* c) {
601 return c->share_host_netns;
Keshav Santhanam1b6bf672016-08-10 18:35:12 -0700602}
603
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700604void container_config_keep_fds_open(struct container_config* c) {
605 c->keep_fds_open = 1;
Dylan Reidc4335842016-11-11 10:24:52 -0800606}
607
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700608void container_config_set_capmask(struct container_config* c,
609 uint64_t capmask,
610 int ambient) {
611 c->use_capmask = 1;
612 c->capmask = capmask;
613 c->use_capmask_ambient = ambient;
Luis Hector Chavezff5978f2017-06-27 12:52:58 -0700614}
615
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700616void container_config_set_securebits_skip_mask(struct container_config* c,
617 uint64_t securebits_skip_mask) {
618 c->securebits_skip_mask = securebits_skip_mask;
Luis Hector Chavezcd44ba72017-06-30 13:01:38 -0700619}
620
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700621void container_config_set_run_as_init(struct container_config* c,
622 int run_as_init) {
623 c->do_init = !run_as_init;
Luis Hector Chavezdac65c32017-07-21 10:30:23 -0700624}
625
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700626int container_config_set_selinux_context(struct container_config* c,
627 const char* context) {
628 if (!context)
629 return -EINVAL;
630 c->selinux_context = strdup(context);
631 if (c->selinux_context)
632 return -ENOMEM;
633 return 0;
Luis Hector Chavez15e8e672017-07-20 15:13:27 -0700634}
635
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700636void container_config_set_pre_execve_hook(struct container_config* c,
637 int (*hook)(void*),
638 void* payload) {
639 c->pre_start_hook = hook;
640 c->pre_start_hook_payload = payload;
Luis Hector Chavezf8e8f4c2017-08-01 01:09:39 -0700641}
642
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700643int container_config_inherit_fds(struct container_config* c,
644 int* inherited_fds,
645 size_t inherited_fd_count) {
646 if (c->inherited_fds)
647 return -EINVAL;
648 c->inherited_fds =
649 reinterpret_cast<int*>(calloc(inherited_fd_count, sizeof(int)));
650 if (!c->inherited_fds)
651 return -ENOMEM;
652 memcpy(c->inherited_fds, inherited_fds, inherited_fd_count * sizeof(int));
653 c->inherited_fd_count = inherited_fd_count;
654 return 0;
Luis Hector Chavezf8e8f4c2017-08-01 01:09:39 -0700655}
656
Dylan Reid837c74a2016-01-22 17:25:21 -0800657/*
658 * Container manipulation
659 */
660struct container {
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700661 struct container_cgroup* cgroup;
662 struct minijail* jail;
663 pid_t init_pid;
664 char* config_root;
665 char* runfs;
666 char* rundir;
667 char* runfsroot;
668 char* pid_file_path;
669 char** ext_mounts; /* Mounts made outside of the minijail */
670 size_t num_ext_mounts;
671 char** loopdevs;
672 size_t num_loopdevs;
673 char** device_mappers;
674 size_t num_device_mappers;
675 char* name;
Dylan Reid837c74a2016-01-22 17:25:21 -0800676};
677
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700678struct container* container_new(const char* name, const char* rundir) {
679 struct container* c;
Dylan Reid837c74a2016-01-22 17:25:21 -0800680
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700681 c = reinterpret_cast<struct container*>(calloc(1, sizeof(*c)));
682 if (!c)
683 return nullptr;
684 c->rundir = strdup(rundir);
685 c->name = strdup(name);
686 if (!c->rundir || !c->name) {
687 container_destroy(c);
688 return nullptr;
689 }
690 return c;
Dylan Reid837c74a2016-01-22 17:25:21 -0800691}
692
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700693void container_destroy(struct container* c) {
694 if (c->cgroup)
695 container_cgroup_destroy(c->cgroup);
696 if (c->jail)
697 minijail_destroy(c->jail);
698 FREE_AND_NULL(c->config_root);
699 FREE_AND_NULL(c->name);
700 FREE_AND_NULL(c->rundir);
701 FREE_AND_NULL(c);
Dylan Reid837c74a2016-01-22 17:25:21 -0800702}
703
Stephen Barber1a398c72017-01-23 12:39:44 -0800704/*
705 * Given a uid/gid map of "inside1 outside1 length1, ...", and an id
706 * inside of the user namespace, return the equivalent outside id, or
707 * return < 0 on error.
708 */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700709static int get_userns_outside_id(const char* map, int id) {
710 char *map_copy, *mapping, *saveptr1, *saveptr2;
711 int inside, outside, length;
712 int result = 0;
713 errno = 0;
Stephen Barber1a398c72017-01-23 12:39:44 -0800714
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700715 if (asprintf(&map_copy, "%s", map) < 0)
716 return -ENOMEM;
Stephen Barber1a398c72017-01-23 12:39:44 -0800717
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700718 mapping = strtok_r(map_copy, ",", &saveptr1);
719 while (mapping) {
720 inside = strtol(strtok_r(mapping, " ", &saveptr2), nullptr, 10);
721 outside = strtol(strtok_r(nullptr, " ", &saveptr2), nullptr, 10);
722 length = strtol(strtok_r(nullptr, "\0", &saveptr2), nullptr, 10);
723 if (errno) {
724 goto error_free_return;
725 } else if (inside < 0 || outside < 0 || length < 0) {
726 errno = EINVAL;
727 goto error_free_return;
728 }
Stephen Barber1a398c72017-01-23 12:39:44 -0800729
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700730 if (id >= inside && id <= (inside + length)) {
731 result = (id - inside) + outside;
732 goto exit;
733 }
Stephen Barber1a398c72017-01-23 12:39:44 -0800734
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700735 mapping = strtok_r(nullptr, ",", &saveptr1);
736 }
737 errno = EINVAL;
Stephen Barber1a398c72017-01-23 12:39:44 -0800738
739error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700740 result = -errno;
Stephen Barber1a398c72017-01-23 12:39:44 -0800741exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700742 free(map_copy);
743 return result;
Stephen Barber1a398c72017-01-23 12:39:44 -0800744}
745
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700746static int make_dir(const char* path, int uid, int gid, int mode) {
747 if (mkdir(path, mode))
748 return -errno;
749 if (chmod(path, mode))
750 return -errno;
751 if (chown(path, uid, gid))
752 return -errno;
753 return 0;
Dylan Reid837c74a2016-01-22 17:25:21 -0800754}
755
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700756static int touch_file(const char* path, int uid, int gid, int mode) {
757 int rc;
758 int fd = open(path, O_RDWR | O_CREAT, mode);
759 if (fd < 0)
760 return -errno;
761 rc = fchown(fd, uid, gid);
762 close(fd);
Dylan Reid837c74a2016-01-22 17:25:21 -0800763
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700764 if (rc)
765 return -errno;
766 return 0;
Dylan Reid837c74a2016-01-22 17:25:21 -0800767}
768
769/* Make sure the mount target exists in the new rootfs. Create if needed and
770 * possible.
771 */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700772static int setup_mount_destination(const struct container_config* config,
773 const struct container_mount* mnt,
774 const char* source,
775 const char* dest) {
776 int uid_userns, gid_userns;
777 int rc;
778 struct stat st_buf;
Dylan Reid837c74a2016-01-22 17:25:21 -0800779
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700780 rc = stat(dest, &st_buf);
781 if (rc == 0) /* destination exists */
782 return 0;
Dylan Reid837c74a2016-01-22 17:25:21 -0800783
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700784 /* Try to create the destination. Either make directory or touch a file
785 * depending on the source type.
786 */
787 uid_userns = get_userns_outside_id(config->uid_map, mnt->uid);
788 if (uid_userns < 0)
789 return uid_userns;
790 gid_userns = get_userns_outside_id(config->gid_map, mnt->gid);
791 if (gid_userns < 0)
792 return gid_userns;
Stephen Barber1a398c72017-01-23 12:39:44 -0800793
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700794 rc = stat(source, &st_buf);
795 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode))
796 return make_dir(dest, uid_userns, gid_userns, mnt->mode);
Dylan Reid837c74a2016-01-22 17:25:21 -0800797
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700798 return touch_file(dest, uid_userns, gid_userns, mnt->mode);
Dylan Reid837c74a2016-01-22 17:25:21 -0800799}
800
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700801/* Fork and exec the setfiles command to configure the selinux policy. */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700802static int run_setfiles_command(const struct container* c,
803 const struct container_config* config,
804 char* const* destinations,
805 size_t num_destinations) {
806 int rc;
807 int status;
808 int pid;
809 char* context_path;
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700810
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700811 if (!config->run_setfiles)
812 return 0;
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700813
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700814 if (asprintf(&context_path, "%s/file_contexts", c->runfsroot) < 0)
815 return -errno;
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700816
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700817 pid = fork();
818 if (pid == 0) {
819 size_t i;
820 size_t arg_index = 0;
821 const char* argv[MAX_NUM_SETFILES_ARGS];
822 const char* env[] = {
823 nullptr,
824 };
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700825
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700826 argv[arg_index++] = config->run_setfiles;
827 argv[arg_index++] = "-r";
828 argv[arg_index++] = c->runfsroot;
829 argv[arg_index++] = context_path;
830 if (arg_index + num_destinations >= MAX_NUM_SETFILES_ARGS)
831 _exit(-E2BIG);
832 for (i = 0; i < num_destinations; ++i) {
833 argv[arg_index++] = destinations[i];
834 }
835 argv[arg_index] = nullptr;
Yusuke Sato91f11f02016-12-02 16:15:13 -0800836
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700837 execve(argv[0], (char* const*)argv, (char* const*)env);
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700838
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700839 /* Command failed to exec if execve returns. */
840 _exit(-errno);
841 }
842 free(context_path);
843 if (pid < 0)
844 return -errno;
845 do {
846 rc = waitpid(pid, &status, 0);
847 } while (rc == -1 && errno == EINTR);
848 if (rc < 0)
849 return -errno;
850 return status;
Dylan Reid2bd9ea92016-04-07 20:57:47 -0700851}
852
Mike Frysinger412dbd22017-01-06 01:50:34 -0500853/* Find a free loop device and attach it. */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700854static int loopdev_setup(char** loopdev_ret, const char* source) {
855 int ret = 0;
856 int source_fd = -1;
857 int control_fd = -1;
858 int loop_fd = -1;
859 char* loopdev = nullptr;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500860
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700861 source_fd = open(source, O_RDONLY | O_CLOEXEC);
862 if (source_fd < 0)
863 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500864
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700865 control_fd = open(loopdev_ctl, O_RDWR | O_NOFOLLOW | O_CLOEXEC);
866 if (control_fd < 0)
867 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500868
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700869 while (1) {
870 int num = ioctl(control_fd, LOOP_CTL_GET_FREE);
871 if (num < 0)
872 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500873
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700874 if (asprintf(&loopdev, "/dev/loop%i", num) < 0)
875 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500876
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700877 loop_fd = open(loopdev, O_RDONLY | O_NOFOLLOW | O_CLOEXEC);
878 if (loop_fd < 0)
879 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500880
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700881 if (ioctl(loop_fd, LOOP_SET_FD, source_fd) == 0)
882 break;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500883
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700884 if (errno != EBUSY)
885 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500886
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700887 /* Clean up resources for the next pass. */
888 free(loopdev);
889 close(loop_fd);
890 }
Mike Frysinger412dbd22017-01-06 01:50:34 -0500891
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700892 *loopdev_ret = loopdev;
893 goto exit;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500894
895error:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700896 ret = -errno;
897 free(loopdev);
Mike Frysinger412dbd22017-01-06 01:50:34 -0500898exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700899 if (source_fd != -1)
900 close(source_fd);
901 if (control_fd != -1)
902 close(control_fd);
903 if (loop_fd != -1)
904 close(loop_fd);
905 return ret;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500906}
907
908/* Detach the specified loop device. */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700909static int loopdev_detach(const char* loopdev) {
910 int ret = 0;
911 int fd;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500912
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700913 fd = open(loopdev, O_RDONLY | O_NOFOLLOW | O_CLOEXEC);
914 if (fd < 0)
915 goto error;
916 if (ioctl(fd, LOOP_CLR_FD) < 0)
917 goto error;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500918
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700919 goto exit;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500920
921error:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700922 ret = -errno;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500923exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700924 if (fd != -1)
925 close(fd);
926 return ret;
Mike Frysinger412dbd22017-01-06 01:50:34 -0500927}
928
Mike Frysinger05e594e2017-01-10 02:11:08 -0500929/* Create a new device mapper target for the source. */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700930static int dm_setup(char** dm_path_ret,
931 char** dm_name_ret,
932 const char* source,
933 const char* verity_cmdline) {
934 int ret = 0;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500935#if USE_device_mapper
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700936 char* p;
937 char* dm_path = nullptr;
938 char* dm_name = nullptr;
939 char* verity = nullptr;
940 struct dm_task* dmt = nullptr;
941 uint32_t cookie = 0;
942 size_t source_len = 0;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500943
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700944 /* Normalize the name into something unique-esque. */
945 if (asprintf(&dm_name, "cros-containers-%s", source) < 0)
946 goto error;
947 p = dm_name;
948 while ((p = strchr(p, '/')) != nullptr)
949 *p++ = '_';
Mike Frysinger05e594e2017-01-10 02:11:08 -0500950
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700951 /* Get the /dev path for the higher levels to mount. */
952 if (asprintf(&dm_path, "%s%s", dm_dev_prefix, dm_name) < 0)
953 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500954
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700955 /* Insert the source path in the verity command line. */
956 source_len = strlen(source);
957 verity = reinterpret_cast<char*>(
958 malloc(strlen(verity_cmdline) + source_len * 2 + 1));
959 memcpy(verity, verity_cmdline, strlen(verity_cmdline) + 1);
960 while ((p = strstr(verity, "@DEV@")) != nullptr) {
961 memmove(p + source_len, p + 5, strlen(p + 5) + 1);
962 memcpy(p, source, source_len);
963 }
Mike Frysinger05e594e2017-01-10 02:11:08 -0500964
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700965 /* Extract the first three parameters for dm-verity settings. */
966 char ttype[20];
967 unsigned long long start, size;
968 int n;
969 if (sscanf(verity, "%llu %llu %10s %n", &start, &size, ttype, &n) != 3)
970 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500971
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700972 /* Finally create the device mapper. */
973 dmt = dm_task_create(DM_DEVICE_CREATE);
974 if (dmt == nullptr)
975 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500976
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700977 if (!dm_task_set_name(dmt, dm_name))
978 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500979
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700980 if (!dm_task_set_ro(dmt))
981 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500982
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700983 if (!dm_task_add_target(dmt, start, size, ttype, verity + n))
984 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500985
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700986 if (!dm_task_set_cookie(dmt, &cookie, 0))
987 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500988
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700989 if (!dm_task_run(dmt))
990 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500991
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700992 /* Make sure the node exists before we continue. */
993 dm_udev_wait(cookie);
Mike Frysinger05e594e2017-01-10 02:11:08 -0500994
Luis Hector Chavez31735bc2017-09-15 08:17:10 -0700995 *dm_path_ret = dm_path;
996 *dm_name_ret = dm_name;
997 goto exit;
Mike Frysinger05e594e2017-01-10 02:11:08 -0500998
999error:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001000 ret = -errno;
1001 free(dm_name);
1002 free(dm_path);
Mike Frysinger05e594e2017-01-10 02:11:08 -05001003exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001004 free(verity);
1005 if (dmt)
1006 dm_task_destroy(dmt);
Mike Frysinger05e594e2017-01-10 02:11:08 -05001007#endif
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001008 return ret;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001009}
1010
1011/* Tear down the device mapper target. */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001012static int dm_detach(const char* dm_name) {
1013 int ret = 0;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001014#if USE_device_mapper
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001015 struct dm_task* dmt;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001016
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001017 dmt = dm_task_create(DM_DEVICE_REMOVE);
1018 if (dmt == nullptr)
1019 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001020
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001021 if (!dm_task_set_name(dmt, dm_name))
1022 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001023
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001024 if (!dm_task_run(dmt))
1025 goto error;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001026
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001027 goto exit;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001028
1029error:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001030 ret = -errno;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001031exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001032 dm_task_destroy(dmt);
Mike Frysinger05e594e2017-01-10 02:11:08 -05001033#endif
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001034 return ret;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001035}
1036
Dylan Reide040c6b2016-05-02 18:49:02 -07001037/*
1038 * Unmounts anything we mounted in this mount namespace in the opposite order
1039 * that they were mounted.
1040 */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001041static int unmount_external_mounts(struct container* c) {
1042 int ret = 0;
Dylan Reide040c6b2016-05-02 18:49:02 -07001043
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001044 while (c->num_ext_mounts) {
1045 c->num_ext_mounts--;
1046 if (!c->ext_mounts[c->num_ext_mounts])
1047 continue;
1048 if (umount(c->ext_mounts[c->num_ext_mounts]))
1049 ret = -errno;
1050 FREE_AND_NULL(c->ext_mounts[c->num_ext_mounts]);
1051 }
1052 FREE_AND_NULL(c->ext_mounts);
Mike Frysinger412dbd22017-01-06 01:50:34 -05001053
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001054 while (c->num_loopdevs) {
1055 c->num_loopdevs--;
1056 if (loopdev_detach(c->loopdevs[c->num_loopdevs]))
1057 ret = -errno;
1058 FREE_AND_NULL(c->loopdevs[c->num_loopdevs]);
1059 }
1060 FREE_AND_NULL(c->loopdevs);
Mike Frysinger412dbd22017-01-06 01:50:34 -05001061
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001062 while (c->num_device_mappers) {
1063 c->num_device_mappers--;
1064 if (dm_detach(c->device_mappers[c->num_device_mappers]))
1065 ret = -errno;
1066 FREE_AND_NULL(c->device_mappers[c->num_device_mappers]);
1067 }
1068 FREE_AND_NULL(c->device_mappers);
Mike Frysinger05e594e2017-01-10 02:11:08 -05001069
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001070 return ret;
Dylan Reide040c6b2016-05-02 18:49:02 -07001071}
1072
Junichi Uekawa5d272772016-07-21 16:07:19 +09001073/*
1074 * Match mount_one in minijail, mount one mountpoint with
1075 * consideration for combination of MS_BIND/MS_RDONLY flag.
1076 */
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001077static int mount_external(const char* src,
1078 const char* dest,
1079 const char* type,
1080 unsigned long flags,
1081 const void* data) {
1082 int remount_ro = 0;
Junichi Uekawa5d272772016-07-21 16:07:19 +09001083
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001084 /*
1085 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1086 * can't both be specified in the original bind mount.
1087 * Remount R/O after the initial mount.
1088 */
1089 if ((flags & MS_BIND) && (flags & MS_RDONLY)) {
1090 remount_ro = 1;
1091 flags &= ~MS_RDONLY;
1092 }
Junichi Uekawa5d272772016-07-21 16:07:19 +09001093
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001094 if (mount(src, dest, type, flags, data) == -1)
1095 return -1;
Junichi Uekawa5d272772016-07-21 16:07:19 +09001096
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001097 if (remount_ro) {
1098 flags |= MS_RDONLY;
1099 if (mount(src, dest, nullptr, flags | MS_REMOUNT, data) == -1)
1100 return -1;
1101 }
Junichi Uekawa5d272772016-07-21 16:07:19 +09001102
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001103 return 0;
Junichi Uekawa5d272772016-07-21 16:07:19 +09001104}
1105
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001106static int do_container_mount(struct container* c,
1107 const struct container_config* config,
1108 const struct container_mount* mnt) {
1109 char* dm_source = nullptr;
1110 char* loop_source = nullptr;
1111 char* source = nullptr;
1112 char* dest = nullptr;
1113 int rc = 0;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001114
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001115 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
1116 return -errno;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001117
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001118 /*
1119 * If it's a bind mount relative to rootfs, append source to
1120 * rootfs path, otherwise source path is absolute.
1121 */
1122 if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') {
1123 if (asprintf(&source, "%s/%s", c->runfsroot, mnt->source) < 0)
1124 goto error_free_return;
1125 } else if (mnt->loopback && mnt->source[0] != '/' && c->config_root) {
1126 if (asprintf(&source, "%s/%s", c->config_root, mnt->source) < 0)
1127 goto error_free_return;
1128 } else {
1129 if (asprintf(&source, "%s", mnt->source) < 0)
1130 goto error_free_return;
1131 }
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001132
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001133 // Only create the destinations for external mounts, minijail will take
1134 // care of those mounted in the new namespace.
1135 if (mnt->create && !mnt->mount_in_ns) {
1136 rc = setup_mount_destination(config, mnt, source, dest);
1137 if (rc)
1138 goto error_free_return;
1139 }
1140 if (mnt->loopback) {
1141 /* Record this loopback file for cleanup later. */
1142 loop_source = source;
1143 source = nullptr;
1144 rc = loopdev_setup(&source, loop_source);
1145 if (rc)
1146 goto error_free_return;
Mike Frysinger412dbd22017-01-06 01:50:34 -05001147
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001148 /* Save this to cleanup when shutting down. */
1149 rc = strdup_and_free(&c->loopdevs[c->num_loopdevs], source);
1150 if (rc)
1151 goto error_free_return;
1152 c->num_loopdevs++;
1153 }
1154 if (mnt->verity) {
1155 /* Set this device up via dm-verity. */
1156 char* dm_name = nullptr;
1157 dm_source = source;
1158 source = nullptr;
1159 rc = dm_setup(&source, &dm_name, dm_source, mnt->verity);
1160 if (rc)
1161 goto error_free_return;
Mike Frysinger05e594e2017-01-10 02:11:08 -05001162
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001163 /* Save this to cleanup when shutting down. */
1164 rc = strdup_and_free(&c->device_mappers[c->num_device_mappers], dm_name);
1165 free(dm_name);
1166 if (rc)
1167 goto error_free_return;
1168 c->num_device_mappers++;
1169 }
1170 if (mnt->mount_in_ns) {
1171 /* We can mount this with minijail. */
1172 rc = minijail_mount_with_data(
1173 c->jail, source, mnt->destination, mnt->type, mnt->flags, mnt->data);
1174 if (rc)
1175 goto error_free_return;
1176 } else {
1177 /* Mount this externally and unmount it on exit. */
1178 if (mount_external(source, dest, mnt->type, mnt->flags, mnt->data))
1179 goto error_free_return;
1180 /* Save this to unmount when shutting down. */
1181 rc = strdup_and_free(&c->ext_mounts[c->num_ext_mounts], dest);
1182 if (rc)
1183 goto error_free_return;
1184 c->num_ext_mounts++;
1185 }
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001186
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001187 goto exit;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001188
1189error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001190 if (!rc)
1191 rc = -errno;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001192exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001193 free(dm_source);
1194 free(loop_source);
1195 free(source);
1196 free(dest);
1197 return rc;
Luis Hector Chavez3341ed62016-06-06 08:04:04 -07001198}
1199
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001200static int do_container_mounts(struct container* c,
1201 const struct container_config* config) {
1202 unsigned int i;
1203 int rc = 0;
Dylan Reid7daf9982016-04-28 16:55:42 -07001204
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001205 unmount_external_mounts(c);
1206 /*
1207 * Allocate space to track anything we mount in our mount namespace.
1208 * This over-allocates as it has space for all mounts.
1209 */
1210 c->ext_mounts = reinterpret_cast<char**>(
1211 calloc(config->num_mounts, sizeof(*c->ext_mounts)));
1212 if (!c->ext_mounts)
1213 return -errno;
1214 c->loopdevs = reinterpret_cast<char**>(
1215 calloc(config->num_mounts, sizeof(*c->loopdevs)));
1216 if (!c->loopdevs)
1217 return -errno;
1218 c->device_mappers = reinterpret_cast<char**>(
1219 calloc(config->num_mounts, sizeof(*c->device_mappers)));
1220 if (!c->device_mappers)
1221 return -errno;
Dylan Reide040c6b2016-05-02 18:49:02 -07001222
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001223 for (i = 0; i < config->num_mounts; ++i) {
1224 rc = do_container_mount(c, config, &config->mounts[i]);
1225 if (rc)
1226 goto error_free_return;
1227 }
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001228
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001229 return 0;
Dylan Reid2149be92016-04-28 18:38:57 -07001230
1231error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001232 unmount_external_mounts(c);
1233 return rc;
Dylan Reid7daf9982016-04-28 16:55:42 -07001234}
1235
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001236static int container_create_device(const struct container* c,
1237 const struct container_config* config,
1238 const struct container_device* dev,
1239 int minor) {
1240 char* path = nullptr;
1241 int rc = 0;
1242 int mode;
1243 int uid_userns, gid_userns;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001244
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001245 switch (dev->type) {
1246 case 'b':
1247 mode = S_IFBLK;
1248 break;
1249 case 'c':
1250 mode = S_IFCHR;
1251 break;
1252 default:
1253 return -EINVAL;
1254 }
1255 mode |= dev->fs_permissions;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001256
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001257 uid_userns = get_userns_outside_id(config->uid_map, dev->uid);
1258 if (uid_userns < 0)
1259 return uid_userns;
1260 gid_userns = get_userns_outside_id(config->gid_map, dev->gid);
1261 if (gid_userns < 0)
1262 return gid_userns;
Stephen Barber1a398c72017-01-23 12:39:44 -08001263
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001264 if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0)
1265 goto error_free_return;
1266 if (mknod(path, mode, makedev(dev->major, minor)) && errno != EEXIST)
1267 goto error_free_return;
1268 if (chown(path, uid_userns, gid_userns))
1269 goto error_free_return;
1270 if (chmod(path, dev->fs_permissions))
1271 goto error_free_return;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001272
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001273 goto exit;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001274
1275error_free_return:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001276 rc = -errno;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001277exit:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001278 free(path);
1279 return rc;
Luis Hector Chavez479b95f2016-06-06 08:01:05 -07001280}
1281
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001282static int mount_runfs(struct container* c,
1283 const struct container_config* config) {
1284 static const mode_t root_dir_mode = 0660;
1285 const char* rootfs = config->rootfs;
1286 char* runfs_template = nullptr;
1287 int uid_userns, gid_userns;
Dylan Reid837c74a2016-01-22 17:25:21 -08001288
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001289 if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0)
1290 return -ENOMEM;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001291
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001292 c->runfs = mkdtemp(runfs_template);
1293 if (!c->runfs) {
1294 free(runfs_template);
1295 return -errno;
1296 }
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001297
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001298 uid_userns = get_userns_outside_id(config->uid_map, config->uid);
1299 if (uid_userns < 0)
1300 return uid_userns;
1301 gid_userns = get_userns_outside_id(config->gid_map, config->gid);
1302 if (gid_userns < 0)
1303 return gid_userns;
Stephen Barber1a398c72017-01-23 12:39:44 -08001304
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001305 /* Make sure the container uid can access the rootfs. */
1306 if (chmod(c->runfs, 0700))
1307 return -errno;
1308 if (chown(c->runfs, uid_userns, gid_userns))
1309 return -errno;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001310
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001311 if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0)
1312 return -errno;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001313
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001314 if (mkdir(c->runfsroot, root_dir_mode))
1315 return -errno;
1316 if (chmod(c->runfsroot, root_dir_mode))
1317 return -errno;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001318
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001319 if (mount(rootfs,
1320 c->runfsroot,
1321 "",
1322 MS_BIND | (config->rootfs_mount_flags & MS_REC),
1323 nullptr)) {
1324 return -errno;
1325 }
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001326
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001327 /* MS_BIND ignores any flags passed to it (except MS_REC). We need a
1328 * second call to mount() to actually set them.
1329 */
1330 if (config->rootfs_mount_flags &&
1331 mount(rootfs,
1332 c->runfsroot,
1333 "",
1334 (config->rootfs_mount_flags & ~MS_REC),
1335 nullptr)) {
1336 return -errno;
1337 }
Luis Hector Chavezc240e7e2016-09-22 10:33:03 -07001338
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001339 return 0;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001340}
1341
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001342static int device_setup(struct container* c,
1343 const struct container_config* config) {
1344 int rc;
1345 size_t i;
Dylan Reidacedff92017-03-31 17:41:40 -07001346
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001347 c->cgroup->ops->deny_all_devices(c->cgroup);
Dylan Reidacedff92017-03-31 17:41:40 -07001348
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001349 for (i = 0; i < config->num_cgroup_devices; i++) {
1350 const struct container_cgroup_device* dev = &config->cgroup_devices[i];
1351 rc = c->cgroup->ops->add_device(c->cgroup,
1352 dev->allow,
1353 dev->major,
1354 dev->minor,
1355 dev->read,
1356 dev->write,
1357 dev->modify,
1358 dev->type);
1359 if (rc)
1360 return rc;
1361 }
Dylan Reid4843d6b2017-03-31 18:14:30 -07001362
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001363 for (i = 0; i < config->num_devices; i++) {
1364 const struct container_device* dev = &config->devices[i];
1365 int minor = dev->minor;
Dylan Reidacedff92017-03-31 17:41:40 -07001366
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001367 if (dev->copy_minor) {
1368 struct stat st_buff;
1369 if (stat(dev->path, &st_buff) < 0)
1370 continue;
1371 minor = minor(st_buff.st_rdev);
1372 }
1373 if (minor >= 0) {
1374 rc = container_create_device(c, config, dev, minor);
1375 if (rc)
1376 return rc;
1377 }
1378 }
Dylan Reidacedff92017-03-31 17:41:40 -07001379
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001380 for (i = 0; i < c->num_loopdevs; ++i) {
1381 struct stat st;
Dylan Reidacedff92017-03-31 17:41:40 -07001382
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001383 rc = stat(c->loopdevs[i], &st);
1384 if (rc < 0)
1385 return -errno;
1386 rc = c->cgroup->ops->add_device(
1387 c->cgroup, 1, major(st.st_rdev), minor(st.st_rdev), 1, 0, 0, 'b');
1388 if (rc)
1389 return rc;
1390 }
Dylan Reidacedff92017-03-31 17:41:40 -07001391
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001392 return 0;
Dylan Reidacedff92017-03-31 17:41:40 -07001393}
1394
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001395static int setexeccon(void* payload) {
1396 char* init_domain = reinterpret_cast<char*>(payload);
1397 char* exec_path;
1398 pid_t tid = syscall(SYS_gettid);
1399 int fd;
1400 int rc = 0;
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001401
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001402 if (tid == -1) {
1403 return -errno;
1404 }
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001405
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001406 if (asprintf(&exec_path, "/proc/self/task/%d/attr/exec", tid) < 0) {
1407 return -errno;
1408 }
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001409
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001410 fd = open(exec_path, O_WRONLY | O_CLOEXEC);
1411 free(exec_path);
1412 if (fd == -1) {
1413 return -errno;
1414 }
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001415
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001416 if (write(fd, init_domain, strlen(init_domain)) !=
1417 (ssize_t)strlen(init_domain)) {
1418 rc = -errno;
1419 }
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001420
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001421 close(fd);
1422 return rc;
Luis Hector Chavez15e8e672017-07-20 15:13:27 -07001423}
1424
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001425int container_start(struct container* c,
1426 const struct container_config* config) {
1427 int rc = 0;
1428 unsigned int i;
1429 int cgroup_uid, cgroup_gid;
1430 char** destinations;
1431 size_t num_destinations;
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001432
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001433 if (!c)
1434 return -EINVAL;
1435 if (!config)
1436 return -EINVAL;
1437 if (!config->program_argv || !config->program_argv[0])
1438 return -EINVAL;
Dylan Reide040c6b2016-05-02 18:49:02 -07001439
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001440 if (config->config_root) {
1441 c->config_root = strdup(config->config_root);
1442 if (!c->config_root) {
1443 rc = -ENOMEM;
1444 goto error_rmdir;
1445 }
1446 }
1447 if (config->premounted_runfs) {
1448 c->runfs = nullptr;
1449 c->runfsroot = strdup(config->premounted_runfs);
1450 if (!c->runfsroot) {
1451 rc = -ENOMEM;
1452 goto error_rmdir;
1453 }
1454 } else {
1455 rc = mount_runfs(c, config);
1456 if (rc)
1457 goto error_rmdir;
1458 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001459
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001460 c->jail = minijail_new();
1461 if (!c->jail)
1462 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001463
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001464 rc = do_container_mounts(c, config);
1465 if (rc)
1466 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001467
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001468 cgroup_uid = get_userns_outside_id(config->uid_map, config->cgroup_owner);
1469 if (cgroup_uid < 0) {
1470 rc = cgroup_uid;
1471 goto error_rmdir;
1472 }
1473 cgroup_gid = get_userns_outside_id(config->gid_map, config->cgroup_group);
1474 if (cgroup_gid < 0) {
1475 rc = cgroup_gid;
1476 goto error_rmdir;
1477 }
Stephen Barber1a398c72017-01-23 12:39:44 -08001478
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001479 c->cgroup = container_cgroup_new(
1480 c->name, "/sys/fs/cgroup", config->cgroup_parent, cgroup_uid, cgroup_gid);
1481 if (!c->cgroup)
1482 goto error_rmdir;
Dylan Reida9966422016-07-21 10:11:34 -07001483
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001484 /* Must be root to modify device cgroup or mknod */
1485 if (getuid() == 0) {
1486 if (device_setup(c, config))
1487 goto error_rmdir;
1488 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001489
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001490 /* Potentailly run setfiles on mounts configured outside of the jail */
1491 destinations =
1492 reinterpret_cast<char**>(calloc(config->num_mounts, sizeof(char*)));
1493 num_destinations = 0;
1494 for (i = 0; i < config->num_mounts; i++) {
1495 const struct container_mount* mnt = &config->mounts[i];
1496 char* dest = mnt->destination;
Dylan Reidd7229582016-04-27 17:08:40 -07001497
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001498 if (mnt->mount_in_ns)
1499 continue;
1500 if (mnt->flags & MS_RDONLY)
1501 continue;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001502
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001503 /* A hack to avoid setfiles on /data and /cache. */
1504 if (!strcmp(dest, "/data") || !strcmp(dest, "/cache"))
1505 continue;
Yusuke Sato91f11f02016-12-02 16:15:13 -08001506
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001507 if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) {
1508 size_t j;
1509 for (j = 0; j < num_destinations; ++j) {
1510 free(destinations[j]);
1511 }
1512 free(destinations);
1513 goto error_rmdir;
1514 }
Yusuke Sato91f11f02016-12-02 16:15:13 -08001515
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001516 destinations[num_destinations++] = dest;
1517 }
1518 if (num_destinations) {
1519 size_t i;
1520 rc = run_setfiles_command(c, config, destinations, num_destinations);
1521 for (i = 0; i < num_destinations; ++i) {
1522 free(destinations[i]);
1523 }
1524 }
1525 free(destinations);
1526 if (rc)
1527 goto error_rmdir;
Dylan Reidd7229582016-04-27 17:08:40 -07001528
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001529 /* Setup CPU cgroup params. */
1530 if (config->cpu_cgparams.shares) {
1531 rc = c->cgroup->ops->set_cpu_shares(c->cgroup, config->cpu_cgparams.shares);
1532 if (rc)
1533 goto error_rmdir;
1534 }
1535 if (config->cpu_cgparams.period) {
1536 rc = c->cgroup->ops->set_cpu_quota(c->cgroup, config->cpu_cgparams.quota);
1537 if (rc)
1538 goto error_rmdir;
1539 rc = c->cgroup->ops->set_cpu_period(c->cgroup, config->cpu_cgparams.period);
1540 if (rc)
1541 goto error_rmdir;
1542 }
1543 if (config->cpu_cgparams.rt_period) {
1544 rc = c->cgroup->ops->set_cpu_rt_runtime(c->cgroup,
1545 config->cpu_cgparams.rt_runtime);
1546 if (rc)
1547 goto error_rmdir;
1548 rc = c->cgroup->ops->set_cpu_rt_period(c->cgroup,
1549 config->cpu_cgparams.rt_period);
1550 if (rc)
1551 goto error_rmdir;
1552 }
Chinyue Chenfac909e2016-06-24 14:17:42 +08001553
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001554 /* Setup and start the container with libminijail. */
1555 if (config->pid_file_path) {
1556 c->pid_file_path = strdup(config->pid_file_path);
1557 if (!c->pid_file_path) {
1558 rc = -ENOMEM;
1559 goto error_rmdir;
1560 }
1561 } else if (c->runfs) {
1562 if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0) {
1563 rc = -ENOMEM;
1564 goto error_rmdir;
1565 }
1566 }
Keshav Santhanam0e4c3282016-07-14 10:25:16 -07001567
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001568 if (c->pid_file_path)
1569 minijail_write_pid_file(c->jail, c->pid_file_path);
1570 minijail_reset_signal_mask(c->jail);
Dylan Reid837c74a2016-01-22 17:25:21 -08001571
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001572 /* Setup container namespaces. */
1573 minijail_namespace_ipc(c->jail);
1574 minijail_namespace_vfs(c->jail);
1575 if (!config->share_host_netns)
1576 minijail_namespace_net(c->jail);
1577 minijail_namespace_pids(c->jail);
1578 minijail_namespace_user(c->jail);
1579 if (getuid() != 0)
1580 minijail_namespace_user_disable_setgroups(c->jail);
1581 minijail_namespace_cgroups(c->jail);
1582 rc = minijail_uidmap(c->jail, config->uid_map);
1583 if (rc)
1584 goto error_rmdir;
1585 rc = minijail_gidmap(c->jail, config->gid_map);
1586 if (rc)
1587 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001588
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001589 /* Set the UID/GID inside the container if not 0. */
1590 if (get_userns_outside_id(config->uid_map, config->uid) < 0)
1591 goto error_rmdir;
1592 else if (config->uid > 0)
1593 minijail_change_uid(c->jail, config->uid);
1594 if (get_userns_outside_id(config->gid_map, config->gid) < 0)
1595 goto error_rmdir;
1596 else if (config->gid > 0)
1597 minijail_change_gid(c->jail, config->gid);
Keshav Santhanam36485ff2016-08-02 16:21:02 -07001598
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001599 rc = minijail_enter_pivot_root(c->jail, c->runfsroot);
1600 if (rc)
1601 goto error_rmdir;
Dylan Reid837c74a2016-01-22 17:25:21 -08001602
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001603 /* Add the cgroups configured above. */
1604 for (i = 0; i < NUM_CGROUP_TYPES; i++) {
1605 if (c->cgroup->cgroup_tasks_paths[i]) {
1606 rc = minijail_add_to_cgroup(c->jail, c->cgroup->cgroup_tasks_paths[i]);
1607 if (rc)
1608 goto error_rmdir;
1609 }
1610 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001611
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001612 if (config->alt_syscall_table)
1613 minijail_use_alt_syscall(c->jail, config->alt_syscall_table);
Dylan Reid837c74a2016-01-22 17:25:21 -08001614
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001615 for (i = 0; i < static_cast<unsigned int>(config->num_rlimits); i++) {
1616 const struct container_rlimit* lim = &config->rlimits[i];
1617 rc = minijail_rlimit(c->jail, lim->type, lim->cur, lim->max);
1618 if (rc)
1619 goto error_rmdir;
1620 }
Dylan Reid93fa4602017-06-06 13:39:31 -07001621
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001622 if (config->selinux_context) {
1623 rc = minijail_add_hook(c->jail,
1624 &setexeccon,
1625 config->selinux_context,
1626 MINIJAIL_HOOK_EVENT_PRE_EXECVE);
1627 if (rc)
1628 goto error_rmdir;
1629 }
Dylan Reid837c74a2016-01-22 17:25:21 -08001630
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001631 if (config->pre_start_hook) {
1632 rc = minijail_add_hook(c->jail,
1633 config->pre_start_hook,
1634 config->pre_start_hook_payload,
1635 MINIJAIL_HOOK_EVENT_PRE_EXECVE);
1636 if (rc)
1637 goto error_rmdir;
1638 }
Luis Hector Chavezf8e8f4c2017-08-01 01:09:39 -07001639
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001640 for (i = 0; i < config->inherited_fd_count; i++) {
1641 rc = minijail_preserve_fd(
1642 c->jail, config->inherited_fds[i], config->inherited_fds[i]);
1643 if (rc)
1644 goto error_rmdir;
1645 }
Luis Hector Chavezf8e8f4c2017-08-01 01:09:39 -07001646
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001647 /* TODO(dgreid) - remove this once shared mounts are cleaned up. */
1648 minijail_skip_remount_private(c->jail);
Dylan Reid3da683b2016-04-05 03:35:35 -07001649
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001650 if (!config->keep_fds_open)
1651 minijail_close_open_fds(c->jail);
Luis Hector Chaveze18e7d42016-10-12 07:35:32 -07001652
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001653 if (config->use_capmask) {
1654 minijail_use_caps(c->jail, config->capmask);
1655 if (config->use_capmask_ambient) {
1656 minijail_set_ambient_caps(c->jail);
1657 }
1658 if (config->securebits_skip_mask) {
1659 minijail_skip_setting_securebits(c->jail, config->securebits_skip_mask);
1660 }
1661 }
Luis Hector Chavezff5978f2017-06-27 12:52:58 -07001662
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001663 if (!config->do_init)
1664 minijail_run_as_init(c->jail);
Luis Hector Chavezdac65c32017-07-21 10:30:23 -07001665
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001666 rc = minijail_run_pid_pipes_no_preload(c->jail,
1667 config->program_argv[0],
1668 config->program_argv,
1669 &c->init_pid,
1670 nullptr,
1671 nullptr,
1672 nullptr);
1673 if (rc)
1674 goto error_rmdir;
1675 return 0;
Dylan Reid837c74a2016-01-22 17:25:21 -08001676
1677error_rmdir:
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001678 if (!rc)
1679 rc = -errno;
1680 container_teardown(c);
1681 return rc;
Dylan Reid837c74a2016-01-22 17:25:21 -08001682}
1683
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001684const char* container_root(struct container* c) {
1685 return c->runfs;
Dylan Reid837c74a2016-01-22 17:25:21 -08001686}
1687
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001688int container_pid(struct container* c) {
1689 return c->init_pid;
Dylan Reid837c74a2016-01-22 17:25:21 -08001690}
1691
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001692static int container_teardown(struct container* c) {
1693 int ret = 0;
Dylan Reid837c74a2016-01-22 17:25:21 -08001694
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001695 unmount_external_mounts(c);
1696 if (c->runfsroot && c->runfs) {
1697 /* |c->runfsroot| may have been mounted recursively. Thus use
1698 * MNT_DETACH to "immediately disconnect the filesystem and all
1699 * filesystems mounted below it from each other and from the
1700 * mount table". Otherwise one would need to unmount every
1701 * single dependent mount before unmounting |c->runfsroot|
1702 * itself.
1703 */
1704 if (umount2(c->runfsroot, MNT_DETACH))
1705 ret = -errno;
1706 if (rmdir(c->runfsroot))
1707 ret = -errno;
1708 FREE_AND_NULL(c->runfsroot);
1709 }
1710 if (c->pid_file_path) {
1711 if (unlink(c->pid_file_path))
1712 ret = -errno;
1713 FREE_AND_NULL(c->pid_file_path);
1714 }
1715 if (c->runfs) {
1716 if (rmdir(c->runfs))
1717 ret = -errno;
1718 FREE_AND_NULL(c->runfs);
1719 }
1720 return ret;
Dylan Reid837c74a2016-01-22 17:25:21 -08001721}
1722
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001723int container_wait(struct container* c) {
1724 int rc;
Dylan Reidcf745c52016-04-22 10:18:03 -07001725
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001726 do {
1727 rc = minijail_wait(c->jail);
1728 } while (rc == -EINTR);
Dylan Reidcf745c52016-04-22 10:18:03 -07001729
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001730 // If the process had already been reaped, still perform teardown.
1731 if (rc == -ECHILD || rc >= 0) {
1732 rc = container_teardown(c);
1733 }
1734 return rc;
Dylan Reid837c74a2016-01-22 17:25:21 -08001735}
1736
Luis Hector Chavez31735bc2017-09-15 08:17:10 -07001737int container_kill(struct container* c) {
1738 if (kill(c->init_pid, SIGKILL) && errno != ESRCH)
1739 return -errno;
1740 return container_wait(c);
Dylan Reid837c74a2016-01-22 17:25:21 -08001741}