blob: ee93eee1fdc700181ca9b1b42be1ab3c8dc3cc44 [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
Arthur Gautier7a569072016-04-23 17:25:20 +00007#define _DEFAULT_SOURCE
Elly Jonescd7a9042011-07-22 13:56:51 -04008#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07009
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080010#include <asm/unistd.h>
Luis Hector Chavez43ff0802016-10-07 12:21:07 -070011#include <dirent.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040012#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070013#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040014#include <grp.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040015#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <sched.h>
17#include <signal.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070018#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080019#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040020#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <sys/capability.h>
24#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050025#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040026#include <sys/prctl.h>
Dylan Reid0f72ef42017-06-06 15:42:49 -070027#include <sys/resource.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070028#include <sys/stat.h>
Mike Frysinger33ffef32017-01-13 19:53:19 -050029#include <sys/sysmacros.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070030#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080031#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040032#include <sys/wait.h>
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -070033#include <syscall.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040034#include <unistd.h>
35
36#include "libminijail.h"
37#include "libminijail-private.h"
38
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070039#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080040#include "syscall_filter.h"
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040041#include "syscall_wrapper.h"
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -040042#include "system.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070043#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080044
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -070045/* Until these are reliably available in linux/prctl.h. */
Andrew Brestickereac28942015-11-11 16:04:46 -080046#ifndef PR_ALT_SYSCALL
47# define PR_ALT_SYSCALL 0x43724f53
48#endif
49
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040050/* Seccomp filter related flags. */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080051#ifndef PR_SET_NO_NEW_PRIVS
52# define PR_SET_NO_NEW_PRIVS 38
53#endif
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040054
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080055#ifndef SECCOMP_MODE_FILTER
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -040056#define SECCOMP_MODE_FILTER 2 /* Uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050057#endif
58
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040059#ifndef SECCOMP_SET_MODE_STRICT
60# define SECCOMP_SET_MODE_STRICT 0
61#endif
62#ifndef SECCOMP_SET_MODE_FILTER
63# define SECCOMP_SET_MODE_FILTER 1
64#endif
65
66#ifndef SECCOMP_FILTER_FLAG_TSYNC
67# define SECCOMP_FILTER_FLAG_TSYNC 1
68#endif
69/* End seccomp filter related flags. */
70
Dylan Reid4cbc2a52016-06-17 19:06:07 -070071/* New cgroup namespace might not be in linux-headers yet. */
72#ifndef CLONE_NEWCGROUP
73# define CLONE_NEWCGROUP 0x02000000
74#endif
75
Dylan Reid605ce7f2016-01-19 19:21:00 -080076#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
77
Dylan Reid0f72ef42017-06-06 15:42:49 -070078#define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */
79
Stephen Barber0d1cbf62017-10-16 22:19:38 -070080#define MAX_PRESERVED_FDS 32U
Luis Hector Chavez1617f632017-08-01 18:32:30 -070081
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -080082/* Keyctl commands. */
83#define KEYCTL_JOIN_SESSION_KEYRING 1
84
Dylan Reid0f72ef42017-06-06 15:42:49 -070085struct minijail_rlimit {
86 int type;
Luis Hector Chavez7058a2d2018-01-29 08:41:34 -080087 rlim_t cur;
88 rlim_t max;
Dylan Reid0f72ef42017-06-06 15:42:49 -070089};
90
Dylan Reid648b2202015-10-23 00:50:00 -070091struct mountpoint {
Elly Jones51a5b6c2011-10-12 19:09:26 -040092 char *src;
93 char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -070094 char *type;
Dylan Reid81e23972016-05-18 14:06:35 -070095 char *data;
96 int has_data;
Dylan Reid648b2202015-10-23 00:50:00 -070097 unsigned long flags;
98 struct mountpoint *next;
Elly Jones51a5b6c2011-10-12 19:09:26 -040099};
100
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -0700101struct hook {
102 minijail_hook_t hook;
103 void *payload;
104 minijail_hook_event_t event;
105 struct hook *next;
106};
107
Luis Hector Chavez1617f632017-08-01 18:32:30 -0700108struct preserved_fd {
109 int parent_fd;
110 int child_fd;
111};
112
Will Drewryf89aef52011-09-16 16:48:57 -0500113struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700114 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700115 * WARNING: if you add a flag here you need to make sure it's
116 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700117 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400118 struct {
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700119 int uid : 1;
120 int gid : 1;
Lutz Justen13807cb2017-01-03 17:11:55 +0100121 int inherit_suppl_gids : 1;
122 int set_suppl_gids : 1;
123 int keep_suppl_gids : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700124 int use_caps : 1;
125 int capbset_drop : 1;
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -0400126 int set_ambient_caps : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700127 int vfs : 1;
128 int enter_vfs : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700129 int pids : 1;
130 int ipc : 1;
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400131 int uts : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700132 int net : 1;
133 int enter_net : 1;
134 int ns_cgroups : 1;
135 int userns : 1;
136 int disable_setgroups : 1;
137 int seccomp : 1;
138 int remount_proc_ro : 1;
139 int no_new_privs : 1;
140 int seccomp_filter : 1;
141 int seccomp_filter_tsync : 1;
142 int seccomp_filter_logging : 1;
143 int chroot : 1;
144 int pivot_root : 1;
Mike Frysinger33ffef32017-01-13 19:53:19 -0500145 int mount_dev : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700146 int mount_tmp : 1;
147 int do_init : 1;
Luis Hector Chavezac981fc2017-09-18 15:52:38 -0700148 int run_as_init : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700149 int pid_file : 1;
150 int cgroups : 1;
151 int alt_syscall : 1;
152 int reset_signal_mask : 1;
Luis Hector Chaveza27118a2018-04-04 08:18:01 -0700153 int reset_signal_handlers : 1;
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -0700154 int close_open_fds : 1;
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -0800155 int new_session_keyring : 1;
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -0400156 int forward_signals : 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400157 } flags;
158 uid_t uid;
159 gid_t gid;
160 gid_t usergid;
161 char *user;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800162 size_t suppl_gid_count;
163 gid_t *suppl_gid_list;
Elly Jonese1749eb2011-10-07 13:54:59 -0400164 uint64_t caps;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800165 uint64_t cap_bset;
Elly Jonese1749eb2011-10-07 13:54:59 -0400166 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700167 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700168 int netns_fd;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400169 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800170 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800171 char *uidmap;
172 char *gidmap;
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400173 char *hostname;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800174 size_t filter_len;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800175 struct sock_fprog *filter_prog;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800176 char *alt_syscall_table;
Dylan Reid648b2202015-10-23 00:50:00 -0700177 struct mountpoint *mounts_head;
178 struct mountpoint *mounts_tail;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800179 size_t mounts_count;
Mike Frysinger785b1c32018-02-23 15:47:24 -0500180 unsigned long remount_mode;
Martin Pelikánab9eb442017-01-25 11:53:58 +1100181 size_t tmpfs_size;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800182 char *cgroups[MAX_CGROUPS];
183 size_t cgroup_count;
Dylan Reid0f72ef42017-06-06 15:42:49 -0700184 struct minijail_rlimit rlimits[MAX_RLIMITS];
185 size_t rlimit_count;
Luis Hector Chavezec0a2c12017-06-29 20:29:57 -0700186 uint64_t securebits_skip_mask;
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -0700187 struct hook *hooks_head;
188 struct hook *hooks_tail;
Luis Hector Chavez1617f632017-08-01 18:32:30 -0700189 struct preserved_fd preserved_fds[MAX_PRESERVED_FDS];
190 size_t preserved_fd_count;
Will Drewryf89aef52011-09-16 16:48:57 -0500191};
192
Luis Hector Chavez64730af2017-09-13 13:18:59 -0700193static void run_hooks_or_die(const struct minijail *j,
194 minijail_hook_event_t event);
195
Mike Frysingerac08a682017-10-10 02:04:50 -0400196static void free_mounts_list(struct minijail *j)
197{
198 while (j->mounts_head) {
199 struct mountpoint *m = j->mounts_head;
200 j->mounts_head = j->mounts_head->next;
201 free(m->data);
202 free(m->type);
203 free(m->dest);
204 free(m->src);
205 free(m);
206 }
207 // No need to clear mounts_head as we know it's NULL after the loop.
208 j->mounts_tail = NULL;
209}
210
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700211/*
212 * Strip out flags meant for the parent.
213 * We keep things that are not inherited across execve(2) (e.g. capabilities),
214 * or are easier to set after execve(2) (e.g. seccomp filters).
215 */
216void minijail_preenter(struct minijail *j)
217{
218 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700219 j->flags.enter_vfs = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700220 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700221 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800222 j->flags.do_init = 0;
Luis Hector Chavezac981fc2017-09-18 15:52:38 -0700223 j->flags.run_as_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800224 j->flags.pid_file = 0;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800225 j->flags.cgroups = 0;
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -0400226 j->flags.forward_signals = 0;
Mike Frysinger785b1c32018-02-23 15:47:24 -0500227 j->remount_mode = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700228}
229
230/*
231 * Strip out flags meant for the child.
232 * We keep things that are inherited across execve(2).
233 */
234void minijail_preexec(struct minijail *j)
235{
236 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700237 int enter_vfs = j->flags.enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700238 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800239 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700240 if (j->user)
241 free(j->user);
242 j->user = NULL;
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -0800243 if (j->suppl_gid_list)
244 free(j->suppl_gid_list);
245 j->suppl_gid_list = NULL;
Mike Frysingerac08a682017-10-10 02:04:50 -0400246 free_mounts_list(j);
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700247 memset(&j->flags, 0, sizeof(j->flags));
248 /* Now restore anything we meant to keep. */
249 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700250 j->flags.enter_vfs = enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700251 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800252 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700253 /* Note, |pids| will already have been used before this call. */
254}
255
256/* Minijail API. */
257
Will Drewry6ac91122011-10-21 16:38:58 -0500258struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400259{
Mike Frysinger785b1c32018-02-23 15:47:24 -0500260 struct minijail *j = calloc(1, sizeof(struct minijail));
261 j->remount_mode = MS_PRIVATE;
262 return j;
Elly Jonescd7a9042011-07-22 13:56:51 -0400263}
264
Will Drewry6ac91122011-10-21 16:38:58 -0500265void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400266{
267 if (uid == 0)
268 die("useless change to uid 0");
269 j->uid = uid;
270 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400271}
272
Will Drewry6ac91122011-10-21 16:38:58 -0500273void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400274{
275 if (gid == 0)
276 die("useless change to gid 0");
277 j->gid = gid;
278 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400279}
280
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800281void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
282 const gid_t *list)
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800283{
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800284 size_t i;
285
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -0500286 if (j->flags.inherit_suppl_gids)
287 die("cannot inherit *and* set supplementary groups");
288 if (j->flags.keep_suppl_gids)
289 die("cannot keep *and* set supplementary groups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800290
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800291 if (size == 0) {
292 /* Clear supplementary groups. */
293 j->suppl_gid_list = NULL;
294 j->suppl_gid_count = 0;
Lutz Justen13807cb2017-01-03 17:11:55 +0100295 j->flags.set_suppl_gids = 1;
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800296 return;
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800297 }
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800298
299 /* Copy the gid_t array. */
300 j->suppl_gid_list = calloc(size, sizeof(gid_t));
301 if (!j->suppl_gid_list) {
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800302 die("failed to allocate internal supplementary group array");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800303 }
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800304 for (i = 0; i < size; i++) {
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800305 j->suppl_gid_list[i] = list[i];
306 }
307 j->suppl_gid_count = size;
Lutz Justen13807cb2017-01-03 17:11:55 +0100308 j->flags.set_suppl_gids = 1;
309}
310
311void API minijail_keep_supplementary_gids(struct minijail *j) {
312 j->flags.keep_suppl_gids = 1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800313}
314
Will Drewry6ac91122011-10-21 16:38:58 -0500315int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400316{
Luis Hector Chavez71323552017-09-05 09:17:22 -0700317 uid_t uid;
318 gid_t gid;
319 int rc = lookup_user(user, &uid, &gid);
320 if (rc)
321 return rc;
322 minijail_change_uid(j, uid);
Elly Jonese1749eb2011-10-07 13:54:59 -0400323 j->user = strdup(user);
324 if (!j->user)
325 return -ENOMEM;
Luis Hector Chavez71323552017-09-05 09:17:22 -0700326 j->usergid = gid;
Elly Jonese1749eb2011-10-07 13:54:59 -0400327 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400328}
329
Will Drewry6ac91122011-10-21 16:38:58 -0500330int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400331{
Luis Hector Chavez71323552017-09-05 09:17:22 -0700332 gid_t gid;
333 int rc = lookup_group(group, &gid);
334 if (rc)
335 return rc;
336 minijail_change_gid(j, gid);
Elly Jonese1749eb2011-10-07 13:54:59 -0400337 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400338}
339
Will Drewry6ac91122011-10-21 16:38:58 -0500340void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400341{
342 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400343}
344
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700345void API minijail_no_new_privs(struct minijail *j)
346{
347 j->flags.no_new_privs = 1;
348}
349
Will Drewry6ac91122011-10-21 16:38:58 -0500350void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400351{
352 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500353}
354
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400355void API minijail_set_seccomp_filter_tsync(struct minijail *j)
356{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400357 if (j->filter_len > 0 && j->filter_prog != NULL) {
358 die("minijail_set_seccomp_filter_tsync() must be called "
359 "before minijail_parse_seccomp_filters()");
360 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400361 j->flags.seccomp_filter_tsync = 1;
362}
363
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700364void API minijail_log_seccomp_filter_failures(struct minijail *j)
365{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400366 if (j->filter_len > 0 && j->filter_prog != NULL) {
367 die("minijail_log_seccomp_filter_failures() must be called "
368 "before minijail_parse_seccomp_filters()");
369 }
370 j->flags.seccomp_filter_logging = 1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700371}
372
Will Drewry6ac91122011-10-21 16:38:58 -0500373void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400374{
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800375 /*
376 * 'minijail_use_caps' configures a runtime-capabilities-only
377 * environment, including a bounding set matching the thread's runtime
378 * (permitted|inheritable|effective) sets.
379 * Therefore, it will override any existing bounding set configurations
380 * since the latter would allow gaining extra runtime capabilities from
381 * file capabilities.
382 */
383 if (j->flags.capbset_drop) {
384 warn("overriding bounding set configuration");
385 j->cap_bset = 0;
386 j->flags.capbset_drop = 0;
387 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400388 j->caps = capmask;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800389 j->flags.use_caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400390}
391
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800392void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
393{
394 if (j->flags.use_caps) {
395 /*
396 * 'minijail_use_caps' will have already configured a capability
397 * bounding set matching the (permitted|inheritable|effective)
398 * sets. Abort if the user tries to configure a separate
399 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
400 * are mutually exclusive.
401 */
402 die("runtime capabilities already configured, can't drop "
403 "bounding set separately");
404 }
405 j->cap_bset = capmask;
406 j->flags.capbset_drop = 1;
407}
408
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -0400409void API minijail_set_ambient_caps(struct minijail *j)
410{
411 j->flags.set_ambient_caps = 1;
412}
413
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800414void API minijail_reset_signal_mask(struct minijail *j)
415{
Peter Qiu2860c462015-12-16 15:13:06 -0800416 j->flags.reset_signal_mask = 1;
417}
418
Luis Hector Chaveza27118a2018-04-04 08:18:01 -0700419void API minijail_reset_signal_handlers(struct minijail *j)
420{
421 j->flags.reset_signal_handlers = 1;
422}
423
Will Drewry6ac91122011-10-21 16:38:58 -0500424void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400425{
426 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400427}
428
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700429void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
430{
Ricky Zhoubce609d2016-03-02 21:47:56 -0800431 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700432 if (ns_fd < 0) {
433 pdie("failed to open namespace '%s'", ns_path);
434 }
435 j->mountns_fd = ns_fd;
436 j->flags.enter_vfs = 1;
437}
438
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -0800439void API minijail_new_session_keyring(struct minijail *j)
440{
441 j->flags.new_session_keyring = 1;
442}
443
Luis Hector Chavezec0a2c12017-06-29 20:29:57 -0700444void API minijail_skip_setting_securebits(struct minijail *j,
445 uint64_t securebits_skip_mask)
446{
447 j->securebits_skip_mask = securebits_skip_mask;
448}
449
Mike Frysinger785b1c32018-02-23 15:47:24 -0500450void API minijail_remount_mode(struct minijail *j, unsigned long mode)
451{
452 j->remount_mode = mode;
453}
454
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800455void API minijail_skip_remount_private(struct minijail *j)
456{
Mike Frysinger785b1c32018-02-23 15:47:24 -0500457 j->remount_mode = 0;
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800458}
459
Will Drewry6ac91122011-10-21 16:38:58 -0500460void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400461{
Elly Jonese58176c2012-01-23 11:46:17 -0500462 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700463 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400464 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800465 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400466}
467
Dylan Reidf7942472015-11-18 17:55:26 -0800468void API minijail_namespace_ipc(struct minijail *j)
469{
470 j->flags.ipc = 1;
471}
472
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400473void API minijail_namespace_uts(struct minijail *j)
474{
475 j->flags.uts = 1;
476}
477
478int API minijail_namespace_set_hostname(struct minijail *j, const char *name)
479{
480 if (j->hostname)
481 return -EINVAL;
482 minijail_namespace_uts(j);
483 j->hostname = strdup(name);
484 if (!j->hostname)
485 return -ENOMEM;
486 return 0;
487}
488
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400489void API minijail_namespace_net(struct minijail *j)
490{
491 j->flags.net = 1;
492}
493
Dylan Reid1102f5a2015-09-15 11:52:20 -0700494void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
495{
Ricky Zhoubce609d2016-03-02 21:47:56 -0800496 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
Dylan Reid1102f5a2015-09-15 11:52:20 -0700497 if (ns_fd < 0) {
498 pdie("failed to open namespace '%s'", ns_path);
499 }
500 j->netns_fd = ns_fd;
501 j->flags.enter_net = 1;
502}
503
Dylan Reid4cbc2a52016-06-17 19:06:07 -0700504void API minijail_namespace_cgroups(struct minijail *j)
505{
506 j->flags.ns_cgroups = 1;
507}
508
Luis Hector Chavez43ff0802016-10-07 12:21:07 -0700509void API minijail_close_open_fds(struct minijail *j)
510{
511 j->flags.close_open_fds = 1;
512}
513
Dylan Reid791f5772015-09-14 20:02:42 -0700514void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400515{
516 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700517 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400518}
519
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800520void API minijail_namespace_user(struct minijail *j)
521{
522 j->flags.userns = 1;
523}
524
Jorge Lucangeli Obes200299c2016-09-23 15:21:57 -0400525void API minijail_namespace_user_disable_setgroups(struct minijail *j)
526{
527 j->flags.disable_setgroups = 1;
528}
529
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800530int API minijail_uidmap(struct minijail *j, const char *uidmap)
531{
532 j->uidmap = strdup(uidmap);
533 if (!j->uidmap)
534 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800535 char *ch;
536 for (ch = j->uidmap; *ch; ch++) {
537 if (*ch == ',')
538 *ch = '\n';
539 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800540 return 0;
541}
542
543int API minijail_gidmap(struct minijail *j, const char *gidmap)
544{
545 j->gidmap = strdup(gidmap);
546 if (!j->gidmap)
547 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800548 char *ch;
549 for (ch = j->gidmap; *ch; ch++) {
550 if (*ch == ',')
551 *ch = '\n';
552 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800553 return 0;
554}
555
Will Drewry6ac91122011-10-21 16:38:58 -0500556void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400557{
Lutz Justen13807cb2017-01-03 17:11:55 +0100558 j->flags.inherit_suppl_gids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400559}
560
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800561void API minijail_run_as_init(struct minijail *j)
562{
563 /*
564 * Since the jailed program will become 'init' in the new PID namespace,
565 * Minijail does not need to fork an 'init' process.
566 */
Luis Hector Chavezac981fc2017-09-18 15:52:38 -0700567 j->flags.run_as_init = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800568}
569
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700570int API minijail_enter_chroot(struct minijail *j, const char *dir)
571{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400572 if (j->chrootdir)
573 return -EINVAL;
574 j->chrootdir = strdup(dir);
575 if (!j->chrootdir)
576 return -ENOMEM;
577 j->flags.chroot = 1;
578 return 0;
579}
580
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800581int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
582{
583 if (j->chrootdir)
584 return -EINVAL;
585 j->chrootdir = strdup(dir);
586 if (!j->chrootdir)
587 return -ENOMEM;
588 j->flags.pivot_root = 1;
589 return 0;
590}
591
Dylan Reida14e08d2015-10-22 21:05:29 -0700592char API *minijail_get_original_path(struct minijail *j,
593 const char *path_inside_chroot)
594{
Dylan Reid648b2202015-10-23 00:50:00 -0700595 struct mountpoint *b;
Dylan Reida14e08d2015-10-22 21:05:29 -0700596
Dylan Reid648b2202015-10-23 00:50:00 -0700597 b = j->mounts_head;
Dylan Reida14e08d2015-10-22 21:05:29 -0700598 while (b) {
599 /*
600 * If |path_inside_chroot| is the exact destination of a
Dylan Reid648b2202015-10-23 00:50:00 -0700601 * mount, then the original path is exactly the source of
602 * the mount.
Dylan Reida14e08d2015-10-22 21:05:29 -0700603 * for example: "-b /some/path/exe,/chroot/path/exe"
Dylan Reid648b2202015-10-23 00:50:00 -0700604 * mount source = /some/path/exe, mount dest =
605 * /chroot/path/exe Then when getting the original path of
606 * "/chroot/path/exe", the source of that mount,
607 * "/some/path/exe" is what should be returned.
Dylan Reida14e08d2015-10-22 21:05:29 -0700608 */
609 if (!strcmp(b->dest, path_inside_chroot))
610 return strdup(b->src);
611
612 /*
613 * If |path_inside_chroot| is within the destination path of a
Dylan Reid648b2202015-10-23 00:50:00 -0700614 * mount, take the suffix of the chroot path relative to the
615 * mount destination path, and append it to the mount source
616 * path.
Dylan Reida14e08d2015-10-22 21:05:29 -0700617 */
618 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
619 const char *relative_path =
620 path_inside_chroot + strlen(b->dest);
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400621 return path_join(b->src, relative_path);
Dylan Reida14e08d2015-10-22 21:05:29 -0700622 }
623 b = b->next;
624 }
625
626 /* If there is a chroot path, append |path_inside_chroot| to that. */
627 if (j->chrootdir)
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400628 return path_join(j->chrootdir, path_inside_chroot);
Dylan Reida14e08d2015-10-22 21:05:29 -0700629
630 /* No chroot, so the path outside is the same as it is inside. */
631 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700632}
633
Martin Pelikánab9eb442017-01-25 11:53:58 +1100634size_t minijail_get_tmpfs_size(const struct minijail *j)
635{
636 return j->tmpfs_size;
637}
638
Mike Frysinger33ffef32017-01-13 19:53:19 -0500639void API minijail_mount_dev(struct minijail *j)
640{
641 j->flags.mount_dev = 1;
642}
643
Lee Campbell11af0622014-05-22 12:36:04 -0700644void API minijail_mount_tmp(struct minijail *j)
645{
Martin Pelikánab9eb442017-01-25 11:53:58 +1100646 minijail_mount_tmp_size(j, 64 * 1024 * 1024);
647}
648
649void API minijail_mount_tmp_size(struct minijail *j, size_t size)
650{
651 j->tmpfs_size = size;
Lee Campbell11af0622014-05-22 12:36:04 -0700652 j->flags.mount_tmp = 1;
653}
654
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800655int API minijail_write_pid_file(struct minijail *j, const char *path)
656{
657 j->pid_file_path = strdup(path);
658 if (!j->pid_file_path)
659 return -ENOMEM;
660 j->flags.pid_file = 1;
661 return 0;
662}
663
Dylan Reid605ce7f2016-01-19 19:21:00 -0800664int API minijail_add_to_cgroup(struct minijail *j, const char *path)
665{
666 if (j->cgroup_count >= MAX_CGROUPS)
667 return -ENOMEM;
668 j->cgroups[j->cgroup_count] = strdup(path);
669 if (!j->cgroups[j->cgroup_count])
670 return -ENOMEM;
671 j->cgroup_count++;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800672 j->flags.cgroups = 1;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800673 return 0;
674}
675
Luis Hector Chavez7058a2d2018-01-29 08:41:34 -0800676int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max)
Dylan Reid0f72ef42017-06-06 15:42:49 -0700677{
678 size_t i;
679
680 if (j->rlimit_count >= MAX_RLIMITS)
681 return -ENOMEM;
682 /* It's an error if the caller sets the same rlimit multiple times. */
683 for (i = 0; i < j->rlimit_count; i++) {
684 if (j->rlimits[i].type == type)
685 return -EEXIST;
686 }
687
688 j->rlimits[j->rlimit_count].type = type;
689 j->rlimits[j->rlimit_count].cur = cur;
690 j->rlimits[j->rlimit_count].max = max;
691 j->rlimit_count++;
692 return 0;
693}
694
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -0400695int API minijail_forward_signals(struct minijail *j)
696{
697 j->flags.forward_signals = 1;
698 return 0;
699}
700
Dylan Reid81e23972016-05-18 14:06:35 -0700701int API minijail_mount_with_data(struct minijail *j, const char *src,
702 const char *dest, const char *type,
703 unsigned long flags, const char *data)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700704{
Dylan Reid648b2202015-10-23 00:50:00 -0700705 struct mountpoint *m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400706
707 if (*dest != '/')
708 return -EINVAL;
Dylan Reid648b2202015-10-23 00:50:00 -0700709 m = calloc(1, sizeof(*m));
710 if (!m)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400711 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700712 m->dest = strdup(dest);
713 if (!m->dest)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400714 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700715 m->src = strdup(src);
716 if (!m->src)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400717 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700718 m->type = strdup(type);
719 if (!m->type)
720 goto error;
Dylan Reid81e23972016-05-18 14:06:35 -0700721 if (data) {
722 m->data = strdup(data);
723 if (!m->data)
724 goto error;
725 m->has_data = 1;
726 }
Dylan Reid648b2202015-10-23 00:50:00 -0700727 m->flags = flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400728
Jorge Lucangeli Obes6c755d22016-01-28 15:24:40 -0800729 info("mount %s -> %s type '%s'", src, dest, type);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400730
Elly Jonesdd3e8512012-01-23 15:13:38 -0500731 /*
Dylan Reid648b2202015-10-23 00:50:00 -0700732 * Force vfs namespacing so the mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400733 * containing vfs namespace.
734 */
735 minijail_namespace_vfs(j);
736
Dylan Reid648b2202015-10-23 00:50:00 -0700737 if (j->mounts_tail)
738 j->mounts_tail->next = m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400739 else
Dylan Reid648b2202015-10-23 00:50:00 -0700740 j->mounts_head = m;
741 j->mounts_tail = m;
742 j->mounts_count++;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400743
744 return 0;
745
746error:
Dylan Reid81e23972016-05-18 14:06:35 -0700747 free(m->type);
Dylan Reid648b2202015-10-23 00:50:00 -0700748 free(m->src);
749 free(m->dest);
750 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400751 return -ENOMEM;
752}
753
Dylan Reid81e23972016-05-18 14:06:35 -0700754int API minijail_mount(struct minijail *j, const char *src, const char *dest,
755 const char *type, unsigned long flags)
756{
757 return minijail_mount_with_data(j, src, dest, type, flags, NULL);
758}
759
Dylan Reid648b2202015-10-23 00:50:00 -0700760int API minijail_bind(struct minijail *j, const char *src, const char *dest,
761 int writeable)
762{
763 unsigned long flags = MS_BIND;
764
765 if (!writeable)
766 flags |= MS_RDONLY;
767
768 return minijail_mount(j, src, dest, "", flags);
769}
770
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -0700771int API minijail_add_hook(struct minijail *j, minijail_hook_t hook,
772 void *payload, minijail_hook_event_t event)
773{
774 struct hook *c;
775
776 if (hook == NULL)
777 return -EINVAL;
778 if (event >= MINIJAIL_HOOK_EVENT_MAX)
779 return -EINVAL;
780 c = calloc(1, sizeof(*c));
781 if (!c)
782 return -ENOMEM;
783
784 c->hook = hook;
785 c->payload = payload;
786 c->event = event;
787
788 if (j->hooks_tail)
789 j->hooks_tail->next = c;
790 else
791 j->hooks_head = c;
792 j->hooks_tail = c;
793
794 return 0;
795}
796
Luis Hector Chavez1617f632017-08-01 18:32:30 -0700797int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd)
798{
799 if (parent_fd < 0 || child_fd < 0)
800 return -EINVAL;
801 if (j->preserved_fd_count >= MAX_PRESERVED_FDS)
802 return -ENOMEM;
803 j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd;
804 j->preserved_fds[j->preserved_fd_count].child_fd = child_fd;
805 j->preserved_fd_count++;
806 return 0;
807}
808
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400809static void clear_seccomp_options(struct minijail *j)
810{
811 j->flags.seccomp_filter = 0;
812 j->flags.seccomp_filter_tsync = 0;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400813 j->flags.seccomp_filter_logging = 0;
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400814 j->filter_len = 0;
815 j->filter_prog = NULL;
816 j->flags.no_new_privs = 0;
817}
818
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400819static int seccomp_should_parse_filters(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400820{
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400821 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400822 /*
823 * |errno| will be set to EINVAL when seccomp has not been
824 * compiled into the kernel. On certain platforms and kernel
825 * versions this is not a fatal failure. In that case, and only
826 * in that case, disable seccomp and skip loading the filters.
827 */
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400828 if ((errno == EINVAL) && seccomp_can_softfail()) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400829 warn("not loading seccomp filters, seccomp filter not "
830 "supported");
831 clear_seccomp_options(j);
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400832 return 0;
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700833 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400834 /*
835 * If |errno| != EINVAL or seccomp_can_softfail() is false,
836 * we can proceed. Worst case scenario minijail_enter() will
837 * abort() if seccomp fails.
838 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700839 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400840 if (j->flags.seccomp_filter_tsync) {
841 /* Are the seccomp(2) syscall and the TSYNC option supported? */
842 if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
843 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
844 int saved_errno = errno;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400845 if (saved_errno == ENOSYS && seccomp_can_softfail()) {
846 warn("seccomp(2) syscall not supported");
847 clear_seccomp_options(j);
848 return 0;
849 } else if (saved_errno == EINVAL &&
850 seccomp_can_softfail()) {
851 warn(
852 "seccomp filter thread sync not supported");
853 clear_seccomp_options(j);
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400854 return 0;
855 }
856 /*
857 * Similar logic here. If seccomp_can_softfail() is
858 * false, or |errno| != ENOSYS, or |errno| != EINVAL,
859 * we can proceed. Worst case scenario minijail_enter()
860 * will abort() if seccomp or TSYNC fail.
861 */
862 }
863 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400864 return 1;
865}
866
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700867static int parse_seccomp_filters(struct minijail *j, const char *filename,
868 FILE *policy_file)
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400869{
870 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400871 int use_ret_trap =
872 j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging;
873 int allow_logging = j->flags.seccomp_filter_logging;
874
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700875 if (compile_filter(filename, policy_file, fprog, use_ret_trap,
876 allow_logging)) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400877 free(fprog);
878 return -1;
879 }
880
881 j->filter_len = fprog->len;
882 j->filter_prog = fprog;
883 return 0;
884}
885
886void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
887{
888 if (!seccomp_should_parse_filters(j))
889 return;
890
Elly Jonese1749eb2011-10-07 13:54:59 -0400891 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800892 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700893 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400894 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800895
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700896 if (parse_seccomp_filters(j, path, file) != 0) {
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700897 die("failed to compile seccomp filter BPF program in '%s'",
898 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800899 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400900 fclose(file);
901}
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800902
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400903void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
904{
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700905 char *fd_path, *path;
906 FILE *file;
907
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400908 if (!seccomp_should_parse_filters(j))
909 return;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800910
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700911 file = fdopen(fd, "r");
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400912 if (!file) {
913 pdie("failed to associate stream with fd %d", fd);
914 }
915
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700916 if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1)
917 pdie("failed to create path for fd %d", fd);
918 path = realpath(fd_path, NULL);
919 if (path == NULL)
920 pwarn("failed to get path of fd %d", fd);
921 free(fd_path);
922
923 if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400924 die("failed to compile seccomp filter BPF program from fd %d",
925 fd);
926 }
Luis Hector Chavez7624e712017-08-28 19:30:59 -0700927 free(path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400928 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500929}
930
Andrew Brestickereac28942015-11-11 16:04:46 -0800931int API minijail_use_alt_syscall(struct minijail *j, const char *table)
932{
933 j->alt_syscall_table = strdup(table);
934 if (!j->alt_syscall_table)
935 return -ENOMEM;
936 j->flags.alt_syscall = 1;
937 return 0;
938}
939
Will Drewryf89aef52011-09-16 16:48:57 -0500940struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400941 size_t available;
942 size_t total;
943 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500944};
945
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800946void marshal_state_init(struct marshal_state *state, char *buf,
947 size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400948{
949 state->available = available;
950 state->buf = buf;
951 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500952}
953
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800954void marshal_append(struct marshal_state *state, void *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400955{
956 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500957
Elly Jonese1749eb2011-10-07 13:54:59 -0400958 /* Up to |available| will be written. */
959 if (copy_len) {
960 memcpy(state->buf, src, copy_len);
961 state->buf += copy_len;
962 state->available -= copy_len;
963 }
964 /* |total| will contain the expected length. */
965 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500966}
967
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400968void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
Dylan Reid81e23972016-05-18 14:06:35 -0700969{
970 marshal_append(state, m->src, strlen(m->src) + 1);
971 marshal_append(state, m->dest, strlen(m->dest) + 1);
972 marshal_append(state, m->type, strlen(m->type) + 1);
973 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
974 if (m->has_data)
975 marshal_append(state, m->data, strlen(m->data) + 1);
976 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
977}
978
Will Drewry6ac91122011-10-21 16:38:58 -0500979void minijail_marshal_helper(struct marshal_state *state,
980 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400981{
Dylan Reid648b2202015-10-23 00:50:00 -0700982 struct mountpoint *m = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800983 size_t i;
984
Elly Jonese1749eb2011-10-07 13:54:59 -0400985 marshal_append(state, (char *)j, sizeof(*j));
986 if (j->user)
987 marshal_append(state, j->user, strlen(j->user) + 1);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800988 if (j->suppl_gid_list) {
989 marshal_append(state, j->suppl_gid_list,
990 j->suppl_gid_count * sizeof(gid_t));
991 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400992 if (j->chrootdir)
993 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400994 if (j->hostname)
995 marshal_append(state, j->hostname, strlen(j->hostname) + 1);
Andrew Brestickereac28942015-11-11 16:04:46 -0800996 if (j->alt_syscall_table) {
997 marshal_append(state, j->alt_syscall_table,
998 strlen(j->alt_syscall_table) + 1);
999 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001000 if (j->flags.seccomp_filter && j->filter_prog) {
1001 struct sock_fprog *fp = j->filter_prog;
1002 marshal_append(state, (char *)fp->filter,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001003 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -04001004 }
Dylan Reid648b2202015-10-23 00:50:00 -07001005 for (m = j->mounts_head; m; m = m->next) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04001006 marshal_mount(state, m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001007 }
Dylan Reid605ce7f2016-01-19 19:21:00 -08001008 for (i = 0; i < j->cgroup_count; ++i)
1009 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
Will Drewryf89aef52011-09-16 16:48:57 -05001010}
1011
Will Drewry6ac91122011-10-21 16:38:58 -05001012size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001013{
1014 struct marshal_state state;
1015 marshal_state_init(&state, NULL, 0);
1016 minijail_marshal_helper(&state, j);
1017 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -05001018}
1019
Elly Jonese1749eb2011-10-07 13:54:59 -04001020int minijail_marshal(const struct minijail *j, char *buf, size_t available)
1021{
1022 struct marshal_state state;
1023 marshal_state_init(&state, buf, available);
1024 minijail_marshal_helper(&state, j);
1025 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -05001026}
1027
Elly Jonese1749eb2011-10-07 13:54:59 -04001028int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
1029{
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -08001030 size_t i;
1031 size_t count;
Will Drewrybee7ba72011-10-21 20:47:01 -05001032 int ret = -EINVAL;
1033
Elly Jonese1749eb2011-10-07 13:54:59 -04001034 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -05001035 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -04001036 memcpy((void *)j, serialized, sizeof(*j));
1037 serialized += sizeof(*j);
1038 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -05001039
Will Drewrybee7ba72011-10-21 20:47:01 -05001040 /* Potentially stale pointers not used as signals. */
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -04001041 j->pid_file_path = NULL;
1042 j->uidmap = NULL;
1043 j->gidmap = NULL;
Dylan Reid648b2202015-10-23 00:50:00 -07001044 j->mounts_head = NULL;
1045 j->mounts_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001046 j->filter_prog = NULL;
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07001047 j->hooks_head = NULL;
1048 j->hooks_tail = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001049
Elly Jonese1749eb2011-10-07 13:54:59 -04001050 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -04001051 char *user = consumestr(&serialized, &length);
1052 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -05001053 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001054 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001055 if (!j->user)
1056 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -04001057 }
Will Drewryf89aef52011-09-16 16:48:57 -05001058
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001059 if (j->suppl_gid_list) { /* stale pointer */
1060 if (j->suppl_gid_count > NGROUPS_MAX) {
1061 goto bad_gid_list;
1062 }
1063 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
1064 void *gid_list_bytes =
1065 consumebytes(gid_list_size, &serialized, &length);
1066 if (!gid_list_bytes)
1067 goto bad_gid_list;
1068
1069 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
1070 if (!j->suppl_gid_list)
1071 goto bad_gid_list;
1072
1073 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
1074 }
1075
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001076 if (j->chrootdir) { /* stale pointer */
1077 char *chrootdir = consumestr(&serialized, &length);
1078 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -05001079 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001080 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -05001081 if (!j->chrootdir)
1082 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001083 }
1084
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001085 if (j->hostname) { /* stale pointer */
1086 char *hostname = consumestr(&serialized, &length);
1087 if (!hostname)
1088 goto bad_hostname;
1089 j->hostname = strdup(hostname);
1090 if (!j->hostname)
1091 goto bad_hostname;
1092 }
1093
Andrew Brestickereac28942015-11-11 16:04:46 -08001094 if (j->alt_syscall_table) { /* stale pointer */
1095 char *alt_syscall_table = consumestr(&serialized, &length);
1096 if (!alt_syscall_table)
1097 goto bad_syscall_table;
1098 j->alt_syscall_table = strdup(alt_syscall_table);
1099 if (!j->alt_syscall_table)
1100 goto bad_syscall_table;
1101 }
1102
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001103 if (j->flags.seccomp_filter && j->filter_len > 0) {
1104 size_t ninstrs = j->filter_len;
1105 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
1106 ninstrs > USHRT_MAX)
1107 goto bad_filters;
1108
1109 size_t program_len = ninstrs * sizeof(struct sock_filter);
1110 void *program = consumebytes(program_len, &serialized, &length);
1111 if (!program)
1112 goto bad_filters;
1113
1114 j->filter_prog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001115 if (!j->filter_prog)
1116 goto bad_filters;
1117
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001118 j->filter_prog->len = ninstrs;
1119 j->filter_prog->filter = malloc(program_len);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001120 if (!j->filter_prog->filter)
1121 goto bad_filter_prog_instrs;
1122
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001123 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -04001124 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001125
Dylan Reid648b2202015-10-23 00:50:00 -07001126 count = j->mounts_count;
1127 j->mounts_count = 0;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001128 for (i = 0; i < count; ++i) {
Dylan Reid648b2202015-10-23 00:50:00 -07001129 unsigned long *flags;
Dylan Reid81e23972016-05-18 14:06:35 -07001130 int *has_data;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001131 const char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -07001132 const char *type;
Dylan Reid81e23972016-05-18 14:06:35 -07001133 const char *data = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001134 const char *src = consumestr(&serialized, &length);
1135 if (!src)
Dylan Reid648b2202015-10-23 00:50:00 -07001136 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001137 dest = consumestr(&serialized, &length);
1138 if (!dest)
Dylan Reid648b2202015-10-23 00:50:00 -07001139 goto bad_mounts;
1140 type = consumestr(&serialized, &length);
1141 if (!type)
1142 goto bad_mounts;
Dylan Reid81e23972016-05-18 14:06:35 -07001143 has_data = consumebytes(sizeof(*has_data), &serialized,
1144 &length);
1145 if (!has_data)
1146 goto bad_mounts;
1147 if (*has_data) {
1148 data = consumestr(&serialized, &length);
1149 if (!data)
1150 goto bad_mounts;
1151 }
Dylan Reid648b2202015-10-23 00:50:00 -07001152 flags = consumebytes(sizeof(*flags), &serialized, &length);
1153 if (!flags)
1154 goto bad_mounts;
Dylan Reid81e23972016-05-18 14:06:35 -07001155 if (minijail_mount_with_data(j, src, dest, type, *flags, data))
Dylan Reid648b2202015-10-23 00:50:00 -07001156 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001157 }
1158
Dylan Reid605ce7f2016-01-19 19:21:00 -08001159 count = j->cgroup_count;
1160 j->cgroup_count = 0;
1161 for (i = 0; i < count; ++i) {
1162 char *cgroup = consumestr(&serialized, &length);
1163 if (!cgroup)
1164 goto bad_cgroups;
1165 j->cgroups[i] = strdup(cgroup);
1166 if (!j->cgroups[i])
1167 goto bad_cgroups;
1168 ++j->cgroup_count;
1169 }
1170
Elly Jonese1749eb2011-10-07 13:54:59 -04001171 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -05001172
Dylan Reid605ce7f2016-01-19 19:21:00 -08001173bad_cgroups:
Mike Frysingerac08a682017-10-10 02:04:50 -04001174 free_mounts_list(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001175 for (i = 0; i < j->cgroup_count; ++i)
1176 free(j->cgroups[i]);
Dylan Reid648b2202015-10-23 00:50:00 -07001177bad_mounts:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001178 if (j->flags.seccomp_filter && j->filter_len > 0) {
1179 free(j->filter_prog->filter);
1180 free(j->filter_prog);
1181 }
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001182bad_filter_prog_instrs:
1183 if (j->filter_prog)
1184 free(j->filter_prog);
Will Drewrybee7ba72011-10-21 20:47:01 -05001185bad_filters:
Andrew Brestickereac28942015-11-11 16:04:46 -08001186 if (j->alt_syscall_table)
1187 free(j->alt_syscall_table);
1188bad_syscall_table:
Will Drewrybee7ba72011-10-21 20:47:01 -05001189 if (j->chrootdir)
1190 free(j->chrootdir);
1191bad_chrootdir:
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001192 if (j->hostname)
1193 free(j->hostname);
1194bad_hostname:
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001195 if (j->suppl_gid_list)
1196 free(j->suppl_gid_list);
1197bad_gid_list:
Will Drewrybee7ba72011-10-21 20:47:01 -05001198 if (j->user)
1199 free(j->user);
1200clear_pointers:
1201 j->user = NULL;
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001202 j->suppl_gid_list = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001203 j->chrootdir = NULL;
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001204 j->hostname = NULL;
Andrew Brestickereac28942015-11-11 16:04:46 -08001205 j->alt_syscall_table = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -08001206 j->cgroup_count = 0;
Will Drewrybee7ba72011-10-21 20:47:01 -05001207out:
1208 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -05001209}
1210
Mike Frysinger33ffef32017-01-13 19:53:19 -05001211struct dev_spec {
1212 const char *name;
1213 mode_t mode;
1214 dev_t major, minor;
1215};
1216
1217static const struct dev_spec device_nodes[] = {
1218 {
1219 "null",
1220 S_IFCHR | 0666, 1, 3,
1221 },
1222 {
1223 "zero",
1224 S_IFCHR | 0666, 1, 5,
1225 },
1226 {
1227 "full",
1228 S_IFCHR | 0666, 1, 7,
1229 },
1230 {
1231 "urandom",
1232 S_IFCHR | 0444, 1, 9,
1233 },
1234 {
1235 "tty",
1236 S_IFCHR | 0666, 5, 0,
1237 },
1238};
1239
1240struct dev_sym_spec {
1241 const char *source, *dest;
1242};
1243
1244static const struct dev_sym_spec device_symlinks[] = {
1245 { "ptmx", "pts/ptmx", },
1246 { "fd", "/proc/self/fd", },
1247 { "stdin", "fd/0", },
1248 { "stdout", "fd/1", },
1249 { "stderr", "fd/2", },
1250};
1251
1252/*
1253 * Clean up the temporary dev path we had setup previously. In case of errors,
1254 * we don't want to go leaking empty tempdirs.
1255 */
1256static void mount_dev_cleanup(char *dev_path)
1257{
1258 umount2(dev_path, MNT_DETACH);
1259 rmdir(dev_path);
1260 free(dev_path);
1261}
1262
1263/*
1264 * Set up the pseudo /dev path at the temporary location.
1265 * See mount_dev_finalize for more details.
1266 */
1267static int mount_dev(char **dev_path_ret)
1268{
1269 int ret;
1270 int dev_fd;
1271 size_t i;
1272 mode_t mask;
1273 char *dev_path;
1274
1275 /*
1276 * Create a temp path for the /dev init. We'll relocate this to the
1277 * final location later on in the startup process.
1278 */
1279 dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX");
1280 if (dev_path == NULL || mkdtemp(dev_path) == NULL)
1281 pdie("could not create temp path for /dev");
1282
1283 /* Set up the empty /dev mount point first. */
1284 ret = mount("minijail-devfs", dev_path, "tmpfs",
1285 MS_NOEXEC | MS_NOSUID, "size=5M,mode=755");
1286 if (ret) {
1287 rmdir(dev_path);
1288 return ret;
1289 }
1290
1291 /* We want to set the mode directly from the spec. */
1292 mask = umask(0);
1293
1294 /* Get a handle to the temp dev path for *at funcs below. */
1295 dev_fd = open(dev_path, O_DIRECTORY|O_PATH|O_CLOEXEC);
1296 if (dev_fd < 0) {
1297 ret = 1;
1298 goto done;
1299 }
1300
1301 /* Create all the nodes in /dev. */
1302 for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) {
1303 const struct dev_spec *ds = &device_nodes[i];
1304 ret = mknodat(dev_fd, ds->name, ds->mode,
1305 makedev(ds->major, ds->minor));
1306 if (ret)
1307 goto done;
1308 }
1309
1310 /* Create all the symlinks in /dev. */
1311 for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) {
1312 const struct dev_sym_spec *ds = &device_symlinks[i];
1313 ret = symlinkat(ds->dest, dev_fd, ds->source);
1314 if (ret)
1315 goto done;
1316 }
1317
1318 /* Restore old mask. */
1319 done:
1320 close(dev_fd);
1321 umask(mask);
1322
1323 if (ret)
1324 mount_dev_cleanup(dev_path);
1325
1326 return ret;
1327}
1328
1329/*
1330 * Relocate the temporary /dev mount to its final /dev place.
1331 * We have to do this two step process so people can bind mount extra
1332 * /dev paths like /dev/log.
1333 */
1334static int mount_dev_finalize(const struct minijail *j, char *dev_path)
1335{
1336 int ret = -1;
1337 char *dest = NULL;
1338
1339 /* Unmount the /dev mount if possible. */
1340 if (umount2("/dev", MNT_DETACH))
1341 goto done;
1342
1343 if (asprintf(&dest, "%s/dev", j->chrootdir ? : "") < 0)
1344 goto done;
1345
1346 if (mount(dev_path, dest, NULL, MS_MOVE, NULL))
1347 goto done;
1348
1349 ret = 0;
1350 done:
1351 free(dest);
1352 mount_dev_cleanup(dev_path);
1353
1354 return ret;
1355}
1356
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001357/*
1358 * mount_one: Applies mounts from @m for @j, recursing as needed.
Dylan Reid648b2202015-10-23 00:50:00 -07001359 * @j Minijail these mounts are for
1360 * @m Head of list of mounts
Elly Jones51a5b6c2011-10-12 19:09:26 -04001361 *
1362 * Returns 0 for success.
1363 */
Mike Frysinger33ffef32017-01-13 19:53:19 -05001364static int mount_one(const struct minijail *j, struct mountpoint *m,
1365 const char *dev_path)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001366{
Dylan Reid648b2202015-10-23 00:50:00 -07001367 int ret;
1368 char *dest;
1369 int remount_ro = 0;
1370
Mike Frysinger33ffef32017-01-13 19:53:19 -05001371 /* We assume |dest| has a leading "/". */
1372 if (dev_path && strncmp("/dev/", m->dest, 5) == 0) {
1373 /* Since the temp path is rooted at /dev, skip that dest part. */
1374 if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0)
1375 return -ENOMEM;
1376 } else {
Mike Frysingerac08a682017-10-10 02:04:50 -04001377 if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0)
Mike Frysinger33ffef32017-01-13 19:53:19 -05001378 return -ENOMEM;
1379 }
Dylan Reid648b2202015-10-23 00:50:00 -07001380
Mike Frysinger33ffef32017-01-13 19:53:19 -05001381 ret = setup_mount_destination(m->src, dest, j->uid, j->gid,
1382 (m->flags & MS_BIND));
1383 if (ret) {
yusukes1b32f852018-03-05 10:24:58 -08001384 warn("creating mount target '%s' failed", dest);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001385 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001386 }
Dylan Reideec77962016-06-30 19:35:10 -07001387
Dylan Reid648b2202015-10-23 00:50:00 -07001388 /*
Jorge Lucangeli Obes2b12ba42016-01-26 10:37:51 -08001389 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1390 * can't both be specified in the original bind mount.
1391 * Remount R/O after the initial mount.
Dylan Reid648b2202015-10-23 00:50:00 -07001392 */
1393 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1394 remount_ro = 1;
1395 m->flags &= ~MS_RDONLY;
Elly Jonesa1059632011-12-15 15:17:07 -05001396 }
Dylan Reid648b2202015-10-23 00:50:00 -07001397
Dylan Reid81e23972016-05-18 14:06:35 -07001398 ret = mount(m->src, dest, m->type, m->flags, m->data);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001399 if (ret) {
1400 pwarn("mount: %s -> %s", m->src, dest);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001401 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001402 }
Dylan Reid648b2202015-10-23 00:50:00 -07001403
1404 if (remount_ro) {
1405 m->flags |= MS_RDONLY;
1406 ret = mount(m->src, dest, NULL,
Dylan Reid81e23972016-05-18 14:06:35 -07001407 m->flags | MS_REMOUNT, m->data);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001408 if (ret) {
1409 pwarn("bind ro: %s -> %s", m->src, dest);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001410 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001411 }
Dylan Reid648b2202015-10-23 00:50:00 -07001412 }
1413
Elly Jones51a5b6c2011-10-12 19:09:26 -04001414 free(dest);
Dylan Reid648b2202015-10-23 00:50:00 -07001415 if (m->next)
Mike Frysinger33ffef32017-01-13 19:53:19 -05001416 return mount_one(j, m->next, dev_path);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001417 return 0;
1418
1419error:
1420 free(dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001421 return ret;
1422}
1423
Mike Frysingerac08a682017-10-10 02:04:50 -04001424static void process_mounts_or_die(const struct minijail *j)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001425{
Mike Frysingerac08a682017-10-10 02:04:50 -04001426 /*
1427 * We have to mount /dev first in case there are bind mounts from
1428 * the original /dev into the new unique tmpfs one.
1429 */
1430 char *dev_path = NULL;
1431 if (j->flags.mount_dev && mount_dev(&dev_path))
1432 pdie("mount_dev failed");
Dylan Reid648b2202015-10-23 00:50:00 -07001433
Mike Frysingerac08a682017-10-10 02:04:50 -04001434 if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) {
1435 if (dev_path) {
1436 int saved_errno = errno;
1437 mount_dev_cleanup(dev_path);
1438 errno = saved_errno;
1439 }
1440 pdie("mount_one failed");
1441 }
Mike Frysinger33ffef32017-01-13 19:53:19 -05001442
1443 /*
Mike Frysingerac08a682017-10-10 02:04:50 -04001444 * Once all bind mounts have been processed, move the temp dev to
1445 * its final /dev home.
Mike Frysinger33ffef32017-01-13 19:53:19 -05001446 */
1447 if (j->flags.mount_dev && mount_dev_finalize(j, dev_path))
Mike Frysingerac08a682017-10-10 02:04:50 -04001448 pdie("mount_dev_finalize failed");
1449}
Elly Jones51a5b6c2011-10-12 19:09:26 -04001450
Mike Frysingerac08a682017-10-10 02:04:50 -04001451static int enter_chroot(const struct minijail *j)
1452{
Luis Hector Chavez64730af2017-09-13 13:18:59 -07001453 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
1454
Elly Jones51a5b6c2011-10-12 19:09:26 -04001455 if (chroot(j->chrootdir))
1456 return -errno;
1457
1458 if (chdir("/"))
1459 return -errno;
1460
1461 return 0;
1462}
1463
Mike Frysingerac08a682017-10-10 02:04:50 -04001464static int enter_pivot_root(const struct minijail *j)
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001465{
Mike Frysingerac08a682017-10-10 02:04:50 -04001466 int oldroot, newroot;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001467
Luis Hector Chavez64730af2017-09-13 13:18:59 -07001468 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
1469
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001470 /*
1471 * Keep the fd for both old and new root.
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001472 * It will be used in fchdir(2) later.
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001473 */
Ricky Zhoubce609d2016-03-02 21:47:56 -08001474 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001475 if (oldroot < 0)
1476 pdie("failed to open / for fchdir");
Ricky Zhoubce609d2016-03-02 21:47:56 -08001477 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001478 if (newroot < 0)
1479 pdie("failed to open %s for fchdir", j->chrootdir);
1480
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001481 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001482 * To ensure j->chrootdir is the root of a filesystem,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001483 * do a self bind mount.
1484 */
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001485 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1486 pdie("failed to bind mount '%s'", j->chrootdir);
1487 if (chdir(j->chrootdir))
1488 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001489 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001490 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001491
1492 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001493 * Now the old root is mounted on top of the new root. Use fchdir(2) to
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001494 * change to the old root and unmount it.
1495 */
1496 if (fchdir(oldroot))
1497 pdie("failed to fchdir to old /");
Hidehiko Abe097b7192016-03-16 18:00:36 +09001498
1499 /*
Mike Frysinger785b1c32018-02-23 15:47:24 -05001500 * If skip_remount_private was enabled for minijail_enter(),
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001501 * there could be a shared mount point under |oldroot|. In that case,
1502 * mounts under this shared mount point will be unmounted below, and
1503 * this unmounting will propagate to the original mount namespace
1504 * (because the mount point is shared). To prevent this unexpected
1505 * unmounting, remove these mounts from their peer groups by recursively
1506 * remounting them as MS_PRIVATE.
Hidehiko Abe097b7192016-03-16 18:00:36 +09001507 */
1508 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001509 pdie("failed to mount(/, private) before umount(/)");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001510 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001511 if (umount2(".", MNT_DETACH))
1512 pdie("umount(/)");
1513 /* Change back to the new root. */
1514 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001515 return -errno;
Ricky Zhoubce609d2016-03-02 21:47:56 -08001516 if (close(oldroot))
1517 return -errno;
1518 if (close(newroot))
1519 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001520 if (chroot("/"))
1521 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -07001522 /* Set correct CWD for getcwd(3). */
1523 if (chdir("/"))
1524 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001525
1526 return 0;
1527}
1528
Martin Pelikánab9eb442017-01-25 11:53:58 +11001529static int mount_tmp(const struct minijail *j)
Lee Campbell11af0622014-05-22 12:36:04 -07001530{
Martin Pelikánab9eb442017-01-25 11:53:58 +11001531 const char fmt[] = "size=%zu,mode=1777";
1532 /* Count for the user storing ULLONG_MAX literally + extra space. */
1533 char data[sizeof(fmt) + sizeof("18446744073709551615ULL")];
1534 int ret;
1535
1536 ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size);
1537
1538 if (ret <= 0)
1539 pdie("tmpfs size spec error");
1540 else if ((size_t)ret >= sizeof(data))
1541 pdie("tmpfs size spec too large");
Mike Frysingerb91d4042017-01-13 19:03:34 -05001542 return mount("none", "/tmp", "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID,
Martin Pelikánab9eb442017-01-25 11:53:58 +11001543 data);
Lee Campbell11af0622014-05-22 12:36:04 -07001544}
1545
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001546static int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001547{
1548 const char *kProcPath = "/proc";
1549 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -05001550 /*
1551 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -04001552 * /proc in our namespace, which means using MS_REMOUNT here would
1553 * mutate our parent's mount as well, even though we're in a VFS
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001554 * namespace (!). Instead, remove their mount from our namespace lazily
1555 * (MNT_DETACH) and make our own.
Elly Jonese1749eb2011-10-07 13:54:59 -04001556 */
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001557 if (umount2(kProcPath, MNT_DETACH)) {
1558 /*
1559 * If we are in a new user namespace, umount(2) will fail.
1560 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
1561 */
1562 if (j->flags.userns) {
1563 info("umount(/proc, MNT_DETACH) failed, "
1564 "this is expected when using user namespaces");
1565 } else {
1566 return -errno;
1567 }
1568 }
Mike Frysinger3ba81572017-01-17 23:33:28 -05001569 if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
Elly Jonese1749eb2011-10-07 13:54:59 -04001570 return -errno;
1571 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -04001572}
1573
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001574static void kill_child_and_die(const struct minijail *j, const char *msg)
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001575{
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001576 kill(j->initpid, SIGKILL);
1577 die("%s", msg);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001578}
1579
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001580static void write_pid_file_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001581{
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07001582 if (write_pid_to_path(j->initpid, j->pid_file_path))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001583 kill_child_and_die(j, "failed to write pid file");
Dylan Reid605ce7f2016-01-19 19:21:00 -08001584}
1585
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001586static void add_to_cgroups_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001587{
1588 size_t i;
1589
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001590 for (i = 0; i < j->cgroup_count; ++i) {
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07001591 if (write_pid_to_path(j->initpid, j->cgroups[i]))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001592 kill_child_and_die(j, "failed to add to cgroups");
1593 }
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001594}
1595
Dylan Reid0f72ef42017-06-06 15:42:49 -07001596static void set_rlimits_or_die(const struct minijail *j)
1597{
1598 size_t i;
1599
1600 for (i = 0; i < j->rlimit_count; ++i) {
1601 struct rlimit limit;
1602 limit.rlim_cur = j->rlimits[i].cur;
1603 limit.rlim_max = j->rlimits[i].max;
1604 if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL))
1605 kill_child_and_die(j, "failed to set rlimit");
1606 }
1607}
1608
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001609static void write_ugid_maps_or_die(const struct minijail *j)
1610{
1611 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
1612 kill_child_and_die(j, "failed to write uid_map");
Mike Frysinger6b190c02017-01-04 17:18:42 -05001613 if (j->gidmap && j->flags.disable_setgroups) {
1614 /* Older kernels might not have the /proc/<pid>/setgroups files. */
1615 int ret = write_proc_file(j->initpid, "deny", "setgroups");
Mike Frysingereea841b2017-01-13 18:11:57 -05001616 if (ret != 0) {
Mike Frysinger6b190c02017-01-04 17:18:42 -05001617 if (ret == -ENOENT) {
1618 /* See http://man7.org/linux/man-pages/man7/user_namespaces.7.html. */
1619 warn("could not disable setgroups(2)");
1620 } else
1621 kill_child_and_die(j, "failed to disable setgroups(2)");
1622 }
1623 }
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001624 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
1625 kill_child_and_die(j, "failed to write gid_map");
1626}
1627
1628static void enter_user_namespace(const struct minijail *j)
1629{
Luis Hector Chavez71323552017-09-05 09:17:22 -07001630 int uid = j->flags.uid ? j->uid : 0;
1631 int gid = j->flags.gid ? j->gid : 0;
1632 if (j->gidmap && setresgid(gid, gid, gid)) {
1633 pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid,
1634 gid);
1635 }
1636 if (j->uidmap && setresuid(uid, uid, uid)) {
1637 pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid,
1638 uid);
1639 }
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001640}
1641
1642static void parent_setup_complete(int *pipe_fds)
1643{
1644 close(pipe_fds[0]);
1645 close(pipe_fds[1]);
1646}
1647
1648/*
1649 * wait_for_parent_setup: Called by the child process to wait for any
1650 * further parent-side setup to complete before continuing.
1651 */
1652static void wait_for_parent_setup(int *pipe_fds)
1653{
1654 char buf;
1655
1656 close(pipe_fds[1]);
1657
1658 /* Wait for parent to complete setup and close the pipe. */
1659 if (read(pipe_fds[0], &buf, 1) != 0)
1660 die("failed to sync with parent");
1661 close(pipe_fds[0]);
1662}
1663
1664static void drop_ugid(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001665{
Lutz Justen13807cb2017-01-03 17:11:55 +01001666 if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids +
1667 j->flags.set_suppl_gids > 1) {
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -05001668 die("can only do one of inherit, keep, or set supplementary "
1669 "groups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001670 }
1671
Lutz Justen13807cb2017-01-03 17:11:55 +01001672 if (j->flags.inherit_suppl_gids) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001673 if (initgroups(j->user, j->usergid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001674 pdie("initgroups(%s, %d) failed", j->user, j->usergid);
Lutz Justen13807cb2017-01-03 17:11:55 +01001675 } else if (j->flags.set_suppl_gids) {
1676 if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001677 pdie("setgroups(suppl_gids) failed");
Luis Hector Chavez71323552017-09-05 09:17:22 -07001678 } else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) {
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001679 /*
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001680 * Only attempt to clear supplementary groups if we are changing
Luis Hector Chavez71323552017-09-05 09:17:22 -07001681 * users or groups, and if the caller did not request to disable
1682 * setgroups (used when entering a user namespace as a
1683 * non-privileged user).
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001684 */
Jorge Lucangeli Obes24499562016-12-01 11:59:27 -05001685 if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001686 pdie("setgroups(0, NULL) failed");
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001687 }
1688
1689 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001690 pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001691
1692 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001693 pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001694}
1695
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001696static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
1697{
1698 const uint64_t one = 1;
1699 unsigned int i;
1700 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
1701 if (keep_mask & (one << i))
1702 continue;
1703 if (prctl(PR_CAPBSET_DROP, i))
1704 pdie("could not drop capability from bounding set");
1705 }
1706}
1707
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001708static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -04001709{
Jorge Lucangeli Obes7ea269e2016-02-26 22:07:09 -08001710 if (!j->flags.use_caps)
1711 return;
1712
Elly Jonese1749eb2011-10-07 13:54:59 -04001713 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -08001714 cap_value_t flag[1];
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04001715 const size_t ncaps = sizeof(j->caps) * 8;
Kees Cooke5609ac2013-02-06 14:12:41 -08001716 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -04001717 unsigned int i;
1718 if (!caps)
1719 die("can't get process caps");
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04001720 if (cap_clear(caps))
1721 die("can't clear caps");
1722
1723 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -08001724 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001725 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -04001726 continue;
Kees Cook323878a2013-02-05 15:35:24 -08001727 flag[0] = i;
1728 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001729 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -08001730 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001731 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -08001732 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001733 die("can't add inheritable cap");
1734 }
1735 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -08001736 die("can't apply initial cleaned capset");
1737
1738 /*
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04001739 * Instead of dropping the bounding set first, do it here in case
Kees Cook323878a2013-02-05 15:35:24 -08001740 * the caller had a more permissive bounding set which could
1741 * have been used above to raise a capability that wasn't already
1742 * present. This requires CAP_SETPCAP, so we raised/kept it above.
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04001743 *
1744 * However, if we're asked to skip setting *and* locking the
1745 * SECURE_NOROOT securebit, also skip dropping the bounding set.
1746 * If the caller wants to regain all capabilities when executing a
1747 * set-user-ID-root program, allow them to do so. The default behavior
1748 * (i.e. the behavior without |securebits_skip_mask| set) will still put
1749 * the jailed process tree in a capabilities-only environment.
1750 *
1751 * We check the negated skip mask for SECURE_NOROOT and
1752 * SECURE_NOROOT_LOCKED. If the bits are set in the negated mask they
1753 * will *not* be skipped in lock_securebits(), and therefore we should
1754 * drop the bounding set.
Kees Cook323878a2013-02-05 15:35:24 -08001755 */
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04001756 if (secure_noroot_set_and_locked(~j->securebits_skip_mask)) {
1757 drop_capbset(j->caps, last_valid_cap);
1758 } else {
1759 warn("SECURE_NOROOT not set, not dropping bounding set");
1760 }
Kees Cook323878a2013-02-05 15:35:24 -08001761
1762 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001763 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -08001764 flag[0] = CAP_SETPCAP;
1765 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1766 die("can't clear effective cap");
1767 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1768 die("can't clear permitted cap");
1769 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1770 die("can't clear inheritable cap");
1771 }
1772
1773 if (cap_set_proc(caps))
1774 die("can't apply final cleaned capset");
1775
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04001776 /*
1777 * If ambient capabilities are supported, clear all capabilities first,
1778 * then raise the requested ones.
1779 */
1780 if (j->flags.set_ambient_caps) {
1781 if (!cap_ambient_supported()) {
1782 pdie("ambient capabilities not supported");
1783 }
Jorge Lucangeli Obesf6058c32017-04-26 10:26:59 -04001784 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) !=
1785 0) {
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04001786 pdie("can't clear ambient capabilities");
1787 }
1788
1789 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
1790 if (!(j->caps & (one << i)))
1791 continue;
1792
1793 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0,
1794 0) != 0) {
1795 pdie("prctl(PR_CAP_AMBIENT, "
1796 "PR_CAP_AMBIENT_RAISE, %u) failed",
1797 i);
1798 }
1799 }
1800 }
1801
Kees Cook323878a2013-02-05 15:35:24 -08001802 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -04001803}
1804
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001805static void set_seccomp_filter(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001806{
1807 /*
1808 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1809 * in the kernel source tree for an explanation of the parameters.
1810 */
1811 if (j->flags.no_new_privs) {
1812 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1813 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1814 }
1815
1816 /*
Jorge Lucangeli Obes2413f372016-04-06 18:43:10 -07001817 * Code running with ASan
1818 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1819 * will make system calls not included in the syscall filter policy,
1820 * which will likely crash the program. Skip setting seccomp filter in
1821 * that case.
1822 * 'running_with_asan()' has no inputs and is completely defined at
1823 * build time, so this cannot be used by an attacker to skip setting
1824 * seccomp filter.
1825 */
1826 if (j->flags.seccomp_filter && running_with_asan()) {
1827 warn("running with ASan, not setting seccomp filter");
1828 return;
1829 }
1830
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04001831 if (j->flags.seccomp_filter) {
1832 if (j->flags.seccomp_filter_logging) {
1833 /*
1834 * If logging seccomp filter failures,
1835 * install the SIGSYS handler first.
1836 */
1837 if (install_sigsys_handler())
1838 pdie("failed to install SIGSYS handler");
1839 warn("logging seccomp filter failures");
1840 } else if (j->flags.seccomp_filter_tsync) {
1841 /*
1842 * If setting thread sync,
1843 * reset the SIGSYS signal handler so that
1844 * the entire thread group is killed.
1845 */
1846 if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
1847 pdie("failed to reset SIGSYS disposition");
1848 info("reset SIGSYS disposition");
1849 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001850 }
1851
1852 /*
1853 * Install the syscall filter.
1854 */
1855 if (j->flags.seccomp_filter) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001856 if (j->flags.seccomp_filter_tsync) {
1857 if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
1858 SECCOMP_FILTER_FLAG_TSYNC,
1859 j->filter_prog)) {
1860 pdie("seccomp(tsync) failed");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001861 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001862 } else {
1863 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1864 j->filter_prog)) {
1865 pdie("prctl(seccomp_filter) failed");
1866 }
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001867 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001868 }
1869}
1870
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001871static pid_t forward_pid = -1;
1872
Mike Frysinger33d051a2018-05-30 16:41:10 -04001873static void forward_signal(int sig,
Mike Frysingerd9ef07c2018-05-30 16:51:36 -04001874 siginfo_t *siginfo attribute_unused,
1875 void *void_context attribute_unused)
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001876{
1877 if (forward_pid != -1) {
Mike Frysinger33d051a2018-05-30 16:41:10 -04001878 kill(forward_pid, sig);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001879 }
1880}
1881
1882static void install_signal_handlers(void)
1883{
1884 struct sigaction act;
1885
1886 memset(&act, 0, sizeof(act));
1887 act.sa_sigaction = &forward_signal;
1888 act.sa_flags = SA_SIGINFO | SA_RESTART;
1889
1890 /* Handle all signals, except SIGCHLD. */
Mike Frysinger33d051a2018-05-30 16:41:10 -04001891 for (int sig = 1; sig < NSIG; sig++) {
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001892 /*
1893 * We don't care if we get EINVAL: that just means that we
1894 * can't handle this signal, so let's skip it and continue.
1895 */
Mike Frysinger33d051a2018-05-30 16:41:10 -04001896 sigaction(sig, &act, NULL);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001897 }
1898 /* Reset SIGCHLD's handler. */
1899 signal(SIGCHLD, SIG_DFL);
1900
1901 /* Handle real-time signals. */
Mike Frysinger33d051a2018-05-30 16:41:10 -04001902 for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++) {
1903 sigaction(sig, &act, NULL);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04001904 }
1905}
1906
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07001907static const char *lookup_hook_name(minijail_hook_event_t event)
1908{
1909 switch (event) {
1910 case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS:
1911 return "pre-drop-caps";
1912 case MINIJAIL_HOOK_EVENT_PRE_EXECVE:
1913 return "pre-execve";
Luis Hector Chavez64730af2017-09-13 13:18:59 -07001914 case MINIJAIL_HOOK_EVENT_PRE_CHROOT:
1915 return "pre-chroot";
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07001916 case MINIJAIL_HOOK_EVENT_MAX:
1917 /*
1918 * Adding this in favor of a default case to force the
1919 * compiler to error out if a new enum value is added.
1920 */
1921 break;
1922 }
1923 return "unknown";
1924}
1925
1926static void run_hooks_or_die(const struct minijail *j,
1927 minijail_hook_event_t event)
1928{
1929 int rc;
1930 int hook_index = 0;
1931 for (struct hook *c = j->hooks_head; c; c = c->next) {
1932 if (c->event != event)
1933 continue;
1934 rc = c->hook(c->payload);
1935 if (rc != 0) {
1936 errno = -rc;
1937 pdie("%s hook (index %d) failed",
1938 lookup_hook_name(event), hook_index);
1939 }
1940 /* Only increase the index within the same hook event type. */
1941 ++hook_index;
1942 }
1943}
1944
Will Drewry6ac91122011-10-21 16:38:58 -05001945void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001946{
Dylan Reidf682d472015-09-17 21:39:07 -07001947 /*
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001948 * If we're dropping caps, get the last valid cap from /proc now,
1949 * since /proc can be unmounted before drop_caps() is called.
Dylan Reidf682d472015-09-17 21:39:07 -07001950 */
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001951 unsigned int last_valid_cap = 0;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001952 if (j->flags.capbset_drop || j->flags.use_caps)
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001953 last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07001954
Elly Jonese1749eb2011-10-07 13:54:59 -04001955 if (j->flags.pids)
1956 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001957 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001958
Lutz Justen13807cb2017-01-03 17:11:55 +01001959 if (j->flags.inherit_suppl_gids && !j->user)
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -05001960 die("cannot inherit supplementary groups without setting a "
1961 "username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001962
Elly Jonesdd3e8512012-01-23 15:13:38 -05001963 /*
1964 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001965 * so we don't even try. If any of our operations fail, we abort() the
1966 * entire process.
1967 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001968 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001969 pdie("setns(CLONE_NEWNS) failed");
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001970
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001971 if (j->flags.vfs) {
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001972 if (unshare(CLONE_NEWNS))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001973 pdie("unshare(CLONE_NEWNS) failed");
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001974 /*
Mike Frysinger785b1c32018-02-23 15:47:24 -05001975 * By default, remount all filesystems as private, unless
1976 * - Passed a specific remount mode, in which case remount with that,
1977 * - Asked not to remount at all, in which case skip the mount(2) call.
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001978 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1979 */
Mike Frysinger785b1c32018-02-23 15:47:24 -05001980 if (j->remount_mode) {
1981 if (mount(NULL, "/", NULL, MS_REC | j->remount_mode, NULL))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001982 pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE,"
1983 " NULL) failed");
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001984 }
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001985 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001986
Dylan Reidf7942472015-11-18 17:55:26 -08001987 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05001988 pdie("unshare(CLONE_NEWIPC) failed");
Dylan Reidf7942472015-11-18 17:55:26 -08001989 }
1990
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001991 if (j->flags.uts) {
1992 if (unshare(CLONE_NEWUTS))
1993 pdie("unshare(CLONE_NEWUTS) failed");
1994
1995 if (j->hostname && sethostname(j->hostname, strlen(j->hostname)))
1996 pdie("sethostname(%s) failed", j->hostname);
1997 }
1998
Dylan Reid1102f5a2015-09-15 11:52:20 -07001999 if (j->flags.enter_net) {
2000 if (setns(j->netns_fd, CLONE_NEWNET))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002001 pdie("setns(CLONE_NEWNET) failed");
Mike Frysinger7559dfe2016-11-15 18:58:39 -05002002 } else if (j->flags.net) {
2003 if (unshare(CLONE_NEWNET))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002004 pdie("unshare(CLONE_NEWNET) failed");
2005 config_net_loopback();
Dylan Reid1102f5a2015-09-15 11:52:20 -07002006 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002007
Dylan Reid4cbc2a52016-06-17 19:06:07 -07002008 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002009 pdie("unshare(CLONE_NEWCGROUP) failed");
Dylan Reid4cbc2a52016-06-17 19:06:07 -07002010
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -08002011 if (j->flags.new_session_keyring) {
2012 if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0)
2013 pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed");
2014 }
2015
Mike Frysingerac08a682017-10-10 02:04:50 -04002016 /* We have to process all the mounts before we chroot/pivot_root. */
2017 process_mounts_or_die(j);
Elly Jones51a5b6c2011-10-12 19:09:26 -04002018
Mike Frysingerac08a682017-10-10 02:04:50 -04002019 if (j->flags.chroot && enter_chroot(j))
Mike Frysinger33ffef32017-01-13 19:53:19 -05002020 pdie("chroot");
Mike Frysinger33ffef32017-01-13 19:53:19 -05002021
Mike Frysingerac08a682017-10-10 02:04:50 -04002022 if (j->flags.pivot_root && enter_pivot_root(j))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002023 pdie("pivot_root");
2024
Martin Pelikánab9eb442017-01-25 11:53:58 +11002025 if (j->flags.mount_tmp && mount_tmp(j))
Lee Campbell11af0622014-05-22 12:36:04 -07002026 pdie("mount_tmp");
2027
Dylan Reid791f5772015-09-14 20:02:42 -07002028 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04002029 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04002030
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002031 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS);
2032
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08002033 /*
2034 * If we're only dropping capabilities from the bounding set, but not
2035 * from the thread's (permitted|inheritable|effective) sets, do it now.
2036 */
2037 if (j->flags.capbset_drop) {
2038 drop_capbset(j->cap_bset, last_valid_cap);
2039 }
2040
2041 if (j->flags.use_caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002042 /*
2043 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04002044 * capability to change uids, our attempt to use setuid()
2045 * below will fail. Hang on to root caps across setuid(), then
2046 * lock securebits.
2047 */
2048 if (prctl(PR_SET_KEEPCAPS, 1))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002049 pdie("prctl(PR_SET_KEEPCAPS) failed");
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -07002050
Luis Hector Chavezec0a2c12017-06-29 20:29:57 -07002051 if (lock_securebits(j->securebits_skip_mask) < 0) {
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002052 pdie("locking securebits failed");
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -07002053 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002054 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002055
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002056 if (j->flags.no_new_privs) {
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002057 /*
2058 * If we're setting no_new_privs, we can drop privileges
2059 * before setting seccomp filter. This way filter policies
2060 * don't need to allow privilege-dropping syscalls.
2061 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002062 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002063 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002064 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04002065 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002066 /*
2067 * If we're not setting no_new_privs,
2068 * we need to set seccomp filter *before* dropping privileges.
2069 * WARNING: this means that filter policies *must* allow
2070 * setgroups()/setresgid()/setresuid() for dropping root and
2071 * capget()/capset()/prctl() for dropping caps.
2072 */
2073 set_seccomp_filter(j);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002074 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002075 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04002076 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002077
Elly Jonesdd3e8512012-01-23 15:13:38 -05002078 /*
Andrew Brestickereac28942015-11-11 16:04:46 -08002079 * Select the specified alternate syscall table. The table must not
2080 * block prctl(2) if we're using seccomp as well.
2081 */
2082 if (j->flags.alt_syscall) {
2083 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002084 pdie("prctl(PR_ALT_SYSCALL) failed");
Andrew Brestickereac28942015-11-11 16:04:46 -08002085 }
2086
2087 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05002088 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04002089 * privilege-dropping syscalls :)
2090 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002091 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04002092 if ((errno == EINVAL) && seccomp_can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002093 warn("seccomp not supported");
2094 return;
2095 }
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002096 pdie("prctl(PR_SET_SECCOMP) failed");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002097 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002098}
2099
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002100/* TODO(wad): will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04002101static int init_exitstatus = 0;
2102
Mike Frysingerd9ef07c2018-05-30 16:51:36 -04002103void init_term(int sig attribute_unused)
Elly Jonese1749eb2011-10-07 13:54:59 -04002104{
2105 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04002106}
2107
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002108void init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04002109{
2110 pid_t pid;
2111 int status;
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002112 /* So that we exit with the right status. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002113 signal(SIGTERM, init_term);
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002114 /* TODO(wad): self jail with seccomp filters here. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002115 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002116 /*
2117 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04002118 * left inside our pid namespace or we get a signal.
2119 */
2120 if (pid == rootpid)
2121 init_exitstatus = status;
2122 }
2123 if (!WIFEXITED(init_exitstatus))
2124 _exit(MINIJAIL_ERR_INIT);
2125 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04002126}
2127
Will Drewry6ac91122011-10-21 16:38:58 -05002128int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002129{
2130 size_t sz = 0;
2131 size_t bytes = read(fd, &sz, sizeof(sz));
2132 char *buf;
2133 int r;
2134 if (sizeof(sz) != bytes)
2135 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002136 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04002137 return -E2BIG;
2138 buf = malloc(sz);
2139 if (!buf)
2140 return -ENOMEM;
2141 bytes = read(fd, buf, sz);
2142 if (bytes != sz) {
2143 free(buf);
2144 return -EINVAL;
2145 }
2146 r = minijail_unmarshal(j, buf, sz);
2147 free(buf);
2148 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05002149}
2150
Will Drewry6ac91122011-10-21 16:38:58 -05002151int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04002152{
2153 char *buf;
2154 size_t sz = minijail_size(j);
2155 ssize_t written;
2156 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04002157
Elly Jonese1749eb2011-10-07 13:54:59 -04002158 if (!sz)
2159 return -EINVAL;
2160 buf = malloc(sz);
2161 r = minijail_marshal(j, buf, sz);
2162 if (r) {
2163 free(buf);
2164 return r;
2165 }
2166 /* Sends [size][minijail]. */
2167 written = write(fd, &sz, sizeof(sz));
2168 if (written != sizeof(sz)) {
2169 free(buf);
2170 return -EFAULT;
2171 }
2172 written = write(fd, buf, sz);
2173 if (written < 0 || (size_t) written != sz) {
2174 free(buf);
2175 return -EFAULT;
2176 }
2177 free(buf);
2178 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05002179}
Elly Jonescd7a9042011-07-22 13:56:51 -04002180
Will Drewry6ac91122011-10-21 16:38:58 -05002181int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04002182{
Daniel Erat5b7a3182015-08-19 16:06:22 -06002183#if defined(__ANDROID__)
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002184 /* Don't use LDPRELOAD on Android. */
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07002185 return 0;
2186#else
Elly Jonese1749eb2011-10-07 13:54:59 -04002187 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
2188 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
2189 if (!newenv)
2190 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04002191
Elly Jonese1749eb2011-10-07 13:54:59 -04002192 /* Only insert a separating space if we have something to separate... */
2193 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
2194 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04002195
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002196 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002197 setenv(kLdPreloadEnvVar, newenv, 1);
2198 free(newenv);
2199 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07002200#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04002201}
2202
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002203static int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04002204{
2205 int r = pipe(fds);
2206 char fd_buf[11];
2207 if (r)
2208 return r;
2209 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
2210 if (r <= 0)
2211 return -EINVAL;
2212 setenv(kFdEnvVar, fd_buf, 1);
2213 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05002214}
2215
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002216static int close_open_fds(int *inheritable_fds, size_t size)
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002217{
2218 const char *kFdPath = "/proc/self/fd";
2219
2220 DIR *d = opendir(kFdPath);
2221 struct dirent *dir_entry;
2222
2223 if (d == NULL)
2224 return -1;
2225 int dir_fd = dirfd(d);
2226 while ((dir_entry = readdir(d)) != NULL) {
2227 size_t i;
2228 char *end;
2229 bool should_close = true;
2230 const int fd = strtol(dir_entry->d_name, &end, 10);
2231
2232 if ((*end) != '\0') {
2233 continue;
2234 }
2235 /*
2236 * We might have set up some pipes that we want to share with
2237 * the parent process, and should not be closed.
2238 */
2239 for (i = 0; i < size; ++i) {
2240 if (fd == inheritable_fds[i]) {
2241 should_close = false;
2242 break;
2243 }
2244 }
2245 /* Also avoid closing the directory fd. */
2246 if (should_close && fd != dir_fd)
2247 close(fd);
2248 }
2249 closedir(d);
2250 return 0;
2251}
2252
Luis Hector Chavez1617f632017-08-01 18:32:30 -07002253static int redirect_fds(struct minijail *j)
2254{
2255 size_t i, i2;
2256 int closeable;
2257 for (i = 0; i < j->preserved_fd_count; i++) {
2258 if (dup2(j->preserved_fds[i].parent_fd,
2259 j->preserved_fds[i].child_fd) == -1) {
2260 return -1;
2261 }
2262 }
2263 /*
2264 * After all fds have been duped, we are now free to close all parent
2265 * fds that are *not* child fds.
2266 */
2267 for (i = 0; i < j->preserved_fd_count; i++) {
2268 closeable = true;
2269 for (i2 = 0; i2 < j->preserved_fd_count; i2++) {
2270 closeable &= j->preserved_fds[i].parent_fd !=
2271 j->preserved_fds[i2].child_fd;
2272 }
2273 if (closeable)
2274 close(j->preserved_fds[i].parent_fd);
2275 }
2276 return 0;
2277}
2278
Dylan Reidacfb8be2017-08-25 12:56:51 -07002279/*
2280 * Structure that specifies how to start a minijail.
2281 *
Dylan Reid0412dcc2017-08-24 11:33:15 -07002282 * filename - The program to exec in the child. Required if `exec_in_child` = 1.
2283 * argv - Arguments for the child program. Required if `exec_in_child` = 1.
Dylan Reidacfb8be2017-08-25 12:56:51 -07002284 * use_preload - If true use LD_PRELOAD.
Dylan Reid0412dcc2017-08-24 11:33:15 -07002285 * exec_in_child - If true, run `filename`. Otherwise, the child will return to
2286 * the caller.
Dylan Reidacfb8be2017-08-25 12:56:51 -07002287 */
2288struct minijail_run_config {
2289 const char *filename;
2290 char *const *argv;
2291 int use_preload;
Dylan Reid0412dcc2017-08-24 11:33:15 -07002292 int exec_in_child;
Dylan Reidacfb8be2017-08-25 12:56:51 -07002293};
2294
2295/*
2296 * Set of pointers to fill with values from minijail_run.
2297 * All arguments are allowed to be NULL if unused.
2298 *
2299 * pstdin_fd - Filled with stdin pipe if non-NULL.
2300 * pstdout_fd - Filled with stdout pipe if non-NULL.
2301 * pstderr_fd - Filled with stderr pipe if non-NULL.
2302 * pchild_pid - Filled with the pid of the child process if non-NULL.
2303 */
2304struct minijail_run_status {
2305 int *pstdin_fd;
2306 int *pstdout_fd;
2307 int *pstderr_fd;
2308 pid_t *pchild_pid;
2309};
2310
Dylan Reid18c49c82017-08-25 14:52:27 -07002311static int minijail_run_internal(struct minijail *j,
2312 const struct minijail_run_config *config,
2313 struct minijail_run_status *status_out);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002314
Will Drewry6ac91122011-10-21 16:38:58 -05002315int API minijail_run(struct minijail *j, const char *filename,
2316 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04002317{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002318 struct minijail_run_config config = {
2319 .filename = filename,
2320 .argv = argv,
2321 .use_preload = true,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002322 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002323 };
2324 struct minijail_run_status status = {};
2325 return minijail_run_internal(j, &config, &status);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07002326}
2327
2328int API minijail_run_pid(struct minijail *j, const char *filename,
2329 char *const argv[], pid_t *pchild_pid)
2330{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002331 struct minijail_run_config config = {
2332 .filename = filename,
2333 .argv = argv,
2334 .use_preload = true,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002335 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002336 };
2337 struct minijail_run_status status = {
2338 .pchild_pid = pchild_pid,
2339 };
2340 return minijail_run_internal(j, &config, &status);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002341}
2342
2343int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07002344 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002345{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002346 struct minijail_run_config config = {
2347 .filename = filename,
2348 .argv = argv,
2349 .use_preload = true,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002350 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002351 };
2352 struct minijail_run_status status = {
2353 .pstdin_fd = pstdin_fd,
2354 };
2355 return minijail_run_internal(j, &config, &status);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002356}
2357
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002358int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07002359 char *const argv[], pid_t *pchild_pid,
2360 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002361{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002362 struct minijail_run_config config = {
2363 .filename = filename,
2364 .argv = argv,
2365 .use_preload = true,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002366 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002367 };
2368 struct minijail_run_status status = {
2369 .pstdin_fd = pstdin_fd,
2370 .pstdout_fd = pstdout_fd,
2371 .pstderr_fd = pstderr_fd,
2372 .pchild_pid = pchild_pid,
2373 };
2374 return minijail_run_internal(j, &config, &status);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002375}
2376
2377int API minijail_run_no_preload(struct minijail *j, const char *filename,
2378 char *const argv[])
2379{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002380 struct minijail_run_config config = {
2381 .filename = filename,
2382 .argv = argv,
2383 .use_preload = false,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002384 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002385 };
2386 struct minijail_run_status status = {};
2387 return minijail_run_internal(j, &config, &status);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002388}
2389
Samuel Tan63187f42015-10-16 13:01:53 -07002390int API minijail_run_pid_pipes_no_preload(struct minijail *j,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08002391 const char *filename,
2392 char *const argv[],
Samuel Tan63187f42015-10-16 13:01:53 -07002393 pid_t *pchild_pid,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002394 int *pstdin_fd,
2395 int *pstdout_fd,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08002396 int *pstderr_fd)
2397{
Dylan Reidacfb8be2017-08-25 12:56:51 -07002398 struct minijail_run_config config = {
2399 .filename = filename,
2400 .argv = argv,
2401 .use_preload = false,
Dylan Reid0412dcc2017-08-24 11:33:15 -07002402 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07002403 };
2404 struct minijail_run_status status = {
2405 .pstdin_fd = pstdin_fd,
2406 .pstdout_fd = pstdout_fd,
2407 .pstderr_fd = pstderr_fd,
2408 .pchild_pid = pchild_pid,
2409 };
2410 return minijail_run_internal(j, &config, &status);
Samuel Tan63187f42015-10-16 13:01:53 -07002411}
2412
Dylan Reid0412dcc2017-08-24 11:33:15 -07002413pid_t API minijail_fork(struct minijail *j)
2414{
2415 struct minijail_run_config config = {};
2416 struct minijail_run_status status = {};
2417 return minijail_run_internal(j, &config, &status);
2418}
2419
Dylan Reid18c49c82017-08-25 14:52:27 -07002420static int minijail_run_internal(struct minijail *j,
2421 const struct minijail_run_config *config,
2422 struct minijail_run_status *status_out)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002423{
Elly Jonese1749eb2011-10-07 13:54:59 -04002424 char *oldenv, *oldenv_copy = NULL;
2425 pid_t child_pid;
2426 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002427 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002428 int stdout_fds[2];
2429 int stderr_fds[2];
Dylan Reidce5b55e2016-01-13 11:04:16 -08002430 int child_sync_pipe_fds[2];
2431 int sync_child = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -04002432 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04002433 /* We need to remember this across the minijail_preexec() call. */
2434 int pid_namespace = j->flags.pids;
Luis Hector Chavezac981fc2017-09-18 15:52:38 -07002435 /*
2436 * Create an init process if we are entering a pid namespace, unless the
2437 * user has explicitly opted out by calling minijail_run_as_init().
2438 */
2439 int do_init = j->flags.do_init && !j->flags.run_as_init;
Dylan Reidacfb8be2017-08-25 12:56:51 -07002440 int use_preload = config->use_preload;
Ben Chan541c7e52011-08-26 14:55:53 -07002441
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002442 if (use_preload) {
Dylan Reid0412dcc2017-08-24 11:33:15 -07002443 if (j->hooks_head != NULL)
2444 die("Minijail hooks are not supported with LD_PRELOAD");
2445 if (!config->exec_in_child)
2446 die("minijail_fork is not supported with LD_PRELOAD");
2447
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002448 oldenv = getenv(kLdPreloadEnvVar);
2449 if (oldenv) {
2450 oldenv_copy = strdup(oldenv);
2451 if (!oldenv_copy)
2452 return -ENOMEM;
2453 }
2454
2455 if (setup_preload())
2456 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04002457 }
Will Drewryf89aef52011-09-16 16:48:57 -05002458
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002459 if (!use_preload) {
Luis Hector Chavezfe5fb8e2017-06-29 10:41:27 -07002460 if (j->flags.use_caps && j->caps != 0 &&
2461 !j->flags.set_ambient_caps) {
2462 die("non-empty, non-ambient capabilities are not "
2463 "supported without LD_PRELOAD");
2464 }
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002465 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05002466
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002467 if (use_preload) {
2468 /*
2469 * Before we fork(2) and execve(2) the child process, we need
2470 * to open a pipe(2) to send the minijail configuration over.
2471 */
2472 if (setup_pipe(pipe_fds))
2473 return -EFAULT;
2474 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002475
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002476 /*
2477 * If we want to write to the child process' standard input,
2478 * create the pipe(2) now.
2479 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002480 if (status_out->pstdin_fd) {
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002481 if (pipe(stdin_fds))
2482 return -EFAULT;
2483 }
2484
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002485 /*
2486 * If we want to read from the child process' standard output,
2487 * create the pipe(2) now.
2488 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002489 if (status_out->pstdout_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002490 if (pipe(stdout_fds))
2491 return -EFAULT;
2492 }
2493
2494 /*
2495 * If we want to read from the child process' standard error,
2496 * create the pipe(2) now.
2497 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002498 if (status_out->pstderr_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002499 if (pipe(stderr_fds))
2500 return -EFAULT;
2501 }
2502
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002503 /*
Jorge Lucangeli Obesab6fa6f2016-08-04 15:42:48 -04002504 * If we want to set up a new uid/gid map in the user namespace,
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08002505 * or if we need to add the child process to cgroups, create the pipe(2)
2506 * to sync between parent and child.
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002507 */
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08002508 if (j->flags.userns || j->flags.cgroups) {
Dylan Reidce5b55e2016-01-13 11:04:16 -08002509 sync_child = 1;
2510 if (pipe(child_sync_pipe_fds))
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002511 return -EFAULT;
2512 }
2513
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08002514 /*
2515 * Use sys_clone() if and only if we're creating a pid namespace.
Elly Jones761b7412012-06-13 15:49:52 -04002516 *
2517 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
2518 *
2519 * In multithreaded programs, there are a bunch of locks inside libc,
2520 * some of which may be held by other threads at the time that we call
2521 * minijail_run_pid(). If we call fork(), glibc does its level best to
2522 * ensure that we hold all of these locks before it calls clone()
2523 * internally and drop them after clone() returns, but when we call
2524 * sys_clone(2) directly, all that gets bypassed and we end up with a
2525 * child address space where some of libc's important locks are held by
2526 * other threads (which did not get cloned, and hence will never release
2527 * those locks). This is okay so long as we call exec() immediately
2528 * after, but a bunch of seemingly-innocent libc functions like setenv()
2529 * take locks.
2530 *
2531 * Hence, only call sys_clone() if we need to, in order to get at pid
2532 * namespacing. If we follow this path, the child's address space might
2533 * have broken locks; you may only call functions that do not acquire
2534 * any locks.
2535 *
2536 * Unfortunately, fork() acquires every lock it can get its hands on, as
2537 * previously detailed, so this function is highly likely to deadlock
2538 * later on (see "deadlock here") if we're multithreaded.
2539 *
2540 * We might hack around this by having the clone()d child (init of the
2541 * pid namespace) return directly, rather than leaving the clone()d
2542 * process hanging around to be init for the new namespace (and having
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08002543 * its fork()ed child return in turn), but that process would be
2544 * crippled with its libc locks potentially broken. We might try
2545 * fork()ing in the parent before we clone() to ensure that we own all
2546 * the locks, but then we have to have the forked child hanging around
2547 * consuming resources (and possibly having file descriptors / shared
2548 * memory regions / etc attached). We'd need to keep the child around to
2549 * avoid having its children get reparented to init.
Elly Jones761b7412012-06-13 15:49:52 -04002550 *
2551 * TODO(ellyjones): figure out if the "forked child hanging around"
2552 * problem is fixable or not. It would be nice if we worked in this
2553 * case.
2554 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002555 if (pid_namespace) {
2556 int clone_flags = CLONE_NEWPID | SIGCHLD;
2557 if (j->flags.userns)
2558 clone_flags |= CLONE_NEWUSER;
2559 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002560 } else {
Elly Jones761b7412012-06-13 15:49:52 -04002561 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002562 }
Elly Jones761b7412012-06-13 15:49:52 -04002563
Elly Jonese1749eb2011-10-07 13:54:59 -04002564 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002565 if (use_preload) {
2566 free(oldenv_copy);
2567 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07002568 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04002569 }
Will Drewryf89aef52011-09-16 16:48:57 -05002570
Elly Jonese1749eb2011-10-07 13:54:59 -04002571 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002572 if (use_preload) {
2573 /* Restore parent's LD_PRELOAD. */
2574 if (oldenv_copy) {
2575 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
2576 free(oldenv_copy);
2577 } else {
2578 unsetenv(kLdPreloadEnvVar);
2579 }
2580 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04002581 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002582
Elly Jonese1749eb2011-10-07 13:54:59 -04002583 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002584
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002585 if (j->flags.forward_signals) {
2586 forward_pid = child_pid;
2587 install_signal_handlers();
2588 }
2589
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002590 if (j->flags.pid_file)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002591 write_pid_file_or_die(j);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002592
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08002593 if (j->flags.cgroups)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002594 add_to_cgroups_or_die(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08002595
Dylan Reid0f72ef42017-06-06 15:42:49 -07002596 if (j->rlimit_count)
2597 set_rlimits_or_die(j);
2598
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002599 if (j->flags.userns)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002600 write_ugid_maps_or_die(j);
Dylan Reidce5b55e2016-01-13 11:04:16 -08002601
2602 if (sync_child)
2603 parent_setup_complete(child_sync_pipe_fds);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002604
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002605 if (use_preload) {
2606 /* Send marshalled minijail. */
2607 close(pipe_fds[0]); /* read endpoint */
2608 ret = minijail_to_fd(j, pipe_fds[1]);
2609 close(pipe_fds[1]); /* write endpoint */
2610 if (ret) {
2611 kill(j->initpid, SIGKILL);
2612 die("failed to send marshalled minijail");
2613 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002614 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002615
Dylan Reidacfb8be2017-08-25 12:56:51 -07002616 if (status_out->pchild_pid)
2617 *status_out->pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002618
2619 /*
2620 * If we want to write to the child process' standard input,
2621 * set up the write end of the pipe.
2622 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002623 if (status_out->pstdin_fd)
2624 *status_out->pstdin_fd =
2625 setup_pipe_end(stdin_fds, 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002626
2627 /*
2628 * If we want to read from the child process' standard output,
2629 * set up the read end of the pipe.
2630 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002631 if (status_out->pstdout_fd)
2632 *status_out->pstdout_fd =
2633 setup_pipe_end(stdout_fds, 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002634
2635 /*
2636 * If we want to read from the child process' standard error,
2637 * set up the read end of the pipe.
2638 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002639 if (status_out->pstderr_fd)
2640 *status_out->pstderr_fd =
2641 setup_pipe_end(stderr_fds, 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002642
Dylan Reid0412dcc2017-08-24 11:33:15 -07002643 /*
2644 * If forking return the child pid, in the normal exec case
2645 * return 0 for success.
2646 */
2647 if (!config->exec_in_child)
2648 return child_pid;
Elly Jonese1749eb2011-10-07 13:54:59 -04002649 return 0;
2650 }
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002651 /* Child process. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002652 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07002653
Peter Qiu2860c462015-12-16 15:13:06 -08002654 if (j->flags.reset_signal_mask) {
2655 sigset_t signal_mask;
2656 if (sigemptyset(&signal_mask) != 0)
2657 pdie("sigemptyset failed");
2658 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
2659 pdie("sigprocmask failed");
2660 }
2661
Luis Hector Chaveza27118a2018-04-04 08:18:01 -07002662 if (j->flags.reset_signal_handlers) {
2663 int signum;
2664 for (signum = 0; signum <= SIGRTMAX; signum++) {
2665 /*
2666 * Ignore EINVAL since some signal numbers in the range
2667 * might not be valid.
2668 */
2669 if (signal(signum, SIG_DFL) == SIG_ERR &&
2670 errno != EINVAL) {
2671 pdie("failed to reset signal %d disposition",
2672 signum);
2673 }
2674 }
2675 }
2676
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002677 if (j->flags.close_open_fds) {
Luis Hector Chavez1617f632017-08-01 18:32:30 -07002678 const size_t kMaxInheritableFdsSize = 10 + MAX_PRESERVED_FDS;
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002679 int inheritable_fds[kMaxInheritableFdsSize];
2680 size_t size = 0;
Luis Hector Chavez1617f632017-08-01 18:32:30 -07002681 size_t i;
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002682 if (use_preload) {
2683 inheritable_fds[size++] = pipe_fds[0];
2684 inheritable_fds[size++] = pipe_fds[1];
2685 }
2686 if (sync_child) {
2687 inheritable_fds[size++] = child_sync_pipe_fds[0];
2688 inheritable_fds[size++] = child_sync_pipe_fds[1];
2689 }
Dylan Reidacfb8be2017-08-25 12:56:51 -07002690 if (status_out->pstdin_fd) {
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002691 inheritable_fds[size++] = stdin_fds[0];
2692 inheritable_fds[size++] = stdin_fds[1];
2693 }
Dylan Reidacfb8be2017-08-25 12:56:51 -07002694 if (status_out->pstdout_fd) {
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002695 inheritable_fds[size++] = stdout_fds[0];
2696 inheritable_fds[size++] = stdout_fds[1];
2697 }
Dylan Reidacfb8be2017-08-25 12:56:51 -07002698 if (status_out->pstderr_fd) {
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002699 inheritable_fds[size++] = stderr_fds[0];
2700 inheritable_fds[size++] = stderr_fds[1];
2701 }
Luis Hector Chavez1617f632017-08-01 18:32:30 -07002702 for (i = 0; i < j->preserved_fd_count; i++) {
2703 /*
2704 * Preserve all parent_fds. They will be dup2(2)-ed in
2705 * the child later.
2706 */
2707 inheritable_fds[size++] = j->preserved_fds[i].parent_fd;
2708 }
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002709
2710 if (close_open_fds(inheritable_fds, size) < 0)
2711 die("failed to close open file descriptors");
2712 }
2713
Luis Hector Chavez1617f632017-08-01 18:32:30 -07002714 if (redirect_fds(j))
2715 die("failed to set up fd redirections");
2716
Dylan Reidce5b55e2016-01-13 11:04:16 -08002717 if (sync_child)
2718 wait_for_parent_setup(child_sync_pipe_fds);
2719
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002720 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08002721 enter_user_namespace(j);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002722
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002723 /*
2724 * If we want to write to the jailed process' standard input,
2725 * set up the read end of the pipe.
2726 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002727 if (status_out->pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002728 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
2729 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002730 die("failed to set up stdin pipe");
2731 }
2732
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002733 /*
2734 * If we want to read from the jailed process' standard output,
2735 * set up the write end of the pipe.
2736 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002737 if (status_out->pstdout_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002738 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
2739 STDOUT_FILENO) < 0)
2740 die("failed to set up stdout pipe");
2741 }
2742
2743 /*
2744 * If we want to read from the jailed process' standard error,
2745 * set up the write end of the pipe.
2746 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002747 if (status_out->pstderr_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002748 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
2749 STDERR_FILENO) < 0)
2750 die("failed to set up stderr pipe");
2751 }
2752
Jorge Lucangeli Obesaa235b92016-11-23 13:48:15 -05002753 /*
2754 * If any of stdin, stdout, or stderr are TTYs, create a new session.
2755 * This prevents the jailed process from using the TIOCSTI ioctl
2756 * to push characters into the parent process terminal's input buffer,
2757 * therefore escaping the jail.
Stephen Barber5dd5b1b2017-10-16 23:02:39 -07002758 *
2759 * Since it has just forked, the child will not be a process group
2760 * leader, and this call to setsid() should always succeed.
Jorge Lucangeli Obesaa235b92016-11-23 13:48:15 -05002761 */
2762 if (isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) ||
2763 isatty(STDERR_FILENO)) {
2764 if (setsid() < 0) {
2765 pdie("setsid() failed");
2766 }
2767 }
2768
Dylan Reid791f5772015-09-14 20:02:42 -07002769 /* If running an init program, let it decide when/how to mount /proc. */
2770 if (pid_namespace && !do_init)
2771 j->flags.remount_proc_ro = 0;
2772
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002773 if (use_preload) {
2774 /* Strip out flags that cannot be inherited across execve(2). */
2775 minijail_preexec(j);
2776 } else {
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002777 /*
2778 * If not using LD_PRELOAD, do all jailing before execve(2).
2779 * Note that PID namespaces can only be entered on fork(2),
2780 * so that flag is still cleared.
2781 */
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002782 j->flags.pids = 0;
2783 }
Dylan Reid0412dcc2017-08-24 11:33:15 -07002784
2785 /*
2786 * Jail this process.
2787 * If forking, return.
2788 * If not, execve(2) the target.
2789 */
Elly Jonese1749eb2011-10-07 13:54:59 -04002790 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04002791
Dylan Reid0412dcc2017-08-24 11:33:15 -07002792 if (config->exec_in_child && pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002793 /*
2794 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08002795 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002796 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08002797 * a child to actually run the program. If |do_init == 0|, we
2798 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04002799 *
2800 * If we're multithreaded, we'll probably deadlock here. See
2801 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04002802 */
2803 child_pid = fork();
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002804 if (child_pid < 0) {
Elly Jonese1749eb2011-10-07 13:54:59 -04002805 _exit(child_pid);
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002806 } else if (child_pid > 0) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04002807 /*
2808 * Best effort. Don't bother checking the return value.
2809 */
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002810 prctl(PR_SET_NAME, "minijail-init");
2811 init(child_pid); /* Never returns. */
2812 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002813 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002814
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002815 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE);
2816
Dylan Reid0412dcc2017-08-24 11:33:15 -07002817 if (!config->exec_in_child)
2818 return 0;
2819
Elly Jonesdd3e8512012-01-23 15:13:38 -05002820 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002821 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04002822 * calling process
2823 * -> execve()-ing process
2824 * If we are:
2825 * calling process
2826 * -> init()-ing process
2827 * -> execve()-ing process
2828 */
Dylan Reidacfb8be2017-08-25 12:56:51 -07002829 ret = execve(config->filename, config->argv, environ);
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002830 if (ret == -1) {
Dylan Reidacfb8be2017-08-25 12:56:51 -07002831 pwarn("execve(%s) failed", config->filename);
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002832 }
2833 _exit(ret);
Elly Jonescd7a9042011-07-22 13:56:51 -04002834}
2835
Will Drewry6ac91122011-10-21 16:38:58 -05002836int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002837{
2838 int st;
2839 if (kill(j->initpid, SIGTERM))
2840 return -errno;
2841 if (waitpid(j->initpid, &st, 0) < 0)
2842 return -errno;
2843 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04002844}
2845
Will Drewry6ac91122011-10-21 16:38:58 -05002846int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002847{
2848 int st;
2849 if (waitpid(j->initpid, &st, 0) < 0)
2850 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002851
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002852 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07002853 int error_status = st;
2854 if (WIFSIGNALED(st)) {
2855 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07002856 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07002857 j->initpid, signum);
2858 /*
2859 * We return MINIJAIL_ERR_JAIL if the process received
2860 * SIGSYS, which happens when a syscall is blocked by
2861 * seccomp filters.
2862 * If not, we do what bash(1) does:
2863 * $? = 128 + signum
2864 */
2865 if (signum == SIGSYS) {
2866 error_status = MINIJAIL_ERR_JAIL;
2867 } else {
2868 error_status = 128 + signum;
2869 }
2870 }
2871 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002872 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002873
2874 int exit_status = WEXITSTATUS(st);
2875 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07002876 info("child process %d exited with status %d",
2877 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002878
2879 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04002880}
2881
Will Drewry6ac91122011-10-21 16:38:58 -05002882void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002883{
Dylan Reid605ce7f2016-01-19 19:21:00 -08002884 size_t i;
2885
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08002886 if (j->flags.seccomp_filter && j->filter_prog) {
2887 free(j->filter_prog->filter);
2888 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04002889 }
Mike Frysingerac08a682017-10-10 02:04:50 -04002890 free_mounts_list(j);
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002891 while (j->hooks_head) {
2892 struct hook *c = j->hooks_head;
2893 j->hooks_head = c->next;
2894 free(c);
2895 }
2896 j->hooks_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04002897 if (j->user)
2898 free(j->user);
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -08002899 if (j->suppl_gid_list)
2900 free(j->suppl_gid_list);
Will Drewrybee7ba72011-10-21 20:47:01 -05002901 if (j->chrootdir)
2902 free(j->chrootdir);
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -04002903 if (j->pid_file_path)
2904 free(j->pid_file_path);
2905 if (j->uidmap)
2906 free(j->uidmap);
2907 if (j->gidmap)
2908 free(j->gidmap);
Mike Frysingerb9a7b162017-05-30 15:25:49 -04002909 if (j->hostname)
2910 free(j->hostname);
Andrew Brestickereac28942015-11-11 16:04:46 -08002911 if (j->alt_syscall_table)
2912 free(j->alt_syscall_table);
Dylan Reid605ce7f2016-01-19 19:21:00 -08002913 for (i = 0; i < j->cgroup_count; ++i)
2914 free(j->cgroups[i]);
Elly Jonese1749eb2011-10-07 13:54:59 -04002915 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04002916}
Luis Hector Chavez114a9302017-09-05 20:36:58 -07002917
2918void API minijail_log_to_fd(int fd, int min_priority)
2919{
2920 init_logging(LOG_TO_FD, fd, min_priority);
2921}