# HG changeset patch # User Nattee Niparnan # Date 2018-07-19 10:56:19 # Node ID 78af34fd4a2ece7e8cfa4aae4983c64f7d1c9de0 # Parent dd4194a3c9c88f0cc1e4ec070fc49f86849ea358 - add isolate - more comment and output for each script diff --git a/isolate/.gitignore b/isolate/.gitignore new file mode 100644 --- /dev/null +++ b/isolate/.gitignore @@ -0,0 +1,5 @@ +docbook-xsl.css +isolate +isolate.1 +isolate.1.html +*.o diff --git a/isolate/.travis.yml b/isolate/.travis.yml new file mode 100644 --- /dev/null +++ b/isolate/.travis.yml @@ -0,0 +1,17 @@ +language: c + +compiler: gcc + +addons: + apt: + packages: + - asciidoc + - libcap-dev + - libxml2-utils + - xsltproc + - docbook-xml + - docbook-xsl + +script: + - make DESTDIR=/tmp/isolate + - make DESTDIR=/tmp/isolate install diff --git a/isolate/LICENSE b/isolate/LICENSE new file mode 100644 --- /dev/null +++ b/isolate/LICENSE @@ -0,0 +1,12 @@ +Isolate is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +If you have less than 10 copies of the GPL on your system :-), +you can find it at http://www.gnu.org/licenses/. diff --git a/isolate/Makefile b/isolate/Makefile new file mode 100644 --- /dev/null +++ b/isolate/Makefile @@ -0,0 +1,67 @@ +# Makefile for Isolate +# (c) 2015--2018 Martin Mares +# (c) 2017 Bernard Blackham + +all: isolate isolate.1 isolate.1.html isolate-check-environment + +CC=gcc +CFLAGS=-std=gnu99 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers -Wstrict-prototypes -Wmissing-prototypes -D_GNU_SOURCE +LIBS=-lcap + +VERSION=1.5 +YEAR=2018 +BUILD_DATE:=$(shell date '+%Y-%m-%d') +BUILD_COMMIT:=$(shell if git rev-parse >/dev/null 2>/dev/null ; then git describe --always --tags ; else echo '' ; fi) + +PREFIX = $(DESTDIR)/usr/local +VARPREFIX = $(DESTDIR)/var/local +CONFIGDIR = $(PREFIX)/etc +CONFIG = $(CONFIGDIR)/isolate +BINDIR = $(PREFIX)/bin +DATAROOTDIR = $(PREFIX)/share +DATADIR = $(DATAROOTDIR) +MANDIR = $(DATADIR)/man +MAN1DIR = $(MANDIR)/man1 +BOXDIR = $(VARPREFIX)/lib/isolate + +isolate: isolate.o util.o rules.o cg.o config.o + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) + +%.o: %.c isolate.h config.h + $(CC) $(CFLAGS) -c -o $@ $< + +isolate.o: CFLAGS += -DVERSION='"$(VERSION)"' -DYEAR='"$(YEAR)"' -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"' +config.o: CFLAGS += -DCONFIG_FILE='"$(CONFIG)"' + +isolate.1: isolate.1.txt + a2x -f manpage $< + +# The dependency on isolate.1 is there to serialize both calls of asciidoc, +# which does not name temporary files safely. +isolate.1.html: isolate.1.txt isolate.1 + a2x -f xhtml -D . $< + +clean: + rm -f *.o + rm -f isolate isolate.1 isolate.1.html + rm -f docbook-xsl.css + +install: isolate isolate-check-environment + install -d $(BINDIR) $(BOXDIR) $(CONFIGDIR) + install isolate-check-environment $(BINDIR) + install -m 4755 isolate $(BINDIR) + install -m 644 default.cf $(CONFIG) + +install-doc: isolate.1 + install -d $(MAN1DIR) + install -m 644 $< $(MAN1DIR)/$< + +release: isolate.1.html + git tag v$(VERSION) + git push --tags + git archive --format=tar --prefix=isolate-$(VERSION)/ HEAD | gzip >isolate-$(VERSION).tar.gz + rsync isolate-$(VERSION).tar.gz atrey:ftp/isolate/ + rsync isolate.1.html jw:/var/www/moe/ + ssh jw 'cd web && bin/release-prog isolate $(VERSION)' + +.PHONY: all clean install install-doc release diff --git a/isolate/README.md b/isolate/README.md new file mode 100644 --- /dev/null +++ b/isolate/README.md @@ -0,0 +1,29 @@ +isolate +======= + +Isolate is a sandbox built to safely run untrusted executables, +offering them a limited-access environment and preventing them from +affecting the host system. It takes advantage of features specific to +the Linux kernel, like namespaces and control groups. + +Isolate was developed by Martin Mareš () and Bernard Blackham +(), who still maintain it. Several other people +contributed patches for features and bug fixes (see Git history for a list). +Thanks! + +Originally, Isolate was a part of the [Moe Contest Environment](http://www.ucw.cz/moe/), +but it evolved to a separate project used by different +contest systems, most prominently [CMS](https://github.com/cms-dev/cms). +It now lives at [GitHub](https://github.com/ioi/isolate), +where you can submit bug reports and feature requests. + +If you are interested in more details, please read Martin's +and Bernard's [paper](http://mj.ucw.cz/papers/isolate.pdf) presented +at the IOI Conference. Also, Isolate's [manual page](http://www.ucw.cz/moe/isolate.1.html) +is available online. + +To compile Isolate, you need the headers for the libcap library +(usually available in a libcap-dev package). + +You may need `a2x` (found in [AsciiDoc](http://www.methods.co.nz/asciidoc/a2x.1.html)) for building manual. +But if you only want the isolate binary, you can just run `make isolate` diff --git a/isolate/TODO b/isolate/TODO new file mode 100644 --- /dev/null +++ b/isolate/TODO @@ -0,0 +1,1 @@ +Examine the use of taskstats for measuring memory diff --git a/isolate/cg.c b/isolate/cg.c new file mode 100644 --- /dev/null +++ b/isolate/cg.c @@ -0,0 +1,327 @@ +/* + * Process Isolator -- Control Groups + * + * (c) 2012-2016 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include "isolate.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +struct cg_controller_desc { + const char *name; + int optional; +}; + +typedef enum { + CG_MEMORY = 0, + CG_CPUACCT, + CG_CPUSET, + CG_NUM_CONTROLLERS, + CG_PARENT = 256, +} cg_controller; + +static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { + [CG_MEMORY] = { "memory", 0 }, + [CG_CPUACCT] = { "cpuacct", 0 }, + [CG_CPUSET] = { "cpuset", 1 }, + [CG_NUM_CONTROLLERS] = { NULL, 0 }, +}; + +#define FOREACH_CG_CONTROLLER(_controller) \ + for (cg_controller (_controller) = 0; \ + (_controller) < CG_NUM_CONTROLLERS; (_controller)++) + +static const char * +cg_controller_name(cg_controller c) +{ + assert(c < CG_NUM_CONTROLLERS); + return cg_controllers[c].name; +} + +static int +cg_controller_optional(cg_controller c) +{ + assert(c < CG_NUM_CONTROLLERS); + return cg_controllers[c].optional; +} + +static char cg_name[256]; +static char cg_parent_name[256]; + +#define CG_BUFSIZE 1024 + +static void +cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) +{ + snprintf(buf, len, "%s/%s/%s/%s", + cf_cg_root, + cg_controller_name(c & ~CG_PARENT), + (c & CG_PARENT) ? cg_parent_name : cg_name, + attr); +} + +static int +cg_read(cg_controller controller, const char *attr, char *buf) +{ + int result = 0; + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_RDONLY); + if (fd < 0) + { + if (maybe) + goto fail; + die("Cannot read %s: %m", path); + } + + int n = read(fd, buf, CG_BUFSIZE); + if (n < 0) + { + if (maybe) + goto fail_close; + die("Cannot read %s: %m", path); + } + if (n >= CG_BUFSIZE - 1) + die("Attribute %s too long", path); + if (n > 0 && buf[n-1] == '\n') + n--; + buf[n] = 0; + + if (verbose > 1) + msg("CG: Read %s = <%s>\n", attr, buf); + + result = 1; +fail_close: + close(fd); +fail: + return result; +} + +static void __attribute__((format(printf,3,4))) +cg_write(cg_controller controller, const char *attr, const char *fmt, ...) +{ + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + va_list args; + va_start(args, fmt); + + char buf[CG_BUFSIZE]; + int n = vsnprintf(buf, sizeof(buf), fmt, args); + if (n >= CG_BUFSIZE) + die("cg_write: Value for attribute %s is too long", attr); + + if (verbose > 1) + msg("CG: Write %s = %s", attr, buf); + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + { + if (maybe) + goto fail; + else + die("Cannot write %s: %m", path); + } + + int written = write(fd, buf, n); + if (written < 0) + { + if (maybe) + goto fail_close; + else + die("Cannot set %s to %s: %m", path, buf); + } + if (written != n) + die("Short write to %s (%d out of %d bytes)", path, written, n); + +fail_close: + close(fd); +fail: + va_end(args); +} + +void +cg_init(void) +{ + if (!cg_enable) + return; + + if (!dir_exists(cf_cg_root)) + die("Control group filesystem at %s not mounted", cf_cg_root); + + if (cf_cg_parent) + { + snprintf(cg_name, sizeof(cg_name), "%s/box-%d", cf_cg_parent, box_id); + snprintf(cg_parent_name, sizeof(cg_parent_name), "%s", cf_cg_parent); + } + else + { + snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); + strcpy(cg_parent_name, "."); + } + msg("Using control group %s under parent %s\n", cg_name, cg_parent_name); +} + +void +cg_prepare(void) +{ + if (!cg_enable) + return; + + struct stat st; + char buf[CG_BUFSIZE]; + char path[256]; + + FOREACH_CG_CONTROLLER(controller) + { + cg_makepath(path, sizeof(path), controller, ""); + if (stat(path, &st) >= 0 || errno != ENOENT) + { + msg("Control group %s already exists, trying to empty it.\n", path); + if (rmdir(path) < 0) + die("Failed to reset control group %s: %m", path); + } + + if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) + die("Failed to create control group %s: %m", path); + } + + // If the cpuset module is enabled, set up allowed cpus and memory nodes. + // If per-box configuration exists, use it; otherwise, inherit the settings + // from the parent cgroup. + struct cf_per_box *cf = cf_current_box(); + if (cg_read(CG_PARENT | CG_CPUSET, "?cpuset.cpus", buf)) + cg_write(CG_CPUSET, "cpuset.cpus", "%s", cf->cpus ? cf->cpus : buf); + if (cg_read(CG_PARENT | CG_CPUSET, "?cpuset.mems", buf)) + cg_write(CG_CPUSET, "cpuset.mems", "%s", cf->mems ? cf->mems : buf); +} + +void +cg_enter(void) +{ + if (!cg_enable) + return; + + msg("Entering control group %s\n", cg_name); + + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) + cg_write(controller, "?tasks", "%d\n", (int) getpid()); + else + cg_write(controller, "tasks", "%d\n", (int) getpid()); + } + + if (cg_memory_limit) + { + cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "memory.max_usage_in_bytes", "0\n"); + cg_write(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", "0\n"); + } + + if (cg_timing) + cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); +} + +int +cg_get_run_time_ms(void) +{ + if (!cg_enable) + return 0; + + char buf[CG_BUFSIZE]; + cg_read(CG_CPUACCT, "cpuacct.usage", buf); + unsigned long long ns = atoll(buf); + return ns / 1000000; +} + +void +cg_stats(void) +{ + if (!cg_enable) + return; + + char buf[CG_BUFSIZE]; + + // Memory usage statistics + unsigned long long mem=0, memsw=0; + if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) + mem = atoll(buf); + if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) + { + memsw = atoll(buf); + if (memsw > mem) + mem = memsw; + } + if (mem) + meta_printf("cg-mem:%lld\n", mem >> 10); + + // OOM kill detection + if (cg_read(CG_MEMORY, "?memory.oom_control", buf)) + { + int oom_killed = 0; + char *s = buf; + while (s) + { + if (sscanf(s, "oom_kill %d", &oom_killed) == 1 && oom_killed) + { + meta_printf("cg-oom-killed:1\n"); + break; + } + s = strchr(s, '\n'); + if (s) + s++; + } + } +} + +void +cg_remove(void) +{ + char buf[CG_BUFSIZE]; + + if (!cg_enable) + return; + + FOREACH_CG_CONTROLLER(controller) + { + // The cgroup can be non-existent at this moment (e.g., --cleanup before the first --init) + if (!cg_read(controller, "?tasks", buf)) + continue; + + if (buf[0]) + die("Some tasks left in controller %s of cgroup %s, failed to remove it", + cg_controller_name(controller), cg_name); + + char path[256]; + cg_makepath(path, sizeof(path), controller, ""); + + if (rmdir(path) < 0) + die("Cannot remove control group %s: %m", path); + } +} diff --git a/isolate/config.c b/isolate/config.c new file mode 100644 --- /dev/null +++ b/isolate/config.c @@ -0,0 +1,168 @@ +/* + * Process Isolator -- Configuration File + * + * (c) 2016 Martin Mares + */ + +#include "isolate.h" + +#include +#include +#include +#include + +#define MAX_LINE_LEN 1024 + +char *cf_box_root; +char *cf_cg_root; +char *cf_cg_parent; +int cf_first_uid; +int cf_first_gid; +int cf_num_boxes; + +static int line_number; +static struct cf_per_box *per_box_configs; + +static void NONRET +cf_err(char *msg) +{ + die("Error in config file, line %d: %s", line_number, msg); +} + +static char * +cf_string(char *val) +{ + return xstrdup(val); +} + +static int +cf_int(char *val) +{ + char *end; + errno = 0; + long int x = strtol(val, &end, 10); + if (errno || end == val || end && *end) + cf_err("Invalid number"); + if ((long int)(int) x != x) + cf_err("Number out of range"); + return x; +} + +static void +cf_entry_toplevel(char *key, char *val) +{ + if (!strcmp(key, "box_root")) + cf_box_root = cf_string(val); + else if (!strcmp(key, "cg_root")) + cf_cg_root = cf_string(val); + else if (!strcmp(key, "cg_parent")) + cf_cg_parent = cf_string(val); + else if (!strcmp(key, "first_uid")) + cf_first_uid = cf_int(val); + else if (!strcmp(key, "first_gid")) + cf_first_gid = cf_int(val); + else if (!strcmp(key, "num_boxes")) + cf_num_boxes = cf_int(val); + else + cf_err("Unknown configuration item"); +} + +static void +cf_entry_compound(char *key, char *subkey, char *val) +{ + if (strncmp(key, "box", 3)) + cf_err("Unknown configuration section"); + int box_id = cf_int(key + 3); + struct cf_per_box *c = cf_per_box(box_id); + + if (!strcmp(subkey, "cpus")) + c->cpus = cf_string(val); + else if (!strcmp(subkey, "mems")) + c->mems = cf_string(val); + else + cf_err("Unknown per-box configuration item"); +} + +static void +cf_entry(char *key, char *val) +{ + char *dot = strchr(key, '.'); + if (!dot) + cf_entry_toplevel(key, val); + else + { + *dot++ = 0; + cf_entry_compound(key, dot, val); + } +} + +static void +cf_check(void) +{ + if (!cf_box_root || + !cf_cg_root || + !cf_first_uid || + !cf_first_gid || + !cf_num_boxes) + cf_err("Configuration is not complete"); +} + +void +cf_parse(void) +{ + FILE *f = fopen(CONFIG_FILE, "r"); + if (!f) + die("Cannot open %s: %m", CONFIG_FILE); + + char line[MAX_LINE_LEN]; + while (fgets(line, sizeof(line), f)) + { + line_number++; + char *nl = strchr(line, '\n'); + if (!nl) + cf_err("Line not terminated or too long"); + *nl = 0; + + if (!line[0] || line[0] == '#') + continue; + + char *s = line; + while (*s && *s != ' ' && *s != '\t' && *s != '=') + s++; + while (*s == ' ' || *s == '\t') + *s++ = 0; + if (*s != '=') + cf_err("Syntax error, expecting key=value"); + *s++ = 0; + while (*s == ' ' || *s == '\t') + *s++ = 0; + + cf_entry(line, s); + } + + fclose(f); + cf_check(); +} + +struct cf_per_box * +cf_per_box(int box_id) +{ + struct cf_per_box *c; + + for (c = per_box_configs; c; c = c->next) + if (c->box_id == box_id) + return c; + + c = xmalloc(sizeof(*c)); + memset(c, 0, sizeof(*c)); + c->next = per_box_configs; + per_box_configs = c; + c->box_id = box_id; + return c; +} + +struct cf_per_box * +cf_current_box(void) +{ + return cf_per_box(box_id); +} diff --git a/isolate/default.cf b/isolate/default.cf new file mode 100644 --- /dev/null +++ b/isolate/default.cf @@ -0,0 +1,24 @@ +# This is a configuration file for Isolate + +# All sandboxes are created under this directory. +# To avoid symlink attacks, this directory and all its ancestors +# must be writeable only to root. +box_root = /var/local/lib/isolate + +# Root of the control group hierarchy +cg_root = /sys/fs/cgroup + +# If the following variable is defined, the per-box cgroups +# are created as sub-groups of the named cgroup +#cg_parent = boxes + +# Block of UIDs and GIDs reserved for sandboxes +first_uid = 60000 +first_gid = 60000 +num_boxes = 1000 + +# Per-box settings of the set of allowed CPUs and NUMA nodes +# (see linux/Documentation/cgroups/cpusets.txt for precise syntax) + +#box0.cpus = 4-7 +#box0.mems = 1 diff --git a/isolate/isolate-check-environment b/isolate/isolate-check-environment new file mode 100755 --- /dev/null +++ b/isolate/isolate-check-environment @@ -0,0 +1,224 @@ +#!/bin/sh +# +# Identifies potential sources issues when using isolate. +# +# (c) 2017 Bernard Blackham +# + +usage() { + cat <&2 +Usage: $0 [-q|--quiet] [-e|--execute] + +Use this script to identify sources of run-time variability and other issues on +Linux machines which may affect isolate. If --execute is not specified, the +recommended actions are written to stdout as an executable shell script, +otherwise, using --execute will attempt to make changes to make the system +behave more deterministically. The changes performed by --execute persist only +until a reboot. To persist across reboots, the standard output from this script +should be added to /etc/rc.local or some other script that is run on each boot. +Alternately, you could add the following line to /etc/rc.local to automatically +apply these changes on boot, but use this with caution as not all issues can +be resolved in this way. + + isolate-check-environment --quiet --execute + +The exit status of this script will be 0 if all checks pass, or 1 if some +checks have failed. + +Note that there are more strategies to reduce run-time variability further. +See the man page of isolate for details under REPRODUCIBILITY. +EOT + exit 2 +} + +# Parse options. +args=$(getopt -o "ehq" --long "execute,help,quiet" -- "$@") || usage +eval set -- "$args" +quiet= +execute= +while : ; do + case "$1" in + -q|--quiet) quiet=1 ; shift ;; + -e|--execute) execute=1 ; shift ;; + -h|--help) usage ;; + --) shift ; break ;; + *) usage ;; + esac +done +[ -n "$*" ] && usage + +# Some helper boilerplate machinery. +exit_status=0 +red=$(tput setaf 1) +green=$(tput setaf 2) +yellow=$(tput setaf 3) +normal=$(tput sgr0) + +# Return true (0) if we are being quiet. +quiet() { + [ -n "$quiet" ] +} + +# Print all arguments to stderr as warning. +warn() { + quiet || echo WARNING: "$*" >&2 +} + +# Print first argument to stderr as warning, and second argument to stdout as +# the recommended remedial action, or execute if --execute is given. +action() { + quiet || warn "$1" + if [ -n "$execute" ] ; then + quiet || echo "+ $2" + sh -c "$2" + else + quiet || echo $2 + fi +} + +print_start_check() { + quiet && return + print_check_status=1 + echo -n "Checking for $@ ... " >&2 +} + +print_fail() { + exit_status=1 + quiet && return + [ -n "$print_check_status" ] && echo "${red}FAIL${normal}" >&2 + print_check_status= +} + +print_dubious() { + exit_status=1 + quiet && return + [ -n "$print_check_status" ] && echo "${yellow}CAUTION${normal}" >&2 + print_check_status= +} + +print_skipped() { + quiet && return + [ -n "$print_check_status" ] && echo "SKIPPED (not detected)" >&2 + print_check_status= +} + +print_finish() { + quiet && return + [ -n "$print_check_status" ] && echo "${green}PASS${normal}" >&2 + print_check_status= +} + +# Check that cgroups are enabled. +cgroup_check() { + local cgroup=$1 + print_start_check "cgroup support for $cgroup" + if ! test -f "/sys/fs/cgroup/$cgroup/tasks" ; then + print_dubious + warn "the $cgroup is not present. isolate --cg cannot be used." + fi + print_finish +} +cgroup_check memory +cgroup_check cpuacct +cgroup_check cpuset + +# Check that swap is either disabled or accounted for. +swap_check() { + print_start_check "swap" + # If swap is disabled, there is nothing to worry about. + local swaps + swaps=$(swapon --noheadings) + if [ -n "$swaps" ] ; then + # Swap is enabled. We had better have the memsw support in the memory + # cgroup. + if ! test -f "/sys/fs/cgroup/memory/memory.memsw.usage_in_bytes" ; then + print_fail + action \ + "swap is enabled, but swap accounting is not. isolate will not be able to enforce memory limits." \ + "swapoff -a" + else + print_dubious + warn "swap is enabled, and although accounted for, may still give run-time variability under memory pressure." + fi + fi + print_finish +} +swap_check + +# Check that CPU frequency scaling is disabled. +cpufreq_check() { + print_start_check "CPU frequency scaling" + local anycpus policy + anycpus= + # Ensure cpufreq governor is set to performance on all CPUs + for cpufreq_file in $(find /sys/devices/system/cpu/cpufreq/ -name scaling_governor) ; do + policy=$(cat $cpufreq_file) + if [ "$policy" != "performance" ] ; then + print_fail + action \ + "cpufreq governor set to '$policy', but 'performance' would be better" \ + "echo performance > $cpufreq_file" + fi + anycpus=1 + done + [ -z "$anycpus" ] && print_skipped + print_finish +} +cpufreq_check + +# Check that address space layout randomisation is disabled. +aslr_check() { + print_start_check "kernel address space randomisation" + local val + if val=$(cat /proc/sys/kernel/randomize_va_space 2>/dev/null) ; then + if [ "$val" -ne 0 ] ; then + print_fail + action \ + "address space randomisation is enabled." \ + "echo 0 > /proc/sys/kernel/randomize_va_space" + fi + else + print_skipped + fi + print_finish +} +aslr_check + +# Check that transparent huge-pages are disabled, as this leads to +# non-determinism depending on whether the kernel can allocate 2 MiB pages or +# not. +thp_check() { + print_start_check "transparent hugepage support" + local val + if val=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null) ; then + case $val in + *'[never]'*) ;; + *) print_fail + action \ + "transparent hugepages are enabled." \ + "echo never > /sys/kernel/mm/transparent_hugepage/enabled" ;; + esac + fi + if val=$(cat /sys/kernel/mm/transparent_hugepage/defrag 2>/dev/null) ; then + case $val in + *'[never]'*) ;; + *) print_fail + action \ + "transparent hugepage defrag is enabled." \ + "echo never > /sys/kernel/mm/transparent_hugepage/defrag" ;; + esac + fi + if val=$(cat /sys/kernel/mm/transparent_hugepage/khugepaged/defrag 2>/dev/null) ; then + if [ "$val" -ne 0 ] ; then + print_fail + action \ + "khugepaged defrag is enabled." \ + "echo 0 > /sys/kernel/mm/transparent_hugepage/khugepaged/defrag" + fi + fi + print_finish +} +thp_check + + +exit $exit_status diff --git a/isolate/isolate.1.txt b/isolate/isolate.1.txt new file mode 100644 --- /dev/null +++ b/isolate/isolate.1.txt @@ -0,0 +1,348 @@ +ISOLATE(1) +========== + +NAME +---- +isolate - Isolate a process using Linux Containers + +SYNOPSIS +-------- +*isolate* 'options' *--init* + +*isolate* 'options' *--run* +--+ 'program' 'arguments' + +*isolate* 'options' *--cleanup* + +DESCRIPTION +----------- +Run 'program' within a sandbox, so that it cannot communicate with the +outside world and its resource consumption is limited. This can be used +for example in a programming contest to run untrusted programs submitted +by contestants in a controlled environment. + +The sandbox is used in the following way: + +* Run *isolate --init*, which initializes the sandbox, creates its working directory and +prints its name to the standard output. Fails if the sandbox already existed. + +* Populate the directory with the executable file of the program and its +input files. + +* Call *isolate --run* to run the program. A single line describing the +status of the program is written to the standard error stream. + +* Fetch the output of the program from the directory. + +* Run *isolate --cleanup* to remove temporary files. Does nothing if the sandbox +was already cleaned up. + +Please note that by default, the program is not allowed to start multiple +processes of threads. If you need that, turn on the control group mode +(see below). + +OPTIONS +------- +*-M, --meta=*'file':: + Output meta-data on the execution of the program to a given file. + See below for syntax of the meta-files. + +*-m, --mem=*'size':: + Limit address space of the program to 'size' kilobytes. If more processes + are allowed, this applies to each of them separately. + +*-t, --time=*'time':: + Limit run time of the program to 'time' seconds. Fractional numbers are allowed. + Time in which the OS assigns the processor to different tasks is not counted. + +*-w, --wall-time=*'time':: + Limit wall-clock time to 'time' seconds. Fractional values are allowed. + This clock measures the time from the start of the program to its exit, + so it does not stop when the program has lost the CPU or when it is waiting + for an external event. We recommend to use *--time* as the main limit, + but set *--wall-time* to a much higher value as a precaution against + sleeping programs. + +*-x, --extra-time=*'time':: + When a time limit is exceeded, wait for extra 'time' seconds before + killing the program. This has the advantage that the real execution time + is reported, even though it slightly exceeds the limit. Fractional + numbers are again allowed. + +*-b, --box-id=*'id':: + When you run multiple sandboxes in parallel, you have to assign each unique + IDs to them by this option. See the discussion on UIDs in the INSTALLATION + section. The ID defaults to 0. + +*-k, --stack=*'size':: + Limit process stack to 'size' kilobytes. By default, the whole address + space is available for the stack, but it is subject to the *--mem* limit. + +*-f, --fsize=*'size':: + Limit size of files created (or modified) by the program to 'size' kilobytes. + In most cases, it is better to restrict overall disk usage by a disk quota + (see below). This option can help in cases when quotas are not enabled + on the underlying filesystem. + +*-q, --quota=*'blocks'*,*'inodes':: + Set disk quota to a given number of blocks and inodes. This requires the + filesystem to be mounted with support for quotas. Please note that this + currently works only on the ext family of filesystems (other filesystems + use other interfaces for setting quotas). + +*-i, --stdin=*'file':: + Redirect standard input from 'file'. The 'file' has to be accessible + inside the sandbox. Otherwise, standard input is inherited from the + parent process. + +*-o, --stdout=*'file':: + Redirect standard output to 'file'. The 'file' has to be accessible + inside the sandbox. Otherwise, standard output is inherited from the + parent process and the sandbox manager does not write anything to it. + +*-r, --stderr=*'file':: + Redirect standard error output to 'file'. The 'file' has to be accessible + inside the sandbox. Otherwise, standard error output is inherited from the + parent process. See also *--stderr-to-stdout*. + +*--stderr-to-stdout*:: + Redirect standard error output to standard output. This is performed after + the standard output is redirected by *--stdout*. Mutually exclusive with *--stderr*. + +*-c, --chdir=*'dir':: + Change directory to 'dir' before executing the program. This path must be + relative to the root of the sandbox. + +*-p, --processes*[*=*'max']:: + Permit the program to create up to 'max' processes and/or threads. Please + keep in mind that time and memory limit do not work with multiple processes + unless you enable the control group mode. If 'max' is not given, an arbitrary + number of processes can be run. By default, only one process is permitted. + +*--share-net*:: + By default, isolate creates a new network namespace for its child process. + This namespace contains no network devices except for a per-namespace loopback. + This prevents the program from communicating with the outside world. If you want + to permit communication, you can use this switch to keep the child process + in parent's network namespace. + +*--inherit-fds*:: + By default, isolate closes all file descriptors passed from its parent + except for descriptors 0, 1, and 2. + This prevents unintentional descriptor leaks. In some cases, passing extra + descriptors to the sandbox can be desirable, so you can use this switch + to make them survive. + +*-v, --verbose*:: + Tell the sandbox manager to be verbose and report on what is going on. + Using *-v* multiple times produces even more jabber. + +*-s, --silent*:: + Tell the sandbox manager to keep silence. No status messages are printed + to stderr except for fatal errors of the sandbox itself. The combination of + *--verbose* and *--silent* has an undefined effect. + +ENVIRONMENT RULES +----------------- +UNIX processes normally inherit all environment variables from their parent. The +sandbox however passes only those variables which are explicitly requested by +environment rules: + +*-E, --env=*'var':: + Inherit the variable 'var' from the parent. + +*-E, --env=*'var'*=*'value':: + Set the variable 'var' to 'value'. When the 'value' is empty, the + variable is removed from the environment. + +*-e, --full-env*:: + Inherit all variables from the parent. + +The rules are applied in the order in which they were given, except for +*--full-env*, which is applied first. + +The list of rules is automatically initialized with *-ELIBC_FATAL_STDERR_=1*. + +DIRECTORY RULES +--------------- +The sandboxed process gets its own filesystem namespace, which contains only subtrees +requested by directory rules: + +*-d, --dir=*'in'*=*'out'[*:*'options']:: + Bind the directory 'out' as seen by the caller to the path 'in' inside the sandbox. + If there already was a directory rule for 'in', it is replaced. + +*-d, --dir=*'dir'[*:*'options']:: + Bind the directory +/+'dir' to 'dir' inside the sandbox. + If there already was a directory rule for 'in', it is replaced. + +*-d, --dir=*'in'*=*:: + Remove a directory rule for the path 'in' inside the sandbox. + +By default, all directories are bound read-only and restricted (no devices, +no setuid binaries). This behavior can be modified using the 'options': + +*rw*:: + Allow read-write access. + +*dev*:: + Allow access to character and block devices. + +*noexec*:: + Disallow execution of binaries. + +*maybe*:: + Silently ignore the rule if the directory to be bound does not exist. + +*fs*:: + Instead of binding a directory, mount a device-less filesystem called 'in'. + For example, this can be 'proc' or 'sysfs'. + +Unless *--no-default-dirs* is specified, the default set of directory rules binds +/bin+, ++/dev+ (with devices allowed), +/lib+, +/lib64+ (if it exists), and +/usr+. It also binds +the working directory to +/box+ (read-write) and mounts the proc filesystem at +/proc+. + +*-D, --no-default-dirs*:: + Do not bind the default set of directories. Care has to be taken to specify + the correct set of rules (using *--dir*) for the executed program to run + correctly. In particular, +/box+ has to be bound. + +CONTROL GROUPS +-------------- +Isolate can make use of system control groups provided by the kernel +to constrain programs consisting of multiple processes. Please note +that this feature needs special system setup described in the INSTALLATION +section. + +*--cg*:: + Enable use of control groups. This should be specified with *--init*, + *--run* and *--cleanup*. + +*--cg-mem=*'size':: + Limit total memory usage by the whole control group to 'size' kilobytes. + This should be specified with *--run*. + +*--cg-timing*:: + Use control groups for timing, so that the *--time* switch affects the + total run time of all processes and threads in the control group. + This should be specified with *--run*. + This option is turned on by default, use *--no-cg-timing* to turn off. + +META-FILES +---------- +The meta-file contains miscellaneous meta-information on execution of the +program within the sandbox. It is a textual file consisting of lines +of format 'key'*:*'value'. The following keys are defined: + +*cg-mem*:: + When control groups are enabled, this is the total memory use + by the whole control group (in kilobytes). +*cg-oom-killed*:: + Present when the program was killed by the out-of-memory killer + (e.g., because it has exceeded the memory limit of its control group). + This is reported only on Linux 4.13 and later. +*csw-forced*:: + Number of context switches forced by the kernel. +*csw-voluntary*:: + Number of context switches caused by the process giving up the CPU + voluntarily. +*exitcode*:: + The program has exited normally with this exit code. +*exitsig*:: + The program has exited after receiving this fatal signal. +*killed*:: + Present when the program was terminated by the sandbox + (e.g., because it has exceeded the time limit). +*max-rss*:: + Maximum resident set size of the process (in kilobytes). +*message*:: + Status message, not intended for machine processing. + E.g., "Time limit exceeded." +*status*:: + Two-letter status code: + * *RE* -- run-time error, i.e., exited with a non-zero exit code + * *SG* -- program died on a signal + * *TO* -- timed out + * *XX* -- internal error of the sandbox +*time*:: + Run time of the program in fractional seconds. +*time-wall*:: + Wall clock time of the program in fractional seconds. + +Please note that not all keys have to be present. +For example, no *status* nor *message* is reported upon normal termination. + +RETURN VALUE +------------ +When the program inside the sandbox finishes correctly, the sandbox returns 0. +If it finishes incorrectly, it returns 1. +All other return codes signal an internal error. + +INSTALLATION +------------ +Isolate depends on several advanced features of the Linux kernel. Please +make sure that your kernel supports +PID namespaces (+CONFIG_PID_NS+), +IPC namespaces (+CONFIG_IPC_NS+), and +network namespaces (+CONFIG_NET_NS+). +If you want to use control groups, you need +the cpusets (+CONFIG_CPUSETS+), +CPU accounting controller (+CONFIG_CGROUP_CPUACCT+), and +memory resource controller (+CONFIG_MEMCG+). If your machine has swap enabled, +you should also enable the swap controller (+CONFIG_MEMCG_SWAP+). + +Debian 7.x and newer require enabling the memory and swap cgroup controllers by +adding the parameters "cgroup_enable=memory swapaccount=1" to the kernel +command-line, which can be set using +GRUB_CMDLINE_LINUX_DEFAULT+ in +/etc/default/grub. + +Isolate is designed to run setuid to root. The sub-process inside the sandbox +then switches to a non-privileged user ID (different for each *--box-id*). +The range of UIDs available and several filesystem paths are set in a configuration +file, by default located in /usr/local/etc/isolate. + +Before you run isolate with control groups, you need to ensure that the cgroup +filesystem is enabled and mounted. Most modern Linux distributions already +provide cgroup support through a tmpfs mounted at /sys/fs/cgroup, with +individual controllers mounted within subdirectories. + +REPRODUCIBILITY +--------------- + +The reproducibility of results can be improved by tuning some kernel +parameters, listed below. Some of these parameters can be checked using the +program isolate-check-environment. + +* Disable address space randomization: +sysctl kernel.randomize_va_space=0+. +Address space randomization can affect timing, memory usage, and program +behavior. This setting can be made persistent through /etc/sysctl.d/. + +* Disable dynamic CPU frequency scaling. This requires setting the cpufreq +scaling governor to +performance+. The process for doing this varies between +distributions. + +* Consider disabling Turboboost on CPUs that might support it (most i3/i5/i7 +Intel CPUs). Approach this one with caution. Disabling a CPU that Turboboosts +from 2.3 GHz to 2.6 GHz would have minimal impact on run-times in exchange +for determinism, but the same on a CPU that Turboboosts from 1.6 GHz to 2.8 +GHz will incur a much more dramatic slowdown. Perhaps if the ambient +temperature is controlled and only one single-threaded task is keeping the +CPU busy at 100%, then TB's behaviour may be reasonably deterministic; +requires further experimentation to confirm. + +* Run evaluations on a single CPU (core). The Linux scheduler has a tendency to randomly +migrate tasks between CPUs, incurring cache migration costs. You can use isolate's +configuration file to pin the process to a specified CPU. + +* Disable automatic kernel support for transparent huge pages. Both /sys/kernel/mm/transparent_hugepage/enabled +and /sys/kernel/mm/transparent_hugepage/defrag should be set to "madvise" or "never", and +/sys/kernel/mm/transparent_hugepage/khugepaged/defrag to 0. + +* Disable swapping. If you really need swap space and you are using cgroups, +make sure that you have the memsw controller enabled, so that swap space is +properly accounted for. + +LICENSE +------- +Isolate was written by Martin Mares and Bernard Blackham. +It can be distributed and used under the terms of the GNU +General Public License version 2 or any later version. diff --git a/isolate/isolate.c b/isolate/isolate.c new file mode 100644 --- /dev/null +++ b/isolate/isolate.c @@ -0,0 +1,1122 @@ +/* + * A Process Isolator based on Linux Containers + * + * (c) 2012-2018 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include "isolate.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* May not be defined in older glibc headers */ +#ifndef MS_PRIVATE +#warning "Working around old glibc: no MS_PRIVATE" +#define MS_PRIVATE (1 << 18) +#endif +#ifndef MS_REC +#warning "Working around old glibc: no MS_REC" +#define MS_REC (1 << 14) +#endif + +/* + * Theory of operation + * + * Generally, we want to run a process inside a namespace/cgroup and watch it + * from the outside. However, the reality is a little bit more complicated as we + * do not want the inside process to become the init process of the PID namespace + * (we want to have all signals properly delivered). + * + * We are running three processes: + * + * - Keeper process (root privileges, parent namespace, parent cgroups) + * - Proxy process (UID/GID of the calling user, init process of the child + * namespace, parent cgroups) + * - Inside process (per-box UID/GID, child namespace, child cgroups) + * + * The proxy process just waits for the inside process to exit and then it passes + * the exit status to the keeper. + * + * We use two pipes: + * + * - Error pipe for error messages produced by the proxy process and the early + * stages of the inside process (until exec()). Listened to by the keeper. + * - Status pipe for passing the PID of the inside process and its exit status + * from the proxy to the keeper. + */ + +#define TIMER_INTERVAL_US 100000 + +static int timeout; /* milliseconds */ +static int wall_timeout; +static int extra_timeout; +int pass_environ; +int verbose; +static int silent; +static int fsize_limit; +static int memory_limit; +static int stack_limit; +int block_quota; +int inode_quota; +static int max_processes = 1; +static char *redir_stdin, *redir_stdout, *redir_stderr; +static int redir_stderr_to_stdout; +static char *set_cwd; +static int share_net; +static int inherit_fds; +static int default_dirs = 1; + +int cg_enable; +int cg_memory_limit; +int cg_timing = 1; + +int box_id; +static char box_dir[1024]; +static pid_t box_pid; +static pid_t proxy_pid; + +uid_t box_uid; +gid_t box_gid; +uid_t orig_uid; +gid_t orig_gid; + +static int partial_line; +static int cleanup_ownership; + +static struct timeval start_time; +static int ticks_per_sec; +static int total_ms, wall_ms; +static volatile sig_atomic_t timer_tick, interrupt; + +static int error_pipes[2]; +static int write_errors_to_fd; +static int read_errors_from_fd; + +static int status_pipes[2]; + +static int get_wall_time_ms(void); +static int get_run_time_ms(struct rusage *rus); + +/*** Messages and exits ***/ + +static void +final_stats(struct rusage *rus) +{ + total_ms = get_run_time_ms(rus); + wall_ms = get_wall_time_ms(); + + meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000); + meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000); + meta_printf("max-rss:%ld\n", rus->ru_maxrss); + meta_printf("csw-voluntary:%ld\n", rus->ru_nvcsw); + meta_printf("csw-forced:%ld\n", rus->ru_nivcsw); + + cg_stats(); +} + +static void NONRET +box_exit(int rc) +{ + if (proxy_pid > 0) + { + if (box_pid > 0) + { + kill(-box_pid, SIGKILL); + kill(box_pid, SIGKILL); + } + kill(-proxy_pid, SIGKILL); + kill(proxy_pid, SIGKILL); + meta_printf("killed:1\n"); + + struct rusage rus; + int p, stat; + do + p = wait4(proxy_pid, &stat, 0, &rus); + while (p < 0 && errno == EINTR); + if (p < 0) + fprintf(stderr, "UGH: Lost track of the process (%m)\n"); + else + final_stats(&rus); + } + + if (rc < 2 && cleanup_ownership) + chowntree("box", orig_uid, orig_gid); + + meta_close(); + exit(rc); +} + +static void +flush_line(void) +{ + if (partial_line) + fputc('\n', stderr); + partial_line = 0; +} + +/* Report an error of the sandbox itself */ +void NONRET __attribute__((format(printf,1,2))) +die(char *msg, ...) +{ + va_list args; + va_start(args, msg); + char buf[1024]; + int n = vsnprintf(buf, sizeof(buf), msg, args); + + // If the child processes are still running, show no mercy. + if (box_pid > 0) + { + kill(-box_pid, SIGKILL); + kill(box_pid, SIGKILL); + } + if (proxy_pid > 0) + { + kill(-proxy_pid, SIGKILL); + kill(proxy_pid, SIGKILL); + } + + if (write_errors_to_fd) + { + // We are inside the box, have to use error pipe for error reporting. + // We hope that the whole error message fits in PIPE_BUF bytes. + write(write_errors_to_fd, buf, n); + exit(2); + } + + // Otherwise, we in the box keeper process, so we report errors normally + flush_line(); + meta_printf("status:XX\nmessage:%s\n", buf); + fputs(buf, stderr); + fputc('\n', stderr); + box_exit(2); +} + +/* Report an error of the program inside the sandbox */ +void NONRET __attribute__((format(printf,1,2))) +err(char *msg, ...) +{ + va_list args; + va_start(args, msg); + flush_line(); + if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ') + { + meta_printf("status:%c%c\n", msg[0], msg[1]); + msg += 4; + } + char buf[1024]; + vsnprintf(buf, sizeof(buf), msg, args); + meta_printf("message:%s\n", buf); + if (!silent) + { + fputs(buf, stderr); + fputc('\n', stderr); + } + box_exit(1); +} + +/* Write a message, but only if in verbose mode */ +void __attribute__((format(printf,1,2))) +msg(char *msg, ...) +{ + va_list args; + va_start(args, msg); + if (verbose) + { + int len = strlen(msg); + if (len > 0) + partial_line = (msg[len-1] != '\n'); + vfprintf(stderr, msg, args); + fflush(stderr); + } + va_end(args); +} + +/*** Signal handling in keeper process ***/ + +/* + * Signal handling is tricky. We must set up signal handlers before + * we start the child process (and reset them in the child process). + * Otherwise, there is a short time window where a SIGINT can kill + * us and leave the child process running. + */ + +struct signal_rule { + int signum; + enum { SIGNAL_IGNORE, SIGNAL_INTERRUPT, SIGNAL_FATAL } action; +}; + +static const struct signal_rule signal_rules[] = { + { SIGHUP, SIGNAL_INTERRUPT }, + { SIGINT, SIGNAL_INTERRUPT }, + { SIGQUIT, SIGNAL_INTERRUPT }, + { SIGILL, SIGNAL_FATAL }, + { SIGABRT, SIGNAL_FATAL }, + { SIGFPE, SIGNAL_FATAL }, + { SIGSEGV, SIGNAL_FATAL }, + { SIGPIPE, SIGNAL_IGNORE }, + { SIGTERM, SIGNAL_INTERRUPT }, + { SIGUSR1, SIGNAL_IGNORE }, + { SIGUSR2, SIGNAL_IGNORE }, + { SIGBUS, SIGNAL_FATAL }, +}; + +static void +signal_alarm(int unused UNUSED) +{ + /* Time limit checks are synchronous, so we only schedule them there. */ + timer_tick = 1; + msg("[timer]"); +} + +static void +signal_int(int signum) +{ + /* Interrupts (e.g., SIGINT) are synchronous, too. */ + interrupt = signum; +} + +static void +signal_fatal(int signum) +{ + /* If we receive SIGSEGV or a similar signal, we try to die gracefully. */ + die("Sandbox keeper received fatal signal %d", signum); +} + +static void +setup_signals(void) +{ + struct sigaction sa_int, sa_fatal; + bzero(&sa_int, sizeof(sa_int)); + sa_int.sa_handler = signal_int; + bzero(&sa_fatal, sizeof(sa_fatal)); + sa_fatal.sa_handler = signal_fatal; + + for (int i=0; i < ARRAY_SIZE(signal_rules); i++) + { + const struct signal_rule *sr = &signal_rules[i]; + switch (sr->action) + { + case SIGNAL_IGNORE: + signal(sr->signum, SIG_IGN); + break; + case SIGNAL_INTERRUPT: + sigaction(sr->signum, &sa_int, NULL); + break; + case SIGNAL_FATAL: + sigaction(sr->signum, &sa_fatal, NULL); + break; + default: + die("Invalid signal rule"); + } + } +} + +static void +reset_signals(void) +{ + for (int i=0; i < ARRAY_SIZE(signal_rules); i++) + signal(signal_rules[i].signum, SIG_DFL); +} + +/*** The keeper process ***/ + +#define PROC_BUF_SIZE 4096 +static int +read_proc_file(char *buf, char *name, int *fdp) +{ + int c; + + if (*fdp < 0) + { + snprintf(buf, PROC_BUF_SIZE, "/proc/%d/%s", (int) box_pid, name); + *fdp = open(buf, O_RDONLY); + if (*fdp < 0) + return 0; // This is OK, the process could have finished + } + lseek(*fdp, 0, SEEK_SET); + if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0) + { + // Even this could fail if the process disappeared since open() + return 0; + } + if (c >= PROC_BUF_SIZE-1) + die("/proc/$pid/%s too long", name); + buf[c] = 0; + return 1; +} + +static int +get_wall_time_ms(void) +{ + struct timeval now, wall; + gettimeofday(&now, NULL); + timersub(&now, &start_time, &wall); + return wall.tv_sec*1000 + wall.tv_usec/1000; +} + +static int +get_run_time_ms(struct rusage *rus) +{ + if (cg_enable && cg_timing) + return cg_get_run_time_ms(); + + if (rus) + { + struct timeval total; + timeradd(&rus->ru_utime, &rus->ru_stime, &total); + return total.tv_sec*1000 + total.tv_usec/1000; + } + + // It might happen that we do not know the box_pid (see comments in find_box_pid()) + if (!box_pid) + return 0; + + char buf[PROC_BUF_SIZE], *x; + int utime, stime; + static int proc_stat_fd = -1; + + if (!read_proc_file(buf, "stat", &proc_stat_fd)) + return 0; + x = buf; + while (*x && *x != ' ') + x++; + while (*x == ' ') + x++; + if (*x++ != '(') + die("proc stat syntax error 1"); + while (*x && (*x != ')' || x[1] != ' ')) + x++; + while (*x == ')' || *x == ' ') + x++; + if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2) + die("proc stat syntax error 2"); + + return (utime + stime) * 1000 / ticks_per_sec; +} + +static void +check_timeout(void) +{ + if (wall_timeout) + { + int wall_ms = get_wall_time_ms(); + if (wall_ms > wall_timeout) + err("TO: Time limit exceeded (wall clock)"); + if (verbose > 1) + fprintf(stderr, "[wall time check: %d msec]\n", wall_ms); + } + if (timeout) + { + int ms = get_run_time_ms(NULL); + if (verbose > 1) + fprintf(stderr, "[time check: %d msec]\n", ms); + if (ms > timeout && ms > extra_timeout) + err("TO: Time limit exceeded"); + } +} + +static void +box_keeper(void) +{ + read_errors_from_fd = error_pipes[0]; + close(error_pipes[1]); + close(status_pipes[1]); + + gettimeofday(&start_time, NULL); + ticks_per_sec = sysconf(_SC_CLK_TCK); + if (ticks_per_sec <= 0) + die("Invalid ticks_per_sec!"); + + if (timeout || wall_timeout) + { + struct sigaction sa; + bzero(&sa, sizeof(sa)); + sa.sa_handler = signal_alarm; + sigaction(SIGALRM, &sa, NULL); + struct itimerval timer = { + .it_interval = { .tv_usec = TIMER_INTERVAL_US }, + .it_value = { .tv_usec = TIMER_INTERVAL_US }, + }; + setitimer(ITIMER_REAL, &timer, NULL); + } + + for(;;) + { + struct rusage rus; + int stat; + pid_t p; + if (interrupt) + { + meta_printf("exitsig:%d\n", interrupt); + err("SG: Interrupted"); + } + if (timer_tick) + { + check_timeout(); + timer_tick = 0; + } + p = wait4(proxy_pid, &stat, 0, &rus); + if (p < 0) + { + if (errno == EINTR) + continue; + die("wait4: %m"); + } + if (p != proxy_pid) + die("wait4: unknown pid %d exited!", p); + proxy_pid = 0; + + // Check error pipe if there is an internal error passed from inside the box + char interr[1024]; + int n = read(read_errors_from_fd, interr, sizeof(interr) - 1); + if (n > 0) + { + interr[n] = 0; + die("%s", interr); + } + + // Check status pipe if there is an exit status reported by the proxy process + n = read(status_pipes[0], &stat, sizeof(stat)); + if (n != sizeof(stat)) + die("Did not receive exit status from proxy"); + + final_stats(&rus); + if (timeout && total_ms > timeout) + err("TO: Time limit exceeded"); + if (wall_timeout && wall_ms > wall_timeout) + err("TO: Time limit exceeded (wall clock)"); + + if (WIFEXITED(stat)) + { + meta_printf("exitcode:%d\n", WEXITSTATUS(stat)); + if (WEXITSTATUS(stat)) + err("RE: Exited with error status %d", WEXITSTATUS(stat)); + flush_line(); + if (!silent) + { + fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n", + total_ms/1000, total_ms%1000, + wall_ms/1000, wall_ms%1000); + } + box_exit(0); + } + else if (WIFSIGNALED(stat)) + { + meta_printf("exitsig:%d\n", WTERMSIG(stat)); + err("SG: Caught fatal signal %d", WTERMSIG(stat)); + } + else if (WIFSTOPPED(stat)) + { + meta_printf("exitsig:%d\n", WSTOPSIG(stat)); + err("SG: Stopped by signal %d", WSTOPSIG(stat)); + } + else + die("wait4: unknown status %x, giving up!", stat); + } +} + +/*** The process running inside the box ***/ + +static void +setup_root(void) +{ + if (mkdir("root", 0750) < 0 && errno != EEXIST) + die("mkdir('root'): %m"); + + /* + * Ensure all mounts are private, not shared. We don't want our mounts + * appearing outside of our namespace. + * (systemd since version 188 mounts filesystems shared by default). + */ + if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) + die("Cannot privatize mounts: %m"); + + if (mount("none", "root", "tmpfs", 0, "mode=755") < 0) + die("Cannot mount root ramdisk: %m"); + + apply_dir_rules(default_dirs); + + if (chroot("root") < 0) + die("Chroot failed: %m"); + + if (chdir("root/box") < 0) + die("Cannot change current directory: %m"); +} + +static void +setup_credentials(void) +{ + if (setresgid(box_gid, box_gid, box_gid) < 0) + die("setresgid: %m"); + if (setgroups(0, NULL) < 0) + die("setgroups: %m"); + if (setresuid(box_uid, box_uid, box_uid) < 0) + die("setresuid: %m"); + setpgrp(); +} + +static void +setup_fds(void) +{ + if (redir_stdin) + { + close(0); + if (open(redir_stdin, O_RDONLY) != 0) + die("open(\"%s\"): %m", redir_stdin); + } + if (redir_stdout) + { + close(1); + if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1) + die("open(\"%s\"): %m", redir_stdout); + } + if (redir_stderr) + { + close(2); + if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2) + die("open(\"%s\"): %m", redir_stderr); + } + if (redir_stderr_to_stdout) + { + if (dup2(1, 2) < 0) + die("Cannot dup stdout to stderr: %m"); + } +} + +static void +setup_rlim(const char *res_name, int res, rlim_t limit) +{ + struct rlimit rl = { .rlim_cur = limit, .rlim_max = limit }; + if (setrlimit(res, &rl) < 0) + die("setrlimit(%s, %jd)", res_name, (intmax_t) limit); +} + +static void +setup_rlimits(void) +{ +#define RLIM(res, val) setup_rlim("RLIMIT_" #res, RLIMIT_##res, val) + + if (memory_limit) + RLIM(AS, (rlim_t)memory_limit * 1024); + + if (fsize_limit) + RLIM(FSIZE, (rlim_t)fsize_limit * 1024); + + RLIM(STACK, (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY)); + RLIM(NOFILE, 64); + RLIM(MEMLOCK, 0); + + if (max_processes) + RLIM(NPROC, max_processes); + +#undef RLIM +} + +static int +box_inside(char **args) +{ + cg_enter(); + setup_root(); + setup_rlimits(); + setup_credentials(); + setup_fds(); + char **env = setup_environment(); + + if (set_cwd && chdir(set_cwd)) + die("chdir: %m"); + + execve(args[0], args, env); + die("execve(\"%s\"): %m", args[0]); +} + +/*** Proxy ***/ + +static void +setup_orig_credentials(void) +{ + if (setresgid(orig_gid, orig_gid, orig_gid) < 0) + die("setresgid: %m"); + if (setgroups(0, NULL) < 0) + die("setgroups: %m"); + if (setresuid(orig_uid, orig_uid, orig_uid) < 0) + die("setresuid: %m"); +} + +static int +box_proxy(void *arg) +{ + char **args = arg; + + write_errors_to_fd = error_pipes[1]; + close(error_pipes[0]); + close(status_pipes[0]); + meta_close(); + reset_signals(); + + pid_t inside_pid = fork(); + if (inside_pid < 0) + die("Cannot run process, fork failed: %m"); + else if (!inside_pid) + { + close(status_pipes[1]); + box_inside(args); + _exit(42); // We should never get here + } + + setup_orig_credentials(); + if (write(status_pipes[1], &inside_pid, sizeof(inside_pid)) != sizeof(inside_pid)) + die("Proxy write to pipe failed: %m"); + + int stat; + pid_t p = waitpid(inside_pid, &stat, 0); + if (p < 0) + die("Proxy waitpid() failed: %m"); + + if (write(status_pipes[1], &stat, sizeof(stat)) != sizeof(stat)) + die("Proxy write to pipe failed: %m"); + + _exit(0); +} + +static void +box_init(void) +{ + if (box_id < 0 || box_id >= cf_num_boxes) + die("Sandbox ID out of range (allowed: 0-%d)", cf_num_boxes-1); + box_uid = cf_first_uid + box_id; + box_gid = cf_first_gid + box_id; + + snprintf(box_dir, sizeof(box_dir), "%s/%d", cf_box_root, box_id); + make_dir(box_dir); + if (chdir(box_dir) < 0) + die("chdir(%s): %m", box_dir); +} + +/*** Commands ***/ + +static const char * +self_name(void) +{ + return cg_enable ? "isolate --cg" : "isolate"; +} + +static void +init(void) +{ + msg("Preparing sandbox directory\n"); + if (mkdir("box", 0700) < 0) + { + if (errno == EEXIST) + die("Box already exists, run `%s --cleanup' first", self_name()); + else + die("Cannot create box: %m"); + } + if (chown("box", orig_uid, orig_gid) < 0) + die("Cannot chown box: %m"); + + cg_prepare(); + set_quota(); + + puts(box_dir); +} + +static void +cleanup(void) +{ + if (!dir_exists("box")) + { + msg("Nothing to do -- box directory did not exist\n"); + return; + } + + msg("Deleting sandbox directory\n"); + rmtree(box_dir); + cg_remove(); +} + +static void +setup_pipe(int *fds, int nonblocking) +{ + if (pipe(fds) < 0) + die("pipe: %m"); + for (int i=0; i<2; i++) + if (fcntl(fds[i], F_SETFD, fcntl(fds[i], F_GETFD) | FD_CLOEXEC) < 0 || + nonblocking && fcntl(fds[i], F_SETFL, fcntl(fds[i], F_GETFL) | O_NONBLOCK) < 0) + die("fcntl on pipe: %m"); +} + +static void +find_box_pid(void) +{ + /* + * The box keeper process wants to poll status of the inside process, + * so it needs to know the box_pid. However, it is not easy to obtain: + * we got the PID from the proxy, but it is local to the PID namespace. + * Instead, we ask /proc to enumerate the children of the proxy. + * + * CAVEAT: The timing is tricky. We know that the inside process was + * already started (passing the PID from the proxy to us guarantees it), + * but it might already have exited and be reaped by the proxy. Therefore + * it is correct if we fail to find anything. + */ + + char namebuf[256]; + snprintf(namebuf, sizeof(namebuf), "/proc/%d/task/%d/children", (int) proxy_pid, (int) proxy_pid); + FILE *f = fopen(namebuf, "r"); + if (!f) + return; + + int child; + if (fscanf(f, "%d", &child) != 1) + { + fclose(f); + return; + } + box_pid = child; + + if (fscanf(f, "%d", &child) == 1) + die("Error parsing %s: unexpected children found", namebuf); + + fclose(f); +} + +static void +run(char **argv) +{ + if (!dir_exists("box")) + die("Box directory not found, did you run `%s --init'?", self_name()); + + if (!inherit_fds) + close_all_fds(); + + chowntree("box", box_uid, box_gid); + cleanup_ownership = 1; + + setup_pipe(error_pipes, 1); + setup_pipe(status_pipes, 0); + setup_signals(); + + proxy_pid = clone( + box_proxy, // Function to execute as the body of the new process + argv, // Pass our stack + SIGCHLD | CLONE_NEWIPC | (share_net ? 0 : CLONE_NEWNET) | CLONE_NEWNS | CLONE_NEWPID, + argv); // Pass the arguments + if (proxy_pid < 0) + die("Cannot run proxy, clone failed: %m"); + if (!proxy_pid) + die("Cannot run proxy, clone returned 0"); + + pid_t box_pid_inside_ns; + int n = read(status_pipes[0], &box_pid_inside_ns, sizeof(box_pid_inside_ns)); + if (n != sizeof(box_pid_inside_ns)) + die("Proxy failed before it passed box_pid: %m"); + find_box_pid(); + msg("Started proxy_pid=%d box_pid=%d box_pid_inside_ns=%d\n", (int) proxy_pid, (int) box_pid, (int) box_pid_inside_ns); + + box_keeper(); +} + +static void +show_version(void) +{ + printf("The process isolator " VERSION "\n"); + printf("(c) 2012--" YEAR " Martin Mares and Bernard Blackham\n"); + printf("Built on " BUILD_DATE " from Git commit " BUILD_COMMIT "\n"); +} + +/*** Options ***/ + +static void __attribute__((format(printf,1,2))) +usage(const char *msg, ...) +{ + if (msg != NULL) + { + va_list args; + va_start(args, msg); + vfprintf(stderr, msg, args); + va_end(args); + } + printf("\ +Usage: isolate [] \n\ +\n\ +Options:\n\ +-b, --box-id=\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\ + --cg\t\tEnable use of control groups\n\ + --cg-mem=\tLimit memory usage of the control group to KB\n\ + --cg-timing\t\tTime limits affects total run time of the control group\n\ +\t\t\t(this is turned on by default, use --no-cg-timing to turn off)\n\ +-c, --chdir=\tChange directory to before executing the program\n\ +-d, --dir=\t\tMake a directory visible inside the sandbox\n\ + --dir==\tMake a directory outside visible as inside\n\ + --dir==\t\tDelete a previously defined directory rule (even a default one)\n\ + --dir=...:\tSpecify options for a rule:\n\ +\t\t\t\tdev\tAllow access to special files\n\ +\t\t\t\tfs\tMount a filesystem (e.g., --dir=/proc:proc:fs)\n\ +\t\t\t\tmaybe\tSkip the rule if does not exist\n\ +\t\t\t\tnoexec\tDo not allow execution of binaries\n\ +\t\t\t\trw\tAllow read-write access\n\ +-D, --no-default-dirs\tDo not add default directory rules\n\ +-f, --fsize=\tMax size (in KB) of files that can be created\n\ +-E, --env=\t\tInherit the environment variable from the parent process\n\ +-E, --env==\tSet the environment variable to ; unset it if is empty\n\ +-x, --extra-time=