From: Andre Noll Date: Fri, 13 Dec 2019 14:04:27 +0000 (+0100) Subject: Initial commit. X-Git-Tag: v1.0.0~17 X-Git-Url: http://git.tue.mpg.de/?a=commitdiff_plain;h=e8cbe0823fdc68c668d8889d4c62d0f6bc0c29f8;p=micoforia.git Initial commit. This project was stared in late 2018. After 2 weeks the first feature complete version was ready. During 2019 the repo received only a moderate number of commits, mostly bug fixes, documentation improvements and the addition of non-essential features. As of version 0.9.0, the project was made public. All commits that led to this version have been discarded, so this repository contains only the final result as a single commit. --- e8cbe0823fdc68c668d8889d4c62d0f6bc0c29f8 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..98c75c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +micoforia +micoforia.8 +build +*.swp +Makefile.local diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4441e04 --- /dev/null +++ b/Makefile @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: GPL-2.0-only +.SUFFIXES: +MAKEFLAGS += -Rr +ifeq ("$(origin CC)", "default") + CC := cc +endif +ifeq ("$(origin V)", "command line") + SAY = +else + SAY = @echo '$(strip $(1))' +endif + +.ONESHELL: +.SHELLFLAGS := -ec +PREFIX ?= /usr/local +INSTALL ?= install +MKDIR_P := mkdir -p +RM := rm -f +CHMOD := chmod +B := build +all := micoforia micoforia.8 +all: $(all) + +PACKAGE := micoforia +SLOGAN := Minimal Containers for Instant Availability +AUTHOR := Andre Noll +EMAIL := maan@tuebingen.mpg.de +COPYRIGHT_YEAR := 2019 +URL := http://people.tuebingen.mpg.de/maan/$(PACKAGE)/ +CLONE_URL := git://git.tuebingen.mpg.de/$(PACKAGE) +GITWEB_URL := http://git.tuebingen.mpg.de/$(PACKAGE).git +HOME_URL := http://people.tuebingen.mpg.de/maan/ +LICENSE := GNU GPL version 3 +LICENSE_URL := https://www.gnu.org/licenses/gpl-3.0-standalone.html +LOGLEVELS := LL_DEBUG,LL_INFO,LL_NOTICE,LL_WARNING,LL_ERROR,LL_CRIT,LL_EMERG + +units := micoforia util version micoforia.lsg +deps := $(addprefix $(B)/, $(addsuffix .d, $(units))) +objs := $(addprefix $(B)/, $(addsuffix .o, $(units))) + +ifeq ($(findstring clean, $(MAKECMDGOALS)),) +ifeq ($(findstring README, $(MAKECMDGOALS)),) +-include $(deps) +-include $(B)/config.mak +endif +endif + +XCPPFLAGS := +XCPPFLAGS += -I$(B) +XCPPFLAGS += -Wunused-macros +XCPPFLAGS += -DCOPYRIGHT_YEAR='"$(COPYRIGHT_YEAR)"' +XCPPFLAGS += -DPACKAGE='"$(PACKAGE)"' +XCPPFLAGS += -DAUTHOR='"$(AUTHOR)"' +XCPPFLAGS += -DEMAIL='"$(EMAIL)"' +XCPPFLAGS += -DURL='"$(URL)"' +XCPPFLAGS += -DCLONE_URL='"$(CLONE_URL)"' +XCPPFLAGS += -DGITWEB_URL='"$(GITWEB_URL)"' +XCPPFLAGS += -DHOME_URL='"$(HOME_URL)"' +XCPPFLAGS += -DGET_VERSION='$(PACKAGE)_version' +XCPPFLAGS += -DLOGLEVELS='$(LOGLEVELS)' +XCPPFLAGS += -DBUILD_DATE='"$(build_date)"' +XCPPFLAGS += -DCC_VERSION='"$(cc_version)"' +XCPPFLAGS += -DUNAME_RS='"$(uname_rs)"' +XCPPFLAGS += -DLICENSE='"$(LICENSE)"' +XCPPFLAGS += -DLICENSE_URL='"$(LICENSE_URL)"' + +XCFLAGS := +XCFLAGS += -fno-strict-aliasing +XCFLAGS += -g +XCFLAGS += -Os +XCFLAGS += -Wundef -W -Wuninitialized +XCFLAGS += -Wchar-subscripts +XCFLAGS += -Werror-implicit-function-declaration +XCFLAGS += -Wmissing-noreturn +XCFLAGS += -Wbad-function-cast +XCFLAGS += -Wredundant-decls +XCFLAGS += -Wno-sign-compare -Wno-unknown-pragmas +XCFLAGS += -Wdeclaration-after-statement +XCFLAGS += -Wformat -Wformat-security -Wmissing-format-attribute +XCFLAGS += -fsanitize=undefined +XCFLAGS += -fdata-sections -ffunction-sections +XCFLAGS += -Wstrict-prototypes +XCFLAGS += -Wshadow +XCFLAGS += -Wunused -Wall +XCFLAGS += -Wformat-signedness +XCFLAGS += -Wdiscarded-qualifiers + +XLDFLAGS := -lubsan -Wl,--gc-sections +version_file := $(B)/version.c +GIT_VERSION := $(shell $(MKDIR_P) $(B) && ./version-gen.sh $(PACKAGE) $(version_file)) + +CC_CMD = $(CC) -c -o $@ $(XCPPFLAGS) $(CPPFLAGS) \ + $(XCFLAGS) $(CFLAGS) -MMD -MF $(B)/$(*F).d -MT $@ + +$(objs): m7a.h $(B)/micoforia.lsg.h + +$(B): + @$(MKDIR_P) $@ + +$(B)/config.h.in: configure.ac | $(B) + $(call SAY, AH $<) + cd $(B) + autoheader -f ../configure.ac +$(B)/configure.sh: configure.ac | $(B) + $(call SAY, AC $<) + cd $(B) + autoconf ../configure.ac > configure.sh + $(CHMOD) 755 configure.sh +$(B)/config.status: $(B)/configure.sh | $(B) + $(call SAY, SH $<) + cd $(B) + if test -x config.status; then \ + ./config.status --quiet --recheck; \ + else \ + ./configure.sh --no-create; \ + fi +$(B)/config.mak $(B)/config.h: $(B)/config.status config.mak.in $(B)/config.h.in + $(call SAY, CS $@) + cd $(B) + ln -f ../config.mak.in + ./config.status -q + test -f config.h && touch config.h + +define DESCRIPTION1 := + $(PACKAGE) is a lightweight container implementation for Linux. + It consists of a single program which reads a single configuration + file that describes all containers. $(PACKAGE) was written with + performance and simplicity in mind, and is designed for trusted + in-house web application hosting. +endef + +define DESCRIPTION2 := + Like other container frameworks, $(PACKAGE) employs Linux namespaces + for isolation and cgroup controllers to limit the resource utilization + of the containers. Networking is implemented through bridging and + virtual ethernet device pairs. There is built-in support for the cpu, + memory, I/O and device controllers. Further customization is possible + via startup hooks. For example, the startup hook could activate + additional cgroup controllers, make the container enter a different + namespace, and mount additional file systems. +endef + +define DESCRIPTION3 := + The micoforia program supports a couple of subcommands. Besides + the start subcommand which starts one or more containers, there are + subcommands for listing, killing or rebooting containers. +endef + +# dependency on config.mak is because the command below depends on $(M4) +$(B)/index.html $(B)/micoforia.suite: $(B)/%: %.m4 Makefile $(B)/config.mak + $(call SAY, M4 $<) + $(M4) -D "AUTHOR=$(AUTHOR)" -D "COPYRIGHT_YEAR=$(COPYRIGHT_YEAR)" \ + -D "PACKAGE=$(PACKAGE)" \ + -D "SLOGAN=$(SLOGAN)" \ + -D "EMAIL=$(EMAIL)" \ + -D "URL=$(URL)" \ + -D "CLONE_URL=$(CLONE_URL)" \ + -D "GITWEB_URL=$(GITWEB_URL)" \ + -D "HOME_URL=$(HOME_URL)" \ + -D "LICENSE=$(LICENSE)" \ + -D "LICENSE_URL=$(LICENSE_URL)" \ + -D "DESCRIPTION1=$(DESCRIPTION1)" \ + -D "DESCRIPTION2=$(DESCRIPTION2)" \ + -D "DESCRIPTION3=$(DESCRIPTION3)" $< > $@ +$(B)/%.lsg.c: $(B)/%.suite + $(call SAY, LSGC $<) + $(LOPSUBGEN) --gen-c --output-dir $(B) < $< +$(B)/%.lsg.h: $(B)/%.suite + $(call SAY, LSGH $<) + $(LOPSUBGEN) --gen-header --output-dir $(B) < $< +%.8: $(B)/%.suite $(B)/version.c + $(call SAY, LSGM $<) + $(LOPSUBGEN) --gen-man=$(*F).8 --version-string $(GIT_VERSION) < $< + +$(B)/%.o: %.c | $(B) + $(call SAY, CC $<) + $(CC_CMD) $< +$(B)/%.o: $(B)/%.c + $(call SAY, CC $<) + $(CC_CMD) $< +micoforia: $(objs) + $(call SAY, LD $@) + $(CC) -o $@ $^ $(XLDFLAGS) $(LDFLAGS) -llopsub -lmnl -lutil -lcap + +mandir := $(datarootdir)/man/man8 +INSTALL ?= install +INSTALL_PROGRAM ?= $(INSTALL) -m 755 +INSTALL_DATA ?= $(INSTALL) -m 644 +ifneq ($(findstring strip, $(MAKECMDGOALS)),) + strip_option := -s +endif +install install-strip: all + $(MKDIR_P) $(DESTDIR)$(sbindir) $(DESTDIR)$(mandir) + $(INSTALL_PROGRAM) $(strip_option) micoforia $(DESTDIR)$(sbindir) + $(INSTALL_DATA) micoforia.8 $(DESTDIR)$(mandir) + +clean: + $(RM) $(B)/*.o $(all) +distclean: clean + $(RM) -r $(B) +maintainer-clean: + git clean -dfqx > /dev/null 2>&1 + +define README := +$(PACKAGE) - $(SLOGAN) + +$(DESCRIPTION1) + +$(DESCRIPTION2) + +$(DESCRIPTION3) + +Resources +~~~~~~~~~ +| web page: $(URL) +| git clone URL: $(CLONE_URL) +| gitweb: $(GITWEB_URL) +| author's home page: $(HOME_URL) +| Send feedback to: $(AUTHOR) <$(EMAIL)> + +License +~~~~~~~ +Open source, licensed under the $(LICENSE). + +Documentation +~~~~~~~~~~~~~ +See micoforia.suite.m4. Or build the man page with \"make\" and run +\"man -l micoforia.8\". + +Dependencies +~~~~~~~~~~~~ +This package requires m4, autoconf, gnu make, gcc or clang, and +lopsub. The configure script checks if all dependencies are installed +and prints a meaningful error message if one of them is missing. + +Building +~~~~~~~~ +Run \"make\" to build the package with the default settings. Run +\"./configure -h\" to list configuration options. + +Installation +~~~~~~~~~~~~ +Run \"sudo make install\" to install to /usr/local. To install to +/somewhere/else, run \"./configure --prefix /somewhere/else && make\" +first. +endef + +README: + @printf '%s\n' "$(README)" + +.PRECIOUS: $(B)/%.lsg.c $(B)/%.lsg.h $(B)/%.8 +.PHONY: all clean install distclean maintainer-clean README +-include Makefile.local diff --git a/README b/README new file mode 100644 index 0000000..52a1fd7 --- /dev/null +++ b/README @@ -0,0 +1 @@ +Run "make README". diff --git a/config.mak.in b/config.mak.in new file mode 100644 index 0000000..ee258b4 --- /dev/null +++ b/config.mak.in @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +prefix := @prefix@ +exec_prefix := @exec_prefix@ + +# These two use prefix and exec_prefix +sbindir := @sbindir@ +datarootdir := @datarootdir@ + +LOPSUBGEN := @LOPSUBGEN@ +M4 := @M4@ diff --git a/configure b/configure new file mode 100755 index 0000000..ad2ec3f --- /dev/null +++ b/configure @@ -0,0 +1,12 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +set -e + +mkdir -p build +cd build +autoconf ../configure.ac > configure.sh +chmod 755 configure.sh +ln -f ../config.mak.in +autoheader ../configure.ac +sh configure.sh "$@" diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..e29968f --- /dev/null +++ b/configure.ac @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-only + +AC_PREREQ([2.61]) +# only for configure -h, see Makefile +AC_INIT([software], [packages]) +AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_FILES([config.mak]) +AC_USE_SYSTEM_EXTENSIONS +AC_PROG_CC +AC_PROG_CPP + +AC_DEFUN([REQUIRE_EXECUTABLE], [ + AC_PATH_PROG(m4_toupper([$1]), [$1]) + test -z "$m4_toupper([$1])" && AC_MSG_ERROR([$2]) +]) +REQUIRE_EXECUTABLE([m4], [m4 is required to build this package]) + +AC_DEFUN([LOPSUB_NOT_FOUND], [ +The lopsub library is required to build this software, but the checks +indicate it is not installed on your system. Run the following +command to download a copy. + git clone git://git.tuebingen.mpg.de/lopsub.git +Install the library, then run this configure script again. + +If you installed lopsub at a non-standard location, make sure to set +PATH, CPPFLAGS and LDFLAGS accordingly. For example: + + pfx=/prefix/where/lopsub/is/installed + export PATH=\$pfx/bin:\$PATH + export CPPFLAGS=-I\$pfx/include + export LDFLAGS=-L\$pfx/lib +]) +REQUIRE_EXECUTABLE([lopsubgen], [LOPSUB_NOT_FOUND()]) +AC_CHECK_HEADER(lopsub.h, [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])]) +AC_CHECK_LIB([lopsub], [lls_merge], [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])]) + +AC_DEFUN([LIBCAP_NOT_FOUND], [the libcap library is required to build dnl +this software. Package: libcap-dev]) +AC_CHECK_HEADER([sys/capability.h], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])]) +AC_CHECK_LIB([cap], [cap_from_text], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])]) + +AC_DEFUN([LIBMNL_NOT_FOUND], [the libmnl library is required to build dnl +this software. Package: libmnl-dev]) +AC_CHECK_HEADER([libmnl/libmnl.h], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])]) +AC_CHECK_LIB([mnl], [mnl_socket_open], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])]) + +AC_OUTPUT diff --git a/index.html.m4 b/index.html.m4 new file mode 100644 index 0000000..a0d8ecc --- /dev/null +++ b/index.html.m4 @@ -0,0 +1,64 @@ +dnl SPDX-License-Identifier: GPL-2.0-only + + + + + + PACKAGE() + + + + + + + + +
+

+ PACKAGE() - SLOGAN() +

+
+ + +
+

DESCRIPTION1()

+

DESCRIPTION2()

+

DESCRIPTION3()

+ +

Resources

+ + +

License

+ Open source, licensed under the LICENSE() + +

Documentation

+ See the manual page for details. + +

Programming Language

+ Plain C. + +

Dependencies

+ A working C compiler and a couple of other dependencies, + most of which are standard (autoconf, make, m4, + libmnl, libcap). The notable exception is the lopsub library. + + diff --git a/m7a.h b/m7a.h new file mode 100644 index 0000000..77c3cad --- /dev/null +++ b/m7a.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +#define CMD_PTR(_cname) lls_cmd(LSG_MICOFORIA_CMD_ ## _cname, micoforia_suite) +#define OPT_RESULT(_cname, _oname) (lls_opt_result(\ + LSG_MICOFORIA_ ## _cname ## _OPT_ ## _oname, \ + (CMD_PTR(_cname) == CMD_PTR(MICOFORIA))? lpr : sublpr)) +#define OPT_GIVEN(_cname, _oname) (lls_opt_given(OPT_RESULT(_cname, _oname))) +#define OPT_UINT32_VAL_N(_n, _cname, _oname) (lls_uint32_val(_n, \ + OPT_RESULT(_cname, _oname))) +#define OPT_UINT32_VAL(_cname, _oname) (OPT_UINT32_VAL_N(0, _cname, _oname)) +#define OPT_STRING_VAL_N(_n, _cname, _oname) (lls_string_val(_n, \ + OPT_RESULT(_cname, _oname))) +#define OPT_STRING_VAL(_cname, _oname) (OPT_STRING_VAL_N(0, _cname, _oname)) + +struct micoforia_user_data {bool (*handler)(void);}; +#define EXPORT_CMD_HANDLER(_cmd) const struct micoforia_user_data \ + lsg_micoforia_com_ ## _cmd ## _user_data = { \ + .handler = com_ ## _cmd \ + }; + + +__attribute__ ((warn_unused_result)) +void *xrealloc(void *p, size_t size); + +__attribute__ ((warn_unused_result)) +void *xmalloc(size_t size); + +__attribute__ ((warn_unused_result)) +void *xzmalloc(size_t size); + +void *xstrdup(const char *s); +char *xstrcat(char *a, const char *b); + +__attribute__ ((format (printf, 1, 2))) __attribute__ ((warn_unused_result)) +char *msg(const char *fmt, ...); + +enum loglevels {LOGLEVELS, NUM_LOGLEVELS}; +extern unsigned loglevel_arg_val; + +__attribute__ ((format (printf, 2, 3))) +void m7a_log(int ll, const char* fmt,...); + +#define DEBUG_LOG(f,...) m7a_log(LL_DEBUG, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define INFO_LOG(f,...) m7a_log(LL_INFO, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define NOTICE_LOG(f,...) m7a_log(LL_NOTICE, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define WARNING_LOG(f,...) m7a_log(LL_WARNING, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define ERROR_LOG(f,...) m7a_log(LL_ERROR, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define CRIT_LOG(f,...) m7a_log(LL_CRIT, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define EMERG_LOG(f,...) m7a_log(LL_EMERG, "%s: " f, __FUNCTION__, ## __VA_ARGS__) + +__attribute__ ((noreturn)) +__attribute__ ((format (printf, 1, 2))) +void die(const char *fmt, ...); + +__attribute__ ((noreturn)) +__attribute__ ((format (printf, 1, 2))) +void die_errno(const char *fmt, ...); + +__attribute__ ((noreturn)) +void die_empty_arg(const char *opt); + +void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt); + +bool xexec(char * const argv[], const struct iovec *iov); +void valid_fd012(void); +void check_name(const char *arg); +void parse_compound_arg(const char *arg, const char *opt, char **name, char **val); +char *parse_cgroup_acl(const char *arg); +char *make_hwaddr(const char *name, const char *bridge); +void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr); +uint32_t atou32(const char *str, const char *opt); +bool remove_subdirs_recursively(const char *path); +void daemonize(const char *logfile); +bool acquire_lock(const char *string); +bool try_lock(const char *string, pid_t *pid); +bool release_lock(const char *string); +bool is_locked(const char *string, pid_t *pid); +bool attach_to_bridge(const char *iface, const char *bridge); +bool rename_interface(const char *before, const char *after); +void pretty_print_hwaddr(const uint8_t *hwaddr, char *result); +bool set_hwaddr(const char *iface, const uint8_t *hwaddr); +bool link_del(const char *iface); +bool link_up(const char *iface); +bool create_veth_device_pair(const char *name, char *peer); +bool set_netns(const char *iface, pid_t pid); +int request_fd(const char *socket_path, char *msg, int *result); +bool request_int(const char *socket_path, char *msg, int *result); +bool listen_on_unix_socket(const char *socket_path, int *result); +bool recv_cred_buffer(int socketfd, char *buf, size_t size, + int *clientfd, uid_t *uid); +bool pass_fd(int passfd, int socketfd); + +extern int signal_pipe[2]; +void init_signal_handling(void); +int next_signal(void); diff --git a/micoforia.c b/micoforia.c new file mode 100644 index 0000000..4d267ec --- /dev/null +++ b/micoforia.c @@ -0,0 +1,1996 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include "m7a.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "micoforia.lsg.h" + +static struct lls_parse_result *lpr, *sublpr; +unsigned loglevel_arg_val = 4; + +struct ifspec { + char *bridge; + uint8_t hwaddr[6]; +}; + +struct container { + char *name; + char *pre_start_hook; + char *pre_exec_hook; + char *root_dir; + char *init; + struct ifspec *ifspec; + /* this is never zero, even if no ifspec was given */ + unsigned num_ifspecs; + char **dacl; + unsigned num_dac_entries; + char **io_max; + unsigned num_io_max_entries; + /* ~0U: not given, 0: unlimited */ + unsigned cpu_cores; + unsigned memory_limit; + /* ~0U: not given */ + unsigned init_type; + cap_value_t *capdrop; + unsigned num_capdrops; + uint32_t *tty; + unsigned num_ttys; +}; + +static struct container **container; +static unsigned num_containers; + +struct container_runtime { + int pipe1[2], pipe2[2]; /* for startup communication */ + uint32_t *tty; + unsigned num_ttys; + int *master, *slave, *client; + + int init_pid; /* in the parent namespace */ + char *pts, *root, *dev; + int socket_fd; +}; + +static char **default_dacl, **default_io_max; +unsigned num_default_dac_entries, num_default_io_max_entries; +static cap_value_t *default_capdrop; +unsigned num_default_capdrops; +uint32_t *default_tty; +unsigned num_default_ttys; +static const struct lls_command *subcmd; +/* does not allocate memory */ +void m7a_log(int ll, const char* fmt,...) +{ + va_list argp; + + if (ll < loglevel_arg_val) + return; + va_start(argp, fmt); + if (subcmd == lls_cmd(LSG_MICOFORIA_CMD_START, micoforia_suite)) { + char str[100]; + struct timespec t; + struct tm *tm; + assert(clock_gettime(CLOCK_REALTIME, &t) == 0); + tm = localtime(&t.tv_sec); + strftime(str, sizeof(str), "%b %d %H:%M:%S", tm); + fprintf(stderr, "%s:%04lu ", str, + (long unsigned)t.tv_nsec / 1000 / 1000); + fprintf(stderr, "(%u) ", (unsigned)getpid()); + } + vfprintf(stderr, fmt, argp); + va_end(argp); +} + +static void die_lopsub(int lopsub_ret, char **errctx) +{ + const char *m = lls_strerror(-lopsub_ret); + if (*errctx) + ERROR_LOG("%s: %s\n", *errctx, m); + else + ERROR_LOG("%s\n", m); + free(*errctx); + *errctx = NULL; + die("lopsub error"); +} + +#define FOR_EACH_CONTAINER(_c) for ( \ + struct container **_cp = container; \ + ((_c) = *(_cp)); \ + (_cp)++, (_c) = *(_cp) \ +) + +static struct container *get_container(const char *name) +{ + struct container *c; + FOR_EACH_CONTAINER(c) { + if (!strcmp(c->name, name)) + return c; + } + return NULL; +} + +static struct container *get_or_append_container(const char *name) +{ + struct container *c = get_container(name); + if (c) + return c; + container = xrealloc(container, + (++num_containers + 1) * sizeof(struct container *)); + c = container[num_containers - 1] = xzmalloc(sizeof(struct container)); + c->name = xstrdup(name); + /* ~0U means: not given */ + c->cpu_cores = ~0U; + c->memory_limit = ~0U; + c->init_type = ~0U; + container[num_containers] = NULL; + return c; +} + +static unsigned get_container_ttys(const struct container *c, uint32_t **result) +{ + static uint32_t dflt = {1}; + if (c->num_ttys > 0) { + *result = c->tty; + return c->num_ttys; + } + if (num_default_ttys > 0) { + *result = default_tty; + return num_default_ttys; + } + *result = &dflt; + return 1; +} + +enum clo_given_counter { + CLOGC_DEFAULT_CGROUP_DAC, + CLOGC_CGROUP_DAC, + CLOGC_DEFAULT_IO_MAX, + CLOGC_IO_MAX, + NUM_CLOGCS +}; + +static unsigned clo_given_counter[NUM_CLOGCS]; + +static void append_dac_entry(const char *arg, char ***listp, unsigned *count) +{ + char *val = parse_cgroup_acl(arg); + (*count)++; + *listp = xrealloc(*listp, (*count + 1) * sizeof(char *)); + (*listp)[*count - 1] = val; + (*listp)[*count] = NULL; +} + +static void append_io_max_entry(const char *arg, char ***listp, unsigned *count) +{ + (*count)++; + *listp = xrealloc(*listp, (*count + 1) * sizeof(char *)); + (*listp)[*count - 1] = xstrdup(arg); + (*listp)[*count] = NULL; +} + +static void check_options(void) +{ + unsigned n, m; + const char *arg; + char *name, *val; + struct container *c; + uint32_t u32; + + container = xzmalloc(sizeof(struct container *)); + /* loop backwards to let command line opts override config file opts */ + for (n = OPT_GIVEN(MICOFORIA, CONTAINER) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CONTAINER); + check_name(arg); + get_or_append_container(arg); + } + for (n = OPT_GIVEN(MICOFORIA, PRE_START_HOOK) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_START_HOOK); + parse_compound_arg(arg, "pre-start-hook", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->pre_start_hook); + c->pre_start_hook = val; + } + for (n = OPT_GIVEN(MICOFORIA, PRE_EXEC_HOOK) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_EXEC_HOOK); + parse_compound_arg(arg, "pre-exec-hook", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->pre_exec_hook); + c->pre_exec_hook = val; + } + for (n = OPT_GIVEN(MICOFORIA, CAPDROP) - 1; n != ~0U; n--) { + cap_value_t cap_val; + arg = OPT_STRING_VAL_N(n, MICOFORIA, CAPDROP); + parse_compound_arg(arg, "capabilities", &name, &val); + c = get_or_append_container(name); + if (cap_from_name(val, &cap_val) < 0) + die_errno("%s: invalid capability: %s", name, val); + c->capdrop = xrealloc(c->capdrop, + ++c->num_capdrops * sizeof(cap_value_t)); + c->capdrop[c->num_capdrops - 1] = cap_val; + free(name); + free(val); + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP); n++) { + cap_value_t cap_val; + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CAPDROP); + if (cap_from_name(arg, &cap_val) < 0) + die_errno("invalid default capability: %s", val); + default_capdrop = xrealloc(default_capdrop, + ++num_default_capdrops * sizeof(cap_value_t)); + default_capdrop[num_default_capdrops - 1] = cap_val; + } + for (n = OPT_GIVEN(MICOFORIA, TTY) - 1; n != ~0U; n--) { + uint32_t minor; + arg = OPT_STRING_VAL_N(n, MICOFORIA, TTY); + parse_compound_arg(arg, "tty", &name, &val); + c = get_or_append_container(name); + minor = atou32(val, "tty"); + if (minor == 0) + die("can not capture tty0"); + c->tty = xrealloc(c->tty, ++c->num_ttys * sizeof(uint32_t)); + c->tty[c->num_ttys - 1] = minor; + free(name); + free(val); + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_TTY); n++) { + uint32_t minor = OPT_UINT32_VAL_N(n, MICOFORIA, DEFAULT_TTY); + if (minor == 0) + die("can not capture tty0"); + default_tty = xrealloc(default_tty, + ++num_default_ttys * sizeof(uint32_t)); + default_tty[num_default_ttys - 1] = minor; + } + + for (n = OPT_GIVEN(MICOFORIA, ROOT_DIRECTORY) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, ROOT_DIRECTORY); + parse_compound_arg(arg, "root-directory", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->root_dir); + c->root_dir = val; + } + u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES); + check_range(u32, 0, 65536, "default-cpu-cores"); + for (n = OPT_GIVEN(MICOFORIA, CPU_CORES) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CPU_CORES); + parse_compound_arg(arg, "cpu-cores", &name, &val); + c = get_or_append_container(name); + free(name); + u32 = atou32(val, "cpu-cores"); + free(val); + check_range(u32, 0, 65536, "cpu-cores"); + c->cpu_cores = u32; + } + u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT); + check_range(u32, 0, 1024 * 1024, "default-memory-limit"); + for (n = OPT_GIVEN(MICOFORIA, MEMORY_LIMIT) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, MEMORY_LIMIT); + parse_compound_arg(arg, "memory-limit", &name, &val); + c = get_or_append_container(name); + free(name); + u32 = atou32(val, "memory-limit"); + free(val); + check_range(u32, 0, 1024 * 1024, "memory-limit"); + c->memory_limit = u32; + } + for (n = OPT_GIVEN(MICOFORIA, INIT) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, INIT); + parse_compound_arg(arg, "init", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->init); + c->init = val; + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, NET); n++) { + struct ifspec *ifspec; + arg = OPT_STRING_VAL_N(n, MICOFORIA, NET); + parse_compound_arg(arg, "net", &name, &val); + c = get_or_append_container(name); + free(name); + c->ifspec = xrealloc(c->ifspec, + ++c->num_ifspecs * sizeof(struct ifspec)); + ifspec = c->ifspec + c->num_ifspecs - 1; + parse_ifspec(val, &ifspec->bridge, ifspec->hwaddr); + free(val); + } + + m = clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC]; + for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_CGROUP_DAC); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC); + append_dac_entry(arg, &default_dacl, &num_default_dac_entries); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC); + append_dac_entry(arg, &default_dacl, &num_default_dac_entries); + } + m = clo_given_counter[CLOGC_CGROUP_DAC]; + for (n = m; n < OPT_GIVEN(MICOFORIA, CGROUP_DAC); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC); + parse_compound_arg(arg, "cgroup-dac", &name, &val); + c = get_or_append_container(name); + free(name); + append_dac_entry(val, &c->dacl, &c->num_dac_entries); + free(val); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC); + parse_compound_arg(arg, "cgroup-dac", &name, &val); + c = get_or_append_container(name); + free(name); + append_dac_entry(val, &c->dacl, &c->num_dac_entries); + free(val); + } + + m = clo_given_counter[CLOGC_DEFAULT_IO_MAX]; + for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX); + append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX); + append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries); + } + m = clo_given_counter[CLOGC_IO_MAX]; + for (n = m; n < OPT_GIVEN(MICOFORIA, IO_MAX); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX); + parse_compound_arg(arg, "io-max", &name, &val); + c = get_or_append_container(name); + free(name); + append_io_max_entry(val, &c->io_max, &c->num_io_max_entries); + free(val); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX); + parse_compound_arg(arg, "io-max", &name, &val); + c = get_or_append_container(name); + free(name); + append_io_max_entry(val, &c->io_max, &c->num_io_max_entries); + free(val); + } + + /* init default c->ifspec[] */ + FOR_EACH_CONTAINER(c) { + if (c->num_ifspecs == 0) { + const char *br = OPT_STRING_VAL(MICOFORIA, DEFAULT_BRIDGE); + c->num_ifspecs = 1; + c->ifspec = xmalloc(sizeof(struct ifspec)); + c->ifspec[0].bridge = xstrdup(br); + memset(c->ifspec[0].hwaddr, 0, 6); + continue; + } + } +} + +static void show_subcommand_summary(bool verbose) +{ + int i; + +#define LSG_MICOFORIA_CMD(_name) #_name + static const char * const subcommand_names[] = {LSG_MICOFORIA_SUBCOMMANDS NULL}; +#undef LSG_MICOFORIA_CMD + printf("Available subcommands:\n"); + if (verbose) { + const struct lls_command *cmd; + for (i = 1; (cmd = lls_cmd(i, micoforia_suite)); i++) { + const char *purpose = lls_purpose(cmd); + const char *name = lls_command_name(cmd); + printf("%-12s%s\n", name, purpose); + } + } else { + unsigned n = 8; + printf("\t"); + for (i = 0; i < LSG_NUM_MICOFORIA_SUBCOMMANDS; i++) { + if (i > 0) + n += printf(", "); + if (n > 70) { + printf("\n\t"); + n = 8; + } + n += printf("%s", subcommand_names[i]); + } + printf("\n"); + } +} + +const char *GET_VERSION(void); +static void handle_version_and_help(void) +{ + char *help; + + if (OPT_GIVEN(MICOFORIA, VERSION)) { + printf(PACKAGE " %s\n" + "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n" + "License: " LICENSE " <" LICENSE_URL ">.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "\n" + "Web page: " URL "\n" + "Clone URL: " CLONE_URL "\n" + "Gitweb: " GITWEB_URL "\n" + "Author's Home Page: " HOME_URL "\n" + "Send feedback to: " AUTHOR " <" EMAIL ">\n" + , + GET_VERSION() + ); + exit(EXIT_SUCCESS); + } + if (OPT_GIVEN(MICOFORIA, DETAILED_HELP)) + help = lls_long_help(CMD_PTR(MICOFORIA)); + else if (OPT_GIVEN(MICOFORIA, HELP)) + help = lls_short_help(CMD_PTR(MICOFORIA)); + else if (lls_num_inputs(lpr) == 0) { + show_subcommand_summary(true /* verbose */); + exit(EXIT_SUCCESS); + } else + return; + printf("%s\n", help); + free(help); + exit(EXIT_SUCCESS); +} + +static char *get_config_file_path(void) +{ + struct passwd *pw; + const char *home; + + if (OPT_GIVEN(MICOFORIA, CONFIG_FILE)) + return xstrdup(OPT_STRING_VAL(MICOFORIA, CONFIG_FILE)); + pw = getpwuid(getuid()); + home = pw? pw->pw_dir : "/root"; + return msg("%s/.micoforiarc", home); +} + +static void parse_options(int argc, char **argv, const struct lls_command *cmd, + struct lls_parse_result **lprp) +{ + int ret, fd = -1; + char *config_file; + struct stat statbuf; + void *map; + size_t sz; + int cf_argc; + char **cf_argv, *errctx = NULL; + const char *subcmd_name; + struct lls_parse_result *merged_lpr, *cf_lpr; + + ret = lls_parse(argc, argv, cmd, lprp, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + handle_version_and_help(); + clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, + DEFAULT_CGROUP_DAC); + clo_given_counter[CLOGC_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, CGROUP_DAC); + clo_given_counter[CLOGC_DEFAULT_IO_MAX] = + OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); + clo_given_counter[CLOGC_IO_MAX] = OPT_GIVEN(MICOFORIA, IO_MAX); + config_file = get_config_file_path(); + ret = open(config_file, O_RDONLY); + if (ret < 0) { + if (errno != ENOENT || OPT_GIVEN(MICOFORIA, CONFIG_FILE)) + die_errno("can not open config file %s", config_file); + /* no config file -- nothing to do */ + ret = 0; + goto success; + } + fd = ret; + ret = fstat(fd, &statbuf); + if (ret < 0) + die_errno("failed to stat config file %s", config_file); + sz = statbuf.st_size; + if (sz == 0) { /* config file is empty -- nothing to do */ + ret = 0; + goto success; + } + map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) + die_errno("failed to mmap config file %s", config_file); + subcmd_name = (cmd == CMD_PTR(MICOFORIA))? NULL : lls_command_name(cmd); + ret = lls_convert_config(map, sz, subcmd_name, &cf_argv, + &errctx); + munmap(map, sz); + if (ret < 0) { + ERROR_LOG("failed to convert config file %s\n", config_file); + die_lopsub(ret, &errctx); + } + cf_argc = ret; + ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx); + lls_free_argv(cf_argv); + if (ret < 0) + die_lopsub(ret, &errctx); + /* command line options override config file options */ + ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + lls_free_parse_result(cf_lpr, cmd); + lls_free_parse_result(*lprp, cmd); + *lprp = merged_lpr; +success: + if (fd >= 0) + close(fd); + free(config_file); +} + +static const char *get_pre_start_hook(const struct container *c) +{ + if (c->pre_start_hook) + return c->pre_start_hook; + return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_START_HOOK); +} + +static const char *get_pre_exec_hook(const struct container *c) +{ + if (c->pre_exec_hook) + return c->pre_exec_hook; + return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_EXEC_HOOK); +} + +static char *get_root_dir(const struct container *c) +{ + if (c->root_dir) + return xstrdup(c->root_dir); + return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, DEFAULT_ROOT_PREFIX), c->name); +} + +static char *get_ifspec_string(const struct container *c) +{ + unsigned n; + char *str = NULL; + + assert(c->num_ifspecs > 0); + for (n = 0; n < c->num_ifspecs; n++) { + uint8_t *x = c->ifspec[n].hwaddr; + char *tmp = msg("%s%s%s:%02x:%02x:%02x:%02x:%02x:%02x", + str? str : "", + str? " " : "", + c->ifspec[n].bridge, + x[0], x[1], x[2], x[3], x[4], x[5] + ); + free(str); + str = tmp; + } + return str; +} + +static char *interface_name(const struct container *c, unsigned idx, bool peer) +{ + assert(idx < c->num_ifspecs); + if (c->num_ifspecs == 1) + return peer? msg("%s-g", c->name) : xstrdup(c->name); + if (peer) + return msg("%s-%s-g", c->name, c->ifspec[idx].bridge); + return msg("%s-%s", c->name, c->ifspec[idx].bridge); +} + +static void set_m7a_root_dir_env(const struct container *c) +{ + char *root = get_root_dir(c); + DEBUG_LOG("root dir: %s\n", root); + setenv("MICOFORIA_ROOT_DIR", root, 1); + free(root); +} + +static bool run_pre_start_hook(const struct container *c) +{ + char *ifspec; + char *cmd = xstrdup(get_pre_start_hook(c)); + char *argv[] = {"/bin/sh", "-c", cmd, NULL}; + bool success; + + setenv("MICOFORIA_CONTAINER_NAME", c->name, 1); + set_m7a_root_dir_env(c); + + ifspec = get_ifspec_string(c); + DEBUG_LOG("ifspecs: %s\n", ifspec); + setenv("MICOFORIA_IFSPECS", ifspec, 1); + free(ifspec); + + INFO_LOG("running pre-start hook %s\n", cmd); + success = xexec(argv, NULL); + free(cmd); + if (!success) + ERROR_LOG("pre-start hook failed\n"); + unsetenv("MICOFORIA_CONTAINER_NAME"); + unsetenv("MICOFORIA_IFSPECS"); + unsetenv("MICOFORIA_ROOT_DIR"); + return success; +} + +static void run_pre_exec_hook(const struct container *c) +{ + char *cmd = xstrdup(get_pre_exec_hook(c)); + char *argv[] = {"/bin/sh", "-c", cmd, NULL}; + + INFO_LOG("/bin/sh -c '%s'\n", cmd); + set_m7a_root_dir_env(c); + if (!xexec(argv, NULL)) + die("%s: pre-exec hook failed", c->name); + free(cmd); + unsetenv("MICOFORIA_ROOT_DIR"); +} + +static void write_cgroup(const char *path, const char *txt) +{ + int fd; + size_t sz; + + if ((fd = open(path, O_WRONLY)) < 0) + die_errno("open %s", path); + sz = strlen(txt); + if (write(fd, txt, sz) != sz) + die_errno("could not write to %s", path); + close(fd); +} + +static unsigned get_dacl(const struct container *c, char ***result) +{ + static char *dflt[] = { + "da", /* deny access to all devices except the ones below */ + "ac 1:3 rwm", /* null */ + "ac 1:5 rwm", /* zero */ + "ac 1:7 rwm", /* full */ + "ac 1:8 rwm", /* random */ + "ac 1:9 rwm", /* urandom */ + "ac 4:* rwm", /* tty?* */ + "ac 5:0 rwm", /* tty */ + "ac 5:2 rwm", /* ptmx */ + "ac 136:* rwm", /* pts */ + }; + if (c->num_dac_entries > 0) { + *result = c->dacl; + return c->num_dac_entries; + } + if (num_default_dac_entries > 0) { + *result = default_dacl; + return num_default_dac_entries; + } + *result = dflt; + return ARRAY_SIZE(dflt); +} + +static void apply_dacl(const struct container *c) +{ + char **dacl; + unsigned n, num_entries; + char *m7a_dir, *container_dir, *allow, *deny, *procs, *txt; + int fd, allow_fd, deny_fd; + size_t sz; + + m7a_dir = msg("/var/cgroup/micoforia"); + container_dir = msg("%s/%s", m7a_dir, c->name); + allow = msg("%s/devices.allow", container_dir); + deny = msg("%s/devices.deny", container_dir); + procs = msg("%s/cgroup.procs", container_dir); + + if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", m7a_dir); + free(m7a_dir); + if (mkdir(container_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", container_dir); + free(container_dir); + if ((allow_fd = open(allow, O_WRONLY)) < 0) + die_errno("open %s", allow); + free(allow); + if ((deny_fd = open(deny, O_WRONLY)) < 0) + die_errno("open %s", deny); + free(deny); + + num_entries = get_dacl(c, &dacl); + INFO_LOG("applying %u entr%s\n", num_entries, num_entries == 1? + "y" : "ies"); + for (n = 0; n < num_entries; n++) { + char *entry = dacl[n]; + DEBUG_LOG("dac entry #%u: %s %s\n", n, dacl[n][0] == 'a'? + "allow" : "deny", dacl[n] + 1); + txt = msg("%s\n", entry + 1); + sz = strlen(txt); + fd = entry[0] == 'a'? allow_fd : deny_fd; + if (write(fd, txt, sz) != sz) + die_errno("could not write to cgroup devices.%s file", + entry[0] == 'a'? "allow" : "deny"); + free(txt); + } + close(allow_fd); + close(deny_fd); + txt = msg("%u\n", (unsigned)getpid()); + write_cgroup(procs, txt); + free(txt); +} + +static void cgroup_init(void) +{ + const char controllers[] = "+cpu +memory +io\n"; + char *m7a_dir, *ctl; + + if (access("/var/cgroup/cgroup.clone_children", F_OK) < 0) + die("cgroup v1 not mounted at /var/cgroup/"); + if (access("/var/cgroup2/cgroup.subtree_control", F_OK) < 0) + die("cgroup v1 not mounted at /var/cgroup/"); + write_cgroup("/var/cgroup2/cgroup.subtree_control", controllers); + m7a_dir = msg("/var/cgroup2/micoforia"); + if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", m7a_dir); + ctl = msg("%s/cgroup.subtree_control", m7a_dir); + free(m7a_dir); + write_cgroup(ctl, controllers); + free(ctl); +} + +static void create_cgroup_v2(const struct container *c) +{ + char buf[10]; + char *ctl, *dir = msg("/var/cgroup2/micoforia/%s", c->name); + + if (mkdir(dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", dir); + ctl = msg("%s/cgroup.procs", dir); + free(dir); + sprintf(buf, "%u\n", (unsigned)getpid()); + write_cgroup(ctl, buf); + free(ctl); +} + +static unsigned get_cpu_cores(const struct container *c) +{ + return c->cpu_cores != ~0U? c->cpu_cores : + OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES); +} + +static void apply_cpu_limit(const struct container *c) +{ + char *str, *ctl; + unsigned cores = get_cpu_cores(c); + + if (cores == 0) /* unlimited */ + return; + assert(cores != ~0U); + INFO_LOG("%u core%s\n", cores, cores == 1? "" : "s"); + ctl = msg("/var/cgroup2/micoforia/%s/cpu.max", c->name); + str = msg("%u 1000000\n", 1000000 * cores); + write_cgroup(ctl, str); + free(ctl); + free(str); +} + +static unsigned get_memory_limit(const struct container *c) +{ + return c->memory_limit != ~0U? c->memory_limit : + OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT); +} + +static void apply_memory_limit(const struct container *c) +{ + char *str, *ctl; + unsigned gigs = get_memory_limit(c); + + if (gigs == 0) /* unlimited */ + return; + assert(gigs != ~0U); + INFO_LOG("%uG\n", gigs); + ctl = msg("/var/cgroup2/micoforia/%s/memory.high", c->name); + str = msg("%llu\n", 1024LLU * 1024LLU * 1024LLU * gigs); + write_cgroup(ctl, str); + free(ctl); + free(str); +} + +static unsigned get_iospecs(const struct container *c, char ***result) +{ + if (c->num_io_max_entries > 0) { + *result = c->dacl; + return c->num_io_max_entries; + } + if (num_default_io_max_entries > 0) { + *result = default_io_max; + return num_default_io_max_entries; + } + *result = NULL; + return 0; +} + +static void apply_io_limit(const struct container *c) +{ + unsigned n, num_entries; + char *io_max; + char **iospec; + + num_entries = get_iospecs(c, &iospec); + if (num_entries == 0) + return; + INFO_LOG("%u entries\n", num_entries); + io_max = msg("/var/cgroup2/micoforia/%s/io.max", c->name); + for (n = 0; n < num_entries; n++) + write_cgroup(io_max, iospec[n]); + free(io_max); +} + +static void cgroup_cleanup(const struct container *c) +{ + char *dir = msg("/var/cgroup/micoforia/%s", c->name); + remove_subdirs_recursively(dir); + free(dir); + dir = msg("/var/cgroup2/micoforia/%s", c->name); + remove_subdirs_recursively(dir); + free(dir); +} + +static bool setup_network(const struct container *c) +{ + unsigned n; + char *iface, *peer; + + if (!link_up("lo")) + WARNING_LOG("could not set establish loopback link\n"); + for (n = 0; n < c->num_ifspecs; n++) { + iface = interface_name(c, n, false); + peer = interface_name(c, n, true); + link_del(iface); /* ignore errors */ + if (!create_veth_device_pair(iface, peer)) + goto fail; + if (!set_hwaddr(peer, c->ifspec[n].hwaddr)) + goto fail; + if (!attach_to_bridge(iface, c->ifspec[n].bridge)) + goto fail; + if (!link_up(iface)) + goto fail; + free(iface); + free(peer); + } + return true; +fail: + free(iface); + free(peer); + return false; +} + +static void setup_termios(int fd) +{ + struct winsize wsz; /* see ioctl_tty(2) */ + struct termios tios; + + if (!isatty(fd)) + return; + if (tcgetattr(fd, &tios)) { + ERROR_LOG("tcgetattr: %m\n"); + return; + } + tios.c_lflag &= ~(ECHO | ISIG | ICANON); + tios.c_cc[VMIN] = 1; + tios.c_cc[VTIME] = 0; + if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + if (ioctl(STDIN_FILENO, TIOCGWINSZ, &wsz) >= 0) + ioctl(fd, TIOCSWINSZ, &wsz); +} + +struct device_node_info { + unsigned major, minor; + mode_t mode; + const char *name; +}; + +static void create_standard_device_nodes(struct container_runtime *cr) +{ + const struct device_node_info devices[] = { + {.major = 1, .minor = 3, .mode = 0666, .name = "null"}, + {.major = 1, .minor = 5, .mode = 0666, .name = "zero"}, + {.major = 1, .minor = 7, .mode = 0666, .name = "full"}, + {.major = 1, .minor = 8, .mode = 0666, .name = "random"}, + {.major = 1, .minor = 9, .mode = 0666, .name = "urandom"}, + {.major = 4, .minor = 0, .mode = 0620, .name = "tty0"}, + {.major = 5, .minor = 1, .mode = 0600, .name = "console"}, + {.major = 5, .minor = 2, .mode = 0666, .name = "ptmx"}, + }; + unsigned n; + + for (n = 0; n < ARRAY_SIZE(devices); n++) { + const struct device_node_info *d = devices + n; + char *path = msg("%s/%s", cr->dev, d->name); + if (mknod(path, S_IFCHR, makedev(d->major, d->minor)) < 0) + die_errno("mknod %s", d->name); + chmod(path, d->mode); + free(path); + } +} + +static void init_console(struct container_runtime *cr) +{ + char *console; + unsigned n; + + if (mount(NULL, cr->dev, "tmpfs", 0, "size=500000,mode=755") < 0) + die("mount tmpfs at %s: %m", cr->dev); + create_standard_device_nodes(cr); + for (n = 0; n < cr->num_ttys; n++) { + char *tty = msg("%s/tty%u", cr->dev, cr->tty[n]); + unlink(tty); + if (mknod(tty, S_IFCHR, makedev(4, cr->tty[n])) < 0) + die("mknod %s: %m", tty); + chmod(tty, 0660); + setup_termios(cr->slave[n]); + INFO_LOG("bind mounting %s -> %s\n", ttyname(cr->slave[n]), tty); + if (mount(ttyname(cr->slave[n]), tty, "none", + MS_BIND | MS_PRIVATE, NULL) < 0) + die("failed to bind mount %s: %m\n", tty); + free(tty); + } + console = msg("%s/console", cr->dev); + if (mount(ttyname(cr->slave[0]), console, "none", + MS_BIND | MS_PRIVATE, NULL) < 0) + die("failed to bind mount %s: %m\n", console); + free(console); +} + +/* + * These umounts fail if the container shutdown already umounted the bind + * mounted devices. This is not fatal, so log only with low severity. + */ +static void shutdown_console(struct container_runtime *cr) +{ + unsigned n; + char *console; + + for (n = 0; n < cr->num_ttys; n++) { + char *tty = msg("%s/tty1", cr->dev); + if (umount2(tty, MNT_DETACH) < 0) + DEBUG_LOG("umount %s: %m\n", tty); + free(tty); + } + console = msg("%s/console", cr->dev); + if (umount2(console, MNT_DETACH) < 0) + DEBUG_LOG("umount %s: %m\n", console); + free(console); +} + +static char *get_socket_path(const char *container_name) +{ + return msg("micoforia/%s", container_name); +} + +/* Ignore everything the client sends us, but invalidate the fd on EOF. */ +static void dispatch_client(int *client) +{ + char buf[1024]; + if (read(*client, buf, sizeof(buf)) <= 0) { + NOTICE_LOG("detaching client on fd %d\n", *client); + close(*client); + *client = -1; + } +} + +static void dispatch_socket_request(struct container_runtime *cr) +{ + uid_t uid; + char buf[32]; + int cfd; + uint32_t minor; + unsigned n; + bool force; + + memset(buf, 0, sizeof(buf)); + if (!recv_cred_buffer(cr->socket_fd, buf, sizeof(buf) - 1, &cfd, &uid)) + return; + if (uid != getuid()) { + const char msg[] = "\1EACCES"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("access denied for uid %d\n", (int)uid); + goto out; + } + if (strcmp(buf, "init_pid") == 0) { + buf[0] = '\0'; + memcpy(buf + 1, &cr->init_pid, sizeof(int)); + send(cfd, buf, 1 + sizeof(int), MSG_DONTWAIT); + goto out; + } + if (sscanf(buf, "attach %u", &minor) == 1) { + force = false; + } else if (sscanf(buf, "force-attach %u", &minor) == 1) { + force = true; + } else { + const char msg[] = "\1EINVAL"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("invalid request: %s\n", buf); + goto out; + } + for (n = 0; n < cr->num_ttys; n++) { + INFO_LOG("n: %u, tty[n]: %u\n", n, cr->tty[n]); + if (cr->tty[n] == minor) + break; + } + if (n == cr->num_ttys) { + const char msg[] = "\1ENOTTY"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("tty%u is not being forwarded\n", minor); + goto out; + } + if (cr->client[n] >= 0) { + if (force) { + close(cr->client[n]); + cr->client[n] = -1; + } else { + const char msg[] = "\1EBUSY"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + ERROR_LOG("tty%u is already in use\n", minor); + goto out; + } + } + if (!pass_fd(cr->master[n], cfd)) { + ERROR_LOG("could not pass master fd\n"); + goto out; + } + NOTICE_LOG("attached client on fd %d to tty%u\n", cfd, minor); + cr->client[n] = cfd; + return; +out: + close(cfd); +} + +/* discards read data if dst < 0 */ +static bool copy(int src, int dst) +{ + ssize_t sz1, sz2; + char buf[1024]; +again: + sz1 = read(src, buf, sizeof(buf)); + if (sz1 < 0) { + if (errno == EINTR) + goto again; + DEBUG_LOG("read from fd %d: %m\n", src); + } + if (sz1 <= 0) + return false; + if (dst < 0) + return true; + sz2 = write(dst, buf, sz1); + if (sz2 < 0) { + DEBUG_LOG("write to fd %d: %m\n", dst); + return false; + } + if (sz1 != sz2) { + DEBUG_LOG("short write to fd %d\n", dst); + return false; + } + return true; +} + +/* + * The function returns only when the process receives SIGCHLD. In this case + * the return value is 0 for success, 1 for failure, and 2 if the child's exit + * code indicates a reboot request. Other signals are pushed down to the child + * process. + */ +static int parent_loop(pid_t pid, const struct container *c, + struct container_runtime *cr) +{ + unsigned n; + + init_signal_handling(); + for (;;) { + int sig, max_fileno = 0; + fd_set fds; + + FD_ZERO(&fds); + if (OPT_GIVEN(START, FOREGROUND)) { + FD_SET(STDIN_FILENO, &fds); + if (STDIN_FILENO > max_fileno) + max_fileno = STDIN_FILENO; + } + FD_SET(signal_pipe[0], &fds); + if (signal_pipe[0] > max_fileno) + max_fileno = signal_pipe[0]; + FD_SET(cr->socket_fd, &fds); + if (cr->socket_fd > max_fileno) + max_fileno = cr->socket_fd; + for (n = 0; n < cr->num_ttys; n++) { + if (cr->client[n] >= 0) { /* detached */ + FD_SET(cr->client[n], &fds); + if (cr->client[n] > max_fileno) + max_fileno = cr->client[n]; + } else { + FD_SET(cr->master[n], &fds); + if (cr->master[n] > max_fileno) + max_fileno = cr->master[n]; + } + } + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + do { + if (!FD_ISSET(signal_pipe[0], &fds)) + break; + sig = next_signal(); + if (sig == SIGCHLD) { + int wstatus; + if (waitpid(pid, &wstatus, WNOHANG) < 0) { + WARNING_LOG("wait: %m\n"); + break; + } + cgroup_cleanup(c); + if (!WIFEXITED(wstatus)) + return 1; + if (WEXITSTATUS(wstatus) == 2) + return 2; + return WEXITSTATUS(wstatus) != EXIT_SUCCESS; + } + kill(pid, sig); + } while (0); + if (FD_ISSET(cr->socket_fd, &fds)) + dispatch_socket_request(cr); + for (n = 0; n < cr->num_ttys; n++) { + if (cr->client[n] >= 0) { + if FD_ISSET(cr->client[n], &fds) + dispatch_client(cr->client + n); + } else { /* stdout is /dev/null in background mode */ + if (FD_ISSET(cr->master[n], &fds)) + copy(cr->master[n], n == 0? + STDOUT_FILENO : -1); + } + } + if (OPT_GIVEN(START, FOREGROUND)) { + if (FD_ISSET(STDIN_FILENO, &fds)) + copy(STDIN_FILENO, cr->master[0]); + } + } +} + +/* Set net namespace of child and call parent_loop(). */ +static int run_parent(pid_t child_pid, const struct container *c, + struct container_runtime *cr) +{ + unsigned n; + bool success; + + close(cr->pipe1[1]); + close(cr->pipe2[0]); + if (read(cr->pipe1[0], &cr->init_pid, 4) != 4) { + ERROR_LOG("pipe1 read error\n"); + close(cr->pipe1[0]); + close(cr->pipe2[1]); + return false; + } + INFO_LOG("received grand child pid: %u\n", (unsigned)cr->init_pid); + close(cr->pipe1[0]); + for (n = 0; n < c->num_ifspecs; n++) { + char *peer = interface_name(c, n, true); + success = set_netns(peer, child_pid); + free(peer); + if (!success) { + ERROR_LOG("set_netns error\n"); + close(cr->pipe2[1]); + return false; + } + } + success = write(cr->pipe2[1], "\0", 1) == 1; + close(cr->pipe2[1]); + if (!success) { + ERROR_LOG("pipe2 write error\n"); + return false; + } + return parent_loop(child_pid, c, cr); +} + +static unsigned get_capdrops(const struct container *c, cap_value_t **result) +{ + static cap_value_t builtin_capdrop[] = {CAP_SYS_MODULE, CAP_SYS_TIME, + CAP_SYS_RESOURCE}; + + if (c->capdrop) { + *result = c->capdrop; + return c->num_capdrops; + } + if (OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP)) { + *result = default_capdrop; + return num_default_capdrops; + } + *result = builtin_capdrop; + return ARRAY_SIZE(builtin_capdrop); +} + +static void drop_caps(const struct container *c) +{ + cap_value_t *capdrop; + unsigned n, num_capdrops; + + INFO_LOG("lowering bounding set capabilities\n"); + num_capdrops = get_capdrops(c, &capdrop); + for (n = 0; n < num_capdrops; n++) { + char *name = cap_to_name(capdrop[n]); + DEBUG_LOG("dropping %s\n", name); + cap_free(name); + if (cap_drop_bound(capdrop[n]) < 0) + die_errno("cap_drop_bound"); + } +} + +__attribute ((noreturn)) +static void child_loop(pid_t pid, struct container_runtime *cr) +{ + int wstatus; + + INFO_LOG("parent: %u, child: %u, init: %u\n", (unsigned) getppid(), + (unsigned)getpid(), (unsigned)pid); + init_signal_handling(); + setsid(); + + for (;;) { + int max_fileno = 0; + fd_set fds; + + FD_ZERO(&fds); + FD_SET(signal_pipe[0], &fds); + if (signal_pipe[0] > max_fileno) + max_fileno = signal_pipe[0]; + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + do { if (FD_ISSET(signal_pipe[0], &fds)) { + int sig = next_signal(); + if (sig == SIGCHLD) { + if (waitpid(pid, &wstatus, WNOHANG) < 0) { + WARNING_LOG("wait: %m\n"); + break; + } + shutdown_console(cr); + if (WIFSIGNALED(wstatus) && + WTERMSIG(wstatus) == 1) { + NOTICE_LOG("reboot requested\n"); + exit(2); + } + NOTICE_LOG("container terminated\n"); + exit(EXIT_SUCCESS); + } + NOTICE_LOG("sending signal %d to container init\n", + sig); + kill(pid, sig == SIGINT? SIGINT : SIGKILL); + }} while(0); + } +} + +static const char *get_init_path(const struct container *c) +{ + return c->init? c->init : OPT_STRING_VAL(MICOFORIA, DEFAULT_INIT); +} + +/* + * The child process unshares namespaces, spawns the init process which runs + * the pre-exec hook and executes the container init process. This function + * never returns, but both the child and the init process exit when the + * container terminates. The exit code of the child tells the parent whether + * it should restart the container. + */ +__attribute ((noreturn)) +static void run_child(const struct container *c, struct container_runtime *cr) +{ + unsigned n; + char *init, *put_old; + char ch; + pid_t pid; + + close(cr->socket_fd); + for (n = 0; n < cr->num_ttys; n++) + close(cr->master[n]); + close(cr->pipe1[0]); + close(cr->pipe2[1]); + if (unshare(CLONE_NEWNET) < 0) + die_errno("unshare net ns\n"); + if (unshare(CLONE_NEWPID) < 0) + die_errno("unshare pid ns\n"); + /* fork again to become pid 1 in the new pid namespace */ + if ((pid = fork()) < 0) + die_errno("fork"); + /* + * By writing to pipe1 we tell the parent (a) we've unshared the net + * namespace, and (b) the pid of the init process in the parent + * namespace. + */ + if (pid > 0) { + close(cr->pipe2[0]); + if (write(cr->pipe1[1], (const char *)&pid, 4) != 4) + die_errno("pipe write error"); + close(cr->pipe1[1]); + child_loop(pid, cr); /* never returns */ + } + pid = getpid(); + DEBUG_LOG("now running as pid %d\n", pid); + if (read(cr->pipe2[0], &ch, 1) != 1) + die_errno("pipe read error"); + close(cr->pipe1[1]); + close(cr->pipe2[0]); + if (unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS) < 0) + die_errno("unshare"); + mkdir(cr->dev, 0777); + init_console(cr); + for (n = 0; n < cr->num_ttys; n++) + close(cr->slave[n]); + INFO_LOG("setting hostname to %s\n", c->name); + if (sethostname(c->name, strlen(c->name)) < 0) + die_errno("sethostname error"); + if (chdir(cr->root) < 0) + die_errno("chdir %s", cr->root); + drop_caps(c); + apply_dacl(c); + apply_cpu_limit(c); + apply_memory_limit(c); + apply_io_limit(c); + for (n = 0; n < c->num_ifspecs; n++) { + char *peer = interface_name(c, n, true); + char *renamed = msg("eth%u", n); + if (!rename_interface(peer, renamed)) + die("can not rename %s to %s\n", peer, renamed); + free(peer); + free(renamed); + } + run_pre_exec_hook(c); + setup_termios(STDIN_FILENO); + put_old = msg("%s/mnt", cr->root); + /* glibc does not provide a wrapper for pivot_root */ + if (syscall(SYS_pivot_root, ".", put_old) < 0) + die_errno("pivot_root (put_old: %s)", put_old); + if (umount2("/mnt", MNT_DETACH) < 0) + die_errno("umount %s", put_old); + free(put_old); + close(STDIN_FILENO); + init = xstrdup(get_init_path(c)); + INFO_LOG("handing over control to container init: %s\n", init); + execve(init, (char *[]){init, NULL}, NULL); + die_errno("failed to exec init process %s", c->init); +} + +/* + * We need three processes, called parent, child, init, because we want one + * process run with namespaces unmodified, requiring one fork. After the child + * has unshared its PID namespace, it keeps its old PID, so we need to fork + * again to get pid 1. The child can not terminate because the parent can not + * wait(2) on its grandchild. + */ +static bool exec_container(const struct container *c) +{ + bool success; + pid_t pid; + unsigned n; + struct container_runtime cr = {0}; + char *socket_path; + int ret; + + create_cgroup_v2(c); + socket_path = get_socket_path(c->name); + success = listen_on_unix_socket(socket_path, &cr.socket_fd); + if (!success) + ERROR_LOG("can not listen on unix socket %s\n", socket_path); + free(socket_path); + if (!success) + return 1; + cr.root = get_root_dir(c); + cr.dev = msg("%s/dev", cr.root); + cr.pts = realpath("/proc/self/fd/0", NULL); + DEBUG_LOG("pts: %s\n", cr.pts); + cr.num_ttys = get_container_ttys(c, &cr.tty); + cr.master = xmalloc(cr.num_ttys * sizeof(int)); + cr.slave = xmalloc(cr.num_ttys * sizeof(int)); + cr.client = xmalloc(cr.num_ttys * sizeof(int)); + for (n = 0; n < cr.num_ttys; n++) + cr.client[n] = -1; +reboot: + NOTICE_LOG("starting %s\n", c->name); + for (n = 0; n < cr.num_ttys; n++) { + if (openpty(cr.master + n, cr.slave + n, NULL, NULL, NULL) < 0) + die("openpty: %m"); + DEBUG_LOG("pty (tty%u <-> %s)\n", n, ttyname(cr.slave[n])); + } + /* mount rw, ignore errors */ + mount(NULL, cr.root, NULL, MS_REMOUNT, NULL); + if (!setup_network(c)) + return false; + if (!run_pre_start_hook(c)) + return false; + if (pipe(cr.pipe1) < 0) /* child -> parent */ + die_errno("pipe1"); + if (pipe(cr.pipe2) < 0) + die_errno("pipe2"); /* parent -> child */ + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid == 0) + run_child(c, &cr); /* never returns */ + ret = run_parent(pid, c, &cr); + if (ret != 2) + return ret == 0; + NOTICE_LOG("rebooting\n"); + for (n = 0; n < cr.num_ttys; n++) { + close(cr.master[n]); + close(cr.slave[n]); + } + goto reboot; +} + +static char *get_container_logfile(const char *name) +{ + return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, LOGDIR), name); +} + +static bool start_container(const struct container *c) +{ + pid_t pid; + char *logfile; + struct termios tios; + bool success; + + if (is_locked(c->name, &pid)) { + ERROR_LOG("%s is locked by pid %u\n", c->name, (unsigned)pid); + return false; + } + if (OPT_GIVEN(START, FOREGROUND)) { + if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) { + ERROR_LOG("both stdin and stdout must be terminals\n"); + return false; + } + if (tcgetattr(STDIN_FILENO, &tios) < 0) { + ERROR_LOG("tcgetattr: %m\n"); + return false; + } + } else { + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid > 0) + return true; + logfile = get_container_logfile(c->name); + daemonize(logfile); + free(logfile); + } + if (!try_lock(c->name, &pid)) + die("%s is locked by pid %u", c->name, (unsigned)pid); + success = exec_container(c); + if (OPT_GIVEN(START, FOREGROUND)) { + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + } + exit(success? EXIT_SUCCESS : EXIT_FAILURE); +} + +static void check_container_args(void) +{ + unsigned n, num_inputs; + struct container *c; + + num_inputs = lls_num_inputs(sublpr); + if (num_inputs == 0) { + if (num_containers == 0) + die("no container configured\n"); + if (OPT_GIVEN(START, FOREGROUND) && num_containers > 1) + die("must specify container for foreground mode"); + } else { + if (OPT_GIVEN(START, FOREGROUND) && num_inputs > 1) + die("can start only one container in foreground mode"); + for (n = 0; n < num_inputs; n++) { + const char *name = lls_input(n, sublpr); + c = get_container(name); + if (!c) + die("container not configured: %s", name); + } + } +} + +struct container_arg_iter { + unsigned idx; +}; + +#define INITIALIZED_CAI(_cai) {.idx = 0} + +static struct container *cai_next(struct container_arg_iter *cai, bool *skipped) +{ + unsigned num_inputs = lls_num_inputs(sublpr); + + if (skipped) + *skipped = false; + if (num_inputs == 0) { + if (cai->idx >= num_containers) + return NULL; + return container[cai->idx++]; + } + for (; cai->idx < num_inputs; cai->idx++) { + const char *name = lls_input(cai->idx, sublpr); + struct container *c = get_container(name); + if (!c) { + ERROR_LOG("%s: not configured\n", name); + if (skipped) + *skipped = true; + continue; + } + cai->idx++; + return c; + } + return NULL; +} + +static bool for_each_container_arg(bool (*f)(const struct container *c)) +{ + struct container *c; + bool success = true; + bool skipped; + struct container_arg_iter cai = INITIALIZED_CAI(cai); + + while ((c = cai_next(&cai, &skipped))) + if (!f(c) || skipped) + success = false; + return success; +} + +static bool com_start(void) +{ + const char *logdir = OPT_STRING_VAL(MICOFORIA, LOGDIR); + + check_container_args(); + if (logdir[0] == '\0') + die_empty_arg("loggir"); + cgroup_init(); + if (mkdir(logdir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", logdir); + return for_each_container_arg(start_container); +} +EXPORT_CMD_HANDLER(start); + +static bool send_signal_to_container(int signum, const struct container *c) +{ + pid_t pid; + bool success; + + if (!is_locked(c->name, &pid)) { + INFO_LOG("%s is not running\n", c->name); + return false; + } + DEBUG_LOG("sending signal %d to pid %u\n", signum, (unsigned)pid); + success = kill(pid, signum) >= 0; + if (!success) + ERROR_LOG("kill %s: %m\n", c->name); + return success; +} + +static void clean_env(void) +{ + char *term = getenv("TERM"); + + clearenv(); + if (term) + setenv("TERM", term, 0); + setenv("PATH", "/root/bin:/usr/local/sbin:/usr/local/bin" + ":/sbin:/usr/sbin:/bin:/usr/bin", 0); + setenv("USER", "root", 0); + setenv("LOGNAME", "root", 0); + setenv("HOME", "/root", 0); +} + +static bool request_init_pid(const char *name, int *result) +{ + char *socket_path = get_socket_path(name); + bool success; + + *result = -1; + success = request_int(socket_path, "init_pid", result); + free(socket_path); + if (!success) + ERROR_LOG("could not determine init pid of %s\n", name); + return success; +} + +static bool shutdown_container(const struct container *c) +{ + pid_t pid; + char str[20]; + char *argv[] = {"nsenter", "-w", "-a", "-r", "-t", str, "halt", NULL}; + + if (!is_locked(c->name, NULL)) { + if (lls_num_inputs(sublpr) == 0) + return true; + ERROR_LOG("container not running: %s\n", c->name); + return false; + } + pid = fork(); + if (pid < 0) + return false; + if (pid > 0) + return true; + if (!request_init_pid(c->name, &pid)) + _exit(EXIT_FAILURE); + sprintf(str, "%d", pid); + clean_env(); + execvp(argv[0], argv); + _exit(EXIT_FAILURE); +} + +static bool container_is_dead(const struct container *c) +{ + return !is_locked(c->name, NULL); +} + +static bool wait_for_containers_to_die(void) +{ + bool success; + unsigned ms = 32; + struct timespec ts; + + while (ms < 20000) { + ts.tv_sec = ms / 1000; + ts.tv_nsec = (ms % 1000) * 1000 * 1000; + if (nanosleep(&ts, NULL) < 0) + return false; + success = for_each_container_arg(container_is_dead); + if (success) + return true; + ms *= 2; + } + return false; +} + +static bool com_stop(void) +{ + bool success = for_each_container_arg(shutdown_container); + + if (!success) + return false; + if (!OPT_GIVEN(STOP, WAIT)) + return true; + return wait_for_containers_to_die(); +} +EXPORT_CMD_HANDLER(stop); + +static bool reboot_container(const struct container *c) +{ + return send_signal_to_container(SIGINT, c); +} + +static bool com_reboot(void) +{ + return for_each_container_arg(reboot_container); +} +EXPORT_CMD_HANDLER(reboot); + +static bool kill_container(const struct container *c) +{ + return send_signal_to_container(SIGUSR1, c); +} + +static bool com_kill(void) +{ + bool success = for_each_container_arg(kill_container); + + if (!success) + return false; + if (!OPT_GIVEN(KILL, WAIT)) + return true; + return wait_for_containers_to_die(); +} +EXPORT_CMD_HANDLER(kill); + +static void list_container_verbose(const struct container *c) +{ + char *root; + unsigned n, N; + char **word_list; + cap_value_t *capdrop; + uint32_t *tty; + char cores_str[25] = "unlimited"; + unsigned cores = get_cpu_cores(c); + + printf("%s:\n", c->name); + printf("\tpre-start hook: %s\n", get_pre_start_hook(c)); + printf("\tpre-exec hook: %s\n", get_pre_exec_hook(c)); + root = get_root_dir(c); + printf("\troot dir: %s\n", root); + free(root); + printf("\tinit path: %s\n", get_init_path(c)); + for (n = 0; n < c->num_ifspecs; n++) { + char pretty_hwaddr[18]; + char *iface = interface_name(c, n, false); + pretty_print_hwaddr(c->ifspec[n].hwaddr, pretty_hwaddr); + printf("\tinterface #%u: %s (%s)\n", n, iface, pretty_hwaddr); + free(iface); + } + N = get_dacl(c, &word_list); + for (n = 0; n < N; n++) + printf("\tdac entry #%u: %s %s\n", n, word_list[n][0] == 'a'? + "allow" : "deny", word_list[n] + 1); + N = get_iospecs(c, &word_list); + for (n = 0; n < N; n++) + printf("\tiospec #%u: %s\n", n, word_list[n]); + if (cores > 0) + sprintf(cores_str, "%u", cores); + printf("\tCPU core limit: %s\n", cores_str); + printf("\tmemory limit: %uG\n", get_memory_limit(c)); + N = get_capdrops(c, &capdrop); + for (n = 0; n < N; n++) + printf("\tcapdrop #%u: %s\n", n, cap_to_name(capdrop[n])); + N = get_container_ttys(c, &tty); + for (n = 0; n < N; n++) + printf("\ttty #%u: %u\n", n, tty[n]); +} + +static bool com_ls(void) +{ + struct container *c; + bool skipped, success = true; + struct container_arg_iter cai = INITIALIZED_CAI(cai); + + while ((c = cai_next(&cai, &skipped))) { + pid_t pid; + if (skipped) + success = false; + if (!is_locked(c->name, &pid)) { + if (!OPT_GIVEN(LS, ALL)) { + success =false; + continue; + } + pid = 0; + } + if (OPT_GIVEN(LS, VERBOSE)) { + list_container_verbose(c); + continue; + } + if (OPT_GIVEN(LS, LONG)) { + if (pid > 0) + printf("%u\t", (unsigned)pid); + else + printf("-\t"); + printf("%u\t", get_cpu_cores(c)); + printf("%uG\t", get_memory_limit(c)); + printf("%s\n", c->name); + continue; + } + if (!OPT_GIVEN(LS, QUIET)) + printf("%s\n", c->name); + } + if (skipped) /* needed if the last given container arg is invalid */ + success = false; + return success; +} +EXPORT_CMD_HANDLER(ls); + +static bool list_container_processes(const struct container *c) +{ + int pid; + char str[20]; + char *argv[] = {"pstree", "-anp", str, NULL}; + bool success; + + success = is_locked(c->name, &pid); + if (!success) { + if (lls_num_inputs(sublpr) == 0) + return true; + ERROR_LOG("container \"%s\" is not running\n", c->name); + return false; + } + if (!OPT_GIVEN(PS, ALL) && !request_init_pid(c->name, &pid)) + return false; + sprintf(str, "%d", pid); + success = xexec(argv, NULL); + return success; +} + +static bool com_ps(void) +{ + return for_each_container_arg(list_container_processes); +} +EXPORT_CMD_HANDLER(ps); + +static bool com_attach(void) +{ + char *errctx; + const char *arg; + pid_t pid; + char *socket_path; + int master, ret, socket_fd; + bool have_escape = false; + struct termios tios; + uint32_t minor = OPT_UINT32_VAL(ATTACH, TTY); + char *rq; + + if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) { + ERROR_LOG("both stdin and stdout must be terminals\n"); + return false; + } + if (tcgetattr(STDIN_FILENO, &tios) < 0) + die_errno("tcgetattr"); + ret = lls_check_arg_count(sublpr, 1, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + arg = lls_input(0, sublpr); + if (!is_locked(arg, &pid)) { + ERROR_LOG("container not running: %s\n", arg); + return false; + } + socket_path = get_socket_path(arg); + if (OPT_GIVEN(ATTACH, FORCE)) + rq = msg("force-attach %u", minor); + else + rq = msg("attach %u", minor); + socket_fd = request_fd(socket_path, rq, &master); + free(rq); + free(socket_path); + INFO_LOG("Attached to /dev/tty%u of container %s\n", minor, arg); + NOTICE_LOG("Type CTRL+a q to quit\n"); + setup_termios(STDIN_FILENO); + setup_termios(master); + for (;;) { + int max_fileno = 0; + fd_set fds; + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + if (STDIN_FILENO > max_fileno) + max_fileno = STDIN_FILENO; + FD_SET(master, &fds); + if (master > max_fileno) + max_fileno = master; + FD_SET(socket_fd, &fds); + if (socket_fd > max_fileno) + max_fileno = socket_fd; + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + if (FD_ISSET(socket_fd, &fds)) + break; + if (FD_ISSET(STDIN_FILENO, &fds)) { + char c; + if (read(STDIN_FILENO, &c, 1) <= 0) + break; + if (c == 1 && !have_escape) + have_escape = true; + else if (c == 'q' && have_escape) + break; + else if (write(master, &c, 1) != 1) + break; + } + if (FD_ISSET(master, &fds)) { + if (!copy(master, STDOUT_FILENO)) + break; + } + } + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + printf("\n"); + return false; +} +EXPORT_CMD_HANDLER(attach); + +static bool com_help(void) +{ + int ret; + char *errctx, *help; + const char *arg; + const struct lls_command *cmd; + + ret = lls_check_arg_count(sublpr, 0, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + if (lls_num_inputs(sublpr) == 0) { + show_subcommand_summary(OPT_GIVEN(HELP, LONG)); + return true; + } + arg = lls_input(0, sublpr); + ret = lls_lookup_subcmd(arg, micoforia_suite, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + cmd = lls_cmd(ret, micoforia_suite); + if (OPT_GIVEN(HELP, LONG)) + help = lls_long_help(cmd); + else + help = lls_short_help(cmd); + printf("%s\n", help); + free(help); + return true; +} +EXPORT_CMD_HANDLER(help); + +static bool com_configtest(void) +{ + printf("Syntax Ok\n"); + return true; +} +EXPORT_CMD_HANDLER(configtest); + +static bool com_edit(void) +{ + char *ed = getenv("EDITOR"); /* must not be freed */ + char *conf = get_config_file_path(); + char *argv[] = {ed? ed : "vi", conf, NULL}; + bool success = xexec(argv, NULL); + + free(conf); + return success; +} +EXPORT_CMD_HANDLER(edit); + +static bool com_enter(void) +{ + char str[20]; + char **argv; + char *nsenter_args[] = {"nsenter", "-w", "-a", "-r", "-t"}; + const unsigned nna = ARRAY_SIZE(nsenter_args); /* num nsenter args */ + char *dflt_cmd[] = {"login", "-f", "root"}; + unsigned n, N, ni = lls_num_inputs(sublpr); + unsigned nea = ni > 1? ni - 1 : ARRAY_SIZE(dflt_cmd); /* num extra args */ + const char *arg; + bool success; + int ret, pid; + char *errctx; + + ret = lls_check_arg_count(sublpr, 1, INT_MAX, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + arg = lls_input(0, sublpr); + if (!is_locked(arg, &pid)) { + ERROR_LOG("container not running: %s\n", arg); + return false; + } + if (!request_init_pid(arg, &pid)) + return false; + N = nna + nea + 2; /* +1 for arg to -t and +1 for terminating NULL */ + argv = xmalloc(N * sizeof(char *)); + for (n = 0; n < nna; n++) + argv[n] = nsenter_args[n]; + sprintf(str, "%d", pid); + argv[nna] = str; + for (n = 0; n < nea; n++) + argv[nna + 1 + n] = ni > 1? (char *)lls_input(n + 1, sublpr) + : dflt_cmd[n]; + argv[N - 1] = NULL; + clean_env(); + success = xexec(argv, NULL); + free(argv); + return success; +} +EXPORT_CMD_HANDLER(enter); + +static bool com_log(void) +{ + int ret; + char *errctx, *logfile; + bool success, use_less = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO); + char *argv[] = {use_less? "less" : "cat", NULL /* filename */, NULL}; + + ret = lls_check_arg_count(sublpr, 1, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + logfile = get_container_logfile(lls_input(0, sublpr)); + argv[1] = logfile; + success = xexec(argv, NULL); + free(logfile); + return success; +} +EXPORT_CMD_HANDLER(log); + +int main(int argc, char *argv[]) +{ + int ret; + char *errctx; + const struct micoforia_user_data *ud; + unsigned num_inputs; + + valid_fd012(); + parse_options(argc, argv, CMD_PTR(MICOFORIA), &lpr); + loglevel_arg_val = OPT_UINT32_VAL(MICOFORIA, LOGLEVEL); + check_options(); + num_inputs = lls_num_inputs(lpr); + ret = lls_lookup_subcmd(argv[argc - num_inputs], micoforia_suite, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + subcmd = lls_cmd(ret, micoforia_suite); + parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr); + ud = lls_user_data(subcmd); + exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/micoforia.suite.m4 b/micoforia.suite.m4 new file mode 100644 index 0000000..697edbc --- /dev/null +++ b/micoforia.suite.m4 @@ -0,0 +1,754 @@ +# SPDX-License-Identifier: GPL-2.0-only +[suite micoforia] + caption = Subcommands + mansect = 8 + manual_title = System Manager's Manual +[supercommand micoforia] + [description] + DESCRIPTION1() + + DESCRIPTION2() + + DESCRIPTION3() + + In addition to global options which apply to all subcommands, each + subcommand has its own set of options. The usual "--" separator must + be used to separate global options from subcommand specific options. + + [/description] + synopsis = [global-options...] [--] [ [subcommand-options...]] + purpose = SLOGAN() + + [option general-options-section] + summary = General options + flag ignored + [option help] + summary = print help and exit + short_opt = h + [option detailed-help] + summary = print help, including all details, and exit + [option version] + summary = print version and exit + short_opt = V + [option config-file] + short_opt = c + summary = use alternative config file (default: ~/.mismarc) + typestr = path + arg_info = required_arg + arg_type = string + [help] + Options may be given at the command line or in the configuration + file. As usual, if an option is given both at the command line and + in the configuration file, the command line option takes precedence. + + The config file may contain global options as well as options for + any subcommand, but subcommand specific options must be placed in a + separate section. See the Examples section of the man page. + [/help] + [option loglevel] + summary = control amount of logging + short_opt = l + arg_info = required_arg + arg_type = string + typestr = severity + values = { + LSGLL_DEBUG = "debug", + LSGLL_INFO = "info", + LSGLL_NOTICE = "notice", + LSGLL_WARNING = "warning", + LSGLL_ERROR = "error", + LSGLL_CRIT = "crit", + LSGLL_EMERG = "emerg" + } + default_val = warning + [help] + Log only messages with severity greater or equal than the given + value. Possible values: + + debug: produces really noisy output. + info: still noisy, but won't fill up the disk quickly. + notice: indicates normal, but significant event. + warning: unexpected events that can be handled. + error: unhandled error condition. + crit: system might be unreliable. + emerg: last message before exit. + [/help] + + [option general-options-section] + summary = Global Container Options + flag ignored + [help] + The options in this section apply to all containers. Most of them + have a per-container counterpart which can be specified to override + the global default. + [/help] + [option default-root-prefix] + summary = path to the parent directory of the container root file systems + typestr = directory + arg_info = required_arg + arg_type = string + default_val = /var/lib/micoforia + [help] + For containers which do not specify their own root directory the path + to the container root is derived from the argument of this option by + appending a slash and the container name. + [/help] + [option logdir] + summary = directory which contains the container log files + arg_info = required_arg + arg_type = string + typestr = directory + default_val = /var/log/micoforia + [help] + The log messages of each container are written to a dedicated + logfile. This option controls in which directroy these files are + written (start subcommand) or expected (log subcommand). + + Nothing is written to the logfile if the container is started in + foreground mode. + [/help] + [option default-pre-start-hook] + summary = command to be executed before the container starts + typestr = command + arg_info = required_arg + arg_type = string + default_val = true + [help] + This hook is run early during container startup. All veth device + pairs have been created, but no namespace or cgroup operations have + been performed at this point. + + If the root file system of the container must be prepared, this is the + right place to perform this task. Unlike the pre exec hook described + below, this hook is only called once. + + The following environment variables are set: MICOFORIA_CONTAINER_NAME, + MICOFORIA_IFSPECS, MICOFORIA_ROOT_DIR. + [/help] + [option default-pre-exec-hook] + summary = command to be executed before /sbin/init is executed + typestr = command + arg_info = required_arg + arg_type = string + default_val = true + [help] + This runs with all namespaces already unshared and cgroup settings + applied but before the root directory is switched to the container + root. The hostname has already been changed to the container name + and the network interfaces have been renamed to eth0, eth1, etc. + + This is the right place to perform additional cgroup or namespace + operations. When the container is rebooted, the pre-exec is called + again, just before control is handed over to the new init process. + + Only MICOFORIA_ROOT_DIR is set in this hook. + [/help] + [option default-init] + summary = control the handover to the init process of the container + typestr = command + arg_info = required_arg + arg_type = string + default_val = /sbin/init + [help] + This program is executed as the last step of the container startup + procedure as pid 1. At this point the root directory of the process + has already been changed, so the given argument refers to a path + relative to the container root directory. + [/help] + [option default-bridge] + summary = ethernet bridge to use by default + typestr = bridge + flag multiple + arg_info = required_arg + arg_type = string + default_val = micoforia + [help] + Applies to all containers which do not specify their own network + interface(s) with --net. If this is given multiple times, containers + will be equipped with multiple interfaces. + [/help] + [option default-cgroup-dac] + summary = specify which device nodes containers may access/create by default + typestr = dacspec + flag multiple + arg_info = required_arg + arg_type = string + [help] + Applies to all containers which do not specify their own access + control lists. May be given multiple times. Each device access control + specifier must be of the form {allow|deny} , where + is a suitable device access control string for the devices.allow or + devices.deny file of the cgroup-v1 controller. Order matters. + + If this option is not given, and the corresponding per-container + option is not given either, a reasonable default applies which allows + access to the most common character devices (/dev/zero, /dev/null, + /dev/urandom, etc.) but denies access to most other devices including + all block devices. + + Example: allow c 1:5 rwm + [/help] + [option default-cpu-cores] + summary = Number of cores to use by default (zero means unlimited) + typestr = num + arg_info = required_arg + arg_type = uint32 + default_val = 0 + [help] + The limit is enforced by the cpu cgroup-v2 controller. Note that in + contrast to the cpuset controller of cgroup-v1 this controller does not + restrict the container to a set of admissible CPUs. Instead, it limits + the number of CPU cycles per time unit for the processes in the cgroup. + [/help] + [option default-memory-limit] + summary = Memory usage throttle limit (zero means no limit) + typestr = gigabytes + arg_info = required_arg + arg_type = uint32 + default_val = 0 + [help] + The value specified here is written to the cgroup-v2 memory.high + control file of all containers which do not specify their own limit. + [/help] + [option default-io-max] + summary = I/O limit (zero means no limit) + flag multiple + typestr = iospec + arg_info = required_arg + arg_type = string + [help] + The I/O specifier argument must be a valid string for the io.max file + of the cgroup-v2 controller. For example, the string "1:5 rbps=1024" + limits the read I/O rate for the /dev/zero device to 1K per second. + [/help] + [option default-capdrop] + summary = Capabilities to drop by default + typestr = capspec + flag multiple + arg_info = required_arg + arg_type = string + [help] + The capability specifier argument is the text representation of a + capability, like CAP_SYS_MODULE. All given capabilities will be dropped + from the bounding set of the container init process, hence from all + all processes of the container. If this option is not given, and no + per-container capabilities to drop are given either, CAP_SYS_MODULE, + CAP_SYS_TIME, and CAP_SYS_RESOURCE are dropped. + + See capabilities(7) for the list of capabilities and their meaning. + [/help] + [option default-tty] + summary = Minor number of a tty device to capture by default + typestr = minor + flag multiple + arg_info = required_arg + arg_type = uint32 + [help] + Normally the container's init process starts at least one "getty" + login session on a tty port /dev/ttyX, where X is the minor device + ID. This option lets you capture these login sessions and forward them + to another micoforia process executing the "attach" subcommand. For + each time the option is given, the device with the given minor device + number is captured. + + If this is not given, /dev/tty1 will be captured. + [/help] + [option general-options-section] + summary = Per-Container Options + flag ignored + [help] + These override the global container options above. Most of them take + a compound argument of the form , where the first part + is the name of the container to which the option should be applied. + + Unless noted otherwise, if both a global option and the corresponding + per-container option is given, the per-container option takes + precedence. + [/help] + [option container] + summary = name of the container + flag multiple + typestr = name + arg_info = required_arg + arg_type = string + [help] + Used for the hostname, the name of the veth interfaces and the name of + the cgroup directory. The name may only contain characters of the set + [a-zA-Z0-9-] and the length must not exceed 32 characters. + + This does not need to be given if one of the compound options below + are given instead. + [/help] + [option pre-start-hook] + summary = See --default-pre-start-hook + flag multiple + typestr = name:command + arg_info = required_arg + arg_type = string + [option pre-exec-hook] + summary = See --default-pre-exec-hook + flag multiple + typestr = name:command + arg_info = required_arg + arg_type = string + [option init] + summary = See --default-init + typestr = name:command + flag multiple + arg_info = required_arg + arg_type = string + [option net] + summary = Equip the container with a non-default network interface + flag multiple + typestr = name:ifspec + arg_info = required_arg + arg_type = string + [help] + The interface specifier is of the form bridge[:hwaddr]. If no hardware + address is given, a random address will be used. See --default-bridge. + + Unlike the other compound options of this section, this option is + cumulative in that multiple options with the same container name do + not override each other but accumulate, resulting in a container with + multiple network interfaces. + [/help] + [option root-directory] + summary = Path to the container root directory. See --default-root-prefix. + flag multiple + typestr = name:path + arg_info = required_arg + arg_type = string + [help] + [/help] + [option cgroup-dac] + summary = See --default-cgroup-dac + typestr = name:dacspec + flag multiple + arg_info = required_arg + arg_type = string + [option cpu-cores] + summary = See --default-cpu-cores + typestr = name:num + flag multiple + arg_info = required_arg + arg_type = string + [option memory-limit] + summary = See --default-memory-limit + typestr = name:gigabytes + flag multiple + arg_info = required_arg + arg_type = string + [option io-max] + summary = See --default-io-max + flag multiple + typestr = name:iospec + arg_info = required_arg + arg_type = string + [option capdrop] + summary = See --default-capdrop + flag multiple + typestr = name:capspec + arg_info = required_arg + arg_type = string + [option tty] + summary = See --default-tty + typestr = name:minor + flag multiple + arg_info = required_arg + arg_type = string + +[introduction] + micoforia supports the subcommands described below. If no subcommand + is given, the list of available subcommands is shown and the program + terminates successfully without performing any further action. +[/introduction] + +[subcommand start] + purpose = start one or more containers + non-opts-name = [...] + [description] + If no container is given, all configured containers are started. + [/description] + [option foreground] + short_opt = F + summary = do not run as background daemon + [help] + Normally, the process detaches from the console and continues to run + in the background. When this option is given, only a single container + can be started, and this container will run with its /dev/console + device redirected to the local tty, making the container startup + messages visible on the local tty. + + Moreover, stdin is forwarded to the first configured tty device + (/dev/tty1 by default) of the container, and anything received from + the other end of the forwarding is dumped to stdout. This allows for + logins on the "local" console of the container, provided the container + starts getty process which listens on the tty device. + [/help] +[subcommand stop] + purpose = shutdown one or more containers + non-opts-name = [...] + [description] + This subcommand works by executing halt(8) in container context. + If no container is given, halt(8) is executed in all configured + container contexts. + [/description] + [option wait] + short_opt = w + summary = wait until all containers have terminated + [help] + Without --wait the micoforia process which executes the stop + subcommand exits after spawning one halt(8) process per container + to be stopped. If --wait is given, the subcommand waits until all + containers have terminated or the timeout expires. This is handy for + system shutdown scripts which are supposed to terminate all running + containers. + [/help] + [closing] + If --wait is not given, the subcommand exits successfully if and only + if all signals were sent successfully. With --wait the subcommand + exits successfully if, additionally, all signalled processes have + terminated before the timeout expires. + [/closing] + +[subcommand reboot] + purpose = reboot containers + non-opts-name = [...] + [description] + Containers are rebooted and killed by sending a signal to a micoforia + process which executes the start subcommand. + [/description] +[subcommand kill] + purpose = force containers to terminate + non-opts-name = [...] + [description] + This works like the reboot subcommand, but a different signal is used + to notify the container. + [/description] + [option wait] + short_opt = w + summary = wait until all signalled containers have terminated + [help] + Without --wait the micoforia process which executes the kill subcommand + exits right after the underlying kill(2) system call returns. At this + point the signalled process might still be alive although SIGKILL + was sent. If --wait is given, the process waits until the signalled + processes have terminated or the timeout expires. + [/help] +[subcommand ls] + purpose = list containers + non-opts-name = [...] + [description] + Several listing modes are available. By default, only the running + containers are listed. If no container name is given, all configured + containers are taken into account. + + [/description] + [option all] + short_opt = a + summary = Also list containers which are not running + [option quiet] + short_opt = q + summary = Do not print any output + [help] + For scripts to determine from the exit code whether all of the given + containers are running. + [/help] + [option long] + short_opt = l + summary = Show also the pid, and the cpu and memory limits + [help] + This overrides --quiet. That is, if both --quiet and --long are given, + the long listing is shown, + [/help] + [option verbose] + short_opt = v + summary = Show all container settings, one setting per line + [help] + This overrides --quiet and --long. + [/help] + [closing] + The subcommand exits successfully if and only if all given/configured + containers could be listed. Unless --all is given, it is considered + an error if a given container is not running. In particular, when ls + is executed with no arguments at all, it exits successfully if and + only if all configured containers are running. + [/closing] +[subcommand ps] + purpose = print process list of one or more containers + non-opts-name = [...] + [description] + This runs pstree(1). The container init process is always the third + process shown. Process IDs refer to the parent PID namespace, which + is why the process ID of the container init is not shown as 1. + [/description] + [option all] + short_opt = a + summary = also show the two micoforia processes +[subcommand attach] + purpose = map the console of a running container to the local terminal. + non-opts-name = [...] + [description] + It is an error if stdin is not associated with a terminal device. + [/description] + [option tty] + short_opt = t + summary = terminal to connect + arg_info = required_arg + arg_type = uint32 + typestr = minor + default_val = 1 + [help] + This operation can only succeed if the given tty is forwarded by the + container. See --tty and --default-tty. + [/help] + [option force] + short_opt = f + summary = don't fail but steal the tty if it is already attached +[subcommand help] + purpose = list available subcommands or print subcommand-specific help + non-opts-name = [subcommand] + [description] + Without any arguments, help prints the list of available + subcommands. When called with a subcommand name argument, it prints + the help text of the given subcommand. + [/description] + [option long] + short_opt = l + summary = show the long help text + [help] + If the optional argument is supplied, the long help text contains the + synopsis, the purpose and the description of the specified subcommand, + followed by the option list including summary and help text of each + option. Without --long, the short help is shown instead. This omits + the description of the subcommand and the option help. + + If no subcommand is supplied but --long is given, the list contains the + purpose of each subcommand. + [/help] + +[subcommand configtest] + purpose = run a configuration file syntax test + [description] + This subcommand checks the command line options and the configuration + file for syntactic and semantic correctness. It either reports + "Syntax Ok" and exits successfully or prints information about the + first error and terminates with exit code 1. + [/description] + +[subcommand edit] + purpose = edit the configuration file + [description] + The editor to start is derived from the EDITOR environment variable. + If this variable is not set, vi is assumed. + [/description] + +[subcommand enter] + purpose = run a command in a container namespace + non-opts-name = [ [arg...]] + [description] + This executes the nsenter(1) command to enter the namespaces of + the init process of the given container. If no command is given, + the login command is run to start a root shell. + [/description] + +[subcommand log] + purpose = show the log file for the given container + non-opts-name = [] + [description] + This executes cat(1) or less(1), depending on whether or not stdin + and stdout are associated with a terminal device. + [/description] +[section Notes] +.SS The Cgroup File Systems + There are two implementations of Linux control groups called + .I cgroup-v1 + and + .IR cgroup-v2 . + Both come with their own pseudo filesystem. + .B micoforia + requires both file systems to be mounted at + .IR /var/cgroup + and + .IR /var/cgroup2 . + Version 1 cgroups are only used to enforce device access control for + the containers, so the cgroup-v1 pseudo filesystem should be mounted + with only this controller enabled. See the Examples section below + for how to do this. Future versions of + .B micoforia + might switch to the devices controller of cgroup-v2. +.SS Container Names + The container name is used also for the name of the network device + and as a directory name if no explicit root directory is given with + --root-prefix. Therefore container names must not exceed 32 characters, + which must all be alphanumeric or '-'. In particular, whitespace and + underscore ('_') are not permitted. + +[/section] +[section Examples] + .IP \(bu 2 + Create a bash alias named + .I m7a + for + .I micoforia + which activates debug messages and already includes the double dash + to separate global options from subcommand options: + + .RS 6 + .EX + .B alias m7a='micoforia --loglevel debug --' + .EE + .RE + .IP \(bu 2 + Set up an ethernet bridge named + .IR micoforia , + add the physical interface + .I eth1 + to it and give the bridge interface an IP address: + + .RS 6 + .EX + .B brctl addbr micoforia + .B ip link set up micoforia + .B brctl addif micoforia eth1 + .B ip a a 192.168.137.1/24 dev micoforia + .EE + .RE + .IP \(bu 2 + Mount the two cgroup file systems, but only activate the + .I devices + controller of cgroup-v1: + + .RS 6 + .EX + .B mkdir -p /var/cgroup && mount -t cgroup -o devices cgroup /var/cgroup + .B mkdir -p /var/cgroup2 && mount -t cgroup2 cgroup2 /var/cgroup2 + .EE + .RE + .IP \(bu 2 + Entries for + .I /etc/fstab + to mount the cgroup file systems automatically at boot: + + .RS 6 + .EX + .B none /var/cgroup cgroup devices 0 0 + .B none /var/cgroup2 cgroup2 defaults 0 0 + .EE + .RE + .IP \(bu 2 + Download a Debian10 root file system to + .IR /var/lib/micoforia/debian10 , + set the root password and let micoforia set the hostname + + .RS 6 + .EX + .B debootstrap --variant=minbase buster /var/lib/micoforia/debian10 http://deb.debian.org/debian/ + .B chroot /var/lib/micoforia/debian10 passwd + .B rm -f /var/lib/micoforia/debian10/etc/hostname + .EE + .RE + .IP \(bu 2 + Download a minimal Ubuntu-18.04 root file system to + .IR /var/lib/micoforia/c1 , + set the root password and configure the + .I eth0 + interface, using a static IP address: + + .RS 6 + .EX + .B debootstrap --include openssh-server --include ifupdown bionic /var/lib/micoforia/c1 http://de.archive.ubuntu.com/ubuntu + .B chroot /var/lib/micoforia/c1 passwd + .B printf 'auto eth0\(rsniface eth0 inet static\(rsnaddress 192.168.137.2/24\(rsn' \ + >> /var/lib/micoforia/c1/etc/network/interfaces + .B echo 'PermitRootLogin yes' >> /var/lib/micoforia/c1/etc/ssh/sshd_config + .EE + .RE + .IP \(bu 2 + Start the container in foreground mode: + + .RS 6 + .EX + .B micoforia --container c1 --start --foreground + .EE + .RE + .IP \(bu 2 + Attach to + .I tty1 + of the running container: + + .RS 6 + .EX + .B m7a attach c1 + .EE + .RE + .IP \(bu 2 + Ask the container to shut down, and wait for the shutdown procedure + to complete: + + .RS 6 + .EX + .B m7a stop --wait c1 + .EE + .RE + .IP \(bu 2 + Check whether the container is running: + + .RS 6 + .EX + .B m7a ls --quiet c1 && echo yes || echo no + .EE + .RE + .IP \(bu 2 + A simple config file: + + .RS 6 + .EX + .B # two global options + .B loglevel info + .B container c1 + .B # an option for the "attach" subcommand + .B [start] + .B \ \ \ \ tty 2 + .EE + .RE + +[/section] +[section copyright] + Written by AUTHOR() + .br + Copyright (C) COPYRIGHT_YEAR() AUTHOR() + .br + License: LICENSE() + .br + This is free software: you are free to change and redistribute it. + .br + There is NO WARRANTY, to the extent permitted by law. + .P + Web page: + .UR URL() + .UE + .br + Git clone `URL': + .UR CLONE_URL() + .UE + .br + Gitweb: + .UR GITWEB_URL() + .UE + .br + Author's home page: + .UR HOME_URL() + .UE + .br + Report bugs to + .MT EMAIL() + AUTHOR() + .ME +[/section] +[section see also] + .BR lxc (7), + .BR brct l(8), + .BR ip (8) + .BR pstree (1) +[/section] diff --git a/micoforia.svg b/micoforia.svg new file mode 100644 index 0000000..4c7a3f9 --- /dev/null +++ b/micoforia.svg @@ -0,0 +1,26 @@ + + + + + + + + diff --git a/util.c b/util.c new file mode 100644 index 0000000..ebe5b1e --- /dev/null +++ b/util.c @@ -0,0 +1,1142 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "m7a.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void die(const char *fmt, ...) +{ + char *str; + va_list argp; + int ret; + + va_start(argp, fmt); + ret = vasprintf(&str, fmt, argp); + va_end(argp); + if (ret < 0) { /* give up */ + EMERG_LOG("OOM\n"); + exit(EXIT_FAILURE); + } + m7a_log(LL_EMERG, "%s\n", str); + exit(EXIT_FAILURE); +} + +void die_errno(const char *fmt, ...) +{ + char *str; + va_list argp; + int ret, save_errno = errno; + + va_start(argp, fmt); + ret = vasprintf(&str, fmt, argp); + va_end(argp); + if (ret < 0) { + EMERG_LOG("OOM\n"); + exit(EXIT_FAILURE); + } + m7a_log(LL_EMERG, "%s: %s\n", str, strerror(save_errno)); + exit(EXIT_FAILURE); +} + +void *xrealloc(void *p, size_t size) +{ + assert(size > 0); + assert((p = realloc(p, size))); + return p; +} + +void *xmalloc(size_t size) +{ + return xrealloc(NULL, size); +} + +void *xzmalloc(size_t size) +{ + void *p = xrealloc(NULL, size); + memset(p, 0, size); + return p; +} + +void *xstrdup(const char *s) +{ + char *ret = strdup(s? s: ""); + + assert(ret); + return ret; +} + +char *msg(const char *fmt, ...) +{ + char *m; + size_t size = 100; + + m = xmalloc(size); + while (1) { + int n; + va_list ap; + + /* Try to print in the allocated space. */ + va_start(ap, fmt); + n = vsnprintf(m, size, fmt, ap); + va_end(ap); + /* If that worked, return the string. */ + if (n < size) + return m; + /* Else try again with more space. */ + size = n + 1; /* precisely what is needed */ + m = xrealloc(m, size); + } +} + +char *xstrcat(char *a, const char *b) +{ + char *tmp; + + if (!a) + return xstrdup(b); + if (!b) + return a; + tmp = msg("%s%s", a, b); + free(a); + return tmp; +} + +void die_empty_arg(const char *opt) +{ + die("argument to --%s must not be empty", opt); +} + +__attribute__ ((noreturn)) +static void die_range(const char *opt) +{ + die("argument to --%s is out of range", opt); +} + +void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt) +{ + if (val < min || val > max) + die_range(opt); +} + +bool fd2buf(int fd, const struct iovec *iov) +{ + ssize_t ret, nread = 0, max; + char *buf = iov->iov_base; + + assert(iov->iov_len > 1); + max = iov->iov_len - 1; + for (;;) { + ret = read(fd, buf + nread, max - nread); + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + continue; + ERROR_LOG("read error: %s\n", strerror(errno)); + return false; + } + if (ret == 0) { + buf[nread] = '\0'; + DEBUG_LOG("read %zd bytes\n", nread); + return true; + } + nread += ret; + if (nread >= max) { + ERROR_LOG("cmd output truncated\n"); + return false; + } + } +} + +bool xexec(char * const argv[], const struct iovec *iov) +{ + pid_t pid; + int pipefd[2] = {-1, -1}; + unsigned n; + + for (n = 0; argv[n]; n++) + DEBUG_LOG("argv[%u]=%s\n", n, argv[n]); + if (iov) { + if (pipe(pipefd) < 0) + die_errno("pipe"); + } + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid > 0) { /* parent */ + int wstatus; + bool success = true; + if (iov) { + close(pipefd[1]); + success = fd2buf(pipefd[0], iov); + close(pipefd[0]); + } + if (waitpid(pid, &wstatus, 0) < 0) + die_errno("waitp"); + if (!success) + return false; + if (!WIFEXITED(wstatus)) + return false; + if (WEXITSTATUS(wstatus) != EXIT_SUCCESS) + return false; + return true; + } + if (pipefd[0] >= 0) + close(pipefd[0]); + if (pipefd[1] >= 0 && pipefd[1] != STDOUT_FILENO) { + if (dup2(pipefd[1], STDOUT_FILENO) < 0) + die_errno("dup2()"); + close(pipefd[1]); + } + execvp(argv[0], argv); + EMERG_LOG("execvp error: %s\n", strerror(errno)); + _exit(EXIT_FAILURE); +} + +void valid_fd012(void) +{ + /* Ensure that file descriptors 0, 1, and 2 are valid. */ + while (1) { + int fd = open("/dev/null", O_RDWR); + if (fd < 0) + die_errno("open"); + if (fd > 2) { + close(fd); + break; + } + } +} + +void check_name(const char *arg) +{ + size_t m, len; + char c; + + len = strlen(arg); + if (len == 0) + die("empty name"); + if (len > 32) + die("name too long: %s", arg); + for (m = 0; m < len; m++) { + c = arg[m]; + if (!isascii(c)) + goto invalid; + if (!isalnum(c) && c != '-') + goto invalid; + } + return; +invalid: + die("invalid character '%c' in name %s", c, arg); +} + +/* allocates two new strings that should be freed by the caller */ +void parse_compound_arg(const char *arg, const char *opt, char **name, char **val) +{ + char *copy, *p; + + if (arg[0] == '\0') + die_empty_arg(opt); + copy = xstrdup(arg); + p = strchr(copy, ':'); + if (!p) + die("could not parse argument to --%s", opt); + *p = '\0'; + check_name(copy); + *name = copy; + p++; + *val = xstrdup(p); +} + +char *parse_cgroup_acl(const char *arg) +{ + if (!strncmp(arg, "allow ", 6)) + return msg("a%s", arg + 6); + if (!strncmp(arg, "deny ", 5)) + return msg("d%s", arg + 5); + die("invalid cgroup access specifier: %s", arg); +} + +void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr) +{ + const char *colon = strchr(arg, ':'); + size_t len; + unsigned n, x[6]; + + if (colon) { + len = colon - arg; + *bridge = xmalloc(len + 1); + memcpy(*bridge, arg, len); + (*bridge)[len] = '\0'; + } else + *bridge = xstrdup(arg); + check_name(*bridge); + if (!colon) { + memset(hwaddr, 0, 6); + return; + } + if (sscanf(colon + 1, "%02x:%02x:%02x:%02x:%02x:%02x", + x, x + 1, x + 2, x + 3, x + 4, x + 5) != 6) + die("invalid hwaddress for ifspec %s", arg); + if (colon[1 + 6 * 2 + 5] != '\0') + die("trailing garbage at the end of ifspec %s", arg); + for (n = 0; n < 6; n++) + hwaddr[n] = x[n]; +} + +uint32_t atou32(const char *str, const char *opt) +{ + char *endptr; + long long tmp; + + errno = 0; /* To distinguish success/failure after call */ + tmp = strtoll(str, &endptr, 10); + if (errno == ERANGE && (tmp == LLONG_MAX || tmp == LLONG_MIN)) + die_range(opt); + if (tmp < 0 || tmp > (uint32_t)-1) + die_range(opt); + /* + * If there were no digits at all, strtoll() stores the original value + * of str in *endptr. + */ + if (endptr == str) + die_empty_arg(opt); + /* + * The implementation may also set errno and return 0 in case no + * conversion was performed. + */ + if (errno != 0 && tmp == 0) + die_empty_arg(opt); + if (*endptr != '\0') /* Further characters after number */ + die("--%s: trailing characters after number", opt); + return tmp; +} + +bool remove_subdirs_recursively(const char *path) +{ + DIR *d = opendir(path); + struct dirent *entry; + int dfd; + struct stat stat; + + if (!d) { + ERROR_LOG("opendir %s: %m\n", path); + return false; + } + dfd = dirfd(d); + assert(dfd >= 0); + while ((entry = readdir(d))) { + char *subpath; + if (!strcmp(entry->d_name, ".")) + continue; + if (!strcmp(entry->d_name, "..")) + continue; + if (fstatat(dfd, entry->d_name, &stat, 0) == -1) { + WARNING_LOG("%s/%s: %m", path, entry->d_name); + continue; + } + if (!S_ISDIR(stat.st_mode)) + continue; + subpath = msg("%s/%s", path, entry->d_name); + remove_subdirs_recursively(subpath); + DEBUG_LOG("removing %s\n", subpath); + if (rmdir(subpath) < 0) { + ERROR_LOG("rmdir %s: %m\n", subpath); + return false; + } + free(subpath); + } + closedir(d); + return true; +} + +void daemonize(const char *logfile) +{ + pid_t pid; + int nullfd, logfd; + + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid) /* parent exits */ + exit(EXIT_SUCCESS); + valid_fd012(); + /* become session leader */ + if (setsid() < 0) + die_errno("setsid"); + if ((nullfd = open("/dev/null", O_RDWR)) < 0) + die_errno("open /dev/null"); + logfile = logfile? logfile : "/dev/null"; + if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, 0666)) < 0) + die_errno("open %s", logfile); + NOTICE_LOG("subsequent log messages go to %s\n", logfile); + if (dup2(nullfd, STDIN_FILENO) < 0) + die_errno("dup2"); + close(nullfd); + if (dup2(logfd, STDOUT_FILENO) < 0) + die_errno("dup2"); + if (dup2(logfd, STDERR_FILENO) < 0) + die_errno("dup2"); + close(logfd); + if (chdir("/") < 0) + die_errno("chdir"); +} + +static int super_dull_hash(const char *input) +{ + const uint8_t *x = (typeof(x))input; + const unsigned p1 = 16777619, p2 = 2971215073; + unsigned n, m, h, result = 0; + + for (n = 0; n < 4; n++) { + h = p1 * (x[0] + n); + for (m = 1; x[m] != 0; m++) + h = p2 * (h ^ x[m]); + result = (result << 8) | (h % 256); + } + return result >> 1; +} + +/** + * We use a semaphore set with two semaphores. The first semaphore is modified + * in all locking related functions while the second semaphore is modified only + * in try_lock() and aquire_lock(). This allows us to obtain the PID of the + * lock holder by querying the PID that last performed an operation on the + * second semaphore. This is achieved by passing GETPID as the control + * operation to semctl(). + */ + +static bool get_lock(const char *string, pid_t *pid, bool wait) +{ + int semid, ret; + struct sembuf sops[4]; + key_t key = super_dull_hash(string); + bool success; + short sem_flg = SEM_UNDO; + + if (!wait) + sem_flg |= IPC_NOWAIT; + ret = semget(key, 2, IPC_CREAT | 0600); + if (ret < 0) { + ERROR_LOG("semget: %m\n"); + return false; + } + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + ret = semctl(semid, 1, GETPID); + if (ret < 0) + return false; + if (pid) + *pid = ret; + sops[0].sem_num = 0; + sops[0].sem_op = 0; + sops[0].sem_flg = sem_flg; + + sops[1].sem_num = 0; + sops[1].sem_op = 1; + sops[1].sem_flg = sem_flg; + + sops[2].sem_num = 1; + sops[2].sem_op = 0; + sops[2].sem_flg = sem_flg; + + sops[3].sem_num = 1; + sops[3].sem_op = 1; + sops[3].sem_flg = sem_flg; + + success = semop(semid, sops, 4) >= 0; + if (!success) + INFO_LOG("semop: %m\n"); + return success; +} + +bool try_lock(const char *string, pid_t *pid) +{ + return get_lock(string, pid, false /* don't wait */); +} + +bool acquire_lock(const char *string) +{ + return get_lock(string, NULL /* don't need pid */, true /* do wait */); +} + +bool release_lock(const char *string) +{ + int semid, ret; + struct sembuf sops[2]; + key_t key = super_dull_hash(string); + bool success; + + ret = semget(key, 2, IPC_CREAT | 0600); + if (ret < 0) { + ERROR_LOG("semget: %m\n"); + return false; + } + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + sops[0].sem_num = 0; + sops[0].sem_op = -1; + sops[0].sem_flg = SEM_UNDO; + sops[1].sem_num = 1; + sops[1].sem_op = -1; + sops[1].sem_flg = SEM_UNDO; + success = semop(semid, sops, 2) >= 0; + if (!success) + INFO_LOG("semop: %m\n"); + return success; +} + +bool is_locked(const char *string, pid_t *pid) +{ + int ret, semid; + struct sembuf sops = { + .sem_num = 0, + .sem_op = 0, + .sem_flg = SEM_UNDO | IPC_NOWAIT + }; + key_t key = super_dull_hash(string); + + if (pid) + *pid = 0; + ret = semget(key, 2, 0); + if (ret < 0) + return false; + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + if (semop(semid, &sops, 1) >= 0) + return false; + ret = semctl(semid, 1, GETPID); + if (ret < 0) + return false; + if (pid) + *pid = ret; + return true; +} + +bool attach_to_bridge(const char *iface, const char *bridge) +{ + int fd, idx; + struct ifreq ifr; + bool success; + + INFO_LOG("adding interface %s to bridge %s\n", iface, bridge); + if (!(idx = if_nametoindex(iface))) { + ERROR_LOG("no index for %s\n", iface); + return false; + } + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + ERROR_LOG("socket: %m\n"); + return false; + } + strncpy(ifr.ifr_name, bridge, IFNAMSIZ - 1); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + ifr.ifr_ifindex = idx; + success = ioctl(fd, SIOCBRADDIF, &ifr) == 0; + if (!success) + ERROR_LOG("interface %s, bridge %s: ioctl SIOCBRADDIF: %m\n", + iface, bridge); + close(fd); + return success; +} + + +#define NLMSG_TAIL(nmsg) \ + ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +static void addattr_l(struct nlmsghdr *nlh, int type, const void *data, + int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + rta = NLMSG_TAIL(nlh); + rta->rta_type = type; + rta->rta_len = len; + if (alen > 0) + memcpy(RTA_DATA(rta), data, alen); + nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(len); +} + +static struct rtattr *addattr_nest(struct nlmsghdr *n, int type) +{ + struct rtattr *nest = NLMSG_TAIL(n); + addattr_l(n, type, NULL, 0); + return nest; +} + +static void end_nest(struct nlmsghdr *nlh, struct rtattr *attr) +{ + attr->rta_len = (void *)NLMSG_TAIL(nlh) - (void *)attr; +} + +static struct mnl_socket *get_and_bind_netlink_socket(void) +{ + struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); + + if (!nl) { + ERROR_LOG("mnl_socket_open error\n"); + return NULL; + } + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + ERROR_LOG("mnl_socket_bind\n"); + mnl_socket_close(nl); + return NULL; + } + return nl; +} + +static struct nlmsghdr *prepare_netlink_msg_header(char *buf) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = time(NULL); + return nlh; +} + +bool rename_interface(const char *before, const char *after) +{ + int idx; + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("%s -> %s\n", before, after); + if (!(idx = if_nametoindex(before))) { + ERROR_LOG("no index for %s\n", before); + return false; + } + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = idx; + addattr_l(nlh, IFLA_IFNAME, after, strlen(after) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("mnl_socket_sendto failed\n"); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +void pretty_print_hwaddr(const uint8_t *hwaddr, char *result) +{ + sprintf(result, "%02x:%02x:%02x:%02x:%02x:%02x", hwaddr[0], hwaddr[1], + hwaddr[2], hwaddr[3], hwaddr[4], hwaddr[5]); +} + +bool set_hwaddr(const char *iface, const uint8_t *hwaddr) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + const uint8_t zero[6] = {0}; + char pretty_hwaddr[18]; + + if (!memcmp(hwaddr, zero, 6)) + return true; /* no hwaddr specified, nothing to do */ + pretty_print_hwaddr(hwaddr, pretty_hwaddr); + INFO_LOG("hardware address of %s: %s\n", iface, pretty_hwaddr); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + addattr_l(nlh, IFLA_ADDRESS, hwaddr, 6); + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool link_del(const char *iface) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("removing interface %s\n", iface); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_DELLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = IFF_UP; + ifm->ifi_flags = IFF_UP; + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool link_up(const char *iface) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("activating interface %s\n", iface); + if (!(nl = get_and_bind_netlink_socket())) + return false; + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = IFF_UP; + ifm->ifi_flags = IFF_UP; + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +#ifndef VETH_INFO_PEER +#define VETH_INFO_PEER 1 +#endif + +bool create_veth_device_pair(const char *name, char *peer) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct rtattr *n1, *n2, *n3; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("new pair: %s <-> %s\n", name, peer); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + n1 = addattr_nest(nlh, IFLA_LINKINFO); + addattr_l(nlh, IFLA_INFO_KIND, "veth", 5); + n2 = addattr_nest(nlh, IFLA_INFO_DATA); + n3 = addattr_nest(nlh, VETH_INFO_PEER); + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + addattr_l(nlh, IFLA_IFNAME, peer, strlen(peer) + 1); + end_nest(nlh, n3); + end_nest(nlh, n2); + end_nest(nlh, n1); + addattr_l(nlh, IFLA_IFNAME, name, strlen(name) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto\n", name); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool set_netns(const char *iface, pid_t pid) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + + INFO_LOG("changing net namespace of interface %s to pid %d\n", + iface, (int)pid); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = 0; + ifm->ifi_flags = 0; + addattr_l(nlh, IFLA_NET_NS_PID, &pid, sizeof(pid)); + mnl_attr_put_str(nlh, IFLA_IFNAME, iface); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + return false; + } + mnl_socket_close(nl); + return true; +} + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX (sizeof(((struct sockaddr_un *)0)->sun_path)) +#endif + +static bool init_unix_socket(const char *socket_path, int *socketfd, + struct sockaddr_un *sau) +{ + int fd; + + *socketfd = -1; + if (strlen(socket_path) + 1 >= UNIX_PATH_MAX) { + ERROR_LOG("socket path to long: %s\n", socket_path); + return false; + } + memset(sau, 0, sizeof(struct sockaddr_un)); + sau->sun_family = PF_UNIX; + sau->sun_path[0] = '\0'; /* use the abstract socket namespace */ + strcpy(sau->sun_path + 1, socket_path); + fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + ERROR_LOG("socket: %m\n"); + return false; + } + *socketfd = fd; + return true; +} + +bool listen_on_unix_socket(const char *socket_path, int *result) +{ + struct sockaddr_un sau; + int fd, flags; + bool success = false; + + if (!init_unix_socket(socket_path, &fd, &sau)) + return false; + flags = fcntl(fd, F_GETFL); + if (flags < 0) { + ERROR_LOG("fcntl (F_GETFL): %m\n"); + goto fail; + } + flags = fcntl(fd, F_SETFL, ((long)flags) | O_NONBLOCK); + if (flags < 0) { + ERROR_LOG("fcntl (F_SETFL): %m\n"); + goto fail; + } + if (bind(fd, (struct sockaddr *)&sau, sizeof(sau)) < 0) { + ERROR_LOG("bind: %m\n"); + goto fail; + } + if (listen(fd , 5) < 0) { + ERROR_LOG("listen: %m\n"); + goto fail; + } + *result = fd; + NOTICE_LOG("listening on fd %d\n", fd); + return true; +fail: + close(fd); + return success; +} +/* + * Send a buffer and the credentials of the current process to a socket. + * + * buf must be zero-terminated. + * return the return value of the underlying call to sendmsg(). + */ +static bool send_cred_buffer(int sock, char *buf) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg; + struct cmsghdr *cmsg; + static struct iovec iov; + struct ucred c; + + /* Response data */ + iov.iov_base = buf; + iov.iov_len = strlen(buf) + 1; + c.pid = getpid(); + c.uid = getuid(); + c.gid = getgid(); + /* compose the message */ + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + /* attach the ucred struct */ + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + *(struct ucred *)CMSG_DATA(cmsg) = c; + msg.msg_controllen = cmsg->cmsg_len; + if (sendmsg(sock, &msg, 0) < 0) { + ERROR_LOG("sendmsg: %m\n"); + return false; + } + return true; +} + +static void dispose_fds(int *fds, unsigned num) +{ + int i; + + for (i = 0; i < num; i++) + close(fds[i]); +} + +/* Receive a buffer and the Unix credentials of the sending process. */ +bool recv_cred_buffer(int socketfd, char *buf, size_t size, + int *clientfd, uid_t *uid) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + int yes = 1, cfd, ret; + struct ucred cred; + struct sockaddr_un sau; + socklen_t sizeof_sau = sizeof(sau); + + ret = accept(socketfd, (struct sockaddr *)&sau, &sizeof_sau); + if (ret < 0) { + ERROR_LOG("accept: %m\n"); + return false; + } + cfd = ret; + setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(int)); + memset(&msg, 0, sizeof(msg)); + iov.iov_base = buf; + iov.iov_len = size; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + if (recvmsg(cfd, &msg, 0) < 0) { + ERROR_LOG("recvmsg: %m\n"); + goto fail; + } + cmsg = CMSG_FIRSTHDR(&msg); + while (cmsg) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type + == SCM_CREDENTIALS) { + memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct ucred)); + *uid = cred.uid; + *clientfd = cfd; + return true; + } else + if (cmsg->cmsg_level == SOL_SOCKET + && cmsg->cmsg_type == SCM_RIGHTS) { + dispose_fds((int *)CMSG_DATA(cmsg), + (cmsg->cmsg_len - CMSG_LEN(0)) + / sizeof(int)); + } + cmsg = CMSG_NXTHDR(&msg, cmsg); + } +fail: + close(*clientfd); + *clientfd = -1; + return false; +} + +bool pass_fd(int passfd, int socketfd) +{ + struct msghdr msg = {.msg_iov = NULL}; + struct cmsghdr *cmsg; + char control[255] __attribute__((__aligned__(8))); + struct iovec iov; + char buf[] = "\0OK"; + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + *(int *)CMSG_DATA(cmsg) = passfd; + + /* Sum of the length of all control messages in the buffer */ + msg.msg_controllen = cmsg->cmsg_len; + DEBUG_LOG("passing %s and fd %d\n", buf, passfd); + if (sendmsg(socketfd, &msg, 0) < 0) { + ERROR_LOG("sendmsg: %m\n"); + return false; + } + return true; +} + +static bool recv_fd(int socketfd, int *recvfd) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg = {.msg_iov = NULL}; + struct cmsghdr *cmsg; + struct iovec iov; + char buf[100]; + ssize_t sz = sizeof(buf), ssz; + + *recvfd = -1; + iov.iov_base = buf; + iov.iov_len = sz - 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + memset(buf, 0, sz); + ssz = recvmsg(socketfd, &msg, 0); + if (ssz < 0) { + ERROR_LOG("recvmsg: %m\n"); + return false; + } + buf[ssz] = '\0'; + INFO_LOG("server response: %u (%s)\n", (unsigned)buf[0], buf + 1); + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET + || cmsg->cmsg_type != SCM_RIGHTS) + continue; + if ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int) != 1) + continue; + *recvfd = *(int *)CMSG_DATA(cmsg); + return true; + } + return false; +} + +int request_fd(const char *socket_path, char *msg, int *result) +{ + struct sockaddr_un sau; + int socketfd, receivefd; + + if (!init_unix_socket(socket_path, &socketfd, &sau)) + die("could not init socket"); + if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) + die_errno("connect"); + if (!send_cred_buffer(socketfd, msg)) + die("could not send cred buffer"); + if (!recv_fd(socketfd, &receivefd)) + die("did not receive tty fd"); + NOTICE_LOG("received fd %d\n", receivefd); + *result = receivefd; + return socketfd; +} + +bool request_int(const char *socket_path, char *msg, int *result) +{ + struct sockaddr_un sau; + int socketfd; + bool success = false; + char buf[100]; + ssize_t ssz; + + *result = -1; + if (!init_unix_socket(socket_path, &socketfd, &sau)) + return false; + if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) { + ERROR_LOG("connect: %m\n"); + goto close; + } + if (!send_cred_buffer(socketfd, msg)) { + ERROR_LOG("could not send cred msg \"%s\"\n", msg); + goto close; + } + ssz = read(socketfd, buf, sizeof(buf) - 1); + if (ssz < 0) { + ERROR_LOG("did not receive integer: %m\n"); + goto close; + } + if (buf[0] != 0) { + ERROR_LOG("did not receive integer: %s\n", buf + 1); + goto close; + } + if (ssz != sizeof(int) + 1) { + ERROR_LOG("protocol mismatch, server msg: %s\n", buf + 1); + goto close; + } + memcpy(result, buf + 1, sizeof(int)); + DEBUG_LOG("received integer: %d\n", *result); + success = true; +close: + close(socketfd); + return success; +} + +int signal_pipe[2]; + +static void signal_handler(int signum) +{ + uint8_t u = signum; + int save_errno = errno; + assert(signum > 0 && signum < 256); + if (write(signal_pipe[1], &u, 1) < 0) + ERROR_LOG("write to signal pipe: %m\n"); + errno = save_errno; +} + +void init_signal_handling(void) +{ + struct sigaction act; + + if (pipe(signal_pipe) < 0) + die_errno("signal pipe"); + act.sa_handler = signal_handler; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_RESTART; + if (sigaction(SIGINT, &act, NULL) < 0) + die_errno("sigaction"); + if (sigaction(SIGTERM, &act, NULL) < 0) + die_errno("sigaction"); + if (sigaction(SIGCHLD, &act, NULL) < 0) + die_errno("sigaction"); +} + +int next_signal(void) +{ + uint8_t u = 0; +again: + if (read(signal_pipe[0], &u, 1) < 0) { + if (errno != EINTR) + die_errno("read"); + goto again; + } + DEBUG_LOG("process %d received signal %u\n", getpid(), u); + return u; +} diff --git a/version-gen.sh b/version-gen.sh new file mode 100755 index 0000000..5e554ee --- /dev/null +++ b/version-gen.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +package="$1" +version_file="$2" + +ver='unnamed_version' +# First try git, then gitweb, then default. +if [ -e '.git' -o -e '../.git' ]; then + git_ver=$(git describe --abbrev=4 HEAD 2>/dev/null) + [ -z "$git_ver" ] && git_ver="$ver" + # update stat information in index to match working tree + git update-index -q --refresh > /dev/null + # if there are differences (exit code 1), the working tree is dirty + git diff-index --quiet HEAD || git_ver=$git_ver-dirty + ver=$git_ver +elif [ "${PWD%%-*}" = $package- ]; then + ver=${PWD##*/$package-} +fi +ver=${ver#v} + +echo "$ver" +[ -z "${version_file}" ] && exit 0 +# update version file if necessary +content="const char *${package}_version(void) {return \"$ver\";};" +[ -r "$version_file" ] && echo "$content" | cmp -s - $version_file && exit 0 +echo "$content" > $version_file