From e8cbe0823fdc68c668d8889d4c62d0f6bc0c29f8 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Fri, 13 Dec 2019 15:04:27 +0100 Subject: [PATCH] Initial commit. This project was stared in late 2018. After 2 weeks the first feature complete version was ready. During 2019 the repo received only a moderate number of commits, mostly bug fixes, documentation improvements and the addition of non-essential features. As of version 0.9.0, the project was made public. All commits that led to this version have been discarded, so this repository contains only the final result as a single commit. --- .gitignore | 5 + Makefile | 253 ++++++ README | 1 + config.mak.in | 11 + configure | 12 + configure.ac | 47 ++ index.html.m4 | 64 ++ m7a.h | 118 +++ micoforia.c | 1996 ++++++++++++++++++++++++++++++++++++++++++++ micoforia.suite.m4 | 754 +++++++++++++++++ micoforia.svg | 26 + util.c | 1142 +++++++++++++++++++++++++ version-gen.sh | 27 + 13 files changed, 4456 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README create mode 100644 config.mak.in create mode 100755 configure create mode 100644 configure.ac create mode 100644 index.html.m4 create mode 100644 m7a.h create mode 100644 micoforia.c create mode 100644 micoforia.suite.m4 create mode 100644 micoforia.svg create mode 100644 util.c create mode 100755 version-gen.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..98c75c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +micoforia +micoforia.8 +build +*.swp +Makefile.local diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4441e04 --- /dev/null +++ b/Makefile @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: GPL-2.0-only +.SUFFIXES: +MAKEFLAGS += -Rr +ifeq ("$(origin CC)", "default") + CC := cc +endif +ifeq ("$(origin V)", "command line") + SAY = +else + SAY = @echo '$(strip $(1))' +endif + +.ONESHELL: +.SHELLFLAGS := -ec +PREFIX ?= /usr/local +INSTALL ?= install +MKDIR_P := mkdir -p +RM := rm -f +CHMOD := chmod +B := build +all := micoforia micoforia.8 +all: $(all) + +PACKAGE := micoforia +SLOGAN := Minimal Containers for Instant Availability +AUTHOR := Andre Noll +EMAIL := maan@tuebingen.mpg.de +COPYRIGHT_YEAR := 2019 +URL := http://people.tuebingen.mpg.de/maan/$(PACKAGE)/ +CLONE_URL := git://git.tuebingen.mpg.de/$(PACKAGE) +GITWEB_URL := http://git.tuebingen.mpg.de/$(PACKAGE).git +HOME_URL := http://people.tuebingen.mpg.de/maan/ +LICENSE := GNU GPL version 3 +LICENSE_URL := https://www.gnu.org/licenses/gpl-3.0-standalone.html +LOGLEVELS := LL_DEBUG,LL_INFO,LL_NOTICE,LL_WARNING,LL_ERROR,LL_CRIT,LL_EMERG + +units := micoforia util version micoforia.lsg +deps := $(addprefix $(B)/, $(addsuffix .d, $(units))) +objs := $(addprefix $(B)/, $(addsuffix .o, $(units))) + +ifeq ($(findstring clean, $(MAKECMDGOALS)),) +ifeq ($(findstring README, $(MAKECMDGOALS)),) +-include $(deps) +-include $(B)/config.mak +endif +endif + +XCPPFLAGS := +XCPPFLAGS += -I$(B) +XCPPFLAGS += -Wunused-macros +XCPPFLAGS += -DCOPYRIGHT_YEAR='"$(COPYRIGHT_YEAR)"' +XCPPFLAGS += -DPACKAGE='"$(PACKAGE)"' +XCPPFLAGS += -DAUTHOR='"$(AUTHOR)"' +XCPPFLAGS += -DEMAIL='"$(EMAIL)"' +XCPPFLAGS += -DURL='"$(URL)"' +XCPPFLAGS += -DCLONE_URL='"$(CLONE_URL)"' +XCPPFLAGS += -DGITWEB_URL='"$(GITWEB_URL)"' +XCPPFLAGS += -DHOME_URL='"$(HOME_URL)"' +XCPPFLAGS += -DGET_VERSION='$(PACKAGE)_version' +XCPPFLAGS += -DLOGLEVELS='$(LOGLEVELS)' +XCPPFLAGS += -DBUILD_DATE='"$(build_date)"' +XCPPFLAGS += -DCC_VERSION='"$(cc_version)"' +XCPPFLAGS += -DUNAME_RS='"$(uname_rs)"' +XCPPFLAGS += -DLICENSE='"$(LICENSE)"' +XCPPFLAGS += -DLICENSE_URL='"$(LICENSE_URL)"' + +XCFLAGS := +XCFLAGS += -fno-strict-aliasing +XCFLAGS += -g +XCFLAGS += -Os +XCFLAGS += -Wundef -W -Wuninitialized +XCFLAGS += -Wchar-subscripts +XCFLAGS += -Werror-implicit-function-declaration +XCFLAGS += -Wmissing-noreturn +XCFLAGS += -Wbad-function-cast +XCFLAGS += -Wredundant-decls +XCFLAGS += -Wno-sign-compare -Wno-unknown-pragmas +XCFLAGS += -Wdeclaration-after-statement +XCFLAGS += -Wformat -Wformat-security -Wmissing-format-attribute +XCFLAGS += -fsanitize=undefined +XCFLAGS += -fdata-sections -ffunction-sections +XCFLAGS += -Wstrict-prototypes +XCFLAGS += -Wshadow +XCFLAGS += -Wunused -Wall +XCFLAGS += -Wformat-signedness +XCFLAGS += -Wdiscarded-qualifiers + +XLDFLAGS := -lubsan -Wl,--gc-sections +version_file := $(B)/version.c +GIT_VERSION := $(shell $(MKDIR_P) $(B) && ./version-gen.sh $(PACKAGE) $(version_file)) + +CC_CMD = $(CC) -c -o $@ $(XCPPFLAGS) $(CPPFLAGS) \ + $(XCFLAGS) $(CFLAGS) -MMD -MF $(B)/$(*F).d -MT $@ + +$(objs): m7a.h $(B)/micoforia.lsg.h + +$(B): + @$(MKDIR_P) $@ + +$(B)/config.h.in: configure.ac | $(B) + $(call SAY, AH $<) + cd $(B) + autoheader -f ../configure.ac +$(B)/configure.sh: configure.ac | $(B) + $(call SAY, AC $<) + cd $(B) + autoconf ../configure.ac > configure.sh + $(CHMOD) 755 configure.sh +$(B)/config.status: $(B)/configure.sh | $(B) + $(call SAY, SH $<) + cd $(B) + if test -x config.status; then \ + ./config.status --quiet --recheck; \ + else \ + ./configure.sh --no-create; \ + fi +$(B)/config.mak $(B)/config.h: $(B)/config.status config.mak.in $(B)/config.h.in + $(call SAY, CS $@) + cd $(B) + ln -f ../config.mak.in + ./config.status -q + test -f config.h && touch config.h + +define DESCRIPTION1 := + $(PACKAGE) is a lightweight container implementation for Linux. + It consists of a single program which reads a single configuration + file that describes all containers. $(PACKAGE) was written with + performance and simplicity in mind, and is designed for trusted + in-house web application hosting. +endef + +define DESCRIPTION2 := + Like other container frameworks, $(PACKAGE) employs Linux namespaces + for isolation and cgroup controllers to limit the resource utilization + of the containers. Networking is implemented through bridging and + virtual ethernet device pairs. There is built-in support for the cpu, + memory, I/O and device controllers. Further customization is possible + via startup hooks. For example, the startup hook could activate + additional cgroup controllers, make the container enter a different + namespace, and mount additional file systems. +endef + +define DESCRIPTION3 := + The micoforia program supports a couple of subcommands. Besides + the start subcommand which starts one or more containers, there are + subcommands for listing, killing or rebooting containers. +endef + +# dependency on config.mak is because the command below depends on $(M4) +$(B)/index.html $(B)/micoforia.suite: $(B)/%: %.m4 Makefile $(B)/config.mak + $(call SAY, M4 $<) + $(M4) -D "AUTHOR=$(AUTHOR)" -D "COPYRIGHT_YEAR=$(COPYRIGHT_YEAR)" \ + -D "PACKAGE=$(PACKAGE)" \ + -D "SLOGAN=$(SLOGAN)" \ + -D "EMAIL=$(EMAIL)" \ + -D "URL=$(URL)" \ + -D "CLONE_URL=$(CLONE_URL)" \ + -D "GITWEB_URL=$(GITWEB_URL)" \ + -D "HOME_URL=$(HOME_URL)" \ + -D "LICENSE=$(LICENSE)" \ + -D "LICENSE_URL=$(LICENSE_URL)" \ + -D "DESCRIPTION1=$(DESCRIPTION1)" \ + -D "DESCRIPTION2=$(DESCRIPTION2)" \ + -D "DESCRIPTION3=$(DESCRIPTION3)" $< > $@ +$(B)/%.lsg.c: $(B)/%.suite + $(call SAY, LSGC $<) + $(LOPSUBGEN) --gen-c --output-dir $(B) < $< +$(B)/%.lsg.h: $(B)/%.suite + $(call SAY, LSGH $<) + $(LOPSUBGEN) --gen-header --output-dir $(B) < $< +%.8: $(B)/%.suite $(B)/version.c + $(call SAY, LSGM $<) + $(LOPSUBGEN) --gen-man=$(*F).8 --version-string $(GIT_VERSION) < $< + +$(B)/%.o: %.c | $(B) + $(call SAY, CC $<) + $(CC_CMD) $< +$(B)/%.o: $(B)/%.c + $(call SAY, CC $<) + $(CC_CMD) $< +micoforia: $(objs) + $(call SAY, LD $@) + $(CC) -o $@ $^ $(XLDFLAGS) $(LDFLAGS) -llopsub -lmnl -lutil -lcap + +mandir := $(datarootdir)/man/man8 +INSTALL ?= install +INSTALL_PROGRAM ?= $(INSTALL) -m 755 +INSTALL_DATA ?= $(INSTALL) -m 644 +ifneq ($(findstring strip, $(MAKECMDGOALS)),) + strip_option := -s +endif +install install-strip: all + $(MKDIR_P) $(DESTDIR)$(sbindir) $(DESTDIR)$(mandir) + $(INSTALL_PROGRAM) $(strip_option) micoforia $(DESTDIR)$(sbindir) + $(INSTALL_DATA) micoforia.8 $(DESTDIR)$(mandir) + +clean: + $(RM) $(B)/*.o $(all) +distclean: clean + $(RM) -r $(B) +maintainer-clean: + git clean -dfqx > /dev/null 2>&1 + +define README := +$(PACKAGE) - $(SLOGAN) + +$(DESCRIPTION1) + +$(DESCRIPTION2) + +$(DESCRIPTION3) + +Resources +~~~~~~~~~ +| web page: $(URL) +| git clone URL: $(CLONE_URL) +| gitweb: $(GITWEB_URL) +| author's home page: $(HOME_URL) +| Send feedback to: $(AUTHOR) <$(EMAIL)> + +License +~~~~~~~ +Open source, licensed under the $(LICENSE). + +Documentation +~~~~~~~~~~~~~ +See micoforia.suite.m4. Or build the man page with \"make\" and run +\"man -l micoforia.8\". + +Dependencies +~~~~~~~~~~~~ +This package requires m4, autoconf, gnu make, gcc or clang, and +lopsub. The configure script checks if all dependencies are installed +and prints a meaningful error message if one of them is missing. + +Building +~~~~~~~~ +Run \"make\" to build the package with the default settings. Run +\"./configure -h\" to list configuration options. + +Installation +~~~~~~~~~~~~ +Run \"sudo make install\" to install to /usr/local. To install to +/somewhere/else, run \"./configure --prefix /somewhere/else && make\" +first. +endef + +README: + @printf '%s\n' "$(README)" + +.PRECIOUS: $(B)/%.lsg.c $(B)/%.lsg.h $(B)/%.8 +.PHONY: all clean install distclean maintainer-clean README +-include Makefile.local diff --git a/README b/README new file mode 100644 index 0000000..52a1fd7 --- /dev/null +++ b/README @@ -0,0 +1 @@ +Run "make README". diff --git a/config.mak.in b/config.mak.in new file mode 100644 index 0000000..ee258b4 --- /dev/null +++ b/config.mak.in @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +prefix := @prefix@ +exec_prefix := @exec_prefix@ + +# These two use prefix and exec_prefix +sbindir := @sbindir@ +datarootdir := @datarootdir@ + +LOPSUBGEN := @LOPSUBGEN@ +M4 := @M4@ diff --git a/configure b/configure new file mode 100755 index 0000000..ad2ec3f --- /dev/null +++ b/configure @@ -0,0 +1,12 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +set -e + +mkdir -p build +cd build +autoconf ../configure.ac > configure.sh +chmod 755 configure.sh +ln -f ../config.mak.in +autoheader ../configure.ac +sh configure.sh "$@" diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..e29968f --- /dev/null +++ b/configure.ac @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0-only + +AC_PREREQ([2.61]) +# only for configure -h, see Makefile +AC_INIT([software], [packages]) +AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_FILES([config.mak]) +AC_USE_SYSTEM_EXTENSIONS +AC_PROG_CC +AC_PROG_CPP + +AC_DEFUN([REQUIRE_EXECUTABLE], [ + AC_PATH_PROG(m4_toupper([$1]), [$1]) + test -z "$m4_toupper([$1])" && AC_MSG_ERROR([$2]) +]) +REQUIRE_EXECUTABLE([m4], [m4 is required to build this package]) + +AC_DEFUN([LOPSUB_NOT_FOUND], [ +The lopsub library is required to build this software, but the checks +indicate it is not installed on your system. Run the following +command to download a copy. + git clone git://git.tuebingen.mpg.de/lopsub.git +Install the library, then run this configure script again. + +If you installed lopsub at a non-standard location, make sure to set +PATH, CPPFLAGS and LDFLAGS accordingly. For example: + + pfx=/prefix/where/lopsub/is/installed + export PATH=\$pfx/bin:\$PATH + export CPPFLAGS=-I\$pfx/include + export LDFLAGS=-L\$pfx/lib +]) +REQUIRE_EXECUTABLE([lopsubgen], [LOPSUB_NOT_FOUND()]) +AC_CHECK_HEADER(lopsub.h, [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])]) +AC_CHECK_LIB([lopsub], [lls_merge], [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])]) + +AC_DEFUN([LIBCAP_NOT_FOUND], [the libcap library is required to build dnl +this software. Package: libcap-dev]) +AC_CHECK_HEADER([sys/capability.h], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])]) +AC_CHECK_LIB([cap], [cap_from_text], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])]) + +AC_DEFUN([LIBMNL_NOT_FOUND], [the libmnl library is required to build dnl +this software. Package: libmnl-dev]) +AC_CHECK_HEADER([libmnl/libmnl.h], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])]) +AC_CHECK_LIB([mnl], [mnl_socket_open], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])]) + +AC_OUTPUT diff --git a/index.html.m4 b/index.html.m4 new file mode 100644 index 0000000..a0d8ecc --- /dev/null +++ b/index.html.m4 @@ -0,0 +1,64 @@ +dnl SPDX-License-Identifier: GPL-2.0-only + + + + + + PACKAGE() + + + + + + + + +
+

+ PACKAGE() - SLOGAN() +

+
+ + +
+

DESCRIPTION1()

+

DESCRIPTION2()

+

DESCRIPTION3()

+ +

Resources

+ + +

License

+ Open source, licensed under the LICENSE() + +

Documentation

+ See the manual page for details. + +

Programming Language

+ Plain C. + +

Dependencies

+ A working C compiler and a couple of other dependencies, + most of which are standard (autoconf, make, m4, + libmnl, libcap). The notable exception is the lopsub library. + + diff --git a/m7a.h b/m7a.h new file mode 100644 index 0000000..77c3cad --- /dev/null +++ b/m7a.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +#define CMD_PTR(_cname) lls_cmd(LSG_MICOFORIA_CMD_ ## _cname, micoforia_suite) +#define OPT_RESULT(_cname, _oname) (lls_opt_result(\ + LSG_MICOFORIA_ ## _cname ## _OPT_ ## _oname, \ + (CMD_PTR(_cname) == CMD_PTR(MICOFORIA))? lpr : sublpr)) +#define OPT_GIVEN(_cname, _oname) (lls_opt_given(OPT_RESULT(_cname, _oname))) +#define OPT_UINT32_VAL_N(_n, _cname, _oname) (lls_uint32_val(_n, \ + OPT_RESULT(_cname, _oname))) +#define OPT_UINT32_VAL(_cname, _oname) (OPT_UINT32_VAL_N(0, _cname, _oname)) +#define OPT_STRING_VAL_N(_n, _cname, _oname) (lls_string_val(_n, \ + OPT_RESULT(_cname, _oname))) +#define OPT_STRING_VAL(_cname, _oname) (OPT_STRING_VAL_N(0, _cname, _oname)) + +struct micoforia_user_data {bool (*handler)(void);}; +#define EXPORT_CMD_HANDLER(_cmd) const struct micoforia_user_data \ + lsg_micoforia_com_ ## _cmd ## _user_data = { \ + .handler = com_ ## _cmd \ + }; + + +__attribute__ ((warn_unused_result)) +void *xrealloc(void *p, size_t size); + +__attribute__ ((warn_unused_result)) +void *xmalloc(size_t size); + +__attribute__ ((warn_unused_result)) +void *xzmalloc(size_t size); + +void *xstrdup(const char *s); +char *xstrcat(char *a, const char *b); + +__attribute__ ((format (printf, 1, 2))) __attribute__ ((warn_unused_result)) +char *msg(const char *fmt, ...); + +enum loglevels {LOGLEVELS, NUM_LOGLEVELS}; +extern unsigned loglevel_arg_val; + +__attribute__ ((format (printf, 2, 3))) +void m7a_log(int ll, const char* fmt,...); + +#define DEBUG_LOG(f,...) m7a_log(LL_DEBUG, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define INFO_LOG(f,...) m7a_log(LL_INFO, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define NOTICE_LOG(f,...) m7a_log(LL_NOTICE, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define WARNING_LOG(f,...) m7a_log(LL_WARNING, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define ERROR_LOG(f,...) m7a_log(LL_ERROR, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define CRIT_LOG(f,...) m7a_log(LL_CRIT, "%s: " f, __FUNCTION__, ## __VA_ARGS__) +#define EMERG_LOG(f,...) m7a_log(LL_EMERG, "%s: " f, __FUNCTION__, ## __VA_ARGS__) + +__attribute__ ((noreturn)) +__attribute__ ((format (printf, 1, 2))) +void die(const char *fmt, ...); + +__attribute__ ((noreturn)) +__attribute__ ((format (printf, 1, 2))) +void die_errno(const char *fmt, ...); + +__attribute__ ((noreturn)) +void die_empty_arg(const char *opt); + +void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt); + +bool xexec(char * const argv[], const struct iovec *iov); +void valid_fd012(void); +void check_name(const char *arg); +void parse_compound_arg(const char *arg, const char *opt, char **name, char **val); +char *parse_cgroup_acl(const char *arg); +char *make_hwaddr(const char *name, const char *bridge); +void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr); +uint32_t atou32(const char *str, const char *opt); +bool remove_subdirs_recursively(const char *path); +void daemonize(const char *logfile); +bool acquire_lock(const char *string); +bool try_lock(const char *string, pid_t *pid); +bool release_lock(const char *string); +bool is_locked(const char *string, pid_t *pid); +bool attach_to_bridge(const char *iface, const char *bridge); +bool rename_interface(const char *before, const char *after); +void pretty_print_hwaddr(const uint8_t *hwaddr, char *result); +bool set_hwaddr(const char *iface, const uint8_t *hwaddr); +bool link_del(const char *iface); +bool link_up(const char *iface); +bool create_veth_device_pair(const char *name, char *peer); +bool set_netns(const char *iface, pid_t pid); +int request_fd(const char *socket_path, char *msg, int *result); +bool request_int(const char *socket_path, char *msg, int *result); +bool listen_on_unix_socket(const char *socket_path, int *result); +bool recv_cred_buffer(int socketfd, char *buf, size_t size, + int *clientfd, uid_t *uid); +bool pass_fd(int passfd, int socketfd); + +extern int signal_pipe[2]; +void init_signal_handling(void); +int next_signal(void); diff --git a/micoforia.c b/micoforia.c new file mode 100644 index 0000000..4d267ec --- /dev/null +++ b/micoforia.c @@ -0,0 +1,1996 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include "m7a.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "micoforia.lsg.h" + +static struct lls_parse_result *lpr, *sublpr; +unsigned loglevel_arg_val = 4; + +struct ifspec { + char *bridge; + uint8_t hwaddr[6]; +}; + +struct container { + char *name; + char *pre_start_hook; + char *pre_exec_hook; + char *root_dir; + char *init; + struct ifspec *ifspec; + /* this is never zero, even if no ifspec was given */ + unsigned num_ifspecs; + char **dacl; + unsigned num_dac_entries; + char **io_max; + unsigned num_io_max_entries; + /* ~0U: not given, 0: unlimited */ + unsigned cpu_cores; + unsigned memory_limit; + /* ~0U: not given */ + unsigned init_type; + cap_value_t *capdrop; + unsigned num_capdrops; + uint32_t *tty; + unsigned num_ttys; +}; + +static struct container **container; +static unsigned num_containers; + +struct container_runtime { + int pipe1[2], pipe2[2]; /* for startup communication */ + uint32_t *tty; + unsigned num_ttys; + int *master, *slave, *client; + + int init_pid; /* in the parent namespace */ + char *pts, *root, *dev; + int socket_fd; +}; + +static char **default_dacl, **default_io_max; +unsigned num_default_dac_entries, num_default_io_max_entries; +static cap_value_t *default_capdrop; +unsigned num_default_capdrops; +uint32_t *default_tty; +unsigned num_default_ttys; +static const struct lls_command *subcmd; +/* does not allocate memory */ +void m7a_log(int ll, const char* fmt,...) +{ + va_list argp; + + if (ll < loglevel_arg_val) + return; + va_start(argp, fmt); + if (subcmd == lls_cmd(LSG_MICOFORIA_CMD_START, micoforia_suite)) { + char str[100]; + struct timespec t; + struct tm *tm; + assert(clock_gettime(CLOCK_REALTIME, &t) == 0); + tm = localtime(&t.tv_sec); + strftime(str, sizeof(str), "%b %d %H:%M:%S", tm); + fprintf(stderr, "%s:%04lu ", str, + (long unsigned)t.tv_nsec / 1000 / 1000); + fprintf(stderr, "(%u) ", (unsigned)getpid()); + } + vfprintf(stderr, fmt, argp); + va_end(argp); +} + +static void die_lopsub(int lopsub_ret, char **errctx) +{ + const char *m = lls_strerror(-lopsub_ret); + if (*errctx) + ERROR_LOG("%s: %s\n", *errctx, m); + else + ERROR_LOG("%s\n", m); + free(*errctx); + *errctx = NULL; + die("lopsub error"); +} + +#define FOR_EACH_CONTAINER(_c) for ( \ + struct container **_cp = container; \ + ((_c) = *(_cp)); \ + (_cp)++, (_c) = *(_cp) \ +) + +static struct container *get_container(const char *name) +{ + struct container *c; + FOR_EACH_CONTAINER(c) { + if (!strcmp(c->name, name)) + return c; + } + return NULL; +} + +static struct container *get_or_append_container(const char *name) +{ + struct container *c = get_container(name); + if (c) + return c; + container = xrealloc(container, + (++num_containers + 1) * sizeof(struct container *)); + c = container[num_containers - 1] = xzmalloc(sizeof(struct container)); + c->name = xstrdup(name); + /* ~0U means: not given */ + c->cpu_cores = ~0U; + c->memory_limit = ~0U; + c->init_type = ~0U; + container[num_containers] = NULL; + return c; +} + +static unsigned get_container_ttys(const struct container *c, uint32_t **result) +{ + static uint32_t dflt = {1}; + if (c->num_ttys > 0) { + *result = c->tty; + return c->num_ttys; + } + if (num_default_ttys > 0) { + *result = default_tty; + return num_default_ttys; + } + *result = &dflt; + return 1; +} + +enum clo_given_counter { + CLOGC_DEFAULT_CGROUP_DAC, + CLOGC_CGROUP_DAC, + CLOGC_DEFAULT_IO_MAX, + CLOGC_IO_MAX, + NUM_CLOGCS +}; + +static unsigned clo_given_counter[NUM_CLOGCS]; + +static void append_dac_entry(const char *arg, char ***listp, unsigned *count) +{ + char *val = parse_cgroup_acl(arg); + (*count)++; + *listp = xrealloc(*listp, (*count + 1) * sizeof(char *)); + (*listp)[*count - 1] = val; + (*listp)[*count] = NULL; +} + +static void append_io_max_entry(const char *arg, char ***listp, unsigned *count) +{ + (*count)++; + *listp = xrealloc(*listp, (*count + 1) * sizeof(char *)); + (*listp)[*count - 1] = xstrdup(arg); + (*listp)[*count] = NULL; +} + +static void check_options(void) +{ + unsigned n, m; + const char *arg; + char *name, *val; + struct container *c; + uint32_t u32; + + container = xzmalloc(sizeof(struct container *)); + /* loop backwards to let command line opts override config file opts */ + for (n = OPT_GIVEN(MICOFORIA, CONTAINER) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CONTAINER); + check_name(arg); + get_or_append_container(arg); + } + for (n = OPT_GIVEN(MICOFORIA, PRE_START_HOOK) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_START_HOOK); + parse_compound_arg(arg, "pre-start-hook", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->pre_start_hook); + c->pre_start_hook = val; + } + for (n = OPT_GIVEN(MICOFORIA, PRE_EXEC_HOOK) - 1; n != ~0U; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_EXEC_HOOK); + parse_compound_arg(arg, "pre-exec-hook", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->pre_exec_hook); + c->pre_exec_hook = val; + } + for (n = OPT_GIVEN(MICOFORIA, CAPDROP) - 1; n != ~0U; n--) { + cap_value_t cap_val; + arg = OPT_STRING_VAL_N(n, MICOFORIA, CAPDROP); + parse_compound_arg(arg, "capabilities", &name, &val); + c = get_or_append_container(name); + if (cap_from_name(val, &cap_val) < 0) + die_errno("%s: invalid capability: %s", name, val); + c->capdrop = xrealloc(c->capdrop, + ++c->num_capdrops * sizeof(cap_value_t)); + c->capdrop[c->num_capdrops - 1] = cap_val; + free(name); + free(val); + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP); n++) { + cap_value_t cap_val; + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CAPDROP); + if (cap_from_name(arg, &cap_val) < 0) + die_errno("invalid default capability: %s", val); + default_capdrop = xrealloc(default_capdrop, + ++num_default_capdrops * sizeof(cap_value_t)); + default_capdrop[num_default_capdrops - 1] = cap_val; + } + for (n = OPT_GIVEN(MICOFORIA, TTY) - 1; n != ~0U; n--) { + uint32_t minor; + arg = OPT_STRING_VAL_N(n, MICOFORIA, TTY); + parse_compound_arg(arg, "tty", &name, &val); + c = get_or_append_container(name); + minor = atou32(val, "tty"); + if (minor == 0) + die("can not capture tty0"); + c->tty = xrealloc(c->tty, ++c->num_ttys * sizeof(uint32_t)); + c->tty[c->num_ttys - 1] = minor; + free(name); + free(val); + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_TTY); n++) { + uint32_t minor = OPT_UINT32_VAL_N(n, MICOFORIA, DEFAULT_TTY); + if (minor == 0) + die("can not capture tty0"); + default_tty = xrealloc(default_tty, + ++num_default_ttys * sizeof(uint32_t)); + default_tty[num_default_ttys - 1] = minor; + } + + for (n = OPT_GIVEN(MICOFORIA, ROOT_DIRECTORY) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, ROOT_DIRECTORY); + parse_compound_arg(arg, "root-directory", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->root_dir); + c->root_dir = val; + } + u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES); + check_range(u32, 0, 65536, "default-cpu-cores"); + for (n = OPT_GIVEN(MICOFORIA, CPU_CORES) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CPU_CORES); + parse_compound_arg(arg, "cpu-cores", &name, &val); + c = get_or_append_container(name); + free(name); + u32 = atou32(val, "cpu-cores"); + free(val); + check_range(u32, 0, 65536, "cpu-cores"); + c->cpu_cores = u32; + } + u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT); + check_range(u32, 0, 1024 * 1024, "default-memory-limit"); + for (n = OPT_GIVEN(MICOFORIA, MEMORY_LIMIT) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, MEMORY_LIMIT); + parse_compound_arg(arg, "memory-limit", &name, &val); + c = get_or_append_container(name); + free(name); + u32 = atou32(val, "memory-limit"); + free(val); + check_range(u32, 0, 1024 * 1024, "memory-limit"); + c->memory_limit = u32; + } + for (n = OPT_GIVEN(MICOFORIA, INIT) - 1; n != ~0U ; n--) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, INIT); + parse_compound_arg(arg, "init", &name, &val); + c = get_or_append_container(name); + free(name); + free(c->init); + c->init = val; + } + for (n = 0; n < OPT_GIVEN(MICOFORIA, NET); n++) { + struct ifspec *ifspec; + arg = OPT_STRING_VAL_N(n, MICOFORIA, NET); + parse_compound_arg(arg, "net", &name, &val); + c = get_or_append_container(name); + free(name); + c->ifspec = xrealloc(c->ifspec, + ++c->num_ifspecs * sizeof(struct ifspec)); + ifspec = c->ifspec + c->num_ifspecs - 1; + parse_ifspec(val, &ifspec->bridge, ifspec->hwaddr); + free(val); + } + + m = clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC]; + for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_CGROUP_DAC); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC); + append_dac_entry(arg, &default_dacl, &num_default_dac_entries); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC); + append_dac_entry(arg, &default_dacl, &num_default_dac_entries); + } + m = clo_given_counter[CLOGC_CGROUP_DAC]; + for (n = m; n < OPT_GIVEN(MICOFORIA, CGROUP_DAC); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC); + parse_compound_arg(arg, "cgroup-dac", &name, &val); + c = get_or_append_container(name); + free(name); + append_dac_entry(val, &c->dacl, &c->num_dac_entries); + free(val); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC); + parse_compound_arg(arg, "cgroup-dac", &name, &val); + c = get_or_append_container(name); + free(name); + append_dac_entry(val, &c->dacl, &c->num_dac_entries); + free(val); + } + + m = clo_given_counter[CLOGC_DEFAULT_IO_MAX]; + for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX); + append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX); + append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries); + } + m = clo_given_counter[CLOGC_IO_MAX]; + for (n = m; n < OPT_GIVEN(MICOFORIA, IO_MAX); n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX); + parse_compound_arg(arg, "io-max", &name, &val); + c = get_or_append_container(name); + free(name); + append_io_max_entry(val, &c->io_max, &c->num_io_max_entries); + free(val); + } + for (n = 0; n < m; n++) { + arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX); + parse_compound_arg(arg, "io-max", &name, &val); + c = get_or_append_container(name); + free(name); + append_io_max_entry(val, &c->io_max, &c->num_io_max_entries); + free(val); + } + + /* init default c->ifspec[] */ + FOR_EACH_CONTAINER(c) { + if (c->num_ifspecs == 0) { + const char *br = OPT_STRING_VAL(MICOFORIA, DEFAULT_BRIDGE); + c->num_ifspecs = 1; + c->ifspec = xmalloc(sizeof(struct ifspec)); + c->ifspec[0].bridge = xstrdup(br); + memset(c->ifspec[0].hwaddr, 0, 6); + continue; + } + } +} + +static void show_subcommand_summary(bool verbose) +{ + int i; + +#define LSG_MICOFORIA_CMD(_name) #_name + static const char * const subcommand_names[] = {LSG_MICOFORIA_SUBCOMMANDS NULL}; +#undef LSG_MICOFORIA_CMD + printf("Available subcommands:\n"); + if (verbose) { + const struct lls_command *cmd; + for (i = 1; (cmd = lls_cmd(i, micoforia_suite)); i++) { + const char *purpose = lls_purpose(cmd); + const char *name = lls_command_name(cmd); + printf("%-12s%s\n", name, purpose); + } + } else { + unsigned n = 8; + printf("\t"); + for (i = 0; i < LSG_NUM_MICOFORIA_SUBCOMMANDS; i++) { + if (i > 0) + n += printf(", "); + if (n > 70) { + printf("\n\t"); + n = 8; + } + n += printf("%s", subcommand_names[i]); + } + printf("\n"); + } +} + +const char *GET_VERSION(void); +static void handle_version_and_help(void) +{ + char *help; + + if (OPT_GIVEN(MICOFORIA, VERSION)) { + printf(PACKAGE " %s\n" + "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n" + "License: " LICENSE " <" LICENSE_URL ">.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "\n" + "Web page: " URL "\n" + "Clone URL: " CLONE_URL "\n" + "Gitweb: " GITWEB_URL "\n" + "Author's Home Page: " HOME_URL "\n" + "Send feedback to: " AUTHOR " <" EMAIL ">\n" + , + GET_VERSION() + ); + exit(EXIT_SUCCESS); + } + if (OPT_GIVEN(MICOFORIA, DETAILED_HELP)) + help = lls_long_help(CMD_PTR(MICOFORIA)); + else if (OPT_GIVEN(MICOFORIA, HELP)) + help = lls_short_help(CMD_PTR(MICOFORIA)); + else if (lls_num_inputs(lpr) == 0) { + show_subcommand_summary(true /* verbose */); + exit(EXIT_SUCCESS); + } else + return; + printf("%s\n", help); + free(help); + exit(EXIT_SUCCESS); +} + +static char *get_config_file_path(void) +{ + struct passwd *pw; + const char *home; + + if (OPT_GIVEN(MICOFORIA, CONFIG_FILE)) + return xstrdup(OPT_STRING_VAL(MICOFORIA, CONFIG_FILE)); + pw = getpwuid(getuid()); + home = pw? pw->pw_dir : "/root"; + return msg("%s/.micoforiarc", home); +} + +static void parse_options(int argc, char **argv, const struct lls_command *cmd, + struct lls_parse_result **lprp) +{ + int ret, fd = -1; + char *config_file; + struct stat statbuf; + void *map; + size_t sz; + int cf_argc; + char **cf_argv, *errctx = NULL; + const char *subcmd_name; + struct lls_parse_result *merged_lpr, *cf_lpr; + + ret = lls_parse(argc, argv, cmd, lprp, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + handle_version_and_help(); + clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, + DEFAULT_CGROUP_DAC); + clo_given_counter[CLOGC_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, CGROUP_DAC); + clo_given_counter[CLOGC_DEFAULT_IO_MAX] = + OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); + clo_given_counter[CLOGC_IO_MAX] = OPT_GIVEN(MICOFORIA, IO_MAX); + config_file = get_config_file_path(); + ret = open(config_file, O_RDONLY); + if (ret < 0) { + if (errno != ENOENT || OPT_GIVEN(MICOFORIA, CONFIG_FILE)) + die_errno("can not open config file %s", config_file); + /* no config file -- nothing to do */ + ret = 0; + goto success; + } + fd = ret; + ret = fstat(fd, &statbuf); + if (ret < 0) + die_errno("failed to stat config file %s", config_file); + sz = statbuf.st_size; + if (sz == 0) { /* config file is empty -- nothing to do */ + ret = 0; + goto success; + } + map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) + die_errno("failed to mmap config file %s", config_file); + subcmd_name = (cmd == CMD_PTR(MICOFORIA))? NULL : lls_command_name(cmd); + ret = lls_convert_config(map, sz, subcmd_name, &cf_argv, + &errctx); + munmap(map, sz); + if (ret < 0) { + ERROR_LOG("failed to convert config file %s\n", config_file); + die_lopsub(ret, &errctx); + } + cf_argc = ret; + ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx); + lls_free_argv(cf_argv); + if (ret < 0) + die_lopsub(ret, &errctx); + /* command line options override config file options */ + ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + lls_free_parse_result(cf_lpr, cmd); + lls_free_parse_result(*lprp, cmd); + *lprp = merged_lpr; +success: + if (fd >= 0) + close(fd); + free(config_file); +} + +static const char *get_pre_start_hook(const struct container *c) +{ + if (c->pre_start_hook) + return c->pre_start_hook; + return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_START_HOOK); +} + +static const char *get_pre_exec_hook(const struct container *c) +{ + if (c->pre_exec_hook) + return c->pre_exec_hook; + return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_EXEC_HOOK); +} + +static char *get_root_dir(const struct container *c) +{ + if (c->root_dir) + return xstrdup(c->root_dir); + return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, DEFAULT_ROOT_PREFIX), c->name); +} + +static char *get_ifspec_string(const struct container *c) +{ + unsigned n; + char *str = NULL; + + assert(c->num_ifspecs > 0); + for (n = 0; n < c->num_ifspecs; n++) { + uint8_t *x = c->ifspec[n].hwaddr; + char *tmp = msg("%s%s%s:%02x:%02x:%02x:%02x:%02x:%02x", + str? str : "", + str? " " : "", + c->ifspec[n].bridge, + x[0], x[1], x[2], x[3], x[4], x[5] + ); + free(str); + str = tmp; + } + return str; +} + +static char *interface_name(const struct container *c, unsigned idx, bool peer) +{ + assert(idx < c->num_ifspecs); + if (c->num_ifspecs == 1) + return peer? msg("%s-g", c->name) : xstrdup(c->name); + if (peer) + return msg("%s-%s-g", c->name, c->ifspec[idx].bridge); + return msg("%s-%s", c->name, c->ifspec[idx].bridge); +} + +static void set_m7a_root_dir_env(const struct container *c) +{ + char *root = get_root_dir(c); + DEBUG_LOG("root dir: %s\n", root); + setenv("MICOFORIA_ROOT_DIR", root, 1); + free(root); +} + +static bool run_pre_start_hook(const struct container *c) +{ + char *ifspec; + char *cmd = xstrdup(get_pre_start_hook(c)); + char *argv[] = {"/bin/sh", "-c", cmd, NULL}; + bool success; + + setenv("MICOFORIA_CONTAINER_NAME", c->name, 1); + set_m7a_root_dir_env(c); + + ifspec = get_ifspec_string(c); + DEBUG_LOG("ifspecs: %s\n", ifspec); + setenv("MICOFORIA_IFSPECS", ifspec, 1); + free(ifspec); + + INFO_LOG("running pre-start hook %s\n", cmd); + success = xexec(argv, NULL); + free(cmd); + if (!success) + ERROR_LOG("pre-start hook failed\n"); + unsetenv("MICOFORIA_CONTAINER_NAME"); + unsetenv("MICOFORIA_IFSPECS"); + unsetenv("MICOFORIA_ROOT_DIR"); + return success; +} + +static void run_pre_exec_hook(const struct container *c) +{ + char *cmd = xstrdup(get_pre_exec_hook(c)); + char *argv[] = {"/bin/sh", "-c", cmd, NULL}; + + INFO_LOG("/bin/sh -c '%s'\n", cmd); + set_m7a_root_dir_env(c); + if (!xexec(argv, NULL)) + die("%s: pre-exec hook failed", c->name); + free(cmd); + unsetenv("MICOFORIA_ROOT_DIR"); +} + +static void write_cgroup(const char *path, const char *txt) +{ + int fd; + size_t sz; + + if ((fd = open(path, O_WRONLY)) < 0) + die_errno("open %s", path); + sz = strlen(txt); + if (write(fd, txt, sz) != sz) + die_errno("could not write to %s", path); + close(fd); +} + +static unsigned get_dacl(const struct container *c, char ***result) +{ + static char *dflt[] = { + "da", /* deny access to all devices except the ones below */ + "ac 1:3 rwm", /* null */ + "ac 1:5 rwm", /* zero */ + "ac 1:7 rwm", /* full */ + "ac 1:8 rwm", /* random */ + "ac 1:9 rwm", /* urandom */ + "ac 4:* rwm", /* tty?* */ + "ac 5:0 rwm", /* tty */ + "ac 5:2 rwm", /* ptmx */ + "ac 136:* rwm", /* pts */ + }; + if (c->num_dac_entries > 0) { + *result = c->dacl; + return c->num_dac_entries; + } + if (num_default_dac_entries > 0) { + *result = default_dacl; + return num_default_dac_entries; + } + *result = dflt; + return ARRAY_SIZE(dflt); +} + +static void apply_dacl(const struct container *c) +{ + char **dacl; + unsigned n, num_entries; + char *m7a_dir, *container_dir, *allow, *deny, *procs, *txt; + int fd, allow_fd, deny_fd; + size_t sz; + + m7a_dir = msg("/var/cgroup/micoforia"); + container_dir = msg("%s/%s", m7a_dir, c->name); + allow = msg("%s/devices.allow", container_dir); + deny = msg("%s/devices.deny", container_dir); + procs = msg("%s/cgroup.procs", container_dir); + + if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", m7a_dir); + free(m7a_dir); + if (mkdir(container_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", container_dir); + free(container_dir); + if ((allow_fd = open(allow, O_WRONLY)) < 0) + die_errno("open %s", allow); + free(allow); + if ((deny_fd = open(deny, O_WRONLY)) < 0) + die_errno("open %s", deny); + free(deny); + + num_entries = get_dacl(c, &dacl); + INFO_LOG("applying %u entr%s\n", num_entries, num_entries == 1? + "y" : "ies"); + for (n = 0; n < num_entries; n++) { + char *entry = dacl[n]; + DEBUG_LOG("dac entry #%u: %s %s\n", n, dacl[n][0] == 'a'? + "allow" : "deny", dacl[n] + 1); + txt = msg("%s\n", entry + 1); + sz = strlen(txt); + fd = entry[0] == 'a'? allow_fd : deny_fd; + if (write(fd, txt, sz) != sz) + die_errno("could not write to cgroup devices.%s file", + entry[0] == 'a'? "allow" : "deny"); + free(txt); + } + close(allow_fd); + close(deny_fd); + txt = msg("%u\n", (unsigned)getpid()); + write_cgroup(procs, txt); + free(txt); +} + +static void cgroup_init(void) +{ + const char controllers[] = "+cpu +memory +io\n"; + char *m7a_dir, *ctl; + + if (access("/var/cgroup/cgroup.clone_children", F_OK) < 0) + die("cgroup v1 not mounted at /var/cgroup/"); + if (access("/var/cgroup2/cgroup.subtree_control", F_OK) < 0) + die("cgroup v1 not mounted at /var/cgroup/"); + write_cgroup("/var/cgroup2/cgroup.subtree_control", controllers); + m7a_dir = msg("/var/cgroup2/micoforia"); + if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", m7a_dir); + ctl = msg("%s/cgroup.subtree_control", m7a_dir); + free(m7a_dir); + write_cgroup(ctl, controllers); + free(ctl); +} + +static void create_cgroup_v2(const struct container *c) +{ + char buf[10]; + char *ctl, *dir = msg("/var/cgroup2/micoforia/%s", c->name); + + if (mkdir(dir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", dir); + ctl = msg("%s/cgroup.procs", dir); + free(dir); + sprintf(buf, "%u\n", (unsigned)getpid()); + write_cgroup(ctl, buf); + free(ctl); +} + +static unsigned get_cpu_cores(const struct container *c) +{ + return c->cpu_cores != ~0U? c->cpu_cores : + OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES); +} + +static void apply_cpu_limit(const struct container *c) +{ + char *str, *ctl; + unsigned cores = get_cpu_cores(c); + + if (cores == 0) /* unlimited */ + return; + assert(cores != ~0U); + INFO_LOG("%u core%s\n", cores, cores == 1? "" : "s"); + ctl = msg("/var/cgroup2/micoforia/%s/cpu.max", c->name); + str = msg("%u 1000000\n", 1000000 * cores); + write_cgroup(ctl, str); + free(ctl); + free(str); +} + +static unsigned get_memory_limit(const struct container *c) +{ + return c->memory_limit != ~0U? c->memory_limit : + OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT); +} + +static void apply_memory_limit(const struct container *c) +{ + char *str, *ctl; + unsigned gigs = get_memory_limit(c); + + if (gigs == 0) /* unlimited */ + return; + assert(gigs != ~0U); + INFO_LOG("%uG\n", gigs); + ctl = msg("/var/cgroup2/micoforia/%s/memory.high", c->name); + str = msg("%llu\n", 1024LLU * 1024LLU * 1024LLU * gigs); + write_cgroup(ctl, str); + free(ctl); + free(str); +} + +static unsigned get_iospecs(const struct container *c, char ***result) +{ + if (c->num_io_max_entries > 0) { + *result = c->dacl; + return c->num_io_max_entries; + } + if (num_default_io_max_entries > 0) { + *result = default_io_max; + return num_default_io_max_entries; + } + *result = NULL; + return 0; +} + +static void apply_io_limit(const struct container *c) +{ + unsigned n, num_entries; + char *io_max; + char **iospec; + + num_entries = get_iospecs(c, &iospec); + if (num_entries == 0) + return; + INFO_LOG("%u entries\n", num_entries); + io_max = msg("/var/cgroup2/micoforia/%s/io.max", c->name); + for (n = 0; n < num_entries; n++) + write_cgroup(io_max, iospec[n]); + free(io_max); +} + +static void cgroup_cleanup(const struct container *c) +{ + char *dir = msg("/var/cgroup/micoforia/%s", c->name); + remove_subdirs_recursively(dir); + free(dir); + dir = msg("/var/cgroup2/micoforia/%s", c->name); + remove_subdirs_recursively(dir); + free(dir); +} + +static bool setup_network(const struct container *c) +{ + unsigned n; + char *iface, *peer; + + if (!link_up("lo")) + WARNING_LOG("could not set establish loopback link\n"); + for (n = 0; n < c->num_ifspecs; n++) { + iface = interface_name(c, n, false); + peer = interface_name(c, n, true); + link_del(iface); /* ignore errors */ + if (!create_veth_device_pair(iface, peer)) + goto fail; + if (!set_hwaddr(peer, c->ifspec[n].hwaddr)) + goto fail; + if (!attach_to_bridge(iface, c->ifspec[n].bridge)) + goto fail; + if (!link_up(iface)) + goto fail; + free(iface); + free(peer); + } + return true; +fail: + free(iface); + free(peer); + return false; +} + +static void setup_termios(int fd) +{ + struct winsize wsz; /* see ioctl_tty(2) */ + struct termios tios; + + if (!isatty(fd)) + return; + if (tcgetattr(fd, &tios)) { + ERROR_LOG("tcgetattr: %m\n"); + return; + } + tios.c_lflag &= ~(ECHO | ISIG | ICANON); + tios.c_cc[VMIN] = 1; + tios.c_cc[VTIME] = 0; + if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + if (ioctl(STDIN_FILENO, TIOCGWINSZ, &wsz) >= 0) + ioctl(fd, TIOCSWINSZ, &wsz); +} + +struct device_node_info { + unsigned major, minor; + mode_t mode; + const char *name; +}; + +static void create_standard_device_nodes(struct container_runtime *cr) +{ + const struct device_node_info devices[] = { + {.major = 1, .minor = 3, .mode = 0666, .name = "null"}, + {.major = 1, .minor = 5, .mode = 0666, .name = "zero"}, + {.major = 1, .minor = 7, .mode = 0666, .name = "full"}, + {.major = 1, .minor = 8, .mode = 0666, .name = "random"}, + {.major = 1, .minor = 9, .mode = 0666, .name = "urandom"}, + {.major = 4, .minor = 0, .mode = 0620, .name = "tty0"}, + {.major = 5, .minor = 1, .mode = 0600, .name = "console"}, + {.major = 5, .minor = 2, .mode = 0666, .name = "ptmx"}, + }; + unsigned n; + + for (n = 0; n < ARRAY_SIZE(devices); n++) { + const struct device_node_info *d = devices + n; + char *path = msg("%s/%s", cr->dev, d->name); + if (mknod(path, S_IFCHR, makedev(d->major, d->minor)) < 0) + die_errno("mknod %s", d->name); + chmod(path, d->mode); + free(path); + } +} + +static void init_console(struct container_runtime *cr) +{ + char *console; + unsigned n; + + if (mount(NULL, cr->dev, "tmpfs", 0, "size=500000,mode=755") < 0) + die("mount tmpfs at %s: %m", cr->dev); + create_standard_device_nodes(cr); + for (n = 0; n < cr->num_ttys; n++) { + char *tty = msg("%s/tty%u", cr->dev, cr->tty[n]); + unlink(tty); + if (mknod(tty, S_IFCHR, makedev(4, cr->tty[n])) < 0) + die("mknod %s: %m", tty); + chmod(tty, 0660); + setup_termios(cr->slave[n]); + INFO_LOG("bind mounting %s -> %s\n", ttyname(cr->slave[n]), tty); + if (mount(ttyname(cr->slave[n]), tty, "none", + MS_BIND | MS_PRIVATE, NULL) < 0) + die("failed to bind mount %s: %m\n", tty); + free(tty); + } + console = msg("%s/console", cr->dev); + if (mount(ttyname(cr->slave[0]), console, "none", + MS_BIND | MS_PRIVATE, NULL) < 0) + die("failed to bind mount %s: %m\n", console); + free(console); +} + +/* + * These umounts fail if the container shutdown already umounted the bind + * mounted devices. This is not fatal, so log only with low severity. + */ +static void shutdown_console(struct container_runtime *cr) +{ + unsigned n; + char *console; + + for (n = 0; n < cr->num_ttys; n++) { + char *tty = msg("%s/tty1", cr->dev); + if (umount2(tty, MNT_DETACH) < 0) + DEBUG_LOG("umount %s: %m\n", tty); + free(tty); + } + console = msg("%s/console", cr->dev); + if (umount2(console, MNT_DETACH) < 0) + DEBUG_LOG("umount %s: %m\n", console); + free(console); +} + +static char *get_socket_path(const char *container_name) +{ + return msg("micoforia/%s", container_name); +} + +/* Ignore everything the client sends us, but invalidate the fd on EOF. */ +static void dispatch_client(int *client) +{ + char buf[1024]; + if (read(*client, buf, sizeof(buf)) <= 0) { + NOTICE_LOG("detaching client on fd %d\n", *client); + close(*client); + *client = -1; + } +} + +static void dispatch_socket_request(struct container_runtime *cr) +{ + uid_t uid; + char buf[32]; + int cfd; + uint32_t minor; + unsigned n; + bool force; + + memset(buf, 0, sizeof(buf)); + if (!recv_cred_buffer(cr->socket_fd, buf, sizeof(buf) - 1, &cfd, &uid)) + return; + if (uid != getuid()) { + const char msg[] = "\1EACCES"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("access denied for uid %d\n", (int)uid); + goto out; + } + if (strcmp(buf, "init_pid") == 0) { + buf[0] = '\0'; + memcpy(buf + 1, &cr->init_pid, sizeof(int)); + send(cfd, buf, 1 + sizeof(int), MSG_DONTWAIT); + goto out; + } + if (sscanf(buf, "attach %u", &minor) == 1) { + force = false; + } else if (sscanf(buf, "force-attach %u", &minor) == 1) { + force = true; + } else { + const char msg[] = "\1EINVAL"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("invalid request: %s\n", buf); + goto out; + } + for (n = 0; n < cr->num_ttys; n++) { + INFO_LOG("n: %u, tty[n]: %u\n", n, cr->tty[n]); + if (cr->tty[n] == minor) + break; + } + if (n == cr->num_ttys) { + const char msg[] = "\1ENOTTY"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + NOTICE_LOG("tty%u is not being forwarded\n", minor); + goto out; + } + if (cr->client[n] >= 0) { + if (force) { + close(cr->client[n]); + cr->client[n] = -1; + } else { + const char msg[] = "\1EBUSY"; + send(cfd, msg, sizeof(msg), MSG_DONTWAIT); + ERROR_LOG("tty%u is already in use\n", minor); + goto out; + } + } + if (!pass_fd(cr->master[n], cfd)) { + ERROR_LOG("could not pass master fd\n"); + goto out; + } + NOTICE_LOG("attached client on fd %d to tty%u\n", cfd, minor); + cr->client[n] = cfd; + return; +out: + close(cfd); +} + +/* discards read data if dst < 0 */ +static bool copy(int src, int dst) +{ + ssize_t sz1, sz2; + char buf[1024]; +again: + sz1 = read(src, buf, sizeof(buf)); + if (sz1 < 0) { + if (errno == EINTR) + goto again; + DEBUG_LOG("read from fd %d: %m\n", src); + } + if (sz1 <= 0) + return false; + if (dst < 0) + return true; + sz2 = write(dst, buf, sz1); + if (sz2 < 0) { + DEBUG_LOG("write to fd %d: %m\n", dst); + return false; + } + if (sz1 != sz2) { + DEBUG_LOG("short write to fd %d\n", dst); + return false; + } + return true; +} + +/* + * The function returns only when the process receives SIGCHLD. In this case + * the return value is 0 for success, 1 for failure, and 2 if the child's exit + * code indicates a reboot request. Other signals are pushed down to the child + * process. + */ +static int parent_loop(pid_t pid, const struct container *c, + struct container_runtime *cr) +{ + unsigned n; + + init_signal_handling(); + for (;;) { + int sig, max_fileno = 0; + fd_set fds; + + FD_ZERO(&fds); + if (OPT_GIVEN(START, FOREGROUND)) { + FD_SET(STDIN_FILENO, &fds); + if (STDIN_FILENO > max_fileno) + max_fileno = STDIN_FILENO; + } + FD_SET(signal_pipe[0], &fds); + if (signal_pipe[0] > max_fileno) + max_fileno = signal_pipe[0]; + FD_SET(cr->socket_fd, &fds); + if (cr->socket_fd > max_fileno) + max_fileno = cr->socket_fd; + for (n = 0; n < cr->num_ttys; n++) { + if (cr->client[n] >= 0) { /* detached */ + FD_SET(cr->client[n], &fds); + if (cr->client[n] > max_fileno) + max_fileno = cr->client[n]; + } else { + FD_SET(cr->master[n], &fds); + if (cr->master[n] > max_fileno) + max_fileno = cr->master[n]; + } + } + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + do { + if (!FD_ISSET(signal_pipe[0], &fds)) + break; + sig = next_signal(); + if (sig == SIGCHLD) { + int wstatus; + if (waitpid(pid, &wstatus, WNOHANG) < 0) { + WARNING_LOG("wait: %m\n"); + break; + } + cgroup_cleanup(c); + if (!WIFEXITED(wstatus)) + return 1; + if (WEXITSTATUS(wstatus) == 2) + return 2; + return WEXITSTATUS(wstatus) != EXIT_SUCCESS; + } + kill(pid, sig); + } while (0); + if (FD_ISSET(cr->socket_fd, &fds)) + dispatch_socket_request(cr); + for (n = 0; n < cr->num_ttys; n++) { + if (cr->client[n] >= 0) { + if FD_ISSET(cr->client[n], &fds) + dispatch_client(cr->client + n); + } else { /* stdout is /dev/null in background mode */ + if (FD_ISSET(cr->master[n], &fds)) + copy(cr->master[n], n == 0? + STDOUT_FILENO : -1); + } + } + if (OPT_GIVEN(START, FOREGROUND)) { + if (FD_ISSET(STDIN_FILENO, &fds)) + copy(STDIN_FILENO, cr->master[0]); + } + } +} + +/* Set net namespace of child and call parent_loop(). */ +static int run_parent(pid_t child_pid, const struct container *c, + struct container_runtime *cr) +{ + unsigned n; + bool success; + + close(cr->pipe1[1]); + close(cr->pipe2[0]); + if (read(cr->pipe1[0], &cr->init_pid, 4) != 4) { + ERROR_LOG("pipe1 read error\n"); + close(cr->pipe1[0]); + close(cr->pipe2[1]); + return false; + } + INFO_LOG("received grand child pid: %u\n", (unsigned)cr->init_pid); + close(cr->pipe1[0]); + for (n = 0; n < c->num_ifspecs; n++) { + char *peer = interface_name(c, n, true); + success = set_netns(peer, child_pid); + free(peer); + if (!success) { + ERROR_LOG("set_netns error\n"); + close(cr->pipe2[1]); + return false; + } + } + success = write(cr->pipe2[1], "\0", 1) == 1; + close(cr->pipe2[1]); + if (!success) { + ERROR_LOG("pipe2 write error\n"); + return false; + } + return parent_loop(child_pid, c, cr); +} + +static unsigned get_capdrops(const struct container *c, cap_value_t **result) +{ + static cap_value_t builtin_capdrop[] = {CAP_SYS_MODULE, CAP_SYS_TIME, + CAP_SYS_RESOURCE}; + + if (c->capdrop) { + *result = c->capdrop; + return c->num_capdrops; + } + if (OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP)) { + *result = default_capdrop; + return num_default_capdrops; + } + *result = builtin_capdrop; + return ARRAY_SIZE(builtin_capdrop); +} + +static void drop_caps(const struct container *c) +{ + cap_value_t *capdrop; + unsigned n, num_capdrops; + + INFO_LOG("lowering bounding set capabilities\n"); + num_capdrops = get_capdrops(c, &capdrop); + for (n = 0; n < num_capdrops; n++) { + char *name = cap_to_name(capdrop[n]); + DEBUG_LOG("dropping %s\n", name); + cap_free(name); + if (cap_drop_bound(capdrop[n]) < 0) + die_errno("cap_drop_bound"); + } +} + +__attribute ((noreturn)) +static void child_loop(pid_t pid, struct container_runtime *cr) +{ + int wstatus; + + INFO_LOG("parent: %u, child: %u, init: %u\n", (unsigned) getppid(), + (unsigned)getpid(), (unsigned)pid); + init_signal_handling(); + setsid(); + + for (;;) { + int max_fileno = 0; + fd_set fds; + + FD_ZERO(&fds); + FD_SET(signal_pipe[0], &fds); + if (signal_pipe[0] > max_fileno) + max_fileno = signal_pipe[0]; + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + do { if (FD_ISSET(signal_pipe[0], &fds)) { + int sig = next_signal(); + if (sig == SIGCHLD) { + if (waitpid(pid, &wstatus, WNOHANG) < 0) { + WARNING_LOG("wait: %m\n"); + break; + } + shutdown_console(cr); + if (WIFSIGNALED(wstatus) && + WTERMSIG(wstatus) == 1) { + NOTICE_LOG("reboot requested\n"); + exit(2); + } + NOTICE_LOG("container terminated\n"); + exit(EXIT_SUCCESS); + } + NOTICE_LOG("sending signal %d to container init\n", + sig); + kill(pid, sig == SIGINT? SIGINT : SIGKILL); + }} while(0); + } +} + +static const char *get_init_path(const struct container *c) +{ + return c->init? c->init : OPT_STRING_VAL(MICOFORIA, DEFAULT_INIT); +} + +/* + * The child process unshares namespaces, spawns the init process which runs + * the pre-exec hook and executes the container init process. This function + * never returns, but both the child and the init process exit when the + * container terminates. The exit code of the child tells the parent whether + * it should restart the container. + */ +__attribute ((noreturn)) +static void run_child(const struct container *c, struct container_runtime *cr) +{ + unsigned n; + char *init, *put_old; + char ch; + pid_t pid; + + close(cr->socket_fd); + for (n = 0; n < cr->num_ttys; n++) + close(cr->master[n]); + close(cr->pipe1[0]); + close(cr->pipe2[1]); + if (unshare(CLONE_NEWNET) < 0) + die_errno("unshare net ns\n"); + if (unshare(CLONE_NEWPID) < 0) + die_errno("unshare pid ns\n"); + /* fork again to become pid 1 in the new pid namespace */ + if ((pid = fork()) < 0) + die_errno("fork"); + /* + * By writing to pipe1 we tell the parent (a) we've unshared the net + * namespace, and (b) the pid of the init process in the parent + * namespace. + */ + if (pid > 0) { + close(cr->pipe2[0]); + if (write(cr->pipe1[1], (const char *)&pid, 4) != 4) + die_errno("pipe write error"); + close(cr->pipe1[1]); + child_loop(pid, cr); /* never returns */ + } + pid = getpid(); + DEBUG_LOG("now running as pid %d\n", pid); + if (read(cr->pipe2[0], &ch, 1) != 1) + die_errno("pipe read error"); + close(cr->pipe1[1]); + close(cr->pipe2[0]); + if (unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS) < 0) + die_errno("unshare"); + mkdir(cr->dev, 0777); + init_console(cr); + for (n = 0; n < cr->num_ttys; n++) + close(cr->slave[n]); + INFO_LOG("setting hostname to %s\n", c->name); + if (sethostname(c->name, strlen(c->name)) < 0) + die_errno("sethostname error"); + if (chdir(cr->root) < 0) + die_errno("chdir %s", cr->root); + drop_caps(c); + apply_dacl(c); + apply_cpu_limit(c); + apply_memory_limit(c); + apply_io_limit(c); + for (n = 0; n < c->num_ifspecs; n++) { + char *peer = interface_name(c, n, true); + char *renamed = msg("eth%u", n); + if (!rename_interface(peer, renamed)) + die("can not rename %s to %s\n", peer, renamed); + free(peer); + free(renamed); + } + run_pre_exec_hook(c); + setup_termios(STDIN_FILENO); + put_old = msg("%s/mnt", cr->root); + /* glibc does not provide a wrapper for pivot_root */ + if (syscall(SYS_pivot_root, ".", put_old) < 0) + die_errno("pivot_root (put_old: %s)", put_old); + if (umount2("/mnt", MNT_DETACH) < 0) + die_errno("umount %s", put_old); + free(put_old); + close(STDIN_FILENO); + init = xstrdup(get_init_path(c)); + INFO_LOG("handing over control to container init: %s\n", init); + execve(init, (char *[]){init, NULL}, NULL); + die_errno("failed to exec init process %s", c->init); +} + +/* + * We need three processes, called parent, child, init, because we want one + * process run with namespaces unmodified, requiring one fork. After the child + * has unshared its PID namespace, it keeps its old PID, so we need to fork + * again to get pid 1. The child can not terminate because the parent can not + * wait(2) on its grandchild. + */ +static bool exec_container(const struct container *c) +{ + bool success; + pid_t pid; + unsigned n; + struct container_runtime cr = {0}; + char *socket_path; + int ret; + + create_cgroup_v2(c); + socket_path = get_socket_path(c->name); + success = listen_on_unix_socket(socket_path, &cr.socket_fd); + if (!success) + ERROR_LOG("can not listen on unix socket %s\n", socket_path); + free(socket_path); + if (!success) + return 1; + cr.root = get_root_dir(c); + cr.dev = msg("%s/dev", cr.root); + cr.pts = realpath("/proc/self/fd/0", NULL); + DEBUG_LOG("pts: %s\n", cr.pts); + cr.num_ttys = get_container_ttys(c, &cr.tty); + cr.master = xmalloc(cr.num_ttys * sizeof(int)); + cr.slave = xmalloc(cr.num_ttys * sizeof(int)); + cr.client = xmalloc(cr.num_ttys * sizeof(int)); + for (n = 0; n < cr.num_ttys; n++) + cr.client[n] = -1; +reboot: + NOTICE_LOG("starting %s\n", c->name); + for (n = 0; n < cr.num_ttys; n++) { + if (openpty(cr.master + n, cr.slave + n, NULL, NULL, NULL) < 0) + die("openpty: %m"); + DEBUG_LOG("pty (tty%u <-> %s)\n", n, ttyname(cr.slave[n])); + } + /* mount rw, ignore errors */ + mount(NULL, cr.root, NULL, MS_REMOUNT, NULL); + if (!setup_network(c)) + return false; + if (!run_pre_start_hook(c)) + return false; + if (pipe(cr.pipe1) < 0) /* child -> parent */ + die_errno("pipe1"); + if (pipe(cr.pipe2) < 0) + die_errno("pipe2"); /* parent -> child */ + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid == 0) + run_child(c, &cr); /* never returns */ + ret = run_parent(pid, c, &cr); + if (ret != 2) + return ret == 0; + NOTICE_LOG("rebooting\n"); + for (n = 0; n < cr.num_ttys; n++) { + close(cr.master[n]); + close(cr.slave[n]); + } + goto reboot; +} + +static char *get_container_logfile(const char *name) +{ + return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, LOGDIR), name); +} + +static bool start_container(const struct container *c) +{ + pid_t pid; + char *logfile; + struct termios tios; + bool success; + + if (is_locked(c->name, &pid)) { + ERROR_LOG("%s is locked by pid %u\n", c->name, (unsigned)pid); + return false; + } + if (OPT_GIVEN(START, FOREGROUND)) { + if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) { + ERROR_LOG("both stdin and stdout must be terminals\n"); + return false; + } + if (tcgetattr(STDIN_FILENO, &tios) < 0) { + ERROR_LOG("tcgetattr: %m\n"); + return false; + } + } else { + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid > 0) + return true; + logfile = get_container_logfile(c->name); + daemonize(logfile); + free(logfile); + } + if (!try_lock(c->name, &pid)) + die("%s is locked by pid %u", c->name, (unsigned)pid); + success = exec_container(c); + if (OPT_GIVEN(START, FOREGROUND)) { + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + } + exit(success? EXIT_SUCCESS : EXIT_FAILURE); +} + +static void check_container_args(void) +{ + unsigned n, num_inputs; + struct container *c; + + num_inputs = lls_num_inputs(sublpr); + if (num_inputs == 0) { + if (num_containers == 0) + die("no container configured\n"); + if (OPT_GIVEN(START, FOREGROUND) && num_containers > 1) + die("must specify container for foreground mode"); + } else { + if (OPT_GIVEN(START, FOREGROUND) && num_inputs > 1) + die("can start only one container in foreground mode"); + for (n = 0; n < num_inputs; n++) { + const char *name = lls_input(n, sublpr); + c = get_container(name); + if (!c) + die("container not configured: %s", name); + } + } +} + +struct container_arg_iter { + unsigned idx; +}; + +#define INITIALIZED_CAI(_cai) {.idx = 0} + +static struct container *cai_next(struct container_arg_iter *cai, bool *skipped) +{ + unsigned num_inputs = lls_num_inputs(sublpr); + + if (skipped) + *skipped = false; + if (num_inputs == 0) { + if (cai->idx >= num_containers) + return NULL; + return container[cai->idx++]; + } + for (; cai->idx < num_inputs; cai->idx++) { + const char *name = lls_input(cai->idx, sublpr); + struct container *c = get_container(name); + if (!c) { + ERROR_LOG("%s: not configured\n", name); + if (skipped) + *skipped = true; + continue; + } + cai->idx++; + return c; + } + return NULL; +} + +static bool for_each_container_arg(bool (*f)(const struct container *c)) +{ + struct container *c; + bool success = true; + bool skipped; + struct container_arg_iter cai = INITIALIZED_CAI(cai); + + while ((c = cai_next(&cai, &skipped))) + if (!f(c) || skipped) + success = false; + return success; +} + +static bool com_start(void) +{ + const char *logdir = OPT_STRING_VAL(MICOFORIA, LOGDIR); + + check_container_args(); + if (logdir[0] == '\0') + die_empty_arg("loggir"); + cgroup_init(); + if (mkdir(logdir, 0777) < 0 && errno != EEXIST) + die_errno("mkdir %s", logdir); + return for_each_container_arg(start_container); +} +EXPORT_CMD_HANDLER(start); + +static bool send_signal_to_container(int signum, const struct container *c) +{ + pid_t pid; + bool success; + + if (!is_locked(c->name, &pid)) { + INFO_LOG("%s is not running\n", c->name); + return false; + } + DEBUG_LOG("sending signal %d to pid %u\n", signum, (unsigned)pid); + success = kill(pid, signum) >= 0; + if (!success) + ERROR_LOG("kill %s: %m\n", c->name); + return success; +} + +static void clean_env(void) +{ + char *term = getenv("TERM"); + + clearenv(); + if (term) + setenv("TERM", term, 0); + setenv("PATH", "/root/bin:/usr/local/sbin:/usr/local/bin" + ":/sbin:/usr/sbin:/bin:/usr/bin", 0); + setenv("USER", "root", 0); + setenv("LOGNAME", "root", 0); + setenv("HOME", "/root", 0); +} + +static bool request_init_pid(const char *name, int *result) +{ + char *socket_path = get_socket_path(name); + bool success; + + *result = -1; + success = request_int(socket_path, "init_pid", result); + free(socket_path); + if (!success) + ERROR_LOG("could not determine init pid of %s\n", name); + return success; +} + +static bool shutdown_container(const struct container *c) +{ + pid_t pid; + char str[20]; + char *argv[] = {"nsenter", "-w", "-a", "-r", "-t", str, "halt", NULL}; + + if (!is_locked(c->name, NULL)) { + if (lls_num_inputs(sublpr) == 0) + return true; + ERROR_LOG("container not running: %s\n", c->name); + return false; + } + pid = fork(); + if (pid < 0) + return false; + if (pid > 0) + return true; + if (!request_init_pid(c->name, &pid)) + _exit(EXIT_FAILURE); + sprintf(str, "%d", pid); + clean_env(); + execvp(argv[0], argv); + _exit(EXIT_FAILURE); +} + +static bool container_is_dead(const struct container *c) +{ + return !is_locked(c->name, NULL); +} + +static bool wait_for_containers_to_die(void) +{ + bool success; + unsigned ms = 32; + struct timespec ts; + + while (ms < 20000) { + ts.tv_sec = ms / 1000; + ts.tv_nsec = (ms % 1000) * 1000 * 1000; + if (nanosleep(&ts, NULL) < 0) + return false; + success = for_each_container_arg(container_is_dead); + if (success) + return true; + ms *= 2; + } + return false; +} + +static bool com_stop(void) +{ + bool success = for_each_container_arg(shutdown_container); + + if (!success) + return false; + if (!OPT_GIVEN(STOP, WAIT)) + return true; + return wait_for_containers_to_die(); +} +EXPORT_CMD_HANDLER(stop); + +static bool reboot_container(const struct container *c) +{ + return send_signal_to_container(SIGINT, c); +} + +static bool com_reboot(void) +{ + return for_each_container_arg(reboot_container); +} +EXPORT_CMD_HANDLER(reboot); + +static bool kill_container(const struct container *c) +{ + return send_signal_to_container(SIGUSR1, c); +} + +static bool com_kill(void) +{ + bool success = for_each_container_arg(kill_container); + + if (!success) + return false; + if (!OPT_GIVEN(KILL, WAIT)) + return true; + return wait_for_containers_to_die(); +} +EXPORT_CMD_HANDLER(kill); + +static void list_container_verbose(const struct container *c) +{ + char *root; + unsigned n, N; + char **word_list; + cap_value_t *capdrop; + uint32_t *tty; + char cores_str[25] = "unlimited"; + unsigned cores = get_cpu_cores(c); + + printf("%s:\n", c->name); + printf("\tpre-start hook: %s\n", get_pre_start_hook(c)); + printf("\tpre-exec hook: %s\n", get_pre_exec_hook(c)); + root = get_root_dir(c); + printf("\troot dir: %s\n", root); + free(root); + printf("\tinit path: %s\n", get_init_path(c)); + for (n = 0; n < c->num_ifspecs; n++) { + char pretty_hwaddr[18]; + char *iface = interface_name(c, n, false); + pretty_print_hwaddr(c->ifspec[n].hwaddr, pretty_hwaddr); + printf("\tinterface #%u: %s (%s)\n", n, iface, pretty_hwaddr); + free(iface); + } + N = get_dacl(c, &word_list); + for (n = 0; n < N; n++) + printf("\tdac entry #%u: %s %s\n", n, word_list[n][0] == 'a'? + "allow" : "deny", word_list[n] + 1); + N = get_iospecs(c, &word_list); + for (n = 0; n < N; n++) + printf("\tiospec #%u: %s\n", n, word_list[n]); + if (cores > 0) + sprintf(cores_str, "%u", cores); + printf("\tCPU core limit: %s\n", cores_str); + printf("\tmemory limit: %uG\n", get_memory_limit(c)); + N = get_capdrops(c, &capdrop); + for (n = 0; n < N; n++) + printf("\tcapdrop #%u: %s\n", n, cap_to_name(capdrop[n])); + N = get_container_ttys(c, &tty); + for (n = 0; n < N; n++) + printf("\ttty #%u: %u\n", n, tty[n]); +} + +static bool com_ls(void) +{ + struct container *c; + bool skipped, success = true; + struct container_arg_iter cai = INITIALIZED_CAI(cai); + + while ((c = cai_next(&cai, &skipped))) { + pid_t pid; + if (skipped) + success = false; + if (!is_locked(c->name, &pid)) { + if (!OPT_GIVEN(LS, ALL)) { + success =false; + continue; + } + pid = 0; + } + if (OPT_GIVEN(LS, VERBOSE)) { + list_container_verbose(c); + continue; + } + if (OPT_GIVEN(LS, LONG)) { + if (pid > 0) + printf("%u\t", (unsigned)pid); + else + printf("-\t"); + printf("%u\t", get_cpu_cores(c)); + printf("%uG\t", get_memory_limit(c)); + printf("%s\n", c->name); + continue; + } + if (!OPT_GIVEN(LS, QUIET)) + printf("%s\n", c->name); + } + if (skipped) /* needed if the last given container arg is invalid */ + success = false; + return success; +} +EXPORT_CMD_HANDLER(ls); + +static bool list_container_processes(const struct container *c) +{ + int pid; + char str[20]; + char *argv[] = {"pstree", "-anp", str, NULL}; + bool success; + + success = is_locked(c->name, &pid); + if (!success) { + if (lls_num_inputs(sublpr) == 0) + return true; + ERROR_LOG("container \"%s\" is not running\n", c->name); + return false; + } + if (!OPT_GIVEN(PS, ALL) && !request_init_pid(c->name, &pid)) + return false; + sprintf(str, "%d", pid); + success = xexec(argv, NULL); + return success; +} + +static bool com_ps(void) +{ + return for_each_container_arg(list_container_processes); +} +EXPORT_CMD_HANDLER(ps); + +static bool com_attach(void) +{ + char *errctx; + const char *arg; + pid_t pid; + char *socket_path; + int master, ret, socket_fd; + bool have_escape = false; + struct termios tios; + uint32_t minor = OPT_UINT32_VAL(ATTACH, TTY); + char *rq; + + if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) { + ERROR_LOG("both stdin and stdout must be terminals\n"); + return false; + } + if (tcgetattr(STDIN_FILENO, &tios) < 0) + die_errno("tcgetattr"); + ret = lls_check_arg_count(sublpr, 1, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + arg = lls_input(0, sublpr); + if (!is_locked(arg, &pid)) { + ERROR_LOG("container not running: %s\n", arg); + return false; + } + socket_path = get_socket_path(arg); + if (OPT_GIVEN(ATTACH, FORCE)) + rq = msg("force-attach %u", minor); + else + rq = msg("attach %u", minor); + socket_fd = request_fd(socket_path, rq, &master); + free(rq); + free(socket_path); + INFO_LOG("Attached to /dev/tty%u of container %s\n", minor, arg); + NOTICE_LOG("Type CTRL+a q to quit\n"); + setup_termios(STDIN_FILENO); + setup_termios(master); + for (;;) { + int max_fileno = 0; + fd_set fds; + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + if (STDIN_FILENO > max_fileno) + max_fileno = STDIN_FILENO; + FD_SET(master, &fds); + if (master > max_fileno) + max_fileno = master; + FD_SET(socket_fd, &fds); + if (socket_fd > max_fileno) + max_fileno = socket_fd; + if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) { + if (errno != EINTR) + ERROR_LOG("select: %m\n"); + continue; + } + if (FD_ISSET(socket_fd, &fds)) + break; + if (FD_ISSET(STDIN_FILENO, &fds)) { + char c; + if (read(STDIN_FILENO, &c, 1) <= 0) + break; + if (c == 1 && !have_escape) + have_escape = true; + else if (c == 'q' && have_escape) + break; + else if (write(master, &c, 1) != 1) + break; + } + if (FD_ISSET(master, &fds)) { + if (!copy(master, STDOUT_FILENO)) + break; + } + } + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0) + ERROR_LOG("tcsetattr: %m\n"); + printf("\n"); + return false; +} +EXPORT_CMD_HANDLER(attach); + +static bool com_help(void) +{ + int ret; + char *errctx, *help; + const char *arg; + const struct lls_command *cmd; + + ret = lls_check_arg_count(sublpr, 0, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + if (lls_num_inputs(sublpr) == 0) { + show_subcommand_summary(OPT_GIVEN(HELP, LONG)); + return true; + } + arg = lls_input(0, sublpr); + ret = lls_lookup_subcmd(arg, micoforia_suite, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + cmd = lls_cmd(ret, micoforia_suite); + if (OPT_GIVEN(HELP, LONG)) + help = lls_long_help(cmd); + else + help = lls_short_help(cmd); + printf("%s\n", help); + free(help); + return true; +} +EXPORT_CMD_HANDLER(help); + +static bool com_configtest(void) +{ + printf("Syntax Ok\n"); + return true; +} +EXPORT_CMD_HANDLER(configtest); + +static bool com_edit(void) +{ + char *ed = getenv("EDITOR"); /* must not be freed */ + char *conf = get_config_file_path(); + char *argv[] = {ed? ed : "vi", conf, NULL}; + bool success = xexec(argv, NULL); + + free(conf); + return success; +} +EXPORT_CMD_HANDLER(edit); + +static bool com_enter(void) +{ + char str[20]; + char **argv; + char *nsenter_args[] = {"nsenter", "-w", "-a", "-r", "-t"}; + const unsigned nna = ARRAY_SIZE(nsenter_args); /* num nsenter args */ + char *dflt_cmd[] = {"login", "-f", "root"}; + unsigned n, N, ni = lls_num_inputs(sublpr); + unsigned nea = ni > 1? ni - 1 : ARRAY_SIZE(dflt_cmd); /* num extra args */ + const char *arg; + bool success; + int ret, pid; + char *errctx; + + ret = lls_check_arg_count(sublpr, 1, INT_MAX, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + arg = lls_input(0, sublpr); + if (!is_locked(arg, &pid)) { + ERROR_LOG("container not running: %s\n", arg); + return false; + } + if (!request_init_pid(arg, &pid)) + return false; + N = nna + nea + 2; /* +1 for arg to -t and +1 for terminating NULL */ + argv = xmalloc(N * sizeof(char *)); + for (n = 0; n < nna; n++) + argv[n] = nsenter_args[n]; + sprintf(str, "%d", pid); + argv[nna] = str; + for (n = 0; n < nea; n++) + argv[nna + 1 + n] = ni > 1? (char *)lls_input(n + 1, sublpr) + : dflt_cmd[n]; + argv[N - 1] = NULL; + clean_env(); + success = xexec(argv, NULL); + free(argv); + return success; +} +EXPORT_CMD_HANDLER(enter); + +static bool com_log(void) +{ + int ret; + char *errctx, *logfile; + bool success, use_less = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO); + char *argv[] = {use_less? "less" : "cat", NULL /* filename */, NULL}; + + ret = lls_check_arg_count(sublpr, 1, 1, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + logfile = get_container_logfile(lls_input(0, sublpr)); + argv[1] = logfile; + success = xexec(argv, NULL); + free(logfile); + return success; +} +EXPORT_CMD_HANDLER(log); + +int main(int argc, char *argv[]) +{ + int ret; + char *errctx; + const struct micoforia_user_data *ud; + unsigned num_inputs; + + valid_fd012(); + parse_options(argc, argv, CMD_PTR(MICOFORIA), &lpr); + loglevel_arg_val = OPT_UINT32_VAL(MICOFORIA, LOGLEVEL); + check_options(); + num_inputs = lls_num_inputs(lpr); + ret = lls_lookup_subcmd(argv[argc - num_inputs], micoforia_suite, &errctx); + if (ret < 0) + die_lopsub(ret, &errctx); + subcmd = lls_cmd(ret, micoforia_suite); + parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr); + ud = lls_user_data(subcmd); + exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/micoforia.suite.m4 b/micoforia.suite.m4 new file mode 100644 index 0000000..697edbc --- /dev/null +++ b/micoforia.suite.m4 @@ -0,0 +1,754 @@ +# SPDX-License-Identifier: GPL-2.0-only +[suite micoforia] + caption = Subcommands + mansect = 8 + manual_title = System Manager's Manual +[supercommand micoforia] + [description] + DESCRIPTION1() + + DESCRIPTION2() + + DESCRIPTION3() + + In addition to global options which apply to all subcommands, each + subcommand has its own set of options. The usual "--" separator must + be used to separate global options from subcommand specific options. + + [/description] + synopsis = [global-options...] [--] [ [subcommand-options...]] + purpose = SLOGAN() + + [option general-options-section] + summary = General options + flag ignored + [option help] + summary = print help and exit + short_opt = h + [option detailed-help] + summary = print help, including all details, and exit + [option version] + summary = print version and exit + short_opt = V + [option config-file] + short_opt = c + summary = use alternative config file (default: ~/.mismarc) + typestr = path + arg_info = required_arg + arg_type = string + [help] + Options may be given at the command line or in the configuration + file. As usual, if an option is given both at the command line and + in the configuration file, the command line option takes precedence. + + The config file may contain global options as well as options for + any subcommand, but subcommand specific options must be placed in a + separate section. See the Examples section of the man page. + [/help] + [option loglevel] + summary = control amount of logging + short_opt = l + arg_info = required_arg + arg_type = string + typestr = severity + values = { + LSGLL_DEBUG = "debug", + LSGLL_INFO = "info", + LSGLL_NOTICE = "notice", + LSGLL_WARNING = "warning", + LSGLL_ERROR = "error", + LSGLL_CRIT = "crit", + LSGLL_EMERG = "emerg" + } + default_val = warning + [help] + Log only messages with severity greater or equal than the given + value. Possible values: + + debug: produces really noisy output. + info: still noisy, but won't fill up the disk quickly. + notice: indicates normal, but significant event. + warning: unexpected events that can be handled. + error: unhandled error condition. + crit: system might be unreliable. + emerg: last message before exit. + [/help] + + [option general-options-section] + summary = Global Container Options + flag ignored + [help] + The options in this section apply to all containers. Most of them + have a per-container counterpart which can be specified to override + the global default. + [/help] + [option default-root-prefix] + summary = path to the parent directory of the container root file systems + typestr = directory + arg_info = required_arg + arg_type = string + default_val = /var/lib/micoforia + [help] + For containers which do not specify their own root directory the path + to the container root is derived from the argument of this option by + appending a slash and the container name. + [/help] + [option logdir] + summary = directory which contains the container log files + arg_info = required_arg + arg_type = string + typestr = directory + default_val = /var/log/micoforia + [help] + The log messages of each container are written to a dedicated + logfile. This option controls in which directroy these files are + written (start subcommand) or expected (log subcommand). + + Nothing is written to the logfile if the container is started in + foreground mode. + [/help] + [option default-pre-start-hook] + summary = command to be executed before the container starts + typestr = command + arg_info = required_arg + arg_type = string + default_val = true + [help] + This hook is run early during container startup. All veth device + pairs have been created, but no namespace or cgroup operations have + been performed at this point. + + If the root file system of the container must be prepared, this is the + right place to perform this task. Unlike the pre exec hook described + below, this hook is only called once. + + The following environment variables are set: MICOFORIA_CONTAINER_NAME, + MICOFORIA_IFSPECS, MICOFORIA_ROOT_DIR. + [/help] + [option default-pre-exec-hook] + summary = command to be executed before /sbin/init is executed + typestr = command + arg_info = required_arg + arg_type = string + default_val = true + [help] + This runs with all namespaces already unshared and cgroup settings + applied but before the root directory is switched to the container + root. The hostname has already been changed to the container name + and the network interfaces have been renamed to eth0, eth1, etc. + + This is the right place to perform additional cgroup or namespace + operations. When the container is rebooted, the pre-exec is called + again, just before control is handed over to the new init process. + + Only MICOFORIA_ROOT_DIR is set in this hook. + [/help] + [option default-init] + summary = control the handover to the init process of the container + typestr = command + arg_info = required_arg + arg_type = string + default_val = /sbin/init + [help] + This program is executed as the last step of the container startup + procedure as pid 1. At this point the root directory of the process + has already been changed, so the given argument refers to a path + relative to the container root directory. + [/help] + [option default-bridge] + summary = ethernet bridge to use by default + typestr = bridge + flag multiple + arg_info = required_arg + arg_type = string + default_val = micoforia + [help] + Applies to all containers which do not specify their own network + interface(s) with --net. If this is given multiple times, containers + will be equipped with multiple interfaces. + [/help] + [option default-cgroup-dac] + summary = specify which device nodes containers may access/create by default + typestr = dacspec + flag multiple + arg_info = required_arg + arg_type = string + [help] + Applies to all containers which do not specify their own access + control lists. May be given multiple times. Each device access control + specifier must be of the form {allow|deny} , where + is a suitable device access control string for the devices.allow or + devices.deny file of the cgroup-v1 controller. Order matters. + + If this option is not given, and the corresponding per-container + option is not given either, a reasonable default applies which allows + access to the most common character devices (/dev/zero, /dev/null, + /dev/urandom, etc.) but denies access to most other devices including + all block devices. + + Example: allow c 1:5 rwm + [/help] + [option default-cpu-cores] + summary = Number of cores to use by default (zero means unlimited) + typestr = num + arg_info = required_arg + arg_type = uint32 + default_val = 0 + [help] + The limit is enforced by the cpu cgroup-v2 controller. Note that in + contrast to the cpuset controller of cgroup-v1 this controller does not + restrict the container to a set of admissible CPUs. Instead, it limits + the number of CPU cycles per time unit for the processes in the cgroup. + [/help] + [option default-memory-limit] + summary = Memory usage throttle limit (zero means no limit) + typestr = gigabytes + arg_info = required_arg + arg_type = uint32 + default_val = 0 + [help] + The value specified here is written to the cgroup-v2 memory.high + control file of all containers which do not specify their own limit. + [/help] + [option default-io-max] + summary = I/O limit (zero means no limit) + flag multiple + typestr = iospec + arg_info = required_arg + arg_type = string + [help] + The I/O specifier argument must be a valid string for the io.max file + of the cgroup-v2 controller. For example, the string "1:5 rbps=1024" + limits the read I/O rate for the /dev/zero device to 1K per second. + [/help] + [option default-capdrop] + summary = Capabilities to drop by default + typestr = capspec + flag multiple + arg_info = required_arg + arg_type = string + [help] + The capability specifier argument is the text representation of a + capability, like CAP_SYS_MODULE. All given capabilities will be dropped + from the bounding set of the container init process, hence from all + all processes of the container. If this option is not given, and no + per-container capabilities to drop are given either, CAP_SYS_MODULE, + CAP_SYS_TIME, and CAP_SYS_RESOURCE are dropped. + + See capabilities(7) for the list of capabilities and their meaning. + [/help] + [option default-tty] + summary = Minor number of a tty device to capture by default + typestr = minor + flag multiple + arg_info = required_arg + arg_type = uint32 + [help] + Normally the container's init process starts at least one "getty" + login session on a tty port /dev/ttyX, where X is the minor device + ID. This option lets you capture these login sessions and forward them + to another micoforia process executing the "attach" subcommand. For + each time the option is given, the device with the given minor device + number is captured. + + If this is not given, /dev/tty1 will be captured. + [/help] + [option general-options-section] + summary = Per-Container Options + flag ignored + [help] + These override the global container options above. Most of them take + a compound argument of the form , where the first part + is the name of the container to which the option should be applied. + + Unless noted otherwise, if both a global option and the corresponding + per-container option is given, the per-container option takes + precedence. + [/help] + [option container] + summary = name of the container + flag multiple + typestr = name + arg_info = required_arg + arg_type = string + [help] + Used for the hostname, the name of the veth interfaces and the name of + the cgroup directory. The name may only contain characters of the set + [a-zA-Z0-9-] and the length must not exceed 32 characters. + + This does not need to be given if one of the compound options below + are given instead. + [/help] + [option pre-start-hook] + summary = See --default-pre-start-hook + flag multiple + typestr = name:command + arg_info = required_arg + arg_type = string + [option pre-exec-hook] + summary = See --default-pre-exec-hook + flag multiple + typestr = name:command + arg_info = required_arg + arg_type = string + [option init] + summary = See --default-init + typestr = name:command + flag multiple + arg_info = required_arg + arg_type = string + [option net] + summary = Equip the container with a non-default network interface + flag multiple + typestr = name:ifspec + arg_info = required_arg + arg_type = string + [help] + The interface specifier is of the form bridge[:hwaddr]. If no hardware + address is given, a random address will be used. See --default-bridge. + + Unlike the other compound options of this section, this option is + cumulative in that multiple options with the same container name do + not override each other but accumulate, resulting in a container with + multiple network interfaces. + [/help] + [option root-directory] + summary = Path to the container root directory. See --default-root-prefix. + flag multiple + typestr = name:path + arg_info = required_arg + arg_type = string + [help] + [/help] + [option cgroup-dac] + summary = See --default-cgroup-dac + typestr = name:dacspec + flag multiple + arg_info = required_arg + arg_type = string + [option cpu-cores] + summary = See --default-cpu-cores + typestr = name:num + flag multiple + arg_info = required_arg + arg_type = string + [option memory-limit] + summary = See --default-memory-limit + typestr = name:gigabytes + flag multiple + arg_info = required_arg + arg_type = string + [option io-max] + summary = See --default-io-max + flag multiple + typestr = name:iospec + arg_info = required_arg + arg_type = string + [option capdrop] + summary = See --default-capdrop + flag multiple + typestr = name:capspec + arg_info = required_arg + arg_type = string + [option tty] + summary = See --default-tty + typestr = name:minor + flag multiple + arg_info = required_arg + arg_type = string + +[introduction] + micoforia supports the subcommands described below. If no subcommand + is given, the list of available subcommands is shown and the program + terminates successfully without performing any further action. +[/introduction] + +[subcommand start] + purpose = start one or more containers + non-opts-name = [...] + [description] + If no container is given, all configured containers are started. + [/description] + [option foreground] + short_opt = F + summary = do not run as background daemon + [help] + Normally, the process detaches from the console and continues to run + in the background. When this option is given, only a single container + can be started, and this container will run with its /dev/console + device redirected to the local tty, making the container startup + messages visible on the local tty. + + Moreover, stdin is forwarded to the first configured tty device + (/dev/tty1 by default) of the container, and anything received from + the other end of the forwarding is dumped to stdout. This allows for + logins on the "local" console of the container, provided the container + starts getty process which listens on the tty device. + [/help] +[subcommand stop] + purpose = shutdown one or more containers + non-opts-name = [...] + [description] + This subcommand works by executing halt(8) in container context. + If no container is given, halt(8) is executed in all configured + container contexts. + [/description] + [option wait] + short_opt = w + summary = wait until all containers have terminated + [help] + Without --wait the micoforia process which executes the stop + subcommand exits after spawning one halt(8) process per container + to be stopped. If --wait is given, the subcommand waits until all + containers have terminated or the timeout expires. This is handy for + system shutdown scripts which are supposed to terminate all running + containers. + [/help] + [closing] + If --wait is not given, the subcommand exits successfully if and only + if all signals were sent successfully. With --wait the subcommand + exits successfully if, additionally, all signalled processes have + terminated before the timeout expires. + [/closing] + +[subcommand reboot] + purpose = reboot containers + non-opts-name = [...] + [description] + Containers are rebooted and killed by sending a signal to a micoforia + process which executes the start subcommand. + [/description] +[subcommand kill] + purpose = force containers to terminate + non-opts-name = [...] + [description] + This works like the reboot subcommand, but a different signal is used + to notify the container. + [/description] + [option wait] + short_opt = w + summary = wait until all signalled containers have terminated + [help] + Without --wait the micoforia process which executes the kill subcommand + exits right after the underlying kill(2) system call returns. At this + point the signalled process might still be alive although SIGKILL + was sent. If --wait is given, the process waits until the signalled + processes have terminated or the timeout expires. + [/help] +[subcommand ls] + purpose = list containers + non-opts-name = [...] + [description] + Several listing modes are available. By default, only the running + containers are listed. If no container name is given, all configured + containers are taken into account. + + [/description] + [option all] + short_opt = a + summary = Also list containers which are not running + [option quiet] + short_opt = q + summary = Do not print any output + [help] + For scripts to determine from the exit code whether all of the given + containers are running. + [/help] + [option long] + short_opt = l + summary = Show also the pid, and the cpu and memory limits + [help] + This overrides --quiet. That is, if both --quiet and --long are given, + the long listing is shown, + [/help] + [option verbose] + short_opt = v + summary = Show all container settings, one setting per line + [help] + This overrides --quiet and --long. + [/help] + [closing] + The subcommand exits successfully if and only if all given/configured + containers could be listed. Unless --all is given, it is considered + an error if a given container is not running. In particular, when ls + is executed with no arguments at all, it exits successfully if and + only if all configured containers are running. + [/closing] +[subcommand ps] + purpose = print process list of one or more containers + non-opts-name = [...] + [description] + This runs pstree(1). The container init process is always the third + process shown. Process IDs refer to the parent PID namespace, which + is why the process ID of the container init is not shown as 1. + [/description] + [option all] + short_opt = a + summary = also show the two micoforia processes +[subcommand attach] + purpose = map the console of a running container to the local terminal. + non-opts-name = [...] + [description] + It is an error if stdin is not associated with a terminal device. + [/description] + [option tty] + short_opt = t + summary = terminal to connect + arg_info = required_arg + arg_type = uint32 + typestr = minor + default_val = 1 + [help] + This operation can only succeed if the given tty is forwarded by the + container. See --tty and --default-tty. + [/help] + [option force] + short_opt = f + summary = don't fail but steal the tty if it is already attached +[subcommand help] + purpose = list available subcommands or print subcommand-specific help + non-opts-name = [subcommand] + [description] + Without any arguments, help prints the list of available + subcommands. When called with a subcommand name argument, it prints + the help text of the given subcommand. + [/description] + [option long] + short_opt = l + summary = show the long help text + [help] + If the optional argument is supplied, the long help text contains the + synopsis, the purpose and the description of the specified subcommand, + followed by the option list including summary and help text of each + option. Without --long, the short help is shown instead. This omits + the description of the subcommand and the option help. + + If no subcommand is supplied but --long is given, the list contains the + purpose of each subcommand. + [/help] + +[subcommand configtest] + purpose = run a configuration file syntax test + [description] + This subcommand checks the command line options and the configuration + file for syntactic and semantic correctness. It either reports + "Syntax Ok" and exits successfully or prints information about the + first error and terminates with exit code 1. + [/description] + +[subcommand edit] + purpose = edit the configuration file + [description] + The editor to start is derived from the EDITOR environment variable. + If this variable is not set, vi is assumed. + [/description] + +[subcommand enter] + purpose = run a command in a container namespace + non-opts-name = [ [arg...]] + [description] + This executes the nsenter(1) command to enter the namespaces of + the init process of the given container. If no command is given, + the login command is run to start a root shell. + [/description] + +[subcommand log] + purpose = show the log file for the given container + non-opts-name = [] + [description] + This executes cat(1) or less(1), depending on whether or not stdin + and stdout are associated with a terminal device. + [/description] +[section Notes] +.SS The Cgroup File Systems + There are two implementations of Linux control groups called + .I cgroup-v1 + and + .IR cgroup-v2 . + Both come with their own pseudo filesystem. + .B micoforia + requires both file systems to be mounted at + .IR /var/cgroup + and + .IR /var/cgroup2 . + Version 1 cgroups are only used to enforce device access control for + the containers, so the cgroup-v1 pseudo filesystem should be mounted + with only this controller enabled. See the Examples section below + for how to do this. Future versions of + .B micoforia + might switch to the devices controller of cgroup-v2. +.SS Container Names + The container name is used also for the name of the network device + and as a directory name if no explicit root directory is given with + --root-prefix. Therefore container names must not exceed 32 characters, + which must all be alphanumeric or '-'. In particular, whitespace and + underscore ('_') are not permitted. + +[/section] +[section Examples] + .IP \(bu 2 + Create a bash alias named + .I m7a + for + .I micoforia + which activates debug messages and already includes the double dash + to separate global options from subcommand options: + + .RS 6 + .EX + .B alias m7a='micoforia --loglevel debug --' + .EE + .RE + .IP \(bu 2 + Set up an ethernet bridge named + .IR micoforia , + add the physical interface + .I eth1 + to it and give the bridge interface an IP address: + + .RS 6 + .EX + .B brctl addbr micoforia + .B ip link set up micoforia + .B brctl addif micoforia eth1 + .B ip a a 192.168.137.1/24 dev micoforia + .EE + .RE + .IP \(bu 2 + Mount the two cgroup file systems, but only activate the + .I devices + controller of cgroup-v1: + + .RS 6 + .EX + .B mkdir -p /var/cgroup && mount -t cgroup -o devices cgroup /var/cgroup + .B mkdir -p /var/cgroup2 && mount -t cgroup2 cgroup2 /var/cgroup2 + .EE + .RE + .IP \(bu 2 + Entries for + .I /etc/fstab + to mount the cgroup file systems automatically at boot: + + .RS 6 + .EX + .B none /var/cgroup cgroup devices 0 0 + .B none /var/cgroup2 cgroup2 defaults 0 0 + .EE + .RE + .IP \(bu 2 + Download a Debian10 root file system to + .IR /var/lib/micoforia/debian10 , + set the root password and let micoforia set the hostname + + .RS 6 + .EX + .B debootstrap --variant=minbase buster /var/lib/micoforia/debian10 http://deb.debian.org/debian/ + .B chroot /var/lib/micoforia/debian10 passwd + .B rm -f /var/lib/micoforia/debian10/etc/hostname + .EE + .RE + .IP \(bu 2 + Download a minimal Ubuntu-18.04 root file system to + .IR /var/lib/micoforia/c1 , + set the root password and configure the + .I eth0 + interface, using a static IP address: + + .RS 6 + .EX + .B debootstrap --include openssh-server --include ifupdown bionic /var/lib/micoforia/c1 http://de.archive.ubuntu.com/ubuntu + .B chroot /var/lib/micoforia/c1 passwd + .B printf 'auto eth0\(rsniface eth0 inet static\(rsnaddress 192.168.137.2/24\(rsn' \ + >> /var/lib/micoforia/c1/etc/network/interfaces + .B echo 'PermitRootLogin yes' >> /var/lib/micoforia/c1/etc/ssh/sshd_config + .EE + .RE + .IP \(bu 2 + Start the container in foreground mode: + + .RS 6 + .EX + .B micoforia --container c1 --start --foreground + .EE + .RE + .IP \(bu 2 + Attach to + .I tty1 + of the running container: + + .RS 6 + .EX + .B m7a attach c1 + .EE + .RE + .IP \(bu 2 + Ask the container to shut down, and wait for the shutdown procedure + to complete: + + .RS 6 + .EX + .B m7a stop --wait c1 + .EE + .RE + .IP \(bu 2 + Check whether the container is running: + + .RS 6 + .EX + .B m7a ls --quiet c1 && echo yes || echo no + .EE + .RE + .IP \(bu 2 + A simple config file: + + .RS 6 + .EX + .B # two global options + .B loglevel info + .B container c1 + .B # an option for the "attach" subcommand + .B [start] + .B \ \ \ \ tty 2 + .EE + .RE + +[/section] +[section copyright] + Written by AUTHOR() + .br + Copyright (C) COPYRIGHT_YEAR() AUTHOR() + .br + License: LICENSE() + .br + This is free software: you are free to change and redistribute it. + .br + There is NO WARRANTY, to the extent permitted by law. + .P + Web page: + .UR URL() + .UE + .br + Git clone `URL': + .UR CLONE_URL() + .UE + .br + Gitweb: + .UR GITWEB_URL() + .UE + .br + Author's home page: + .UR HOME_URL() + .UE + .br + Report bugs to + .MT EMAIL() + AUTHOR() + .ME +[/section] +[section see also] + .BR lxc (7), + .BR brct l(8), + .BR ip (8) + .BR pstree (1) +[/section] diff --git a/micoforia.svg b/micoforia.svg new file mode 100644 index 0000000..4c7a3f9 --- /dev/null +++ b/micoforia.svg @@ -0,0 +1,26 @@ + + + + + + + + diff --git a/util.c b/util.c new file mode 100644 index 0000000..ebe5b1e --- /dev/null +++ b/util.c @@ -0,0 +1,1142 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "m7a.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void die(const char *fmt, ...) +{ + char *str; + va_list argp; + int ret; + + va_start(argp, fmt); + ret = vasprintf(&str, fmt, argp); + va_end(argp); + if (ret < 0) { /* give up */ + EMERG_LOG("OOM\n"); + exit(EXIT_FAILURE); + } + m7a_log(LL_EMERG, "%s\n", str); + exit(EXIT_FAILURE); +} + +void die_errno(const char *fmt, ...) +{ + char *str; + va_list argp; + int ret, save_errno = errno; + + va_start(argp, fmt); + ret = vasprintf(&str, fmt, argp); + va_end(argp); + if (ret < 0) { + EMERG_LOG("OOM\n"); + exit(EXIT_FAILURE); + } + m7a_log(LL_EMERG, "%s: %s\n", str, strerror(save_errno)); + exit(EXIT_FAILURE); +} + +void *xrealloc(void *p, size_t size) +{ + assert(size > 0); + assert((p = realloc(p, size))); + return p; +} + +void *xmalloc(size_t size) +{ + return xrealloc(NULL, size); +} + +void *xzmalloc(size_t size) +{ + void *p = xrealloc(NULL, size); + memset(p, 0, size); + return p; +} + +void *xstrdup(const char *s) +{ + char *ret = strdup(s? s: ""); + + assert(ret); + return ret; +} + +char *msg(const char *fmt, ...) +{ + char *m; + size_t size = 100; + + m = xmalloc(size); + while (1) { + int n; + va_list ap; + + /* Try to print in the allocated space. */ + va_start(ap, fmt); + n = vsnprintf(m, size, fmt, ap); + va_end(ap); + /* If that worked, return the string. */ + if (n < size) + return m; + /* Else try again with more space. */ + size = n + 1; /* precisely what is needed */ + m = xrealloc(m, size); + } +} + +char *xstrcat(char *a, const char *b) +{ + char *tmp; + + if (!a) + return xstrdup(b); + if (!b) + return a; + tmp = msg("%s%s", a, b); + free(a); + return tmp; +} + +void die_empty_arg(const char *opt) +{ + die("argument to --%s must not be empty", opt); +} + +__attribute__ ((noreturn)) +static void die_range(const char *opt) +{ + die("argument to --%s is out of range", opt); +} + +void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt) +{ + if (val < min || val > max) + die_range(opt); +} + +bool fd2buf(int fd, const struct iovec *iov) +{ + ssize_t ret, nread = 0, max; + char *buf = iov->iov_base; + + assert(iov->iov_len > 1); + max = iov->iov_len - 1; + for (;;) { + ret = read(fd, buf + nread, max - nread); + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + continue; + ERROR_LOG("read error: %s\n", strerror(errno)); + return false; + } + if (ret == 0) { + buf[nread] = '\0'; + DEBUG_LOG("read %zd bytes\n", nread); + return true; + } + nread += ret; + if (nread >= max) { + ERROR_LOG("cmd output truncated\n"); + return false; + } + } +} + +bool xexec(char * const argv[], const struct iovec *iov) +{ + pid_t pid; + int pipefd[2] = {-1, -1}; + unsigned n; + + for (n = 0; argv[n]; n++) + DEBUG_LOG("argv[%u]=%s\n", n, argv[n]); + if (iov) { + if (pipe(pipefd) < 0) + die_errno("pipe"); + } + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid > 0) { /* parent */ + int wstatus; + bool success = true; + if (iov) { + close(pipefd[1]); + success = fd2buf(pipefd[0], iov); + close(pipefd[0]); + } + if (waitpid(pid, &wstatus, 0) < 0) + die_errno("waitp"); + if (!success) + return false; + if (!WIFEXITED(wstatus)) + return false; + if (WEXITSTATUS(wstatus) != EXIT_SUCCESS) + return false; + return true; + } + if (pipefd[0] >= 0) + close(pipefd[0]); + if (pipefd[1] >= 0 && pipefd[1] != STDOUT_FILENO) { + if (dup2(pipefd[1], STDOUT_FILENO) < 0) + die_errno("dup2()"); + close(pipefd[1]); + } + execvp(argv[0], argv); + EMERG_LOG("execvp error: %s\n", strerror(errno)); + _exit(EXIT_FAILURE); +} + +void valid_fd012(void) +{ + /* Ensure that file descriptors 0, 1, and 2 are valid. */ + while (1) { + int fd = open("/dev/null", O_RDWR); + if (fd < 0) + die_errno("open"); + if (fd > 2) { + close(fd); + break; + } + } +} + +void check_name(const char *arg) +{ + size_t m, len; + char c; + + len = strlen(arg); + if (len == 0) + die("empty name"); + if (len > 32) + die("name too long: %s", arg); + for (m = 0; m < len; m++) { + c = arg[m]; + if (!isascii(c)) + goto invalid; + if (!isalnum(c) && c != '-') + goto invalid; + } + return; +invalid: + die("invalid character '%c' in name %s", c, arg); +} + +/* allocates two new strings that should be freed by the caller */ +void parse_compound_arg(const char *arg, const char *opt, char **name, char **val) +{ + char *copy, *p; + + if (arg[0] == '\0') + die_empty_arg(opt); + copy = xstrdup(arg); + p = strchr(copy, ':'); + if (!p) + die("could not parse argument to --%s", opt); + *p = '\0'; + check_name(copy); + *name = copy; + p++; + *val = xstrdup(p); +} + +char *parse_cgroup_acl(const char *arg) +{ + if (!strncmp(arg, "allow ", 6)) + return msg("a%s", arg + 6); + if (!strncmp(arg, "deny ", 5)) + return msg("d%s", arg + 5); + die("invalid cgroup access specifier: %s", arg); +} + +void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr) +{ + const char *colon = strchr(arg, ':'); + size_t len; + unsigned n, x[6]; + + if (colon) { + len = colon - arg; + *bridge = xmalloc(len + 1); + memcpy(*bridge, arg, len); + (*bridge)[len] = '\0'; + } else + *bridge = xstrdup(arg); + check_name(*bridge); + if (!colon) { + memset(hwaddr, 0, 6); + return; + } + if (sscanf(colon + 1, "%02x:%02x:%02x:%02x:%02x:%02x", + x, x + 1, x + 2, x + 3, x + 4, x + 5) != 6) + die("invalid hwaddress for ifspec %s", arg); + if (colon[1 + 6 * 2 + 5] != '\0') + die("trailing garbage at the end of ifspec %s", arg); + for (n = 0; n < 6; n++) + hwaddr[n] = x[n]; +} + +uint32_t atou32(const char *str, const char *opt) +{ + char *endptr; + long long tmp; + + errno = 0; /* To distinguish success/failure after call */ + tmp = strtoll(str, &endptr, 10); + if (errno == ERANGE && (tmp == LLONG_MAX || tmp == LLONG_MIN)) + die_range(opt); + if (tmp < 0 || tmp > (uint32_t)-1) + die_range(opt); + /* + * If there were no digits at all, strtoll() stores the original value + * of str in *endptr. + */ + if (endptr == str) + die_empty_arg(opt); + /* + * The implementation may also set errno and return 0 in case no + * conversion was performed. + */ + if (errno != 0 && tmp == 0) + die_empty_arg(opt); + if (*endptr != '\0') /* Further characters after number */ + die("--%s: trailing characters after number", opt); + return tmp; +} + +bool remove_subdirs_recursively(const char *path) +{ + DIR *d = opendir(path); + struct dirent *entry; + int dfd; + struct stat stat; + + if (!d) { + ERROR_LOG("opendir %s: %m\n", path); + return false; + } + dfd = dirfd(d); + assert(dfd >= 0); + while ((entry = readdir(d))) { + char *subpath; + if (!strcmp(entry->d_name, ".")) + continue; + if (!strcmp(entry->d_name, "..")) + continue; + if (fstatat(dfd, entry->d_name, &stat, 0) == -1) { + WARNING_LOG("%s/%s: %m", path, entry->d_name); + continue; + } + if (!S_ISDIR(stat.st_mode)) + continue; + subpath = msg("%s/%s", path, entry->d_name); + remove_subdirs_recursively(subpath); + DEBUG_LOG("removing %s\n", subpath); + if (rmdir(subpath) < 0) { + ERROR_LOG("rmdir %s: %m\n", subpath); + return false; + } + free(subpath); + } + closedir(d); + return true; +} + +void daemonize(const char *logfile) +{ + pid_t pid; + int nullfd, logfd; + + if ((pid = fork()) < 0) + die_errno("fork"); + if (pid) /* parent exits */ + exit(EXIT_SUCCESS); + valid_fd012(); + /* become session leader */ + if (setsid() < 0) + die_errno("setsid"); + if ((nullfd = open("/dev/null", O_RDWR)) < 0) + die_errno("open /dev/null"); + logfile = logfile? logfile : "/dev/null"; + if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, 0666)) < 0) + die_errno("open %s", logfile); + NOTICE_LOG("subsequent log messages go to %s\n", logfile); + if (dup2(nullfd, STDIN_FILENO) < 0) + die_errno("dup2"); + close(nullfd); + if (dup2(logfd, STDOUT_FILENO) < 0) + die_errno("dup2"); + if (dup2(logfd, STDERR_FILENO) < 0) + die_errno("dup2"); + close(logfd); + if (chdir("/") < 0) + die_errno("chdir"); +} + +static int super_dull_hash(const char *input) +{ + const uint8_t *x = (typeof(x))input; + const unsigned p1 = 16777619, p2 = 2971215073; + unsigned n, m, h, result = 0; + + for (n = 0; n < 4; n++) { + h = p1 * (x[0] + n); + for (m = 1; x[m] != 0; m++) + h = p2 * (h ^ x[m]); + result = (result << 8) | (h % 256); + } + return result >> 1; +} + +/** + * We use a semaphore set with two semaphores. The first semaphore is modified + * in all locking related functions while the second semaphore is modified only + * in try_lock() and aquire_lock(). This allows us to obtain the PID of the + * lock holder by querying the PID that last performed an operation on the + * second semaphore. This is achieved by passing GETPID as the control + * operation to semctl(). + */ + +static bool get_lock(const char *string, pid_t *pid, bool wait) +{ + int semid, ret; + struct sembuf sops[4]; + key_t key = super_dull_hash(string); + bool success; + short sem_flg = SEM_UNDO; + + if (!wait) + sem_flg |= IPC_NOWAIT; + ret = semget(key, 2, IPC_CREAT | 0600); + if (ret < 0) { + ERROR_LOG("semget: %m\n"); + return false; + } + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + ret = semctl(semid, 1, GETPID); + if (ret < 0) + return false; + if (pid) + *pid = ret; + sops[0].sem_num = 0; + sops[0].sem_op = 0; + sops[0].sem_flg = sem_flg; + + sops[1].sem_num = 0; + sops[1].sem_op = 1; + sops[1].sem_flg = sem_flg; + + sops[2].sem_num = 1; + sops[2].sem_op = 0; + sops[2].sem_flg = sem_flg; + + sops[3].sem_num = 1; + sops[3].sem_op = 1; + sops[3].sem_flg = sem_flg; + + success = semop(semid, sops, 4) >= 0; + if (!success) + INFO_LOG("semop: %m\n"); + return success; +} + +bool try_lock(const char *string, pid_t *pid) +{ + return get_lock(string, pid, false /* don't wait */); +} + +bool acquire_lock(const char *string) +{ + return get_lock(string, NULL /* don't need pid */, true /* do wait */); +} + +bool release_lock(const char *string) +{ + int semid, ret; + struct sembuf sops[2]; + key_t key = super_dull_hash(string); + bool success; + + ret = semget(key, 2, IPC_CREAT | 0600); + if (ret < 0) { + ERROR_LOG("semget: %m\n"); + return false; + } + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + sops[0].sem_num = 0; + sops[0].sem_op = -1; + sops[0].sem_flg = SEM_UNDO; + sops[1].sem_num = 1; + sops[1].sem_op = -1; + sops[1].sem_flg = SEM_UNDO; + success = semop(semid, sops, 2) >= 0; + if (!success) + INFO_LOG("semop: %m\n"); + return success; +} + +bool is_locked(const char *string, pid_t *pid) +{ + int ret, semid; + struct sembuf sops = { + .sem_num = 0, + .sem_op = 0, + .sem_flg = SEM_UNDO | IPC_NOWAIT + }; + key_t key = super_dull_hash(string); + + if (pid) + *pid = 0; + ret = semget(key, 2, 0); + if (ret < 0) + return false; + semid = ret; + DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid); + if (semop(semid, &sops, 1) >= 0) + return false; + ret = semctl(semid, 1, GETPID); + if (ret < 0) + return false; + if (pid) + *pid = ret; + return true; +} + +bool attach_to_bridge(const char *iface, const char *bridge) +{ + int fd, idx; + struct ifreq ifr; + bool success; + + INFO_LOG("adding interface %s to bridge %s\n", iface, bridge); + if (!(idx = if_nametoindex(iface))) { + ERROR_LOG("no index for %s\n", iface); + return false; + } + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + ERROR_LOG("socket: %m\n"); + return false; + } + strncpy(ifr.ifr_name, bridge, IFNAMSIZ - 1); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + ifr.ifr_ifindex = idx; + success = ioctl(fd, SIOCBRADDIF, &ifr) == 0; + if (!success) + ERROR_LOG("interface %s, bridge %s: ioctl SIOCBRADDIF: %m\n", + iface, bridge); + close(fd); + return success; +} + + +#define NLMSG_TAIL(nmsg) \ + ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +static void addattr_l(struct nlmsghdr *nlh, int type, const void *data, + int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + rta = NLMSG_TAIL(nlh); + rta->rta_type = type; + rta->rta_len = len; + if (alen > 0) + memcpy(RTA_DATA(rta), data, alen); + nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(len); +} + +static struct rtattr *addattr_nest(struct nlmsghdr *n, int type) +{ + struct rtattr *nest = NLMSG_TAIL(n); + addattr_l(n, type, NULL, 0); + return nest; +} + +static void end_nest(struct nlmsghdr *nlh, struct rtattr *attr) +{ + attr->rta_len = (void *)NLMSG_TAIL(nlh) - (void *)attr; +} + +static struct mnl_socket *get_and_bind_netlink_socket(void) +{ + struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); + + if (!nl) { + ERROR_LOG("mnl_socket_open error\n"); + return NULL; + } + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + ERROR_LOG("mnl_socket_bind\n"); + mnl_socket_close(nl); + return NULL; + } + return nl; +} + +static struct nlmsghdr *prepare_netlink_msg_header(char *buf) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = time(NULL); + return nlh; +} + +bool rename_interface(const char *before, const char *after) +{ + int idx; + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("%s -> %s\n", before, after); + if (!(idx = if_nametoindex(before))) { + ERROR_LOG("no index for %s\n", before); + return false; + } + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = idx; + addattr_l(nlh, IFLA_IFNAME, after, strlen(after) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("mnl_socket_sendto failed\n"); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +void pretty_print_hwaddr(const uint8_t *hwaddr, char *result) +{ + sprintf(result, "%02x:%02x:%02x:%02x:%02x:%02x", hwaddr[0], hwaddr[1], + hwaddr[2], hwaddr[3], hwaddr[4], hwaddr[5]); +} + +bool set_hwaddr(const char *iface, const uint8_t *hwaddr) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + const uint8_t zero[6] = {0}; + char pretty_hwaddr[18]; + + if (!memcmp(hwaddr, zero, 6)) + return true; /* no hwaddr specified, nothing to do */ + pretty_print_hwaddr(hwaddr, pretty_hwaddr); + INFO_LOG("hardware address of %s: %s\n", iface, pretty_hwaddr); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + addattr_l(nlh, IFLA_ADDRESS, hwaddr, 6); + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool link_del(const char *iface) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("removing interface %s\n", iface); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_DELLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = IFF_UP; + ifm->ifi_flags = IFF_UP; + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool link_up(const char *iface) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("activating interface %s\n", iface); + if (!(nl = get_and_bind_netlink_socket())) + return false; + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = IFF_UP; + ifm->ifi_flags = IFF_UP; + addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +#ifndef VETH_INFO_PEER +#define VETH_INFO_PEER 1 +#endif + +bool create_veth_device_pair(const char *name, char *peer) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct rtattr *n1, *n2, *n3; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + bool success; + + INFO_LOG("new pair: %s <-> %s\n", name, peer); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + n1 = addattr_nest(nlh, IFLA_LINKINFO); + addattr_l(nlh, IFLA_INFO_KIND, "veth", 5); + n2 = addattr_nest(nlh, IFLA_INFO_DATA); + n3 = addattr_nest(nlh, VETH_INFO_PEER); + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + addattr_l(nlh, IFLA_IFNAME, peer, strlen(peer) + 1); + end_nest(nlh, n3); + end_nest(nlh, n2); + end_nest(nlh, n1); + addattr_l(nlh, IFLA_IFNAME, name, strlen(name) + 1); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto\n", name); + success = false; + goto close; + } + success = true; +close: + mnl_socket_close(nl); + return success; +} + +bool set_netns(const char *iface, pid_t pid) +{ + struct mnl_socket *nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + + INFO_LOG("changing net namespace of interface %s to pid %d\n", + iface, (int)pid); + if (!(nl = get_and_bind_netlink_socket())) + return false; + + nlh = prepare_netlink_msg_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_change = 0; + ifm->ifi_flags = 0; + addattr_l(nlh, IFLA_NET_NS_PID, &pid, sizeof(pid)); + mnl_attr_put_str(nlh, IFLA_IFNAME, iface); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + ERROR_LOG("%s: mnl_socket_sendto failed\n", iface); + return false; + } + mnl_socket_close(nl); + return true; +} + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX (sizeof(((struct sockaddr_un *)0)->sun_path)) +#endif + +static bool init_unix_socket(const char *socket_path, int *socketfd, + struct sockaddr_un *sau) +{ + int fd; + + *socketfd = -1; + if (strlen(socket_path) + 1 >= UNIX_PATH_MAX) { + ERROR_LOG("socket path to long: %s\n", socket_path); + return false; + } + memset(sau, 0, sizeof(struct sockaddr_un)); + sau->sun_family = PF_UNIX; + sau->sun_path[0] = '\0'; /* use the abstract socket namespace */ + strcpy(sau->sun_path + 1, socket_path); + fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + ERROR_LOG("socket: %m\n"); + return false; + } + *socketfd = fd; + return true; +} + +bool listen_on_unix_socket(const char *socket_path, int *result) +{ + struct sockaddr_un sau; + int fd, flags; + bool success = false; + + if (!init_unix_socket(socket_path, &fd, &sau)) + return false; + flags = fcntl(fd, F_GETFL); + if (flags < 0) { + ERROR_LOG("fcntl (F_GETFL): %m\n"); + goto fail; + } + flags = fcntl(fd, F_SETFL, ((long)flags) | O_NONBLOCK); + if (flags < 0) { + ERROR_LOG("fcntl (F_SETFL): %m\n"); + goto fail; + } + if (bind(fd, (struct sockaddr *)&sau, sizeof(sau)) < 0) { + ERROR_LOG("bind: %m\n"); + goto fail; + } + if (listen(fd , 5) < 0) { + ERROR_LOG("listen: %m\n"); + goto fail; + } + *result = fd; + NOTICE_LOG("listening on fd %d\n", fd); + return true; +fail: + close(fd); + return success; +} +/* + * Send a buffer and the credentials of the current process to a socket. + * + * buf must be zero-terminated. + * return the return value of the underlying call to sendmsg(). + */ +static bool send_cred_buffer(int sock, char *buf) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg; + struct cmsghdr *cmsg; + static struct iovec iov; + struct ucred c; + + /* Response data */ + iov.iov_base = buf; + iov.iov_len = strlen(buf) + 1; + c.pid = getpid(); + c.uid = getuid(); + c.gid = getgid(); + /* compose the message */ + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + /* attach the ucred struct */ + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + *(struct ucred *)CMSG_DATA(cmsg) = c; + msg.msg_controllen = cmsg->cmsg_len; + if (sendmsg(sock, &msg, 0) < 0) { + ERROR_LOG("sendmsg: %m\n"); + return false; + } + return true; +} + +static void dispose_fds(int *fds, unsigned num) +{ + int i; + + for (i = 0; i < num; i++) + close(fds[i]); +} + +/* Receive a buffer and the Unix credentials of the sending process. */ +bool recv_cred_buffer(int socketfd, char *buf, size_t size, + int *clientfd, uid_t *uid) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + int yes = 1, cfd, ret; + struct ucred cred; + struct sockaddr_un sau; + socklen_t sizeof_sau = sizeof(sau); + + ret = accept(socketfd, (struct sockaddr *)&sau, &sizeof_sau); + if (ret < 0) { + ERROR_LOG("accept: %m\n"); + return false; + } + cfd = ret; + setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(int)); + memset(&msg, 0, sizeof(msg)); + iov.iov_base = buf; + iov.iov_len = size; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + if (recvmsg(cfd, &msg, 0) < 0) { + ERROR_LOG("recvmsg: %m\n"); + goto fail; + } + cmsg = CMSG_FIRSTHDR(&msg); + while (cmsg) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type + == SCM_CREDENTIALS) { + memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct ucred)); + *uid = cred.uid; + *clientfd = cfd; + return true; + } else + if (cmsg->cmsg_level == SOL_SOCKET + && cmsg->cmsg_type == SCM_RIGHTS) { + dispose_fds((int *)CMSG_DATA(cmsg), + (cmsg->cmsg_len - CMSG_LEN(0)) + / sizeof(int)); + } + cmsg = CMSG_NXTHDR(&msg, cmsg); + } +fail: + close(*clientfd); + *clientfd = -1; + return false; +} + +bool pass_fd(int passfd, int socketfd) +{ + struct msghdr msg = {.msg_iov = NULL}; + struct cmsghdr *cmsg; + char control[255] __attribute__((__aligned__(8))); + struct iovec iov; + char buf[] = "\0OK"; + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + *(int *)CMSG_DATA(cmsg) = passfd; + + /* Sum of the length of all control messages in the buffer */ + msg.msg_controllen = cmsg->cmsg_len; + DEBUG_LOG("passing %s and fd %d\n", buf, passfd); + if (sendmsg(socketfd, &msg, 0) < 0) { + ERROR_LOG("sendmsg: %m\n"); + return false; + } + return true; +} + +static bool recv_fd(int socketfd, int *recvfd) +{ + char control[255] __attribute__((__aligned__(8))); + struct msghdr msg = {.msg_iov = NULL}; + struct cmsghdr *cmsg; + struct iovec iov; + char buf[100]; + ssize_t sz = sizeof(buf), ssz; + + *recvfd = -1; + iov.iov_base = buf; + iov.iov_len = sz - 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + memset(buf, 0, sz); + ssz = recvmsg(socketfd, &msg, 0); + if (ssz < 0) { + ERROR_LOG("recvmsg: %m\n"); + return false; + } + buf[ssz] = '\0'; + INFO_LOG("server response: %u (%s)\n", (unsigned)buf[0], buf + 1); + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET + || cmsg->cmsg_type != SCM_RIGHTS) + continue; + if ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int) != 1) + continue; + *recvfd = *(int *)CMSG_DATA(cmsg); + return true; + } + return false; +} + +int request_fd(const char *socket_path, char *msg, int *result) +{ + struct sockaddr_un sau; + int socketfd, receivefd; + + if (!init_unix_socket(socket_path, &socketfd, &sau)) + die("could not init socket"); + if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) + die_errno("connect"); + if (!send_cred_buffer(socketfd, msg)) + die("could not send cred buffer"); + if (!recv_fd(socketfd, &receivefd)) + die("did not receive tty fd"); + NOTICE_LOG("received fd %d\n", receivefd); + *result = receivefd; + return socketfd; +} + +bool request_int(const char *socket_path, char *msg, int *result) +{ + struct sockaddr_un sau; + int socketfd; + bool success = false; + char buf[100]; + ssize_t ssz; + + *result = -1; + if (!init_unix_socket(socket_path, &socketfd, &sau)) + return false; + if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) { + ERROR_LOG("connect: %m\n"); + goto close; + } + if (!send_cred_buffer(socketfd, msg)) { + ERROR_LOG("could not send cred msg \"%s\"\n", msg); + goto close; + } + ssz = read(socketfd, buf, sizeof(buf) - 1); + if (ssz < 0) { + ERROR_LOG("did not receive integer: %m\n"); + goto close; + } + if (buf[0] != 0) { + ERROR_LOG("did not receive integer: %s\n", buf + 1); + goto close; + } + if (ssz != sizeof(int) + 1) { + ERROR_LOG("protocol mismatch, server msg: %s\n", buf + 1); + goto close; + } + memcpy(result, buf + 1, sizeof(int)); + DEBUG_LOG("received integer: %d\n", *result); + success = true; +close: + close(socketfd); + return success; +} + +int signal_pipe[2]; + +static void signal_handler(int signum) +{ + uint8_t u = signum; + int save_errno = errno; + assert(signum > 0 && signum < 256); + if (write(signal_pipe[1], &u, 1) < 0) + ERROR_LOG("write to signal pipe: %m\n"); + errno = save_errno; +} + +void init_signal_handling(void) +{ + struct sigaction act; + + if (pipe(signal_pipe) < 0) + die_errno("signal pipe"); + act.sa_handler = signal_handler; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_RESTART; + if (sigaction(SIGINT, &act, NULL) < 0) + die_errno("sigaction"); + if (sigaction(SIGTERM, &act, NULL) < 0) + die_errno("sigaction"); + if (sigaction(SIGCHLD, &act, NULL) < 0) + die_errno("sigaction"); +} + +int next_signal(void) +{ + uint8_t u = 0; +again: + if (read(signal_pipe[0], &u, 1) < 0) { + if (errno != EINTR) + die_errno("read"); + goto again; + } + DEBUG_LOG("process %d received signal %u\n", getpid(), u); + return u; +} diff --git a/version-gen.sh b/version-gen.sh new file mode 100755 index 0000000..5e554ee --- /dev/null +++ b/version-gen.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +package="$1" +version_file="$2" + +ver='unnamed_version' +# First try git, then gitweb, then default. +if [ -e '.git' -o -e '../.git' ]; then + git_ver=$(git describe --abbrev=4 HEAD 2>/dev/null) + [ -z "$git_ver" ] && git_ver="$ver" + # update stat information in index to match working tree + git update-index -q --refresh > /dev/null + # if there are differences (exit code 1), the working tree is dirty + git diff-index --quiet HEAD || git_ver=$git_ver-dirty + ver=$git_ver +elif [ "${PWD%%-*}" = $package- ]; then + ver=${PWD##*/$package-} +fi +ver=${ver#v} + +echo "$ver" +[ -z "${version_file}" ] && exit 0 +# update version file if necessary +content="const char *${package}_version(void) {return \"$ver\";};" +[ -r "$version_file" ] && echo "$content" | cmp -s - $version_file && exit 0 +echo "$content" > $version_file -- 2.39.5