--- /dev/null
+micoforia
+micoforia.8
+build
+*.swp
+Makefile.local
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+.SUFFIXES:
+MAKEFLAGS += -Rr
+ifeq ("$(origin CC)", "default")
+ CC := cc
+endif
+ifeq ("$(origin V)", "command line")
+ SAY =
+else
+ SAY = @echo '$(strip $(1))'
+endif
+
+.ONESHELL:
+.SHELLFLAGS := -ec
+PREFIX ?= /usr/local
+INSTALL ?= install
+MKDIR_P := mkdir -p
+RM := rm -f
+CHMOD := chmod
+B := build
+all := micoforia micoforia.8
+all: $(all)
+
+PACKAGE := micoforia
+SLOGAN := Minimal Containers for Instant Availability
+AUTHOR := Andre Noll
+EMAIL := maan@tuebingen.mpg.de
+COPYRIGHT_YEAR := 2019
+URL := http://people.tuebingen.mpg.de/maan/$(PACKAGE)/
+CLONE_URL := git://git.tuebingen.mpg.de/$(PACKAGE)
+GITWEB_URL := http://git.tuebingen.mpg.de/$(PACKAGE).git
+HOME_URL := http://people.tuebingen.mpg.de/maan/
+LICENSE := GNU GPL version 3
+LICENSE_URL := https://www.gnu.org/licenses/gpl-3.0-standalone.html
+LOGLEVELS := LL_DEBUG,LL_INFO,LL_NOTICE,LL_WARNING,LL_ERROR,LL_CRIT,LL_EMERG
+
+units := micoforia util version micoforia.lsg
+deps := $(addprefix $(B)/, $(addsuffix .d, $(units)))
+objs := $(addprefix $(B)/, $(addsuffix .o, $(units)))
+
+ifeq ($(findstring clean, $(MAKECMDGOALS)),)
+ifeq ($(findstring README, $(MAKECMDGOALS)),)
+-include $(deps)
+-include $(B)/config.mak
+endif
+endif
+
+XCPPFLAGS :=
+XCPPFLAGS += -I$(B)
+XCPPFLAGS += -Wunused-macros
+XCPPFLAGS += -DCOPYRIGHT_YEAR='"$(COPYRIGHT_YEAR)"'
+XCPPFLAGS += -DPACKAGE='"$(PACKAGE)"'
+XCPPFLAGS += -DAUTHOR='"$(AUTHOR)"'
+XCPPFLAGS += -DEMAIL='"$(EMAIL)"'
+XCPPFLAGS += -DURL='"$(URL)"'
+XCPPFLAGS += -DCLONE_URL='"$(CLONE_URL)"'
+XCPPFLAGS += -DGITWEB_URL='"$(GITWEB_URL)"'
+XCPPFLAGS += -DHOME_URL='"$(HOME_URL)"'
+XCPPFLAGS += -DGET_VERSION='$(PACKAGE)_version'
+XCPPFLAGS += -DLOGLEVELS='$(LOGLEVELS)'
+XCPPFLAGS += -DBUILD_DATE='"$(build_date)"'
+XCPPFLAGS += -DCC_VERSION='"$(cc_version)"'
+XCPPFLAGS += -DUNAME_RS='"$(uname_rs)"'
+XCPPFLAGS += -DLICENSE='"$(LICENSE)"'
+XCPPFLAGS += -DLICENSE_URL='"$(LICENSE_URL)"'
+
+XCFLAGS :=
+XCFLAGS += -fno-strict-aliasing
+XCFLAGS += -g
+XCFLAGS += -Os
+XCFLAGS += -Wundef -W -Wuninitialized
+XCFLAGS += -Wchar-subscripts
+XCFLAGS += -Werror-implicit-function-declaration
+XCFLAGS += -Wmissing-noreturn
+XCFLAGS += -Wbad-function-cast
+XCFLAGS += -Wredundant-decls
+XCFLAGS += -Wno-sign-compare -Wno-unknown-pragmas
+XCFLAGS += -Wdeclaration-after-statement
+XCFLAGS += -Wformat -Wformat-security -Wmissing-format-attribute
+XCFLAGS += -fsanitize=undefined
+XCFLAGS += -fdata-sections -ffunction-sections
+XCFLAGS += -Wstrict-prototypes
+XCFLAGS += -Wshadow
+XCFLAGS += -Wunused -Wall
+XCFLAGS += -Wformat-signedness
+XCFLAGS += -Wdiscarded-qualifiers
+
+XLDFLAGS := -lubsan -Wl,--gc-sections
+version_file := $(B)/version.c
+GIT_VERSION := $(shell $(MKDIR_P) $(B) && ./version-gen.sh $(PACKAGE) $(version_file))
+
+CC_CMD = $(CC) -c -o $@ $(XCPPFLAGS) $(CPPFLAGS) \
+ $(XCFLAGS) $(CFLAGS) -MMD -MF $(B)/$(*F).d -MT $@
+
+$(objs): m7a.h $(B)/micoforia.lsg.h
+
+$(B):
+ @$(MKDIR_P) $@
+
+$(B)/config.h.in: configure.ac | $(B)
+ $(call SAY, AH $<)
+ cd $(B)
+ autoheader -f ../configure.ac
+$(B)/configure.sh: configure.ac | $(B)
+ $(call SAY, AC $<)
+ cd $(B)
+ autoconf ../configure.ac > configure.sh
+ $(CHMOD) 755 configure.sh
+$(B)/config.status: $(B)/configure.sh | $(B)
+ $(call SAY, SH $<)
+ cd $(B)
+ if test -x config.status; then \
+ ./config.status --quiet --recheck; \
+ else \
+ ./configure.sh --no-create; \
+ fi
+$(B)/config.mak $(B)/config.h: $(B)/config.status config.mak.in $(B)/config.h.in
+ $(call SAY, CS $@)
+ cd $(B)
+ ln -f ../config.mak.in
+ ./config.status -q
+ test -f config.h && touch config.h
+
+define DESCRIPTION1 :=
+ $(PACKAGE) is a lightweight container implementation for Linux.
+ It consists of a single program which reads a single configuration
+ file that describes all containers. $(PACKAGE) was written with
+ performance and simplicity in mind, and is designed for trusted
+ in-house web application hosting.
+endef
+
+define DESCRIPTION2 :=
+ Like other container frameworks, $(PACKAGE) employs Linux namespaces
+ for isolation and cgroup controllers to limit the resource utilization
+ of the containers. Networking is implemented through bridging and
+ virtual ethernet device pairs. There is built-in support for the cpu,
+ memory, I/O and device controllers. Further customization is possible
+ via startup hooks. For example, the startup hook could activate
+ additional cgroup controllers, make the container enter a different
+ namespace, and mount additional file systems.
+endef
+
+define DESCRIPTION3 :=
+ The micoforia program supports a couple of subcommands. Besides
+ the start subcommand which starts one or more containers, there are
+ subcommands for listing, killing or rebooting containers.
+endef
+
+# dependency on config.mak is because the command below depends on $(M4)
+$(B)/index.html $(B)/micoforia.suite: $(B)/%: %.m4 Makefile $(B)/config.mak
+ $(call SAY, M4 $<)
+ $(M4) -D "AUTHOR=$(AUTHOR)" -D "COPYRIGHT_YEAR=$(COPYRIGHT_YEAR)" \
+ -D "PACKAGE=$(PACKAGE)" \
+ -D "SLOGAN=$(SLOGAN)" \
+ -D "EMAIL=$(EMAIL)" \
+ -D "URL=$(URL)" \
+ -D "CLONE_URL=$(CLONE_URL)" \
+ -D "GITWEB_URL=$(GITWEB_URL)" \
+ -D "HOME_URL=$(HOME_URL)" \
+ -D "LICENSE=$(LICENSE)" \
+ -D "LICENSE_URL=$(LICENSE_URL)" \
+ -D "DESCRIPTION1=$(DESCRIPTION1)" \
+ -D "DESCRIPTION2=$(DESCRIPTION2)" \
+ -D "DESCRIPTION3=$(DESCRIPTION3)" $< > $@
+$(B)/%.lsg.c: $(B)/%.suite
+ $(call SAY, LSGC $<)
+ $(LOPSUBGEN) --gen-c --output-dir $(B) < $<
+$(B)/%.lsg.h: $(B)/%.suite
+ $(call SAY, LSGH $<)
+ $(LOPSUBGEN) --gen-header --output-dir $(B) < $<
+%.8: $(B)/%.suite $(B)/version.c
+ $(call SAY, LSGM $<)
+ $(LOPSUBGEN) --gen-man=$(*F).8 --version-string $(GIT_VERSION) < $<
+
+$(B)/%.o: %.c | $(B)
+ $(call SAY, CC $<)
+ $(CC_CMD) $<
+$(B)/%.o: $(B)/%.c
+ $(call SAY, CC $<)
+ $(CC_CMD) $<
+micoforia: $(objs)
+ $(call SAY, LD $@)
+ $(CC) -o $@ $^ $(XLDFLAGS) $(LDFLAGS) -llopsub -lmnl -lutil -lcap
+
+mandir := $(datarootdir)/man/man8
+INSTALL ?= install
+INSTALL_PROGRAM ?= $(INSTALL) -m 755
+INSTALL_DATA ?= $(INSTALL) -m 644
+ifneq ($(findstring strip, $(MAKECMDGOALS)),)
+ strip_option := -s
+endif
+install install-strip: all
+ $(MKDIR_P) $(DESTDIR)$(sbindir) $(DESTDIR)$(mandir)
+ $(INSTALL_PROGRAM) $(strip_option) micoforia $(DESTDIR)$(sbindir)
+ $(INSTALL_DATA) micoforia.8 $(DESTDIR)$(mandir)
+
+clean:
+ $(RM) $(B)/*.o $(all)
+distclean: clean
+ $(RM) -r $(B)
+maintainer-clean:
+ git clean -dfqx > /dev/null 2>&1
+
+define README :=
+$(PACKAGE) - $(SLOGAN)
+
+$(DESCRIPTION1)
+
+$(DESCRIPTION2)
+
+$(DESCRIPTION3)
+
+Resources
+~~~~~~~~~
+| web page: $(URL)
+| git clone URL: $(CLONE_URL)
+| gitweb: $(GITWEB_URL)
+| author's home page: $(HOME_URL)
+| Send feedback to: $(AUTHOR) <$(EMAIL)>
+
+License
+~~~~~~~
+Open source, licensed under the $(LICENSE).
+
+Documentation
+~~~~~~~~~~~~~
+See micoforia.suite.m4. Or build the man page with \"make\" and run
+\"man -l micoforia.8\".
+
+Dependencies
+~~~~~~~~~~~~
+This package requires m4, autoconf, gnu make, gcc or clang, and
+lopsub. The configure script checks if all dependencies are installed
+and prints a meaningful error message if one of them is missing.
+
+Building
+~~~~~~~~
+Run \"make\" to build the package with the default settings. Run
+\"./configure -h\" to list configuration options.
+
+Installation
+~~~~~~~~~~~~
+Run \"sudo make install\" to install to /usr/local. To install to
+/somewhere/else, run \"./configure --prefix /somewhere/else && make\"
+first.
+endef
+
+README:
+ @printf '%s\n' "$(README)"
+
+.PRECIOUS: $(B)/%.lsg.c $(B)/%.lsg.h $(B)/%.8
+.PHONY: all clean install distclean maintainer-clean README
+-include Makefile.local
--- /dev/null
+Run "make README".
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+
+prefix := @prefix@
+exec_prefix := @exec_prefix@
+
+# These two use prefix and exec_prefix
+sbindir := @sbindir@
+datarootdir := @datarootdir@
+
+LOPSUBGEN := @LOPSUBGEN@
+M4 := @M4@
--- /dev/null
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+set -e
+
+mkdir -p build
+cd build
+autoconf ../configure.ac > configure.sh
+chmod 755 configure.sh
+ln -f ../config.mak.in
+autoheader ../configure.ac
+sh configure.sh "$@"
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+
+AC_PREREQ([2.61])
+# only for configure -h, see Makefile
+AC_INIT([software], [packages])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES([config.mak])
+AC_USE_SYSTEM_EXTENSIONS
+AC_PROG_CC
+AC_PROG_CPP
+
+AC_DEFUN([REQUIRE_EXECUTABLE], [
+ AC_PATH_PROG(m4_toupper([$1]), [$1])
+ test -z "$m4_toupper([$1])" && AC_MSG_ERROR([$2])
+])
+REQUIRE_EXECUTABLE([m4], [m4 is required to build this package])
+
+AC_DEFUN([LOPSUB_NOT_FOUND], [
+The lopsub library is required to build this software, but the checks
+indicate it is not installed on your system. Run the following
+command to download a copy.
+ git clone git://git.tuebingen.mpg.de/lopsub.git
+Install the library, then run this configure script again.
+
+If you installed lopsub at a non-standard location, make sure to set
+PATH, CPPFLAGS and LDFLAGS accordingly. For example:
+
+ pfx=/prefix/where/lopsub/is/installed
+ export PATH=\$pfx/bin:\$PATH
+ export CPPFLAGS=-I\$pfx/include
+ export LDFLAGS=-L\$pfx/lib
+])
+REQUIRE_EXECUTABLE([lopsubgen], [LOPSUB_NOT_FOUND()])
+AC_CHECK_HEADER(lopsub.h, [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])])
+AC_CHECK_LIB([lopsub], [lls_merge], [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])])
+
+AC_DEFUN([LIBCAP_NOT_FOUND], [the libcap library is required to build dnl
+this software. Package: libcap-dev])
+AC_CHECK_HEADER([sys/capability.h], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])])
+AC_CHECK_LIB([cap], [cap_from_text], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])])
+
+AC_DEFUN([LIBMNL_NOT_FOUND], [the libmnl library is required to build dnl
+this software. Package: libmnl-dev])
+AC_CHECK_HEADER([libmnl/libmnl.h], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])])
+AC_CHECK_LIB([mnl], [mnl_socket_open], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])])
+
+AC_OUTPUT
--- /dev/null
+dnl SPDX-License-Identifier: GPL-2.0-only
+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
+'http://www.w3.org/TR/html4/loose.dtd'>
+
+<html>
+ <head>
+ <meta
+ http-equiv='Content-Type';
+ content='text/html';
+ charset=utf-8;
+ >
+ <title>PACKAGE()</title>
+ <style type='text/css'>
+ a {
+ color: #003355;
+ }
+ p {
+ font-size: 120%;
+ }
+ </style>
+ </head>
+ <body>
+ <table width="100%">
+ <tr>
+ <td>
+ <h1 align="left">
+ PACKAGE() - SLOGAN()
+ </h1>
+ </td>
+ <td align="right" title="The micoforic dude">
+
+ <img src="micoforia.svg">
+ </td>
+ </tr>
+ </table>
+ <p> DESCRIPTION1() </p>
+ <p> DESCRIPTION2() </p>
+ <p> DESCRIPTION3() </p>
+
+ <h2> Resources </h2>
+ <ul>
+ <li> Clone `URL': CLONE_URL() </li>
+ <li> <a href="GITWEB_URL()">Gitweb</a> </li>
+ <li> The author's <a href="HOME_URL()">home page</a> </li>
+ <li> Send feedback to <a href="mailto:EMAIL()">AUTHOR()</a> </li>
+ </ul>
+
+ <h2> License </h2>
+ Open source, licensed under the <a
+ href="LICENSE_URL()">LICENSE()</a>
+
+ <h2> Documentation </h2>
+ See the manual page for details.
+
+ <h2> Programming Language </h2>
+ Plain C.
+
+ <h2> Dependencies </h2>
+ A working C compiler and a couple of other dependencies,
+ most of which are standard (autoconf, make, m4,
+ libmnl, libcap). The notable exception is the <a
+ href="http://people.tuebingen.mpg.de/maan/lopsub/">lopsub</a> library.
+ </body>
+</html>
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/uio.h>
+#include <pwd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <limits.h>
+
+#include "config.h"
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define CMD_PTR(_cname) lls_cmd(LSG_MICOFORIA_CMD_ ## _cname, micoforia_suite)
+#define OPT_RESULT(_cname, _oname) (lls_opt_result(\
+ LSG_MICOFORIA_ ## _cname ## _OPT_ ## _oname, \
+ (CMD_PTR(_cname) == CMD_PTR(MICOFORIA))? lpr : sublpr))
+#define OPT_GIVEN(_cname, _oname) (lls_opt_given(OPT_RESULT(_cname, _oname)))
+#define OPT_UINT32_VAL_N(_n, _cname, _oname) (lls_uint32_val(_n, \
+ OPT_RESULT(_cname, _oname)))
+#define OPT_UINT32_VAL(_cname, _oname) (OPT_UINT32_VAL_N(0, _cname, _oname))
+#define OPT_STRING_VAL_N(_n, _cname, _oname) (lls_string_val(_n, \
+ OPT_RESULT(_cname, _oname)))
+#define OPT_STRING_VAL(_cname, _oname) (OPT_STRING_VAL_N(0, _cname, _oname))
+
+struct micoforia_user_data {bool (*handler)(void);};
+#define EXPORT_CMD_HANDLER(_cmd) const struct micoforia_user_data \
+ lsg_micoforia_com_ ## _cmd ## _user_data = { \
+ .handler = com_ ## _cmd \
+ };
+
+
+__attribute__ ((warn_unused_result))
+void *xrealloc(void *p, size_t size);
+
+__attribute__ ((warn_unused_result))
+void *xmalloc(size_t size);
+
+__attribute__ ((warn_unused_result))
+void *xzmalloc(size_t size);
+
+void *xstrdup(const char *s);
+char *xstrcat(char *a, const char *b);
+
+__attribute__ ((format (printf, 1, 2))) __attribute__ ((warn_unused_result))
+char *msg(const char *fmt, ...);
+
+enum loglevels {LOGLEVELS, NUM_LOGLEVELS};
+extern unsigned loglevel_arg_val;
+
+__attribute__ ((format (printf, 2, 3)))
+void m7a_log(int ll, const char* fmt,...);
+
+#define DEBUG_LOG(f,...) m7a_log(LL_DEBUG, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define INFO_LOG(f,...) m7a_log(LL_INFO, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define NOTICE_LOG(f,...) m7a_log(LL_NOTICE, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define WARNING_LOG(f,...) m7a_log(LL_WARNING, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define ERROR_LOG(f,...) m7a_log(LL_ERROR, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define CRIT_LOG(f,...) m7a_log(LL_CRIT, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define EMERG_LOG(f,...) m7a_log(LL_EMERG, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+
+__attribute__ ((noreturn))
+__attribute__ ((format (printf, 1, 2)))
+void die(const char *fmt, ...);
+
+__attribute__ ((noreturn))
+__attribute__ ((format (printf, 1, 2)))
+void die_errno(const char *fmt, ...);
+
+__attribute__ ((noreturn))
+void die_empty_arg(const char *opt);
+
+void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt);
+
+bool xexec(char * const argv[], const struct iovec *iov);
+void valid_fd012(void);
+void check_name(const char *arg);
+void parse_compound_arg(const char *arg, const char *opt, char **name, char **val);
+char *parse_cgroup_acl(const char *arg);
+char *make_hwaddr(const char *name, const char *bridge);
+void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr);
+uint32_t atou32(const char *str, const char *opt);
+bool remove_subdirs_recursively(const char *path);
+void daemonize(const char *logfile);
+bool acquire_lock(const char *string);
+bool try_lock(const char *string, pid_t *pid);
+bool release_lock(const char *string);
+bool is_locked(const char *string, pid_t *pid);
+bool attach_to_bridge(const char *iface, const char *bridge);
+bool rename_interface(const char *before, const char *after);
+void pretty_print_hwaddr(const uint8_t *hwaddr, char *result);
+bool set_hwaddr(const char *iface, const uint8_t *hwaddr);
+bool link_del(const char *iface);
+bool link_up(const char *iface);
+bool create_veth_device_pair(const char *name, char *peer);
+bool set_netns(const char *iface, pid_t pid);
+int request_fd(const char *socket_path, char *msg, int *result);
+bool request_int(const char *socket_path, char *msg, int *result);
+bool listen_on_unix_socket(const char *socket_path, int *result);
+bool recv_cred_buffer(int socketfd, char *buf, size_t size,
+ int *clientfd, uid_t *uid);
+bool pass_fd(int passfd, int socketfd);
+
+extern int signal_pipe[2];
+void init_signal_handling(void);
+int next_signal(void);
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include "m7a.h"
+
+#include <lopsub.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+#include <pty.h>
+#include <utmp.h>
+#include <sys/socket.h>
+#include <sys/capability.h>
+#include <sys/syscall.h>
+
+#include "micoforia.lsg.h"
+
+static struct lls_parse_result *lpr, *sublpr;
+unsigned loglevel_arg_val = 4;
+
+struct ifspec {
+ char *bridge;
+ uint8_t hwaddr[6];
+};
+
+struct container {
+ char *name;
+ char *pre_start_hook;
+ char *pre_exec_hook;
+ char *root_dir;
+ char *init;
+ struct ifspec *ifspec;
+ /* this is never zero, even if no ifspec was given */
+ unsigned num_ifspecs;
+ char **dacl;
+ unsigned num_dac_entries;
+ char **io_max;
+ unsigned num_io_max_entries;
+ /* ~0U: not given, 0: unlimited */
+ unsigned cpu_cores;
+ unsigned memory_limit;
+ /* ~0U: not given */
+ unsigned init_type;
+ cap_value_t *capdrop;
+ unsigned num_capdrops;
+ uint32_t *tty;
+ unsigned num_ttys;
+};
+
+static struct container **container;
+static unsigned num_containers;
+
+struct container_runtime {
+ int pipe1[2], pipe2[2]; /* for startup communication */
+ uint32_t *tty;
+ unsigned num_ttys;
+ int *master, *slave, *client;
+
+ int init_pid; /* in the parent namespace */
+ char *pts, *root, *dev;
+ int socket_fd;
+};
+
+static char **default_dacl, **default_io_max;
+unsigned num_default_dac_entries, num_default_io_max_entries;
+static cap_value_t *default_capdrop;
+unsigned num_default_capdrops;
+uint32_t *default_tty;
+unsigned num_default_ttys;
+static const struct lls_command *subcmd;
+/* does not allocate memory */
+void m7a_log(int ll, const char* fmt,...)
+{
+ va_list argp;
+
+ if (ll < loglevel_arg_val)
+ return;
+ va_start(argp, fmt);
+ if (subcmd == lls_cmd(LSG_MICOFORIA_CMD_START, micoforia_suite)) {
+ char str[100];
+ struct timespec t;
+ struct tm *tm;
+ assert(clock_gettime(CLOCK_REALTIME, &t) == 0);
+ tm = localtime(&t.tv_sec);
+ strftime(str, sizeof(str), "%b %d %H:%M:%S", tm);
+ fprintf(stderr, "%s:%04lu ", str,
+ (long unsigned)t.tv_nsec / 1000 / 1000);
+ fprintf(stderr, "(%u) ", (unsigned)getpid());
+ }
+ vfprintf(stderr, fmt, argp);
+ va_end(argp);
+}
+
+static void die_lopsub(int lopsub_ret, char **errctx)
+{
+ const char *m = lls_strerror(-lopsub_ret);
+ if (*errctx)
+ ERROR_LOG("%s: %s\n", *errctx, m);
+ else
+ ERROR_LOG("%s\n", m);
+ free(*errctx);
+ *errctx = NULL;
+ die("lopsub error");
+}
+
+#define FOR_EACH_CONTAINER(_c) for ( \
+ struct container **_cp = container; \
+ ((_c) = *(_cp)); \
+ (_cp)++, (_c) = *(_cp) \
+)
+
+static struct container *get_container(const char *name)
+{
+ struct container *c;
+ FOR_EACH_CONTAINER(c) {
+ if (!strcmp(c->name, name))
+ return c;
+ }
+ return NULL;
+}
+
+static struct container *get_or_append_container(const char *name)
+{
+ struct container *c = get_container(name);
+ if (c)
+ return c;
+ container = xrealloc(container,
+ (++num_containers + 1) * sizeof(struct container *));
+ c = container[num_containers - 1] = xzmalloc(sizeof(struct container));
+ c->name = xstrdup(name);
+ /* ~0U means: not given */
+ c->cpu_cores = ~0U;
+ c->memory_limit = ~0U;
+ c->init_type = ~0U;
+ container[num_containers] = NULL;
+ return c;
+}
+
+static unsigned get_container_ttys(const struct container *c, uint32_t **result)
+{
+ static uint32_t dflt = {1};
+ if (c->num_ttys > 0) {
+ *result = c->tty;
+ return c->num_ttys;
+ }
+ if (num_default_ttys > 0) {
+ *result = default_tty;
+ return num_default_ttys;
+ }
+ *result = &dflt;
+ return 1;
+}
+
+enum clo_given_counter {
+ CLOGC_DEFAULT_CGROUP_DAC,
+ CLOGC_CGROUP_DAC,
+ CLOGC_DEFAULT_IO_MAX,
+ CLOGC_IO_MAX,
+ NUM_CLOGCS
+};
+
+static unsigned clo_given_counter[NUM_CLOGCS];
+
+static void append_dac_entry(const char *arg, char ***listp, unsigned *count)
+{
+ char *val = parse_cgroup_acl(arg);
+ (*count)++;
+ *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+ (*listp)[*count - 1] = val;
+ (*listp)[*count] = NULL;
+}
+
+static void append_io_max_entry(const char *arg, char ***listp, unsigned *count)
+{
+ (*count)++;
+ *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+ (*listp)[*count - 1] = xstrdup(arg);
+ (*listp)[*count] = NULL;
+}
+
+static void check_options(void)
+{
+ unsigned n, m;
+ const char *arg;
+ char *name, *val;
+ struct container *c;
+ uint32_t u32;
+
+ container = xzmalloc(sizeof(struct container *));
+ /* loop backwards to let command line opts override config file opts */
+ for (n = OPT_GIVEN(MICOFORIA, CONTAINER) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CONTAINER);
+ check_name(arg);
+ get_or_append_container(arg);
+ }
+ for (n = OPT_GIVEN(MICOFORIA, PRE_START_HOOK) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_START_HOOK);
+ parse_compound_arg(arg, "pre-start-hook", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->pre_start_hook);
+ c->pre_start_hook = val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, PRE_EXEC_HOOK) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_EXEC_HOOK);
+ parse_compound_arg(arg, "pre-exec-hook", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->pre_exec_hook);
+ c->pre_exec_hook = val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, CAPDROP) - 1; n != ~0U; n--) {
+ cap_value_t cap_val;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CAPDROP);
+ parse_compound_arg(arg, "capabilities", &name, &val);
+ c = get_or_append_container(name);
+ if (cap_from_name(val, &cap_val) < 0)
+ die_errno("%s: invalid capability: %s", name, val);
+ c->capdrop = xrealloc(c->capdrop,
+ ++c->num_capdrops * sizeof(cap_value_t));
+ c->capdrop[c->num_capdrops - 1] = cap_val;
+ free(name);
+ free(val);
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP); n++) {
+ cap_value_t cap_val;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CAPDROP);
+ if (cap_from_name(arg, &cap_val) < 0)
+ die_errno("invalid default capability: %s", val);
+ default_capdrop = xrealloc(default_capdrop,
+ ++num_default_capdrops * sizeof(cap_value_t));
+ default_capdrop[num_default_capdrops - 1] = cap_val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, TTY) - 1; n != ~0U; n--) {
+ uint32_t minor;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, TTY);
+ parse_compound_arg(arg, "tty", &name, &val);
+ c = get_or_append_container(name);
+ minor = atou32(val, "tty");
+ if (minor == 0)
+ die("can not capture tty0");
+ c->tty = xrealloc(c->tty, ++c->num_ttys * sizeof(uint32_t));
+ c->tty[c->num_ttys - 1] = minor;
+ free(name);
+ free(val);
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_TTY); n++) {
+ uint32_t minor = OPT_UINT32_VAL_N(n, MICOFORIA, DEFAULT_TTY);
+ if (minor == 0)
+ die("can not capture tty0");
+ default_tty = xrealloc(default_tty,
+ ++num_default_ttys * sizeof(uint32_t));
+ default_tty[num_default_ttys - 1] = minor;
+ }
+
+ for (n = OPT_GIVEN(MICOFORIA, ROOT_DIRECTORY) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, ROOT_DIRECTORY);
+ parse_compound_arg(arg, "root-directory", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->root_dir);
+ c->root_dir = val;
+ }
+ u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+ check_range(u32, 0, 65536, "default-cpu-cores");
+ for (n = OPT_GIVEN(MICOFORIA, CPU_CORES) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CPU_CORES);
+ parse_compound_arg(arg, "cpu-cores", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ u32 = atou32(val, "cpu-cores");
+ free(val);
+ check_range(u32, 0, 65536, "cpu-cores");
+ c->cpu_cores = u32;
+ }
+ u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+ check_range(u32, 0, 1024 * 1024, "default-memory-limit");
+ for (n = OPT_GIVEN(MICOFORIA, MEMORY_LIMIT) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, MEMORY_LIMIT);
+ parse_compound_arg(arg, "memory-limit", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ u32 = atou32(val, "memory-limit");
+ free(val);
+ check_range(u32, 0, 1024 * 1024, "memory-limit");
+ c->memory_limit = u32;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, INIT) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, INIT);
+ parse_compound_arg(arg, "init", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->init);
+ c->init = val;
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, NET); n++) {
+ struct ifspec *ifspec;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, NET);
+ parse_compound_arg(arg, "net", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ c->ifspec = xrealloc(c->ifspec,
+ ++c->num_ifspecs * sizeof(struct ifspec));
+ ifspec = c->ifspec + c->num_ifspecs - 1;
+ parse_ifspec(val, &ifspec->bridge, ifspec->hwaddr);
+ free(val);
+ }
+
+ m = clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_CGROUP_DAC); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+ append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+ append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+ }
+ m = clo_given_counter[CLOGC_CGROUP_DAC];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, CGROUP_DAC); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+ parse_compound_arg(arg, "cgroup-dac", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+ free(val);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+ parse_compound_arg(arg, "cgroup-dac", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+ free(val);
+ }
+
+ m = clo_given_counter[CLOGC_DEFAULT_IO_MAX];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+ append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+ append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+ }
+ m = clo_given_counter[CLOGC_IO_MAX];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, IO_MAX); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+ parse_compound_arg(arg, "io-max", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+ free(val);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+ parse_compound_arg(arg, "io-max", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+ free(val);
+ }
+
+ /* init default c->ifspec[] */
+ FOR_EACH_CONTAINER(c) {
+ if (c->num_ifspecs == 0) {
+ const char *br = OPT_STRING_VAL(MICOFORIA, DEFAULT_BRIDGE);
+ c->num_ifspecs = 1;
+ c->ifspec = xmalloc(sizeof(struct ifspec));
+ c->ifspec[0].bridge = xstrdup(br);
+ memset(c->ifspec[0].hwaddr, 0, 6);
+ continue;
+ }
+ }
+}
+
+static void show_subcommand_summary(bool verbose)
+{
+ int i;
+
+#define LSG_MICOFORIA_CMD(_name) #_name
+ static const char * const subcommand_names[] = {LSG_MICOFORIA_SUBCOMMANDS NULL};
+#undef LSG_MICOFORIA_CMD
+ printf("Available subcommands:\n");
+ if (verbose) {
+ const struct lls_command *cmd;
+ for (i = 1; (cmd = lls_cmd(i, micoforia_suite)); i++) {
+ const char *purpose = lls_purpose(cmd);
+ const char *name = lls_command_name(cmd);
+ printf("%-12s%s\n", name, purpose);
+ }
+ } else {
+ unsigned n = 8;
+ printf("\t");
+ for (i = 0; i < LSG_NUM_MICOFORIA_SUBCOMMANDS; i++) {
+ if (i > 0)
+ n += printf(", ");
+ if (n > 70) {
+ printf("\n\t");
+ n = 8;
+ }
+ n += printf("%s", subcommand_names[i]);
+ }
+ printf("\n");
+ }
+}
+
+const char *GET_VERSION(void);
+static void handle_version_and_help(void)
+{
+ char *help;
+
+ if (OPT_GIVEN(MICOFORIA, VERSION)) {
+ printf(PACKAGE " %s\n"
+ "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n"
+ "License: " LICENSE " <" LICENSE_URL ">.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "\n"
+ "Web page: " URL "\n"
+ "Clone URL: " CLONE_URL "\n"
+ "Gitweb: " GITWEB_URL "\n"
+ "Author's Home Page: " HOME_URL "\n"
+ "Send feedback to: " AUTHOR " <" EMAIL ">\n"
+ ,
+ GET_VERSION()
+ );
+ exit(EXIT_SUCCESS);
+ }
+ if (OPT_GIVEN(MICOFORIA, DETAILED_HELP))
+ help = lls_long_help(CMD_PTR(MICOFORIA));
+ else if (OPT_GIVEN(MICOFORIA, HELP))
+ help = lls_short_help(CMD_PTR(MICOFORIA));
+ else if (lls_num_inputs(lpr) == 0) {
+ show_subcommand_summary(true /* verbose */);
+ exit(EXIT_SUCCESS);
+ } else
+ return;
+ printf("%s\n", help);
+ free(help);
+ exit(EXIT_SUCCESS);
+}
+
+static char *get_config_file_path(void)
+{
+ struct passwd *pw;
+ const char *home;
+
+ if (OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+ return xstrdup(OPT_STRING_VAL(MICOFORIA, CONFIG_FILE));
+ pw = getpwuid(getuid());
+ home = pw? pw->pw_dir : "/root";
+ return msg("%s/.micoforiarc", home);
+}
+
+static void parse_options(int argc, char **argv, const struct lls_command *cmd,
+ struct lls_parse_result **lprp)
+{
+ int ret, fd = -1;
+ char *config_file;
+ struct stat statbuf;
+ void *map;
+ size_t sz;
+ int cf_argc;
+ char **cf_argv, *errctx = NULL;
+ const char *subcmd_name;
+ struct lls_parse_result *merged_lpr, *cf_lpr;
+
+ ret = lls_parse(argc, argv, cmd, lprp, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ handle_version_and_help();
+ clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC] = OPT_GIVEN(MICOFORIA,
+ DEFAULT_CGROUP_DAC);
+ clo_given_counter[CLOGC_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, CGROUP_DAC);
+ clo_given_counter[CLOGC_DEFAULT_IO_MAX] =
+ OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX);
+ clo_given_counter[CLOGC_IO_MAX] = OPT_GIVEN(MICOFORIA, IO_MAX);
+ config_file = get_config_file_path();
+ ret = open(config_file, O_RDONLY);
+ if (ret < 0) {
+ if (errno != ENOENT || OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+ die_errno("can not open config file %s", config_file);
+ /* no config file -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ fd = ret;
+ ret = fstat(fd, &statbuf);
+ if (ret < 0)
+ die_errno("failed to stat config file %s", config_file);
+ sz = statbuf.st_size;
+ if (sz == 0) { /* config file is empty -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (map == MAP_FAILED)
+ die_errno("failed to mmap config file %s", config_file);
+ subcmd_name = (cmd == CMD_PTR(MICOFORIA))? NULL : lls_command_name(cmd);
+ ret = lls_convert_config(map, sz, subcmd_name, &cf_argv,
+ &errctx);
+ munmap(map, sz);
+ if (ret < 0) {
+ ERROR_LOG("failed to convert config file %s\n", config_file);
+ die_lopsub(ret, &errctx);
+ }
+ cf_argc = ret;
+ ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx);
+ lls_free_argv(cf_argv);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ /* command line options override config file options */
+ ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ lls_free_parse_result(cf_lpr, cmd);
+ lls_free_parse_result(*lprp, cmd);
+ *lprp = merged_lpr;
+success:
+ if (fd >= 0)
+ close(fd);
+ free(config_file);
+}
+
+static const char *get_pre_start_hook(const struct container *c)
+{
+ if (c->pre_start_hook)
+ return c->pre_start_hook;
+ return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_START_HOOK);
+}
+
+static const char *get_pre_exec_hook(const struct container *c)
+{
+ if (c->pre_exec_hook)
+ return c->pre_exec_hook;
+ return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_EXEC_HOOK);
+}
+
+static char *get_root_dir(const struct container *c)
+{
+ if (c->root_dir)
+ return xstrdup(c->root_dir);
+ return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, DEFAULT_ROOT_PREFIX), c->name);
+}
+
+static char *get_ifspec_string(const struct container *c)
+{
+ unsigned n;
+ char *str = NULL;
+
+ assert(c->num_ifspecs > 0);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ uint8_t *x = c->ifspec[n].hwaddr;
+ char *tmp = msg("%s%s%s:%02x:%02x:%02x:%02x:%02x:%02x",
+ str? str : "",
+ str? " " : "",
+ c->ifspec[n].bridge,
+ x[0], x[1], x[2], x[3], x[4], x[5]
+ );
+ free(str);
+ str = tmp;
+ }
+ return str;
+}
+
+static char *interface_name(const struct container *c, unsigned idx, bool peer)
+{
+ assert(idx < c->num_ifspecs);
+ if (c->num_ifspecs == 1)
+ return peer? msg("%s-g", c->name) : xstrdup(c->name);
+ if (peer)
+ return msg("%s-%s-g", c->name, c->ifspec[idx].bridge);
+ return msg("%s-%s", c->name, c->ifspec[idx].bridge);
+}
+
+static void set_m7a_root_dir_env(const struct container *c)
+{
+ char *root = get_root_dir(c);
+ DEBUG_LOG("root dir: %s\n", root);
+ setenv("MICOFORIA_ROOT_DIR", root, 1);
+ free(root);
+}
+
+static bool run_pre_start_hook(const struct container *c)
+{
+ char *ifspec;
+ char *cmd = xstrdup(get_pre_start_hook(c));
+ char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+ bool success;
+
+ setenv("MICOFORIA_CONTAINER_NAME", c->name, 1);
+ set_m7a_root_dir_env(c);
+
+ ifspec = get_ifspec_string(c);
+ DEBUG_LOG("ifspecs: %s\n", ifspec);
+ setenv("MICOFORIA_IFSPECS", ifspec, 1);
+ free(ifspec);
+
+ INFO_LOG("running pre-start hook %s\n", cmd);
+ success = xexec(argv, NULL);
+ free(cmd);
+ if (!success)
+ ERROR_LOG("pre-start hook failed\n");
+ unsetenv("MICOFORIA_CONTAINER_NAME");
+ unsetenv("MICOFORIA_IFSPECS");
+ unsetenv("MICOFORIA_ROOT_DIR");
+ return success;
+}
+
+static void run_pre_exec_hook(const struct container *c)
+{
+ char *cmd = xstrdup(get_pre_exec_hook(c));
+ char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+
+ INFO_LOG("/bin/sh -c '%s'\n", cmd);
+ set_m7a_root_dir_env(c);
+ if (!xexec(argv, NULL))
+ die("%s: pre-exec hook failed", c->name);
+ free(cmd);
+ unsetenv("MICOFORIA_ROOT_DIR");
+}
+
+static void write_cgroup(const char *path, const char *txt)
+{
+ int fd;
+ size_t sz;
+
+ if ((fd = open(path, O_WRONLY)) < 0)
+ die_errno("open %s", path);
+ sz = strlen(txt);
+ if (write(fd, txt, sz) != sz)
+ die_errno("could not write to %s", path);
+ close(fd);
+}
+
+static unsigned get_dacl(const struct container *c, char ***result)
+{
+ static char *dflt[] = {
+ "da", /* deny access to all devices except the ones below */
+ "ac 1:3 rwm", /* null */
+ "ac 1:5 rwm", /* zero */
+ "ac 1:7 rwm", /* full */
+ "ac 1:8 rwm", /* random */
+ "ac 1:9 rwm", /* urandom */
+ "ac 4:* rwm", /* tty?* */
+ "ac 5:0 rwm", /* tty */
+ "ac 5:2 rwm", /* ptmx */
+ "ac 136:* rwm", /* pts */
+ };
+ if (c->num_dac_entries > 0) {
+ *result = c->dacl;
+ return c->num_dac_entries;
+ }
+ if (num_default_dac_entries > 0) {
+ *result = default_dacl;
+ return num_default_dac_entries;
+ }
+ *result = dflt;
+ return ARRAY_SIZE(dflt);
+}
+
+static void apply_dacl(const struct container *c)
+{
+ char **dacl;
+ unsigned n, num_entries;
+ char *m7a_dir, *container_dir, *allow, *deny, *procs, *txt;
+ int fd, allow_fd, deny_fd;
+ size_t sz;
+
+ m7a_dir = msg("/var/cgroup/micoforia");
+ container_dir = msg("%s/%s", m7a_dir, c->name);
+ allow = msg("%s/devices.allow", container_dir);
+ deny = msg("%s/devices.deny", container_dir);
+ procs = msg("%s/cgroup.procs", container_dir);
+
+ if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", m7a_dir);
+ free(m7a_dir);
+ if (mkdir(container_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", container_dir);
+ free(container_dir);
+ if ((allow_fd = open(allow, O_WRONLY)) < 0)
+ die_errno("open %s", allow);
+ free(allow);
+ if ((deny_fd = open(deny, O_WRONLY)) < 0)
+ die_errno("open %s", deny);
+ free(deny);
+
+ num_entries = get_dacl(c, &dacl);
+ INFO_LOG("applying %u entr%s\n", num_entries, num_entries == 1?
+ "y" : "ies");
+ for (n = 0; n < num_entries; n++) {
+ char *entry = dacl[n];
+ DEBUG_LOG("dac entry #%u: %s %s\n", n, dacl[n][0] == 'a'?
+ "allow" : "deny", dacl[n] + 1);
+ txt = msg("%s\n", entry + 1);
+ sz = strlen(txt);
+ fd = entry[0] == 'a'? allow_fd : deny_fd;
+ if (write(fd, txt, sz) != sz)
+ die_errno("could not write to cgroup devices.%s file",
+ entry[0] == 'a'? "allow" : "deny");
+ free(txt);
+ }
+ close(allow_fd);
+ close(deny_fd);
+ txt = msg("%u\n", (unsigned)getpid());
+ write_cgroup(procs, txt);
+ free(txt);
+}
+
+static void cgroup_init(void)
+{
+ const char controllers[] = "+cpu +memory +io\n";
+ char *m7a_dir, *ctl;
+
+ if (access("/var/cgroup/cgroup.clone_children", F_OK) < 0)
+ die("cgroup v1 not mounted at /var/cgroup/");
+ if (access("/var/cgroup2/cgroup.subtree_control", F_OK) < 0)
+ die("cgroup v1 not mounted at /var/cgroup/");
+ write_cgroup("/var/cgroup2/cgroup.subtree_control", controllers);
+ m7a_dir = msg("/var/cgroup2/micoforia");
+ if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", m7a_dir);
+ ctl = msg("%s/cgroup.subtree_control", m7a_dir);
+ free(m7a_dir);
+ write_cgroup(ctl, controllers);
+ free(ctl);
+}
+
+static void create_cgroup_v2(const struct container *c)
+{
+ char buf[10];
+ char *ctl, *dir = msg("/var/cgroup2/micoforia/%s", c->name);
+
+ if (mkdir(dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", dir);
+ ctl = msg("%s/cgroup.procs", dir);
+ free(dir);
+ sprintf(buf, "%u\n", (unsigned)getpid());
+ write_cgroup(ctl, buf);
+ free(ctl);
+}
+
+static unsigned get_cpu_cores(const struct container *c)
+{
+ return c->cpu_cores != ~0U? c->cpu_cores :
+ OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+}
+
+static void apply_cpu_limit(const struct container *c)
+{
+ char *str, *ctl;
+ unsigned cores = get_cpu_cores(c);
+
+ if (cores == 0) /* unlimited */
+ return;
+ assert(cores != ~0U);
+ INFO_LOG("%u core%s\n", cores, cores == 1? "" : "s");
+ ctl = msg("/var/cgroup2/micoforia/%s/cpu.max", c->name);
+ str = msg("%u 1000000\n", 1000000 * cores);
+ write_cgroup(ctl, str);
+ free(ctl);
+ free(str);
+}
+
+static unsigned get_memory_limit(const struct container *c)
+{
+ return c->memory_limit != ~0U? c->memory_limit :
+ OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+}
+
+static void apply_memory_limit(const struct container *c)
+{
+ char *str, *ctl;
+ unsigned gigs = get_memory_limit(c);
+
+ if (gigs == 0) /* unlimited */
+ return;
+ assert(gigs != ~0U);
+ INFO_LOG("%uG\n", gigs);
+ ctl = msg("/var/cgroup2/micoforia/%s/memory.high", c->name);
+ str = msg("%llu\n", 1024LLU * 1024LLU * 1024LLU * gigs);
+ write_cgroup(ctl, str);
+ free(ctl);
+ free(str);
+}
+
+static unsigned get_iospecs(const struct container *c, char ***result)
+{
+ if (c->num_io_max_entries > 0) {
+ *result = c->dacl;
+ return c->num_io_max_entries;
+ }
+ if (num_default_io_max_entries > 0) {
+ *result = default_io_max;
+ return num_default_io_max_entries;
+ }
+ *result = NULL;
+ return 0;
+}
+
+static void apply_io_limit(const struct container *c)
+{
+ unsigned n, num_entries;
+ char *io_max;
+ char **iospec;
+
+ num_entries = get_iospecs(c, &iospec);
+ if (num_entries == 0)
+ return;
+ INFO_LOG("%u entries\n", num_entries);
+ io_max = msg("/var/cgroup2/micoforia/%s/io.max", c->name);
+ for (n = 0; n < num_entries; n++)
+ write_cgroup(io_max, iospec[n]);
+ free(io_max);
+}
+
+static void cgroup_cleanup(const struct container *c)
+{
+ char *dir = msg("/var/cgroup/micoforia/%s", c->name);
+ remove_subdirs_recursively(dir);
+ free(dir);
+ dir = msg("/var/cgroup2/micoforia/%s", c->name);
+ remove_subdirs_recursively(dir);
+ free(dir);
+}
+
+static bool setup_network(const struct container *c)
+{
+ unsigned n;
+ char *iface, *peer;
+
+ if (!link_up("lo"))
+ WARNING_LOG("could not set establish loopback link\n");
+ for (n = 0; n < c->num_ifspecs; n++) {
+ iface = interface_name(c, n, false);
+ peer = interface_name(c, n, true);
+ link_del(iface); /* ignore errors */
+ if (!create_veth_device_pair(iface, peer))
+ goto fail;
+ if (!set_hwaddr(peer, c->ifspec[n].hwaddr))
+ goto fail;
+ if (!attach_to_bridge(iface, c->ifspec[n].bridge))
+ goto fail;
+ if (!link_up(iface))
+ goto fail;
+ free(iface);
+ free(peer);
+ }
+ return true;
+fail:
+ free(iface);
+ free(peer);
+ return false;
+}
+
+static void setup_termios(int fd)
+{
+ struct winsize wsz; /* see ioctl_tty(2) */
+ struct termios tios;
+
+ if (!isatty(fd))
+ return;
+ if (tcgetattr(fd, &tios)) {
+ ERROR_LOG("tcgetattr: %m\n");
+ return;
+ }
+ tios.c_lflag &= ~(ECHO | ISIG | ICANON);
+ tios.c_cc[VMIN] = 1;
+ tios.c_cc[VTIME] = 0;
+ if (tcsetattr(fd, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ if (ioctl(STDIN_FILENO, TIOCGWINSZ, &wsz) >= 0)
+ ioctl(fd, TIOCSWINSZ, &wsz);
+}
+
+struct device_node_info {
+ unsigned major, minor;
+ mode_t mode;
+ const char *name;
+};
+
+static void create_standard_device_nodes(struct container_runtime *cr)
+{
+ const struct device_node_info devices[] = {
+ {.major = 1, .minor = 3, .mode = 0666, .name = "null"},
+ {.major = 1, .minor = 5, .mode = 0666, .name = "zero"},
+ {.major = 1, .minor = 7, .mode = 0666, .name = "full"},
+ {.major = 1, .minor = 8, .mode = 0666, .name = "random"},
+ {.major = 1, .minor = 9, .mode = 0666, .name = "urandom"},
+ {.major = 4, .minor = 0, .mode = 0620, .name = "tty0"},
+ {.major = 5, .minor = 1, .mode = 0600, .name = "console"},
+ {.major = 5, .minor = 2, .mode = 0666, .name = "ptmx"},
+ };
+ unsigned n;
+
+ for (n = 0; n < ARRAY_SIZE(devices); n++) {
+ const struct device_node_info *d = devices + n;
+ char *path = msg("%s/%s", cr->dev, d->name);
+ if (mknod(path, S_IFCHR, makedev(d->major, d->minor)) < 0)
+ die_errno("mknod %s", d->name);
+ chmod(path, d->mode);
+ free(path);
+ }
+}
+
+static void init_console(struct container_runtime *cr)
+{
+ char *console;
+ unsigned n;
+
+ if (mount(NULL, cr->dev, "tmpfs", 0, "size=500000,mode=755") < 0)
+ die("mount tmpfs at %s: %m", cr->dev);
+ create_standard_device_nodes(cr);
+ for (n = 0; n < cr->num_ttys; n++) {
+ char *tty = msg("%s/tty%u", cr->dev, cr->tty[n]);
+ unlink(tty);
+ if (mknod(tty, S_IFCHR, makedev(4, cr->tty[n])) < 0)
+ die("mknod %s: %m", tty);
+ chmod(tty, 0660);
+ setup_termios(cr->slave[n]);
+ INFO_LOG("bind mounting %s -> %s\n", ttyname(cr->slave[n]), tty);
+ if (mount(ttyname(cr->slave[n]), tty, "none",
+ MS_BIND | MS_PRIVATE, NULL) < 0)
+ die("failed to bind mount %s: %m\n", tty);
+ free(tty);
+ }
+ console = msg("%s/console", cr->dev);
+ if (mount(ttyname(cr->slave[0]), console, "none",
+ MS_BIND | MS_PRIVATE, NULL) < 0)
+ die("failed to bind mount %s: %m\n", console);
+ free(console);
+}
+
+/*
+ * These umounts fail if the container shutdown already umounted the bind
+ * mounted devices. This is not fatal, so log only with low severity.
+ */
+static void shutdown_console(struct container_runtime *cr)
+{
+ unsigned n;
+ char *console;
+
+ for (n = 0; n < cr->num_ttys; n++) {
+ char *tty = msg("%s/tty1", cr->dev);
+ if (umount2(tty, MNT_DETACH) < 0)
+ DEBUG_LOG("umount %s: %m\n", tty);
+ free(tty);
+ }
+ console = msg("%s/console", cr->dev);
+ if (umount2(console, MNT_DETACH) < 0)
+ DEBUG_LOG("umount %s: %m\n", console);
+ free(console);
+}
+
+static char *get_socket_path(const char *container_name)
+{
+ return msg("micoforia/%s", container_name);
+}
+
+/* Ignore everything the client sends us, but invalidate the fd on EOF. */
+static void dispatch_client(int *client)
+{
+ char buf[1024];
+ if (read(*client, buf, sizeof(buf)) <= 0) {
+ NOTICE_LOG("detaching client on fd %d\n", *client);
+ close(*client);
+ *client = -1;
+ }
+}
+
+static void dispatch_socket_request(struct container_runtime *cr)
+{
+ uid_t uid;
+ char buf[32];
+ int cfd;
+ uint32_t minor;
+ unsigned n;
+ bool force;
+
+ memset(buf, 0, sizeof(buf));
+ if (!recv_cred_buffer(cr->socket_fd, buf, sizeof(buf) - 1, &cfd, &uid))
+ return;
+ if (uid != getuid()) {
+ const char msg[] = "\1EACCES";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("access denied for uid %d\n", (int)uid);
+ goto out;
+ }
+ if (strcmp(buf, "init_pid") == 0) {
+ buf[0] = '\0';
+ memcpy(buf + 1, &cr->init_pid, sizeof(int));
+ send(cfd, buf, 1 + sizeof(int), MSG_DONTWAIT);
+ goto out;
+ }
+ if (sscanf(buf, "attach %u", &minor) == 1) {
+ force = false;
+ } else if (sscanf(buf, "force-attach %u", &minor) == 1) {
+ force = true;
+ } else {
+ const char msg[] = "\1EINVAL";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("invalid request: %s\n", buf);
+ goto out;
+ }
+ for (n = 0; n < cr->num_ttys; n++) {
+ INFO_LOG("n: %u, tty[n]: %u\n", n, cr->tty[n]);
+ if (cr->tty[n] == minor)
+ break;
+ }
+ if (n == cr->num_ttys) {
+ const char msg[] = "\1ENOTTY";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("tty%u is not being forwarded\n", minor);
+ goto out;
+ }
+ if (cr->client[n] >= 0) {
+ if (force) {
+ close(cr->client[n]);
+ cr->client[n] = -1;
+ } else {
+ const char msg[] = "\1EBUSY";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ ERROR_LOG("tty%u is already in use\n", minor);
+ goto out;
+ }
+ }
+ if (!pass_fd(cr->master[n], cfd)) {
+ ERROR_LOG("could not pass master fd\n");
+ goto out;
+ }
+ NOTICE_LOG("attached client on fd %d to tty%u\n", cfd, minor);
+ cr->client[n] = cfd;
+ return;
+out:
+ close(cfd);
+}
+
+/* discards read data if dst < 0 */
+static bool copy(int src, int dst)
+{
+ ssize_t sz1, sz2;
+ char buf[1024];
+again:
+ sz1 = read(src, buf, sizeof(buf));
+ if (sz1 < 0) {
+ if (errno == EINTR)
+ goto again;
+ DEBUG_LOG("read from fd %d: %m\n", src);
+ }
+ if (sz1 <= 0)
+ return false;
+ if (dst < 0)
+ return true;
+ sz2 = write(dst, buf, sz1);
+ if (sz2 < 0) {
+ DEBUG_LOG("write to fd %d: %m\n", dst);
+ return false;
+ }
+ if (sz1 != sz2) {
+ DEBUG_LOG("short write to fd %d\n", dst);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * The function returns only when the process receives SIGCHLD. In this case
+ * the return value is 0 for success, 1 for failure, and 2 if the child's exit
+ * code indicates a reboot request. Other signals are pushed down to the child
+ * process.
+ */
+static int parent_loop(pid_t pid, const struct container *c,
+ struct container_runtime *cr)
+{
+ unsigned n;
+
+ init_signal_handling();
+ for (;;) {
+ int sig, max_fileno = 0;
+ fd_set fds;
+
+ FD_ZERO(&fds);
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ FD_SET(STDIN_FILENO, &fds);
+ if (STDIN_FILENO > max_fileno)
+ max_fileno = STDIN_FILENO;
+ }
+ FD_SET(signal_pipe[0], &fds);
+ if (signal_pipe[0] > max_fileno)
+ max_fileno = signal_pipe[0];
+ FD_SET(cr->socket_fd, &fds);
+ if (cr->socket_fd > max_fileno)
+ max_fileno = cr->socket_fd;
+ for (n = 0; n < cr->num_ttys; n++) {
+ if (cr->client[n] >= 0) { /* detached */
+ FD_SET(cr->client[n], &fds);
+ if (cr->client[n] > max_fileno)
+ max_fileno = cr->client[n];
+ } else {
+ FD_SET(cr->master[n], &fds);
+ if (cr->master[n] > max_fileno)
+ max_fileno = cr->master[n];
+ }
+ }
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ do {
+ if (!FD_ISSET(signal_pipe[0], &fds))
+ break;
+ sig = next_signal();
+ if (sig == SIGCHLD) {
+ int wstatus;
+ if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+ WARNING_LOG("wait: %m\n");
+ break;
+ }
+ cgroup_cleanup(c);
+ if (!WIFEXITED(wstatus))
+ return 1;
+ if (WEXITSTATUS(wstatus) == 2)
+ return 2;
+ return WEXITSTATUS(wstatus) != EXIT_SUCCESS;
+ }
+ kill(pid, sig);
+ } while (0);
+ if (FD_ISSET(cr->socket_fd, &fds))
+ dispatch_socket_request(cr);
+ for (n = 0; n < cr->num_ttys; n++) {
+ if (cr->client[n] >= 0) {
+ if FD_ISSET(cr->client[n], &fds)
+ dispatch_client(cr->client + n);
+ } else { /* stdout is /dev/null in background mode */
+ if (FD_ISSET(cr->master[n], &fds))
+ copy(cr->master[n], n == 0?
+ STDOUT_FILENO : -1);
+ }
+ }
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (FD_ISSET(STDIN_FILENO, &fds))
+ copy(STDIN_FILENO, cr->master[0]);
+ }
+ }
+}
+
+/* Set net namespace of child and call parent_loop(). */
+static int run_parent(pid_t child_pid, const struct container *c,
+ struct container_runtime *cr)
+{
+ unsigned n;
+ bool success;
+
+ close(cr->pipe1[1]);
+ close(cr->pipe2[0]);
+ if (read(cr->pipe1[0], &cr->init_pid, 4) != 4) {
+ ERROR_LOG("pipe1 read error\n");
+ close(cr->pipe1[0]);
+ close(cr->pipe2[1]);
+ return false;
+ }
+ INFO_LOG("received grand child pid: %u\n", (unsigned)cr->init_pid);
+ close(cr->pipe1[0]);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char *peer = interface_name(c, n, true);
+ success = set_netns(peer, child_pid);
+ free(peer);
+ if (!success) {
+ ERROR_LOG("set_netns error\n");
+ close(cr->pipe2[1]);
+ return false;
+ }
+ }
+ success = write(cr->pipe2[1], "\0", 1) == 1;
+ close(cr->pipe2[1]);
+ if (!success) {
+ ERROR_LOG("pipe2 write error\n");
+ return false;
+ }
+ return parent_loop(child_pid, c, cr);
+}
+
+static unsigned get_capdrops(const struct container *c, cap_value_t **result)
+{
+ static cap_value_t builtin_capdrop[] = {CAP_SYS_MODULE, CAP_SYS_TIME,
+ CAP_SYS_RESOURCE};
+
+ if (c->capdrop) {
+ *result = c->capdrop;
+ return c->num_capdrops;
+ }
+ if (OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP)) {
+ *result = default_capdrop;
+ return num_default_capdrops;
+ }
+ *result = builtin_capdrop;
+ return ARRAY_SIZE(builtin_capdrop);
+}
+
+static void drop_caps(const struct container *c)
+{
+ cap_value_t *capdrop;
+ unsigned n, num_capdrops;
+
+ INFO_LOG("lowering bounding set capabilities\n");
+ num_capdrops = get_capdrops(c, &capdrop);
+ for (n = 0; n < num_capdrops; n++) {
+ char *name = cap_to_name(capdrop[n]);
+ DEBUG_LOG("dropping %s\n", name);
+ cap_free(name);
+ if (cap_drop_bound(capdrop[n]) < 0)
+ die_errno("cap_drop_bound");
+ }
+}
+
+__attribute ((noreturn))
+static void child_loop(pid_t pid, struct container_runtime *cr)
+{
+ int wstatus;
+
+ INFO_LOG("parent: %u, child: %u, init: %u\n", (unsigned) getppid(),
+ (unsigned)getpid(), (unsigned)pid);
+ init_signal_handling();
+ setsid();
+
+ for (;;) {
+ int max_fileno = 0;
+ fd_set fds;
+
+ FD_ZERO(&fds);
+ FD_SET(signal_pipe[0], &fds);
+ if (signal_pipe[0] > max_fileno)
+ max_fileno = signal_pipe[0];
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ do { if (FD_ISSET(signal_pipe[0], &fds)) {
+ int sig = next_signal();
+ if (sig == SIGCHLD) {
+ if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+ WARNING_LOG("wait: %m\n");
+ break;
+ }
+ shutdown_console(cr);
+ if (WIFSIGNALED(wstatus) &&
+ WTERMSIG(wstatus) == 1) {
+ NOTICE_LOG("reboot requested\n");
+ exit(2);
+ }
+ NOTICE_LOG("container terminated\n");
+ exit(EXIT_SUCCESS);
+ }
+ NOTICE_LOG("sending signal %d to container init\n",
+ sig);
+ kill(pid, sig == SIGINT? SIGINT : SIGKILL);
+ }} while(0);
+ }
+}
+
+static const char *get_init_path(const struct container *c)
+{
+ return c->init? c->init : OPT_STRING_VAL(MICOFORIA, DEFAULT_INIT);
+}
+
+/*
+ * The child process unshares namespaces, spawns the init process which runs
+ * the pre-exec hook and executes the container init process. This function
+ * never returns, but both the child and the init process exit when the
+ * container terminates. The exit code of the child tells the parent whether
+ * it should restart the container.
+ */
+__attribute ((noreturn))
+static void run_child(const struct container *c, struct container_runtime *cr)
+{
+ unsigned n;
+ char *init, *put_old;
+ char ch;
+ pid_t pid;
+
+ close(cr->socket_fd);
+ for (n = 0; n < cr->num_ttys; n++)
+ close(cr->master[n]);
+ close(cr->pipe1[0]);
+ close(cr->pipe2[1]);
+ if (unshare(CLONE_NEWNET) < 0)
+ die_errno("unshare net ns\n");
+ if (unshare(CLONE_NEWPID) < 0)
+ die_errno("unshare pid ns\n");
+ /* fork again to become pid 1 in the new pid namespace */
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ /*
+ * By writing to pipe1 we tell the parent (a) we've unshared the net
+ * namespace, and (b) the pid of the init process in the parent
+ * namespace.
+ */
+ if (pid > 0) {
+ close(cr->pipe2[0]);
+ if (write(cr->pipe1[1], (const char *)&pid, 4) != 4)
+ die_errno("pipe write error");
+ close(cr->pipe1[1]);
+ child_loop(pid, cr); /* never returns */
+ }
+ pid = getpid();
+ DEBUG_LOG("now running as pid %d\n", pid);
+ if (read(cr->pipe2[0], &ch, 1) != 1)
+ die_errno("pipe read error");
+ close(cr->pipe1[1]);
+ close(cr->pipe2[0]);
+ if (unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS) < 0)
+ die_errno("unshare");
+ mkdir(cr->dev, 0777);
+ init_console(cr);
+ for (n = 0; n < cr->num_ttys; n++)
+ close(cr->slave[n]);
+ INFO_LOG("setting hostname to %s\n", c->name);
+ if (sethostname(c->name, strlen(c->name)) < 0)
+ die_errno("sethostname error");
+ if (chdir(cr->root) < 0)
+ die_errno("chdir %s", cr->root);
+ drop_caps(c);
+ apply_dacl(c);
+ apply_cpu_limit(c);
+ apply_memory_limit(c);
+ apply_io_limit(c);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char *peer = interface_name(c, n, true);
+ char *renamed = msg("eth%u", n);
+ if (!rename_interface(peer, renamed))
+ die("can not rename %s to %s\n", peer, renamed);
+ free(peer);
+ free(renamed);
+ }
+ run_pre_exec_hook(c);
+ setup_termios(STDIN_FILENO);
+ put_old = msg("%s/mnt", cr->root);
+ /* glibc does not provide a wrapper for pivot_root */
+ if (syscall(SYS_pivot_root, ".", put_old) < 0)
+ die_errno("pivot_root (put_old: %s)", put_old);
+ if (umount2("/mnt", MNT_DETACH) < 0)
+ die_errno("umount %s", put_old);
+ free(put_old);
+ close(STDIN_FILENO);
+ init = xstrdup(get_init_path(c));
+ INFO_LOG("handing over control to container init: %s\n", init);
+ execve(init, (char *[]){init, NULL}, NULL);
+ die_errno("failed to exec init process %s", c->init);
+}
+
+/*
+ * We need three processes, called parent, child, init, because we want one
+ * process run with namespaces unmodified, requiring one fork. After the child
+ * has unshared its PID namespace, it keeps its old PID, so we need to fork
+ * again to get pid 1. The child can not terminate because the parent can not
+ * wait(2) on its grandchild.
+ */
+static bool exec_container(const struct container *c)
+{
+ bool success;
+ pid_t pid;
+ unsigned n;
+ struct container_runtime cr = {0};
+ char *socket_path;
+ int ret;
+
+ create_cgroup_v2(c);
+ socket_path = get_socket_path(c->name);
+ success = listen_on_unix_socket(socket_path, &cr.socket_fd);
+ if (!success)
+ ERROR_LOG("can not listen on unix socket %s\n", socket_path);
+ free(socket_path);
+ if (!success)
+ return 1;
+ cr.root = get_root_dir(c);
+ cr.dev = msg("%s/dev", cr.root);
+ cr.pts = realpath("/proc/self/fd/0", NULL);
+ DEBUG_LOG("pts: %s\n", cr.pts);
+ cr.num_ttys = get_container_ttys(c, &cr.tty);
+ cr.master = xmalloc(cr.num_ttys * sizeof(int));
+ cr.slave = xmalloc(cr.num_ttys * sizeof(int));
+ cr.client = xmalloc(cr.num_ttys * sizeof(int));
+ for (n = 0; n < cr.num_ttys; n++)
+ cr.client[n] = -1;
+reboot:
+ NOTICE_LOG("starting %s\n", c->name);
+ for (n = 0; n < cr.num_ttys; n++) {
+ if (openpty(cr.master + n, cr.slave + n, NULL, NULL, NULL) < 0)
+ die("openpty: %m");
+ DEBUG_LOG("pty (tty%u <-> %s)\n", n, ttyname(cr.slave[n]));
+ }
+ /* mount rw, ignore errors */
+ mount(NULL, cr.root, NULL, MS_REMOUNT, NULL);
+ if (!setup_network(c))
+ return false;
+ if (!run_pre_start_hook(c))
+ return false;
+ if (pipe(cr.pipe1) < 0) /* child -> parent */
+ die_errno("pipe1");
+ if (pipe(cr.pipe2) < 0)
+ die_errno("pipe2"); /* parent -> child */
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid == 0)
+ run_child(c, &cr); /* never returns */
+ ret = run_parent(pid, c, &cr);
+ if (ret != 2)
+ return ret == 0;
+ NOTICE_LOG("rebooting\n");
+ for (n = 0; n < cr.num_ttys; n++) {
+ close(cr.master[n]);
+ close(cr.slave[n]);
+ }
+ goto reboot;
+}
+
+static char *get_container_logfile(const char *name)
+{
+ return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, LOGDIR), name);
+}
+
+static bool start_container(const struct container *c)
+{
+ pid_t pid;
+ char *logfile;
+ struct termios tios;
+ bool success;
+
+ if (is_locked(c->name, &pid)) {
+ ERROR_LOG("%s is locked by pid %u\n", c->name, (unsigned)pid);
+ return false;
+ }
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+ ERROR_LOG("both stdin and stdout must be terminals\n");
+ return false;
+ }
+ if (tcgetattr(STDIN_FILENO, &tios) < 0) {
+ ERROR_LOG("tcgetattr: %m\n");
+ return false;
+ }
+ } else {
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid > 0)
+ return true;
+ logfile = get_container_logfile(c->name);
+ daemonize(logfile);
+ free(logfile);
+ }
+ if (!try_lock(c->name, &pid))
+ die("%s is locked by pid %u", c->name, (unsigned)pid);
+ success = exec_container(c);
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ }
+ exit(success? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static void check_container_args(void)
+{
+ unsigned n, num_inputs;
+ struct container *c;
+
+ num_inputs = lls_num_inputs(sublpr);
+ if (num_inputs == 0) {
+ if (num_containers == 0)
+ die("no container configured\n");
+ if (OPT_GIVEN(START, FOREGROUND) && num_containers > 1)
+ die("must specify container for foreground mode");
+ } else {
+ if (OPT_GIVEN(START, FOREGROUND) && num_inputs > 1)
+ die("can start only one container in foreground mode");
+ for (n = 0; n < num_inputs; n++) {
+ const char *name = lls_input(n, sublpr);
+ c = get_container(name);
+ if (!c)
+ die("container not configured: %s", name);
+ }
+ }
+}
+
+struct container_arg_iter {
+ unsigned idx;
+};
+
+#define INITIALIZED_CAI(_cai) {.idx = 0}
+
+static struct container *cai_next(struct container_arg_iter *cai, bool *skipped)
+{
+ unsigned num_inputs = lls_num_inputs(sublpr);
+
+ if (skipped)
+ *skipped = false;
+ if (num_inputs == 0) {
+ if (cai->idx >= num_containers)
+ return NULL;
+ return container[cai->idx++];
+ }
+ for (; cai->idx < num_inputs; cai->idx++) {
+ const char *name = lls_input(cai->idx, sublpr);
+ struct container *c = get_container(name);
+ if (!c) {
+ ERROR_LOG("%s: not configured\n", name);
+ if (skipped)
+ *skipped = true;
+ continue;
+ }
+ cai->idx++;
+ return c;
+ }
+ return NULL;
+}
+
+static bool for_each_container_arg(bool (*f)(const struct container *c))
+{
+ struct container *c;
+ bool success = true;
+ bool skipped;
+ struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+ while ((c = cai_next(&cai, &skipped)))
+ if (!f(c) || skipped)
+ success = false;
+ return success;
+}
+
+static bool com_start(void)
+{
+ const char *logdir = OPT_STRING_VAL(MICOFORIA, LOGDIR);
+
+ check_container_args();
+ if (logdir[0] == '\0')
+ die_empty_arg("loggir");
+ cgroup_init();
+ if (mkdir(logdir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", logdir);
+ return for_each_container_arg(start_container);
+}
+EXPORT_CMD_HANDLER(start);
+
+static bool send_signal_to_container(int signum, const struct container *c)
+{
+ pid_t pid;
+ bool success;
+
+ if (!is_locked(c->name, &pid)) {
+ INFO_LOG("%s is not running\n", c->name);
+ return false;
+ }
+ DEBUG_LOG("sending signal %d to pid %u\n", signum, (unsigned)pid);
+ success = kill(pid, signum) >= 0;
+ if (!success)
+ ERROR_LOG("kill %s: %m\n", c->name);
+ return success;
+}
+
+static void clean_env(void)
+{
+ char *term = getenv("TERM");
+
+ clearenv();
+ if (term)
+ setenv("TERM", term, 0);
+ setenv("PATH", "/root/bin:/usr/local/sbin:/usr/local/bin"
+ ":/sbin:/usr/sbin:/bin:/usr/bin", 0);
+ setenv("USER", "root", 0);
+ setenv("LOGNAME", "root", 0);
+ setenv("HOME", "/root", 0);
+}
+
+static bool request_init_pid(const char *name, int *result)
+{
+ char *socket_path = get_socket_path(name);
+ bool success;
+
+ *result = -1;
+ success = request_int(socket_path, "init_pid", result);
+ free(socket_path);
+ if (!success)
+ ERROR_LOG("could not determine init pid of %s\n", name);
+ return success;
+}
+
+static bool shutdown_container(const struct container *c)
+{
+ pid_t pid;
+ char str[20];
+ char *argv[] = {"nsenter", "-w", "-a", "-r", "-t", str, "halt", NULL};
+
+ if (!is_locked(c->name, NULL)) {
+ if (lls_num_inputs(sublpr) == 0)
+ return true;
+ ERROR_LOG("container not running: %s\n", c->name);
+ return false;
+ }
+ pid = fork();
+ if (pid < 0)
+ return false;
+ if (pid > 0)
+ return true;
+ if (!request_init_pid(c->name, &pid))
+ _exit(EXIT_FAILURE);
+ sprintf(str, "%d", pid);
+ clean_env();
+ execvp(argv[0], argv);
+ _exit(EXIT_FAILURE);
+}
+
+static bool container_is_dead(const struct container *c)
+{
+ return !is_locked(c->name, NULL);
+}
+
+static bool wait_for_containers_to_die(void)
+{
+ bool success;
+ unsigned ms = 32;
+ struct timespec ts;
+
+ while (ms < 20000) {
+ ts.tv_sec = ms / 1000;
+ ts.tv_nsec = (ms % 1000) * 1000 * 1000;
+ if (nanosleep(&ts, NULL) < 0)
+ return false;
+ success = for_each_container_arg(container_is_dead);
+ if (success)
+ return true;
+ ms *= 2;
+ }
+ return false;
+}
+
+static bool com_stop(void)
+{
+ bool success = for_each_container_arg(shutdown_container);
+
+ if (!success)
+ return false;
+ if (!OPT_GIVEN(STOP, WAIT))
+ return true;
+ return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(stop);
+
+static bool reboot_container(const struct container *c)
+{
+ return send_signal_to_container(SIGINT, c);
+}
+
+static bool com_reboot(void)
+{
+ return for_each_container_arg(reboot_container);
+}
+EXPORT_CMD_HANDLER(reboot);
+
+static bool kill_container(const struct container *c)
+{
+ return send_signal_to_container(SIGUSR1, c);
+}
+
+static bool com_kill(void)
+{
+ bool success = for_each_container_arg(kill_container);
+
+ if (!success)
+ return false;
+ if (!OPT_GIVEN(KILL, WAIT))
+ return true;
+ return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(kill);
+
+static void list_container_verbose(const struct container *c)
+{
+ char *root;
+ unsigned n, N;
+ char **word_list;
+ cap_value_t *capdrop;
+ uint32_t *tty;
+ char cores_str[25] = "unlimited";
+ unsigned cores = get_cpu_cores(c);
+
+ printf("%s:\n", c->name);
+ printf("\tpre-start hook: %s\n", get_pre_start_hook(c));
+ printf("\tpre-exec hook: %s\n", get_pre_exec_hook(c));
+ root = get_root_dir(c);
+ printf("\troot dir: %s\n", root);
+ free(root);
+ printf("\tinit path: %s\n", get_init_path(c));
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char pretty_hwaddr[18];
+ char *iface = interface_name(c, n, false);
+ pretty_print_hwaddr(c->ifspec[n].hwaddr, pretty_hwaddr);
+ printf("\tinterface #%u: %s (%s)\n", n, iface, pretty_hwaddr);
+ free(iface);
+ }
+ N = get_dacl(c, &word_list);
+ for (n = 0; n < N; n++)
+ printf("\tdac entry #%u: %s %s\n", n, word_list[n][0] == 'a'?
+ "allow" : "deny", word_list[n] + 1);
+ N = get_iospecs(c, &word_list);
+ for (n = 0; n < N; n++)
+ printf("\tiospec #%u: %s\n", n, word_list[n]);
+ if (cores > 0)
+ sprintf(cores_str, "%u", cores);
+ printf("\tCPU core limit: %s\n", cores_str);
+ printf("\tmemory limit: %uG\n", get_memory_limit(c));
+ N = get_capdrops(c, &capdrop);
+ for (n = 0; n < N; n++)
+ printf("\tcapdrop #%u: %s\n", n, cap_to_name(capdrop[n]));
+ N = get_container_ttys(c, &tty);
+ for (n = 0; n < N; n++)
+ printf("\ttty #%u: %u\n", n, tty[n]);
+}
+
+static bool com_ls(void)
+{
+ struct container *c;
+ bool skipped, success = true;
+ struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+ while ((c = cai_next(&cai, &skipped))) {
+ pid_t pid;
+ if (skipped)
+ success = false;
+ if (!is_locked(c->name, &pid)) {
+ if (!OPT_GIVEN(LS, ALL)) {
+ success =false;
+ continue;
+ }
+ pid = 0;
+ }
+ if (OPT_GIVEN(LS, VERBOSE)) {
+ list_container_verbose(c);
+ continue;
+ }
+ if (OPT_GIVEN(LS, LONG)) {
+ if (pid > 0)
+ printf("%u\t", (unsigned)pid);
+ else
+ printf("-\t");
+ printf("%u\t", get_cpu_cores(c));
+ printf("%uG\t", get_memory_limit(c));
+ printf("%s\n", c->name);
+ continue;
+ }
+ if (!OPT_GIVEN(LS, QUIET))
+ printf("%s\n", c->name);
+ }
+ if (skipped) /* needed if the last given container arg is invalid */
+ success = false;
+ return success;
+}
+EXPORT_CMD_HANDLER(ls);
+
+static bool list_container_processes(const struct container *c)
+{
+ int pid;
+ char str[20];
+ char *argv[] = {"pstree", "-anp", str, NULL};
+ bool success;
+
+ success = is_locked(c->name, &pid);
+ if (!success) {
+ if (lls_num_inputs(sublpr) == 0)
+ return true;
+ ERROR_LOG("container \"%s\" is not running\n", c->name);
+ return false;
+ }
+ if (!OPT_GIVEN(PS, ALL) && !request_init_pid(c->name, &pid))
+ return false;
+ sprintf(str, "%d", pid);
+ success = xexec(argv, NULL);
+ return success;
+}
+
+static bool com_ps(void)
+{
+ return for_each_container_arg(list_container_processes);
+}
+EXPORT_CMD_HANDLER(ps);
+
+static bool com_attach(void)
+{
+ char *errctx;
+ const char *arg;
+ pid_t pid;
+ char *socket_path;
+ int master, ret, socket_fd;
+ bool have_escape = false;
+ struct termios tios;
+ uint32_t minor = OPT_UINT32_VAL(ATTACH, TTY);
+ char *rq;
+
+ if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+ ERROR_LOG("both stdin and stdout must be terminals\n");
+ return false;
+ }
+ if (tcgetattr(STDIN_FILENO, &tios) < 0)
+ die_errno("tcgetattr");
+ ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ arg = lls_input(0, sublpr);
+ if (!is_locked(arg, &pid)) {
+ ERROR_LOG("container not running: %s\n", arg);
+ return false;
+ }
+ socket_path = get_socket_path(arg);
+ if (OPT_GIVEN(ATTACH, FORCE))
+ rq = msg("force-attach %u", minor);
+ else
+ rq = msg("attach %u", minor);
+ socket_fd = request_fd(socket_path, rq, &master);
+ free(rq);
+ free(socket_path);
+ INFO_LOG("Attached to /dev/tty%u of container %s\n", minor, arg);
+ NOTICE_LOG("Type CTRL+a q to quit\n");
+ setup_termios(STDIN_FILENO);
+ setup_termios(master);
+ for (;;) {
+ int max_fileno = 0;
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(STDIN_FILENO, &fds);
+ if (STDIN_FILENO > max_fileno)
+ max_fileno = STDIN_FILENO;
+ FD_SET(master, &fds);
+ if (master > max_fileno)
+ max_fileno = master;
+ FD_SET(socket_fd, &fds);
+ if (socket_fd > max_fileno)
+ max_fileno = socket_fd;
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ if (FD_ISSET(socket_fd, &fds))
+ break;
+ if (FD_ISSET(STDIN_FILENO, &fds)) {
+ char c;
+ if (read(STDIN_FILENO, &c, 1) <= 0)
+ break;
+ if (c == 1 && !have_escape)
+ have_escape = true;
+ else if (c == 'q' && have_escape)
+ break;
+ else if (write(master, &c, 1) != 1)
+ break;
+ }
+ if (FD_ISSET(master, &fds)) {
+ if (!copy(master, STDOUT_FILENO))
+ break;
+ }
+ }
+ if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ printf("\n");
+ return false;
+}
+EXPORT_CMD_HANDLER(attach);
+
+static bool com_help(void)
+{
+ int ret;
+ char *errctx, *help;
+ const char *arg;
+ const struct lls_command *cmd;
+
+ ret = lls_check_arg_count(sublpr, 0, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ if (lls_num_inputs(sublpr) == 0) {
+ show_subcommand_summary(OPT_GIVEN(HELP, LONG));
+ return true;
+ }
+ arg = lls_input(0, sublpr);
+ ret = lls_lookup_subcmd(arg, micoforia_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ cmd = lls_cmd(ret, micoforia_suite);
+ if (OPT_GIVEN(HELP, LONG))
+ help = lls_long_help(cmd);
+ else
+ help = lls_short_help(cmd);
+ printf("%s\n", help);
+ free(help);
+ return true;
+}
+EXPORT_CMD_HANDLER(help);
+
+static bool com_configtest(void)
+{
+ printf("Syntax Ok\n");
+ return true;
+}
+EXPORT_CMD_HANDLER(configtest);
+
+static bool com_edit(void)
+{
+ char *ed = getenv("EDITOR"); /* must not be freed */
+ char *conf = get_config_file_path();
+ char *argv[] = {ed? ed : "vi", conf, NULL};
+ bool success = xexec(argv, NULL);
+
+ free(conf);
+ return success;
+}
+EXPORT_CMD_HANDLER(edit);
+
+static bool com_enter(void)
+{
+ char str[20];
+ char **argv;
+ char *nsenter_args[] = {"nsenter", "-w", "-a", "-r", "-t"};
+ const unsigned nna = ARRAY_SIZE(nsenter_args); /* num nsenter args */
+ char *dflt_cmd[] = {"login", "-f", "root"};
+ unsigned n, N, ni = lls_num_inputs(sublpr);
+ unsigned nea = ni > 1? ni - 1 : ARRAY_SIZE(dflt_cmd); /* num extra args */
+ const char *arg;
+ bool success;
+ int ret, pid;
+ char *errctx;
+
+ ret = lls_check_arg_count(sublpr, 1, INT_MAX, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ arg = lls_input(0, sublpr);
+ if (!is_locked(arg, &pid)) {
+ ERROR_LOG("container not running: %s\n", arg);
+ return false;
+ }
+ if (!request_init_pid(arg, &pid))
+ return false;
+ N = nna + nea + 2; /* +1 for arg to -t and +1 for terminating NULL */
+ argv = xmalloc(N * sizeof(char *));
+ for (n = 0; n < nna; n++)
+ argv[n] = nsenter_args[n];
+ sprintf(str, "%d", pid);
+ argv[nna] = str;
+ for (n = 0; n < nea; n++)
+ argv[nna + 1 + n] = ni > 1? (char *)lls_input(n + 1, sublpr)
+ : dflt_cmd[n];
+ argv[N - 1] = NULL;
+ clean_env();
+ success = xexec(argv, NULL);
+ free(argv);
+ return success;
+}
+EXPORT_CMD_HANDLER(enter);
+
+static bool com_log(void)
+{
+ int ret;
+ char *errctx, *logfile;
+ bool success, use_less = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO);
+ char *argv[] = {use_less? "less" : "cat", NULL /* filename */, NULL};
+
+ ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ logfile = get_container_logfile(lls_input(0, sublpr));
+ argv[1] = logfile;
+ success = xexec(argv, NULL);
+ free(logfile);
+ return success;
+}
+EXPORT_CMD_HANDLER(log);
+
+int main(int argc, char *argv[])
+{
+ int ret;
+ char *errctx;
+ const struct micoforia_user_data *ud;
+ unsigned num_inputs;
+
+ valid_fd012();
+ parse_options(argc, argv, CMD_PTR(MICOFORIA), &lpr);
+ loglevel_arg_val = OPT_UINT32_VAL(MICOFORIA, LOGLEVEL);
+ check_options();
+ num_inputs = lls_num_inputs(lpr);
+ ret = lls_lookup_subcmd(argv[argc - num_inputs], micoforia_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ subcmd = lls_cmd(ret, micoforia_suite);
+ parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr);
+ ud = lls_user_data(subcmd);
+ exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE);
+}
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+[suite micoforia]
+ caption = Subcommands
+ mansect = 8
+ manual_title = System Manager's Manual
+[supercommand micoforia]
+ [description]
+ DESCRIPTION1()
+
+ DESCRIPTION2()
+
+ DESCRIPTION3()
+
+ In addition to global options which apply to all subcommands, each
+ subcommand has its own set of options. The usual "--" separator must
+ be used to separate global options from subcommand specific options.
+
+ [/description]
+ synopsis = [global-options...] [--] [<subcommand> [subcommand-options...]]
+ purpose = SLOGAN()
+
+ [option general-options-section]
+ summary = General options
+ flag ignored
+ [option help]
+ summary = print help and exit
+ short_opt = h
+ [option detailed-help]
+ summary = print help, including all details, and exit
+ [option version]
+ summary = print version and exit
+ short_opt = V
+ [option config-file]
+ short_opt = c
+ summary = use alternative config file (default: ~/.mismarc)
+ typestr = path
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ Options may be given at the command line or in the configuration
+ file. As usual, if an option is given both at the command line and
+ in the configuration file, the command line option takes precedence.
+
+ The config file may contain global options as well as options for
+ any subcommand, but subcommand specific options must be placed in a
+ separate section. See the Examples section of the man page.
+ [/help]
+ [option loglevel]
+ summary = control amount of logging
+ short_opt = l
+ arg_info = required_arg
+ arg_type = string
+ typestr = severity
+ values = {
+ LSGLL_DEBUG = "debug",
+ LSGLL_INFO = "info",
+ LSGLL_NOTICE = "notice",
+ LSGLL_WARNING = "warning",
+ LSGLL_ERROR = "error",
+ LSGLL_CRIT = "crit",
+ LSGLL_EMERG = "emerg"
+ }
+ default_val = warning
+ [help]
+ Log only messages with severity greater or equal than the given
+ value. Possible values:
+
+ debug: produces really noisy output.
+ info: still noisy, but won't fill up the disk quickly.
+ notice: indicates normal, but significant event.
+ warning: unexpected events that can be handled.
+ error: unhandled error condition.
+ crit: system might be unreliable.
+ emerg: last message before exit.
+ [/help]
+
+ [option general-options-section]
+ summary = Global Container Options
+ flag ignored
+ [help]
+ The options in this section apply to all containers. Most of them
+ have a per-container counterpart which can be specified to override
+ the global default.
+ [/help]
+ [option default-root-prefix]
+ summary = path to the parent directory of the container root file systems
+ typestr = directory
+ arg_info = required_arg
+ arg_type = string
+ default_val = /var/lib/micoforia
+ [help]
+ For containers which do not specify their own root directory the path
+ to the container root is derived from the argument of this option by
+ appending a slash and the container name.
+ [/help]
+ [option logdir]
+ summary = directory which contains the container log files
+ arg_info = required_arg
+ arg_type = string
+ typestr = directory
+ default_val = /var/log/micoforia
+ [help]
+ The log messages of each container are written to a dedicated
+ logfile. This option controls in which directroy these files are
+ written (start subcommand) or expected (log subcommand).
+
+ Nothing is written to the logfile if the container is started in
+ foreground mode.
+ [/help]
+ [option default-pre-start-hook]
+ summary = command to be executed before the container starts
+ typestr = command
+ arg_info = required_arg
+ arg_type = string
+ default_val = true
+ [help]
+ This hook is run early during container startup. All veth device
+ pairs have been created, but no namespace or cgroup operations have
+ been performed at this point.
+
+ If the root file system of the container must be prepared, this is the
+ right place to perform this task. Unlike the pre exec hook described
+ below, this hook is only called once.
+
+ The following environment variables are set: MICOFORIA_CONTAINER_NAME,
+ MICOFORIA_IFSPECS, MICOFORIA_ROOT_DIR.
+ [/help]
+ [option default-pre-exec-hook]
+ summary = command to be executed before /sbin/init is executed
+ typestr = command
+ arg_info = required_arg
+ arg_type = string
+ default_val = true
+ [help]
+ This runs with all namespaces already unshared and cgroup settings
+ applied but before the root directory is switched to the container
+ root. The hostname has already been changed to the container name
+ and the network interfaces have been renamed to eth0, eth1, etc.
+
+ This is the right place to perform additional cgroup or namespace
+ operations. When the container is rebooted, the pre-exec is called
+ again, just before control is handed over to the new init process.
+
+ Only MICOFORIA_ROOT_DIR is set in this hook.
+ [/help]
+ [option default-init]
+ summary = control the handover to the init process of the container
+ typestr = command
+ arg_info = required_arg
+ arg_type = string
+ default_val = /sbin/init
+ [help]
+ This program is executed as the last step of the container startup
+ procedure as pid 1. At this point the root directory of the process
+ has already been changed, so the given argument refers to a path
+ relative to the container root directory.
+ [/help]
+ [option default-bridge]
+ summary = ethernet bridge to use by default
+ typestr = bridge
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ default_val = micoforia
+ [help]
+ Applies to all containers which do not specify their own network
+ interface(s) with --net. If this is given multiple times, containers
+ will be equipped with multiple interfaces.
+ [/help]
+ [option default-cgroup-dac]
+ summary = specify which device nodes containers may access/create by default
+ typestr = dacspec
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ Applies to all containers which do not specify their own access
+ control lists. May be given multiple times. Each device access control
+ specifier must be of the form {allow|deny} <entry>, where <entry>
+ is a suitable device access control string for the devices.allow or
+ devices.deny file of the cgroup-v1 controller. Order matters.
+
+ If this option is not given, and the corresponding per-container
+ option is not given either, a reasonable default applies which allows
+ access to the most common character devices (/dev/zero, /dev/null,
+ /dev/urandom, etc.) but denies access to most other devices including
+ all block devices.
+
+ Example: allow c 1:5 rwm
+ [/help]
+ [option default-cpu-cores]
+ summary = Number of cores to use by default (zero means unlimited)
+ typestr = num
+ arg_info = required_arg
+ arg_type = uint32
+ default_val = 0
+ [help]
+ The limit is enforced by the cpu cgroup-v2 controller. Note that in
+ contrast to the cpuset controller of cgroup-v1 this controller does not
+ restrict the container to a set of admissible CPUs. Instead, it limits
+ the number of CPU cycles per time unit for the processes in the cgroup.
+ [/help]
+ [option default-memory-limit]
+ summary = Memory usage throttle limit (zero means no limit)
+ typestr = gigabytes
+ arg_info = required_arg
+ arg_type = uint32
+ default_val = 0
+ [help]
+ The value specified here is written to the cgroup-v2 memory.high
+ control file of all containers which do not specify their own limit.
+ [/help]
+ [option default-io-max]
+ summary = I/O limit (zero means no limit)
+ flag multiple
+ typestr = iospec
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ The I/O specifier argument must be a valid string for the io.max file
+ of the cgroup-v2 controller. For example, the string "1:5 rbps=1024"
+ limits the read I/O rate for the /dev/zero device to 1K per second.
+ [/help]
+ [option default-capdrop]
+ summary = Capabilities to drop by default
+ typestr = capspec
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ The capability specifier argument is the text representation of a
+ capability, like CAP_SYS_MODULE. All given capabilities will be dropped
+ from the bounding set of the container init process, hence from all
+ all processes of the container. If this option is not given, and no
+ per-container capabilities to drop are given either, CAP_SYS_MODULE,
+ CAP_SYS_TIME, and CAP_SYS_RESOURCE are dropped.
+
+ See capabilities(7) for the list of capabilities and their meaning.
+ [/help]
+ [option default-tty]
+ summary = Minor number of a tty device to capture by default
+ typestr = minor
+ flag multiple
+ arg_info = required_arg
+ arg_type = uint32
+ [help]
+ Normally the container's init process starts at least one "getty"
+ login session on a tty port /dev/ttyX, where X is the minor device
+ ID. This option lets you capture these login sessions and forward them
+ to another micoforia process executing the "attach" subcommand. For
+ each time the option is given, the device with the given minor device
+ number is captured.
+
+ If this is not given, /dev/tty1 will be captured.
+ [/help]
+ [option general-options-section]
+ summary = Per-Container Options
+ flag ignored
+ [help]
+ These override the global container options above. Most of them take
+ a compound argument of the form <name:value>, where the first part
+ is the name of the container to which the option should be applied.
+
+ Unless noted otherwise, if both a global option and the corresponding
+ per-container option is given, the per-container option takes
+ precedence.
+ [/help]
+ [option container]
+ summary = name of the container
+ flag multiple
+ typestr = name
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ Used for the hostname, the name of the veth interfaces and the name of
+ the cgroup directory. The name may only contain characters of the set
+ [a-zA-Z0-9-] and the length must not exceed 32 characters.
+
+ This does not need to be given if one of the compound options below
+ are given instead.
+ [/help]
+ [option pre-start-hook]
+ summary = See --default-pre-start-hook
+ flag multiple
+ typestr = name:command
+ arg_info = required_arg
+ arg_type = string
+ [option pre-exec-hook]
+ summary = See --default-pre-exec-hook
+ flag multiple
+ typestr = name:command
+ arg_info = required_arg
+ arg_type = string
+ [option init]
+ summary = See --default-init
+ typestr = name:command
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [option net]
+ summary = Equip the container with a non-default network interface
+ flag multiple
+ typestr = name:ifspec
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ The interface specifier is of the form bridge[:hwaddr]. If no hardware
+ address is given, a random address will be used. See --default-bridge.
+
+ Unlike the other compound options of this section, this option is
+ cumulative in that multiple options with the same container name do
+ not override each other but accumulate, resulting in a container with
+ multiple network interfaces.
+ [/help]
+ [option root-directory]
+ summary = Path to the container root directory. See --default-root-prefix.
+ flag multiple
+ typestr = name:path
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ [/help]
+ [option cgroup-dac]
+ summary = See --default-cgroup-dac
+ typestr = name:dacspec
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [option cpu-cores]
+ summary = See --default-cpu-cores
+ typestr = name:num
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [option memory-limit]
+ summary = See --default-memory-limit
+ typestr = name:gigabytes
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+ [option io-max]
+ summary = See --default-io-max
+ flag multiple
+ typestr = name:iospec
+ arg_info = required_arg
+ arg_type = string
+ [option capdrop]
+ summary = See --default-capdrop
+ flag multiple
+ typestr = name:capspec
+ arg_info = required_arg
+ arg_type = string
+ [option tty]
+ summary = See --default-tty
+ typestr = name:minor
+ flag multiple
+ arg_info = required_arg
+ arg_type = string
+
+[introduction]
+ micoforia supports the subcommands described below. If no subcommand
+ is given, the list of available subcommands is shown and the program
+ terminates successfully without performing any further action.
+[/introduction]
+
+[subcommand start]
+ purpose = start one or more containers
+ non-opts-name = [<name>...]
+ [description]
+ If no container is given, all configured containers are started.
+ [/description]
+ [option foreground]
+ short_opt = F
+ summary = do not run as background daemon
+ [help]
+ Normally, the process detaches from the console and continues to run
+ in the background. When this option is given, only a single container
+ can be started, and this container will run with its /dev/console
+ device redirected to the local tty, making the container startup
+ messages visible on the local tty.
+
+ Moreover, stdin is forwarded to the first configured tty device
+ (/dev/tty1 by default) of the container, and anything received from
+ the other end of the forwarding is dumped to stdout. This allows for
+ logins on the "local" console of the container, provided the container
+ starts getty process which listens on the tty device.
+ [/help]
+[subcommand stop]
+ purpose = shutdown one or more containers
+ non-opts-name = [<name>...]
+ [description]
+ This subcommand works by executing halt(8) in container context.
+ If no container is given, halt(8) is executed in all configured
+ container contexts.
+ [/description]
+ [option wait]
+ short_opt = w
+ summary = wait until all containers have terminated
+ [help]
+ Without --wait the micoforia process which executes the stop
+ subcommand exits after spawning one halt(8) process per container
+ to be stopped. If --wait is given, the subcommand waits until all
+ containers have terminated or the timeout expires. This is handy for
+ system shutdown scripts which are supposed to terminate all running
+ containers.
+ [/help]
+ [closing]
+ If --wait is not given, the subcommand exits successfully if and only
+ if all signals were sent successfully. With --wait the subcommand
+ exits successfully if, additionally, all signalled processes have
+ terminated before the timeout expires.
+ [/closing]
+
+[subcommand reboot]
+ purpose = reboot containers
+ non-opts-name = [<name>...]
+ [description]
+ Containers are rebooted and killed by sending a signal to a micoforia
+ process which executes the start subcommand.
+ [/description]
+[subcommand kill]
+ purpose = force containers to terminate
+ non-opts-name = [<name>...]
+ [description]
+ This works like the reboot subcommand, but a different signal is used
+ to notify the container.
+ [/description]
+ [option wait]
+ short_opt = w
+ summary = wait until all signalled containers have terminated
+ [help]
+ Without --wait the micoforia process which executes the kill subcommand
+ exits right after the underlying kill(2) system call returns. At this
+ point the signalled process might still be alive although SIGKILL
+ was sent. If --wait is given, the process waits until the signalled
+ processes have terminated or the timeout expires.
+ [/help]
+[subcommand ls]
+ purpose = list containers
+ non-opts-name = [<name>...]
+ [description]
+ Several listing modes are available. By default, only the running
+ containers are listed. If no container name is given, all configured
+ containers are taken into account.
+
+ [/description]
+ [option all]
+ short_opt = a
+ summary = Also list containers which are not running
+ [option quiet]
+ short_opt = q
+ summary = Do not print any output
+ [help]
+ For scripts to determine from the exit code whether all of the given
+ containers are running.
+ [/help]
+ [option long]
+ short_opt = l
+ summary = Show also the pid, and the cpu and memory limits
+ [help]
+ This overrides --quiet. That is, if both --quiet and --long are given,
+ the long listing is shown,
+ [/help]
+ [option verbose]
+ short_opt = v
+ summary = Show all container settings, one setting per line
+ [help]
+ This overrides --quiet and --long.
+ [/help]
+ [closing]
+ The subcommand exits successfully if and only if all given/configured
+ containers could be listed. Unless --all is given, it is considered
+ an error if a given container is not running. In particular, when ls
+ is executed with no arguments at all, it exits successfully if and
+ only if all configured containers are running.
+ [/closing]
+[subcommand ps]
+ purpose = print process list of one or more containers
+ non-opts-name = [<name>...]
+ [description]
+ This runs pstree(1). The container init process is always the third
+ process shown. Process IDs refer to the parent PID namespace, which
+ is why the process ID of the container init is not shown as 1.
+ [/description]
+ [option all]
+ short_opt = a
+ summary = also show the two micoforia processes
+[subcommand attach]
+ purpose = map the console of a running container to the local terminal.
+ non-opts-name = [<name>...]
+ [description]
+ It is an error if stdin is not associated with a terminal device.
+ [/description]
+ [option tty]
+ short_opt = t
+ summary = terminal to connect
+ arg_info = required_arg
+ arg_type = uint32
+ typestr = minor
+ default_val = 1
+ [help]
+ This operation can only succeed if the given tty is forwarded by the
+ container. See --tty and --default-tty.
+ [/help]
+ [option force]
+ short_opt = f
+ summary = don't fail but steal the tty if it is already attached
+[subcommand help]
+ purpose = list available subcommands or print subcommand-specific help
+ non-opts-name = [subcommand]
+ [description]
+ Without any arguments, help prints the list of available
+ subcommands. When called with a subcommand name argument, it prints
+ the help text of the given subcommand.
+ [/description]
+ [option long]
+ short_opt = l
+ summary = show the long help text
+ [help]
+ If the optional argument is supplied, the long help text contains the
+ synopsis, the purpose and the description of the specified subcommand,
+ followed by the option list including summary and help text of each
+ option. Without --long, the short help is shown instead. This omits
+ the description of the subcommand and the option help.
+
+ If no subcommand is supplied but --long is given, the list contains the
+ purpose of each subcommand.
+ [/help]
+
+[subcommand configtest]
+ purpose = run a configuration file syntax test
+ [description]
+ This subcommand checks the command line options and the configuration
+ file for syntactic and semantic correctness. It either reports
+ "Syntax Ok" and exits successfully or prints information about the
+ first error and terminates with exit code 1.
+ [/description]
+
+[subcommand edit]
+ purpose = edit the configuration file
+ [description]
+ The editor to start is derived from the EDITOR environment variable.
+ If this variable is not set, vi is assumed.
+ [/description]
+
+[subcommand enter]
+ purpose = run a command in a container namespace
+ non-opts-name = <name> [<command> [arg...]]
+ [description]
+ This executes the nsenter(1) command to enter the namespaces of
+ the init process of the given container. If no command is given,
+ the login command is run to start a root shell.
+ [/description]
+
+[subcommand log]
+ purpose = show the log file for the given container
+ non-opts-name = [<name>]
+ [description]
+ This executes cat(1) or less(1), depending on whether or not stdin
+ and stdout are associated with a terminal device.
+ [/description]
+[section Notes]
+.SS The Cgroup File Systems
+ There are two implementations of Linux control groups called
+ .I cgroup-v1
+ and
+ .IR cgroup-v2 .
+ Both come with their own pseudo filesystem.
+ .B micoforia
+ requires both file systems to be mounted at
+ .IR /var/cgroup
+ and
+ .IR /var/cgroup2 .
+ Version 1 cgroups are only used to enforce device access control for
+ the containers, so the cgroup-v1 pseudo filesystem should be mounted
+ with only this controller enabled. See the Examples section below
+ for how to do this. Future versions of
+ .B micoforia
+ might switch to the devices controller of cgroup-v2.
+.SS Container Names
+ The container name is used also for the name of the network device
+ and as a directory name if no explicit root directory is given with
+ --root-prefix. Therefore container names must not exceed 32 characters,
+ which must all be alphanumeric or '-'. In particular, whitespace and
+ underscore ('_') are not permitted.
+
+[/section]
+[section Examples]
+ .IP \(bu 2
+ Create a bash alias named
+ .I m7a
+ for
+ .I micoforia
+ which activates debug messages and already includes the double dash
+ to separate global options from subcommand options:
+
+ .RS 6
+ .EX
+ .B alias m7a='micoforia --loglevel debug --'
+ .EE
+ .RE
+ .IP \(bu 2
+ Set up an ethernet bridge named
+ .IR micoforia ,
+ add the physical interface
+ .I eth1
+ to it and give the bridge interface an IP address:
+
+ .RS 6
+ .EX
+ .B brctl addbr micoforia
+ .B ip link set up micoforia
+ .B brctl addif micoforia eth1
+ .B ip a a 192.168.137.1/24 dev micoforia
+ .EE
+ .RE
+ .IP \(bu 2
+ Mount the two cgroup file systems, but only activate the
+ .I devices
+ controller of cgroup-v1:
+
+ .RS 6
+ .EX
+ .B mkdir -p /var/cgroup && mount -t cgroup -o devices cgroup /var/cgroup
+ .B mkdir -p /var/cgroup2 && mount -t cgroup2 cgroup2 /var/cgroup2
+ .EE
+ .RE
+ .IP \(bu 2
+ Entries for
+ .I /etc/fstab
+ to mount the cgroup file systems automatically at boot:
+
+ .RS 6
+ .EX
+ .B none /var/cgroup cgroup devices 0 0
+ .B none /var/cgroup2 cgroup2 defaults 0 0
+ .EE
+ .RE
+ .IP \(bu 2
+ Download a Debian10 root file system to
+ .IR /var/lib/micoforia/debian10 ,
+ set the root password and let micoforia set the hostname
+
+ .RS 6
+ .EX
+ .B debootstrap --variant=minbase buster /var/lib/micoforia/debian10 http://deb.debian.org/debian/
+ .B chroot /var/lib/micoforia/debian10 passwd
+ .B rm -f /var/lib/micoforia/debian10/etc/hostname
+ .EE
+ .RE
+ .IP \(bu 2
+ Download a minimal Ubuntu-18.04 root file system to
+ .IR /var/lib/micoforia/c1 ,
+ set the root password and configure the
+ .I eth0
+ interface, using a static IP address:
+
+ .RS 6
+ .EX
+ .B debootstrap --include openssh-server --include ifupdown bionic /var/lib/micoforia/c1 http://de.archive.ubuntu.com/ubuntu
+ .B chroot /var/lib/micoforia/c1 passwd
+ .B printf 'auto eth0\(rsniface eth0 inet static\(rsnaddress 192.168.137.2/24\(rsn' \
+ >> /var/lib/micoforia/c1/etc/network/interfaces
+ .B echo 'PermitRootLogin yes' >> /var/lib/micoforia/c1/etc/ssh/sshd_config
+ .EE
+ .RE
+ .IP \(bu 2
+ Start the container in foreground mode:
+
+ .RS 6
+ .EX
+ .B micoforia --container c1 --start --foreground
+ .EE
+ .RE
+ .IP \(bu 2
+ Attach to
+ .I tty1
+ of the running container:
+
+ .RS 6
+ .EX
+ .B m7a attach c1
+ .EE
+ .RE
+ .IP \(bu 2
+ Ask the container to shut down, and wait for the shutdown procedure
+ to complete:
+
+ .RS 6
+ .EX
+ .B m7a stop --wait c1
+ .EE
+ .RE
+ .IP \(bu 2
+ Check whether the container is running:
+
+ .RS 6
+ .EX
+ .B m7a ls --quiet c1 && echo yes || echo no
+ .EE
+ .RE
+ .IP \(bu 2
+ A simple config file:
+
+ .RS 6
+ .EX
+ .B # two global options
+ .B loglevel info
+ .B container c1
+ .B # an option for the "attach" subcommand
+ .B [start]
+ .B \ \ \ \ tty 2
+ .EE
+ .RE
+
+[/section]
+[section copyright]
+ Written by AUTHOR()
+ .br
+ Copyright (C) COPYRIGHT_YEAR() AUTHOR()
+ .br
+ License: LICENSE()
+ .br
+ This is free software: you are free to change and redistribute it.
+ .br
+ There is NO WARRANTY, to the extent permitted by law.
+ .P
+ Web page:
+ .UR URL()
+ .UE
+ .br
+ Git clone `URL':
+ .UR CLONE_URL()
+ .UE
+ .br
+ Gitweb:
+ .UR GITWEB_URL()
+ .UE
+ .br
+ Author's home page:
+ .UR HOME_URL()
+ .UE
+ .br
+ Report bugs to
+ .MT EMAIL()
+ AUTHOR()
+ .ME
+[/section]
+[section see also]
+ .BR lxc (7),
+ .BR brct l(8),
+ .BR ip (8)
+ .BR pstree (1)
+[/section]
--- /dev/null
+<svg
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink"
+ width="90"
+ height="70"
+>
+ <g stroke-width="3" stroke="black" fill="none">
+ <path d="
+ M 5 5
+ l 25 25
+ l 0 30
+ c 10 5 20 5 30 0
+ l 0 -30
+ l 25 -25
+ l -27 20
+ l 0 -10
+ c -8 -13 -16 -13 -24 0
+ l 0 10
+ z
+ "
+ />
+ </g>
+ <ellipse cx="46" cy="22" rx="3" ry="2" fill="none" stroke="black" />
+ <ellipse cx="40" cy="15" rx="2" ry="2" />
+ <ellipse cx="52" cy="15" rx="2" ry="2" />
+</svg>
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include "m7a.h"
+
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <sys/mount.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <libmnl/libmnl.h>
+#include <linux/if_link.h>
+#include <linux/rtnetlink.h>
+#include <sys/un.h>
+
+void die(const char *fmt, ...)
+{
+ char *str;
+ va_list argp;
+ int ret;
+
+ va_start(argp, fmt);
+ ret = vasprintf(&str, fmt, argp);
+ va_end(argp);
+ if (ret < 0) { /* give up */
+ EMERG_LOG("OOM\n");
+ exit(EXIT_FAILURE);
+ }
+ m7a_log(LL_EMERG, "%s\n", str);
+ exit(EXIT_FAILURE);
+}
+
+void die_errno(const char *fmt, ...)
+{
+ char *str;
+ va_list argp;
+ int ret, save_errno = errno;
+
+ va_start(argp, fmt);
+ ret = vasprintf(&str, fmt, argp);
+ va_end(argp);
+ if (ret < 0) {
+ EMERG_LOG("OOM\n");
+ exit(EXIT_FAILURE);
+ }
+ m7a_log(LL_EMERG, "%s: %s\n", str, strerror(save_errno));
+ exit(EXIT_FAILURE);
+}
+
+void *xrealloc(void *p, size_t size)
+{
+ assert(size > 0);
+ assert((p = realloc(p, size)));
+ return p;
+}
+
+void *xmalloc(size_t size)
+{
+ return xrealloc(NULL, size);
+}
+
+void *xzmalloc(size_t size)
+{
+ void *p = xrealloc(NULL, size);
+ memset(p, 0, size);
+ return p;
+}
+
+void *xstrdup(const char *s)
+{
+ char *ret = strdup(s? s: "");
+
+ assert(ret);
+ return ret;
+}
+
+char *msg(const char *fmt, ...)
+{
+ char *m;
+ size_t size = 100;
+
+ m = xmalloc(size);
+ while (1) {
+ int n;
+ va_list ap;
+
+ /* Try to print in the allocated space. */
+ va_start(ap, fmt);
+ n = vsnprintf(m, size, fmt, ap);
+ va_end(ap);
+ /* If that worked, return the string. */
+ if (n < size)
+ return m;
+ /* Else try again with more space. */
+ size = n + 1; /* precisely what is needed */
+ m = xrealloc(m, size);
+ }
+}
+
+char *xstrcat(char *a, const char *b)
+{
+ char *tmp;
+
+ if (!a)
+ return xstrdup(b);
+ if (!b)
+ return a;
+ tmp = msg("%s%s", a, b);
+ free(a);
+ return tmp;
+}
+
+void die_empty_arg(const char *opt)
+{
+ die("argument to --%s must not be empty", opt);
+}
+
+__attribute__ ((noreturn))
+static void die_range(const char *opt)
+{
+ die("argument to --%s is out of range", opt);
+}
+
+void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt)
+{
+ if (val < min || val > max)
+ die_range(opt);
+}
+
+bool fd2buf(int fd, const struct iovec *iov)
+{
+ ssize_t ret, nread = 0, max;
+ char *buf = iov->iov_base;
+
+ assert(iov->iov_len > 1);
+ max = iov->iov_len - 1;
+ for (;;) {
+ ret = read(fd, buf + nread, max - nread);
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
+ ERROR_LOG("read error: %s\n", strerror(errno));
+ return false;
+ }
+ if (ret == 0) {
+ buf[nread] = '\0';
+ DEBUG_LOG("read %zd bytes\n", nread);
+ return true;
+ }
+ nread += ret;
+ if (nread >= max) {
+ ERROR_LOG("cmd output truncated\n");
+ return false;
+ }
+ }
+}
+
+bool xexec(char * const argv[], const struct iovec *iov)
+{
+ pid_t pid;
+ int pipefd[2] = {-1, -1};
+ unsigned n;
+
+ for (n = 0; argv[n]; n++)
+ DEBUG_LOG("argv[%u]=%s\n", n, argv[n]);
+ if (iov) {
+ if (pipe(pipefd) < 0)
+ die_errno("pipe");
+ }
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid > 0) { /* parent */
+ int wstatus;
+ bool success = true;
+ if (iov) {
+ close(pipefd[1]);
+ success = fd2buf(pipefd[0], iov);
+ close(pipefd[0]);
+ }
+ if (waitpid(pid, &wstatus, 0) < 0)
+ die_errno("waitp");
+ if (!success)
+ return false;
+ if (!WIFEXITED(wstatus))
+ return false;
+ if (WEXITSTATUS(wstatus) != EXIT_SUCCESS)
+ return false;
+ return true;
+ }
+ if (pipefd[0] >= 0)
+ close(pipefd[0]);
+ if (pipefd[1] >= 0 && pipefd[1] != STDOUT_FILENO) {
+ if (dup2(pipefd[1], STDOUT_FILENO) < 0)
+ die_errno("dup2()");
+ close(pipefd[1]);
+ }
+ execvp(argv[0], argv);
+ EMERG_LOG("execvp error: %s\n", strerror(errno));
+ _exit(EXIT_FAILURE);
+}
+
+void valid_fd012(void)
+{
+ /* Ensure that file descriptors 0, 1, and 2 are valid. */
+ while (1) {
+ int fd = open("/dev/null", O_RDWR);
+ if (fd < 0)
+ die_errno("open");
+ if (fd > 2) {
+ close(fd);
+ break;
+ }
+ }
+}
+
+void check_name(const char *arg)
+{
+ size_t m, len;
+ char c;
+
+ len = strlen(arg);
+ if (len == 0)
+ die("empty name");
+ if (len > 32)
+ die("name too long: %s", arg);
+ for (m = 0; m < len; m++) {
+ c = arg[m];
+ if (!isascii(c))
+ goto invalid;
+ if (!isalnum(c) && c != '-')
+ goto invalid;
+ }
+ return;
+invalid:
+ die("invalid character '%c' in name %s", c, arg);
+}
+
+/* allocates two new strings that should be freed by the caller */
+void parse_compound_arg(const char *arg, const char *opt, char **name, char **val)
+{
+ char *copy, *p;
+
+ if (arg[0] == '\0')
+ die_empty_arg(opt);
+ copy = xstrdup(arg);
+ p = strchr(copy, ':');
+ if (!p)
+ die("could not parse argument to --%s", opt);
+ *p = '\0';
+ check_name(copy);
+ *name = copy;
+ p++;
+ *val = xstrdup(p);
+}
+
+char *parse_cgroup_acl(const char *arg)
+{
+ if (!strncmp(arg, "allow ", 6))
+ return msg("a%s", arg + 6);
+ if (!strncmp(arg, "deny ", 5))
+ return msg("d%s", arg + 5);
+ die("invalid cgroup access specifier: %s", arg);
+}
+
+void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr)
+{
+ const char *colon = strchr(arg, ':');
+ size_t len;
+ unsigned n, x[6];
+
+ if (colon) {
+ len = colon - arg;
+ *bridge = xmalloc(len + 1);
+ memcpy(*bridge, arg, len);
+ (*bridge)[len] = '\0';
+ } else
+ *bridge = xstrdup(arg);
+ check_name(*bridge);
+ if (!colon) {
+ memset(hwaddr, 0, 6);
+ return;
+ }
+ if (sscanf(colon + 1, "%02x:%02x:%02x:%02x:%02x:%02x",
+ x, x + 1, x + 2, x + 3, x + 4, x + 5) != 6)
+ die("invalid hwaddress for ifspec %s", arg);
+ if (colon[1 + 6 * 2 + 5] != '\0')
+ die("trailing garbage at the end of ifspec %s", arg);
+ for (n = 0; n < 6; n++)
+ hwaddr[n] = x[n];
+}
+
+uint32_t atou32(const char *str, const char *opt)
+{
+ char *endptr;
+ long long tmp;
+
+ errno = 0; /* To distinguish success/failure after call */
+ tmp = strtoll(str, &endptr, 10);
+ if (errno == ERANGE && (tmp == LLONG_MAX || tmp == LLONG_MIN))
+ die_range(opt);
+ if (tmp < 0 || tmp > (uint32_t)-1)
+ die_range(opt);
+ /*
+ * If there were no digits at all, strtoll() stores the original value
+ * of str in *endptr.
+ */
+ if (endptr == str)
+ die_empty_arg(opt);
+ /*
+ * The implementation may also set errno and return 0 in case no
+ * conversion was performed.
+ */
+ if (errno != 0 && tmp == 0)
+ die_empty_arg(opt);
+ if (*endptr != '\0') /* Further characters after number */
+ die("--%s: trailing characters after number", opt);
+ return tmp;
+}
+
+bool remove_subdirs_recursively(const char *path)
+{
+ DIR *d = opendir(path);
+ struct dirent *entry;
+ int dfd;
+ struct stat stat;
+
+ if (!d) {
+ ERROR_LOG("opendir %s: %m\n", path);
+ return false;
+ }
+ dfd = dirfd(d);
+ assert(dfd >= 0);
+ while ((entry = readdir(d))) {
+ char *subpath;
+ if (!strcmp(entry->d_name, "."))
+ continue;
+ if (!strcmp(entry->d_name, ".."))
+ continue;
+ if (fstatat(dfd, entry->d_name, &stat, 0) == -1) {
+ WARNING_LOG("%s/%s: %m", path, entry->d_name);
+ continue;
+ }
+ if (!S_ISDIR(stat.st_mode))
+ continue;
+ subpath = msg("%s/%s", path, entry->d_name);
+ remove_subdirs_recursively(subpath);
+ DEBUG_LOG("removing %s\n", subpath);
+ if (rmdir(subpath) < 0) {
+ ERROR_LOG("rmdir %s: %m\n", subpath);
+ return false;
+ }
+ free(subpath);
+ }
+ closedir(d);
+ return true;
+}
+
+void daemonize(const char *logfile)
+{
+ pid_t pid;
+ int nullfd, logfd;
+
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid) /* parent exits */
+ exit(EXIT_SUCCESS);
+ valid_fd012();
+ /* become session leader */
+ if (setsid() < 0)
+ die_errno("setsid");
+ if ((nullfd = open("/dev/null", O_RDWR)) < 0)
+ die_errno("open /dev/null");
+ logfile = logfile? logfile : "/dev/null";
+ if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, 0666)) < 0)
+ die_errno("open %s", logfile);
+ NOTICE_LOG("subsequent log messages go to %s\n", logfile);
+ if (dup2(nullfd, STDIN_FILENO) < 0)
+ die_errno("dup2");
+ close(nullfd);
+ if (dup2(logfd, STDOUT_FILENO) < 0)
+ die_errno("dup2");
+ if (dup2(logfd, STDERR_FILENO) < 0)
+ die_errno("dup2");
+ close(logfd);
+ if (chdir("/") < 0)
+ die_errno("chdir");
+}
+
+static int super_dull_hash(const char *input)
+{
+ const uint8_t *x = (typeof(x))input;
+ const unsigned p1 = 16777619, p2 = 2971215073;
+ unsigned n, m, h, result = 0;
+
+ for (n = 0; n < 4; n++) {
+ h = p1 * (x[0] + n);
+ for (m = 1; x[m] != 0; m++)
+ h = p2 * (h ^ x[m]);
+ result = (result << 8) | (h % 256);
+ }
+ return result >> 1;
+}
+
+/**
+ * We use a semaphore set with two semaphores. The first semaphore is modified
+ * in all locking related functions while the second semaphore is modified only
+ * in try_lock() and aquire_lock(). This allows us to obtain the PID of the
+ * lock holder by querying the PID that last performed an operation on the
+ * second semaphore. This is achieved by passing GETPID as the control
+ * operation to semctl().
+ */
+
+static bool get_lock(const char *string, pid_t *pid, bool wait)
+{
+ int semid, ret;
+ struct sembuf sops[4];
+ key_t key = super_dull_hash(string);
+ bool success;
+ short sem_flg = SEM_UNDO;
+
+ if (!wait)
+ sem_flg |= IPC_NOWAIT;
+ ret = semget(key, 2, IPC_CREAT | 0600);
+ if (ret < 0) {
+ ERROR_LOG("semget: %m\n");
+ return false;
+ }
+ semid = ret;
+ DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+ ret = semctl(semid, 1, GETPID);
+ if (ret < 0)
+ return false;
+ if (pid)
+ *pid = ret;
+ sops[0].sem_num = 0;
+ sops[0].sem_op = 0;
+ sops[0].sem_flg = sem_flg;
+
+ sops[1].sem_num = 0;
+ sops[1].sem_op = 1;
+ sops[1].sem_flg = sem_flg;
+
+ sops[2].sem_num = 1;
+ sops[2].sem_op = 0;
+ sops[2].sem_flg = sem_flg;
+
+ sops[3].sem_num = 1;
+ sops[3].sem_op = 1;
+ sops[3].sem_flg = sem_flg;
+
+ success = semop(semid, sops, 4) >= 0;
+ if (!success)
+ INFO_LOG("semop: %m\n");
+ return success;
+}
+
+bool try_lock(const char *string, pid_t *pid)
+{
+ return get_lock(string, pid, false /* don't wait */);
+}
+
+bool acquire_lock(const char *string)
+{
+ return get_lock(string, NULL /* don't need pid */, true /* do wait */);
+}
+
+bool release_lock(const char *string)
+{
+ int semid, ret;
+ struct sembuf sops[2];
+ key_t key = super_dull_hash(string);
+ bool success;
+
+ ret = semget(key, 2, IPC_CREAT | 0600);
+ if (ret < 0) {
+ ERROR_LOG("semget: %m\n");
+ return false;
+ }
+ semid = ret;
+ DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+ sops[0].sem_num = 0;
+ sops[0].sem_op = -1;
+ sops[0].sem_flg = SEM_UNDO;
+ sops[1].sem_num = 1;
+ sops[1].sem_op = -1;
+ sops[1].sem_flg = SEM_UNDO;
+ success = semop(semid, sops, 2) >= 0;
+ if (!success)
+ INFO_LOG("semop: %m\n");
+ return success;
+}
+
+bool is_locked(const char *string, pid_t *pid)
+{
+ int ret, semid;
+ struct sembuf sops = {
+ .sem_num = 0,
+ .sem_op = 0,
+ .sem_flg = SEM_UNDO | IPC_NOWAIT
+ };
+ key_t key = super_dull_hash(string);
+
+ if (pid)
+ *pid = 0;
+ ret = semget(key, 2, 0);
+ if (ret < 0)
+ return false;
+ semid = ret;
+ DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+ if (semop(semid, &sops, 1) >= 0)
+ return false;
+ ret = semctl(semid, 1, GETPID);
+ if (ret < 0)
+ return false;
+ if (pid)
+ *pid = ret;
+ return true;
+}
+
+bool attach_to_bridge(const char *iface, const char *bridge)
+{
+ int fd, idx;
+ struct ifreq ifr;
+ bool success;
+
+ INFO_LOG("adding interface %s to bridge %s\n", iface, bridge);
+ if (!(idx = if_nametoindex(iface))) {
+ ERROR_LOG("no index for %s\n", iface);
+ return false;
+ }
+ if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ ERROR_LOG("socket: %m\n");
+ return false;
+ }
+ strncpy(ifr.ifr_name, bridge, IFNAMSIZ - 1);
+ ifr.ifr_name[IFNAMSIZ - 1] = '\0';
+ ifr.ifr_ifindex = idx;
+ success = ioctl(fd, SIOCBRADDIF, &ifr) == 0;
+ if (!success)
+ ERROR_LOG("interface %s, bridge %s: ioctl SIOCBRADDIF: %m\n",
+ iface, bridge);
+ close(fd);
+ return success;
+}
+
+
+#define NLMSG_TAIL(nmsg) \
+ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+
+static void addattr_l(struct nlmsghdr *nlh, int type, const void *data,
+ int alen)
+{
+ int len = RTA_LENGTH(alen);
+ struct rtattr *rta;
+
+ rta = NLMSG_TAIL(nlh);
+ rta->rta_type = type;
+ rta->rta_len = len;
+ if (alen > 0)
+ memcpy(RTA_DATA(rta), data, alen);
+ nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(len);
+}
+
+static struct rtattr *addattr_nest(struct nlmsghdr *n, int type)
+{
+ struct rtattr *nest = NLMSG_TAIL(n);
+ addattr_l(n, type, NULL, 0);
+ return nest;
+}
+
+static void end_nest(struct nlmsghdr *nlh, struct rtattr *attr)
+{
+ attr->rta_len = (void *)NLMSG_TAIL(nlh) - (void *)attr;
+}
+
+static struct mnl_socket *get_and_bind_netlink_socket(void)
+{
+ struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+ if (!nl) {
+ ERROR_LOG("mnl_socket_open error\n");
+ return NULL;
+ }
+ if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
+ ERROR_LOG("mnl_socket_bind\n");
+ mnl_socket_close(nl);
+ return NULL;
+ }
+ return nl;
+}
+
+static struct nlmsghdr *prepare_netlink_msg_header(char *buf)
+{
+ struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+ nlh->nlmsg_seq = time(NULL);
+ return nlh;
+}
+
+bool rename_interface(const char *before, const char *after)
+{
+ int idx;
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ bool success;
+
+ INFO_LOG("%s -> %s\n", before, after);
+ if (!(idx = if_nametoindex(before))) {
+ ERROR_LOG("no index for %s\n", before);
+ return false;
+ }
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_NEWLINK;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_index = idx;
+ addattr_l(nlh, IFLA_IFNAME, after, strlen(after) + 1);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("mnl_socket_sendto failed\n");
+ success = false;
+ goto close;
+ }
+ success = true;
+close:
+ mnl_socket_close(nl);
+ return success;
+}
+
+void pretty_print_hwaddr(const uint8_t *hwaddr, char *result)
+{
+ sprintf(result, "%02x:%02x:%02x:%02x:%02x:%02x", hwaddr[0], hwaddr[1],
+ hwaddr[2], hwaddr[3], hwaddr[4], hwaddr[5]);
+}
+
+bool set_hwaddr(const char *iface, const uint8_t *hwaddr)
+{
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ bool success;
+ const uint8_t zero[6] = {0};
+ char pretty_hwaddr[18];
+
+ if (!memcmp(hwaddr, zero, 6))
+ return true; /* no hwaddr specified, nothing to do */
+ pretty_print_hwaddr(hwaddr, pretty_hwaddr);
+ INFO_LOG("hardware address of %s: %s\n", iface, pretty_hwaddr);
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_NEWLINK;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ addattr_l(nlh, IFLA_ADDRESS, hwaddr, 6);
+ addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+ success = false;
+ goto close;
+ }
+ success = true;
+close:
+ mnl_socket_close(nl);
+ return success;
+}
+
+bool link_del(const char *iface)
+{
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ bool success;
+
+ INFO_LOG("removing interface %s\n", iface);
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_DELLINK;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_change = IFF_UP;
+ ifm->ifi_flags = IFF_UP;
+ addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+ success = false;
+ goto close;
+ }
+ success = true;
+close:
+ mnl_socket_close(nl);
+ return success;
+}
+
+bool link_up(const char *iface)
+{
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ bool success;
+
+ INFO_LOG("activating interface %s\n", iface);
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_NEWLINK;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_change = IFF_UP;
+ ifm->ifi_flags = IFF_UP;
+ addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+ success = false;
+ goto close;
+ }
+ success = true;
+close:
+ mnl_socket_close(nl);
+ return success;
+}
+
+#ifndef VETH_INFO_PEER
+#define VETH_INFO_PEER 1
+#endif
+
+bool create_veth_device_pair(const char *name, char *peer)
+{
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct rtattr *n1, *n2, *n3;
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ bool success;
+
+ INFO_LOG("new pair: %s <-> %s\n", name, peer);
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_NEWLINK;
+ nlh->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ n1 = addattr_nest(nlh, IFLA_LINKINFO);
+ addattr_l(nlh, IFLA_INFO_KIND, "veth", 5);
+ n2 = addattr_nest(nlh, IFLA_INFO_DATA);
+ n3 = addattr_nest(nlh, VETH_INFO_PEER);
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ addattr_l(nlh, IFLA_IFNAME, peer, strlen(peer) + 1);
+ end_nest(nlh, n3);
+ end_nest(nlh, n2);
+ end_nest(nlh, n1);
+ addattr_l(nlh, IFLA_IFNAME, name, strlen(name) + 1);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("%s: mnl_socket_sendto\n", name);
+ success = false;
+ goto close;
+ }
+ success = true;
+close:
+ mnl_socket_close(nl);
+ return success;
+}
+
+bool set_netns(const char *iface, pid_t pid)
+{
+ struct mnl_socket *nl;
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+
+ INFO_LOG("changing net namespace of interface %s to pid %d\n",
+ iface, (int)pid);
+ if (!(nl = get_and_bind_netlink_socket()))
+ return false;
+
+ nlh = prepare_netlink_msg_header(buf);
+ nlh->nlmsg_type = RTM_NEWLINK;
+
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_change = 0;
+ ifm->ifi_flags = 0;
+ addattr_l(nlh, IFLA_NET_NS_PID, &pid, sizeof(pid));
+ mnl_attr_put_str(nlh, IFLA_IFNAME, iface);
+
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+ return false;
+ }
+ mnl_socket_close(nl);
+ return true;
+}
+
+#ifndef UNIX_PATH_MAX
+#define UNIX_PATH_MAX (sizeof(((struct sockaddr_un *)0)->sun_path))
+#endif
+
+static bool init_unix_socket(const char *socket_path, int *socketfd,
+ struct sockaddr_un *sau)
+{
+ int fd;
+
+ *socketfd = -1;
+ if (strlen(socket_path) + 1 >= UNIX_PATH_MAX) {
+ ERROR_LOG("socket path to long: %s\n", socket_path);
+ return false;
+ }
+ memset(sau, 0, sizeof(struct sockaddr_un));
+ sau->sun_family = PF_UNIX;
+ sau->sun_path[0] = '\0'; /* use the abstract socket namespace */
+ strcpy(sau->sun_path + 1, socket_path);
+ fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ ERROR_LOG("socket: %m\n");
+ return false;
+ }
+ *socketfd = fd;
+ return true;
+}
+
+bool listen_on_unix_socket(const char *socket_path, int *result)
+{
+ struct sockaddr_un sau;
+ int fd, flags;
+ bool success = false;
+
+ if (!init_unix_socket(socket_path, &fd, &sau))
+ return false;
+ flags = fcntl(fd, F_GETFL);
+ if (flags < 0) {
+ ERROR_LOG("fcntl (F_GETFL): %m\n");
+ goto fail;
+ }
+ flags = fcntl(fd, F_SETFL, ((long)flags) | O_NONBLOCK);
+ if (flags < 0) {
+ ERROR_LOG("fcntl (F_SETFL): %m\n");
+ goto fail;
+ }
+ if (bind(fd, (struct sockaddr *)&sau, sizeof(sau)) < 0) {
+ ERROR_LOG("bind: %m\n");
+ goto fail;
+ }
+ if (listen(fd , 5) < 0) {
+ ERROR_LOG("listen: %m\n");
+ goto fail;
+ }
+ *result = fd;
+ NOTICE_LOG("listening on fd %d\n", fd);
+ return true;
+fail:
+ close(fd);
+ return success;
+}
+/*
+ * Send a buffer and the credentials of the current process to a socket.
+ *
+ * buf must be zero-terminated.
+ * return the return value of the underlying call to sendmsg().
+ */
+static bool send_cred_buffer(int sock, char *buf)
+{
+ char control[255] __attribute__((__aligned__(8)));
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+ static struct iovec iov;
+ struct ucred c;
+
+ /* Response data */
+ iov.iov_base = buf;
+ iov.iov_len = strlen(buf) + 1;
+ c.pid = getpid();
+ c.uid = getuid();
+ c.gid = getgid();
+ /* compose the message */
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ /* attach the ucred struct */
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_CREDENTIALS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ *(struct ucred *)CMSG_DATA(cmsg) = c;
+ msg.msg_controllen = cmsg->cmsg_len;
+ if (sendmsg(sock, &msg, 0) < 0) {
+ ERROR_LOG("sendmsg: %m\n");
+ return false;
+ }
+ return true;
+}
+
+static void dispose_fds(int *fds, unsigned num)
+{
+ int i;
+
+ for (i = 0; i < num; i++)
+ close(fds[i]);
+}
+
+/* Receive a buffer and the Unix credentials of the sending process. */
+bool recv_cred_buffer(int socketfd, char *buf, size_t size,
+ int *clientfd, uid_t *uid)
+{
+ char control[255] __attribute__((__aligned__(8)));
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+ struct iovec iov;
+ int yes = 1, cfd, ret;
+ struct ucred cred;
+ struct sockaddr_un sau;
+ socklen_t sizeof_sau = sizeof(sau);
+
+ ret = accept(socketfd, (struct sockaddr *)&sau, &sizeof_sau);
+ if (ret < 0) {
+ ERROR_LOG("accept: %m\n");
+ return false;
+ }
+ cfd = ret;
+ setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(int));
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = buf;
+ iov.iov_len = size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ if (recvmsg(cfd, &msg, 0) < 0) {
+ ERROR_LOG("recvmsg: %m\n");
+ goto fail;
+ }
+ cmsg = CMSG_FIRSTHDR(&msg);
+ while (cmsg) {
+ if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type
+ == SCM_CREDENTIALS) {
+ memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct ucred));
+ *uid = cred.uid;
+ *clientfd = cfd;
+ return true;
+ } else
+ if (cmsg->cmsg_level == SOL_SOCKET
+ && cmsg->cmsg_type == SCM_RIGHTS) {
+ dispose_fds((int *)CMSG_DATA(cmsg),
+ (cmsg->cmsg_len - CMSG_LEN(0))
+ / sizeof(int));
+ }
+ cmsg = CMSG_NXTHDR(&msg, cmsg);
+ }
+fail:
+ close(*clientfd);
+ *clientfd = -1;
+ return false;
+}
+
+bool pass_fd(int passfd, int socketfd)
+{
+ struct msghdr msg = {.msg_iov = NULL};
+ struct cmsghdr *cmsg;
+ char control[255] __attribute__((__aligned__(8)));
+ struct iovec iov;
+ char buf[] = "\0OK";
+
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ *(int *)CMSG_DATA(cmsg) = passfd;
+
+ /* Sum of the length of all control messages in the buffer */
+ msg.msg_controllen = cmsg->cmsg_len;
+ DEBUG_LOG("passing %s and fd %d\n", buf, passfd);
+ if (sendmsg(socketfd, &msg, 0) < 0) {
+ ERROR_LOG("sendmsg: %m\n");
+ return false;
+ }
+ return true;
+}
+
+static bool recv_fd(int socketfd, int *recvfd)
+{
+ char control[255] __attribute__((__aligned__(8)));
+ struct msghdr msg = {.msg_iov = NULL};
+ struct cmsghdr *cmsg;
+ struct iovec iov;
+ char buf[100];
+ ssize_t sz = sizeof(buf), ssz;
+
+ *recvfd = -1;
+ iov.iov_base = buf;
+ iov.iov_len = sz - 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ memset(buf, 0, sz);
+ ssz = recvmsg(socketfd, &msg, 0);
+ if (ssz < 0) {
+ ERROR_LOG("recvmsg: %m\n");
+ return false;
+ }
+ buf[ssz] = '\0';
+ INFO_LOG("server response: %u (%s)\n", (unsigned)buf[0], buf + 1);
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET
+ || cmsg->cmsg_type != SCM_RIGHTS)
+ continue;
+ if ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int) != 1)
+ continue;
+ *recvfd = *(int *)CMSG_DATA(cmsg);
+ return true;
+ }
+ return false;
+}
+
+int request_fd(const char *socket_path, char *msg, int *result)
+{
+ struct sockaddr_un sau;
+ int socketfd, receivefd;
+
+ if (!init_unix_socket(socket_path, &socketfd, &sau))
+ die("could not init socket");
+ if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0)
+ die_errno("connect");
+ if (!send_cred_buffer(socketfd, msg))
+ die("could not send cred buffer");
+ if (!recv_fd(socketfd, &receivefd))
+ die("did not receive tty fd");
+ NOTICE_LOG("received fd %d\n", receivefd);
+ *result = receivefd;
+ return socketfd;
+}
+
+bool request_int(const char *socket_path, char *msg, int *result)
+{
+ struct sockaddr_un sau;
+ int socketfd;
+ bool success = false;
+ char buf[100];
+ ssize_t ssz;
+
+ *result = -1;
+ if (!init_unix_socket(socket_path, &socketfd, &sau))
+ return false;
+ if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) {
+ ERROR_LOG("connect: %m\n");
+ goto close;
+ }
+ if (!send_cred_buffer(socketfd, msg)) {
+ ERROR_LOG("could not send cred msg \"%s\"\n", msg);
+ goto close;
+ }
+ ssz = read(socketfd, buf, sizeof(buf) - 1);
+ if (ssz < 0) {
+ ERROR_LOG("did not receive integer: %m\n");
+ goto close;
+ }
+ if (buf[0] != 0) {
+ ERROR_LOG("did not receive integer: %s\n", buf + 1);
+ goto close;
+ }
+ if (ssz != sizeof(int) + 1) {
+ ERROR_LOG("protocol mismatch, server msg: %s\n", buf + 1);
+ goto close;
+ }
+ memcpy(result, buf + 1, sizeof(int));
+ DEBUG_LOG("received integer: %d\n", *result);
+ success = true;
+close:
+ close(socketfd);
+ return success;
+}
+
+int signal_pipe[2];
+
+static void signal_handler(int signum)
+{
+ uint8_t u = signum;
+ int save_errno = errno;
+ assert(signum > 0 && signum < 256);
+ if (write(signal_pipe[1], &u, 1) < 0)
+ ERROR_LOG("write to signal pipe: %m\n");
+ errno = save_errno;
+}
+
+void init_signal_handling(void)
+{
+ struct sigaction act;
+
+ if (pipe(signal_pipe) < 0)
+ die_errno("signal pipe");
+ act.sa_handler = signal_handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_RESTART;
+ if (sigaction(SIGINT, &act, NULL) < 0)
+ die_errno("sigaction");
+ if (sigaction(SIGTERM, &act, NULL) < 0)
+ die_errno("sigaction");
+ if (sigaction(SIGCHLD, &act, NULL) < 0)
+ die_errno("sigaction");
+}
+
+int next_signal(void)
+{
+ uint8_t u = 0;
+again:
+ if (read(signal_pipe[0], &u, 1) < 0) {
+ if (errno != EINTR)
+ die_errno("read");
+ goto again;
+ }
+ DEBUG_LOG("process %d received signal %u\n", getpid(), u);
+ return u;
+}
--- /dev/null
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+package="$1"
+version_file="$2"
+
+ver='unnamed_version'
+# First try git, then gitweb, then default.
+if [ -e '.git' -o -e '../.git' ]; then
+ git_ver=$(git describe --abbrev=4 HEAD 2>/dev/null)
+ [ -z "$git_ver" ] && git_ver="$ver"
+ # update stat information in index to match working tree
+ git update-index -q --refresh > /dev/null
+ # if there are differences (exit code 1), the working tree is dirty
+ git diff-index --quiet HEAD || git_ver=$git_ver-dirty
+ ver=$git_ver
+elif [ "${PWD%%-*}" = $package- ]; then
+ ver=${PWD##*/$package-}
+fi
+ver=${ver#v}
+
+echo "$ver"
+[ -z "${version_file}" ] && exit 0
+# update version file if necessary
+content="const char *${package}_version(void) {return \"$ver\";};"
+[ -r "$version_file" ] && echo "$content" | cmp -s - $version_file && exit 0
+echo "$content" > $version_file