]> git.tue.mpg.de Git - micoforia.git/commitdiff
Initial commit.
authorAndre Noll <maan@tuebingen.mpg.de>
Fri, 13 Dec 2019 14:04:27 +0000 (15:04 +0100)
committerAndre Noll <maan@tuebingen.mpg.de>
Fri, 13 Dec 2019 14:27:31 +0000 (15:27 +0100)
This project was stared in late 2018. After 2 weeks the first
feature complete version was ready. During 2019 the repo received
only a moderate number of commits, mostly bug fixes, documentation
improvements and the addition of non-essential features.

As of version 0.9.0, the project was made public. All commits that
led to this version have been discarded, so this repository contains
only the final result as a single commit.

13 files changed:
.gitignore [new file with mode: 0644]
Makefile [new file with mode: 0644]
README [new file with mode: 0644]
config.mak.in [new file with mode: 0644]
configure [new file with mode: 0755]
configure.ac [new file with mode: 0644]
index.html.m4 [new file with mode: 0644]
m7a.h [new file with mode: 0644]
micoforia.c [new file with mode: 0644]
micoforia.suite.m4 [new file with mode: 0644]
micoforia.svg [new file with mode: 0644]
util.c [new file with mode: 0644]
version-gen.sh [new file with mode: 0755]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..98c75c0
--- /dev/null
@@ -0,0 +1,5 @@
+micoforia
+micoforia.8
+build
+*.swp
+Makefile.local
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..4441e04
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: GPL-2.0-only
+.SUFFIXES:
+MAKEFLAGS += -Rr
+ifeq ("$(origin CC)", "default")
+        CC := cc
+endif
+ifeq ("$(origin V)", "command line")
+       SAY =
+else
+       SAY = @echo '$(strip $(1))'
+endif
+
+.ONESHELL:
+.SHELLFLAGS := -ec
+PREFIX ?= /usr/local
+INSTALL ?= install
+MKDIR_P := mkdir -p
+RM := rm -f
+CHMOD := chmod
+B := build
+all := micoforia micoforia.8
+all: $(all)
+
+PACKAGE := micoforia
+SLOGAN := Minimal Containers for Instant Availability
+AUTHOR := Andre Noll
+EMAIL := maan@tuebingen.mpg.de
+COPYRIGHT_YEAR := 2019
+URL := http://people.tuebingen.mpg.de/maan/$(PACKAGE)/
+CLONE_URL := git://git.tuebingen.mpg.de/$(PACKAGE)
+GITWEB_URL := http://git.tuebingen.mpg.de/$(PACKAGE).git
+HOME_URL := http://people.tuebingen.mpg.de/maan/
+LICENSE := GNU GPL version 3
+LICENSE_URL := https://www.gnu.org/licenses/gpl-3.0-standalone.html
+LOGLEVELS := LL_DEBUG,LL_INFO,LL_NOTICE,LL_WARNING,LL_ERROR,LL_CRIT,LL_EMERG
+
+units := micoforia util version micoforia.lsg
+deps := $(addprefix $(B)/, $(addsuffix .d, $(units)))
+objs := $(addprefix $(B)/, $(addsuffix .o, $(units)))
+
+ifeq ($(findstring clean, $(MAKECMDGOALS)),)
+ifeq ($(findstring README, $(MAKECMDGOALS)),)
+-include $(deps)
+-include $(B)/config.mak
+endif
+endif
+
+XCPPFLAGS :=
+XCPPFLAGS += -I$(B)
+XCPPFLAGS += -Wunused-macros
+XCPPFLAGS += -DCOPYRIGHT_YEAR='"$(COPYRIGHT_YEAR)"'
+XCPPFLAGS += -DPACKAGE='"$(PACKAGE)"'
+XCPPFLAGS += -DAUTHOR='"$(AUTHOR)"'
+XCPPFLAGS += -DEMAIL='"$(EMAIL)"'
+XCPPFLAGS += -DURL='"$(URL)"'
+XCPPFLAGS += -DCLONE_URL='"$(CLONE_URL)"'
+XCPPFLAGS += -DGITWEB_URL='"$(GITWEB_URL)"'
+XCPPFLAGS += -DHOME_URL='"$(HOME_URL)"'
+XCPPFLAGS += -DGET_VERSION='$(PACKAGE)_version'
+XCPPFLAGS += -DLOGLEVELS='$(LOGLEVELS)'
+XCPPFLAGS += -DBUILD_DATE='"$(build_date)"'
+XCPPFLAGS += -DCC_VERSION='"$(cc_version)"'
+XCPPFLAGS += -DUNAME_RS='"$(uname_rs)"'
+XCPPFLAGS += -DLICENSE='"$(LICENSE)"'
+XCPPFLAGS += -DLICENSE_URL='"$(LICENSE_URL)"'
+
+XCFLAGS :=
+XCFLAGS += -fno-strict-aliasing
+XCFLAGS += -g
+XCFLAGS += -Os
+XCFLAGS += -Wundef -W -Wuninitialized
+XCFLAGS += -Wchar-subscripts
+XCFLAGS += -Werror-implicit-function-declaration
+XCFLAGS += -Wmissing-noreturn
+XCFLAGS += -Wbad-function-cast
+XCFLAGS += -Wredundant-decls
+XCFLAGS += -Wno-sign-compare -Wno-unknown-pragmas
+XCFLAGS += -Wdeclaration-after-statement
+XCFLAGS += -Wformat -Wformat-security -Wmissing-format-attribute
+XCFLAGS += -fsanitize=undefined
+XCFLAGS += -fdata-sections -ffunction-sections
+XCFLAGS += -Wstrict-prototypes
+XCFLAGS += -Wshadow
+XCFLAGS += -Wunused -Wall
+XCFLAGS += -Wformat-signedness
+XCFLAGS += -Wdiscarded-qualifiers
+
+XLDFLAGS := -lubsan -Wl,--gc-sections
+version_file := $(B)/version.c
+GIT_VERSION := $(shell $(MKDIR_P) $(B) && ./version-gen.sh $(PACKAGE) $(version_file))
+
+CC_CMD = $(CC) -c -o $@ $(XCPPFLAGS) $(CPPFLAGS) \
+       $(XCFLAGS) $(CFLAGS) -MMD -MF $(B)/$(*F).d -MT $@
+
+$(objs): m7a.h $(B)/micoforia.lsg.h
+
+$(B):
+       @$(MKDIR_P) $@
+
+$(B)/config.h.in: configure.ac | $(B)
+       $(call SAY, AH $<)
+       cd $(B)
+       autoheader -f ../configure.ac
+$(B)/configure.sh: configure.ac | $(B)
+       $(call SAY, AC $<)
+       cd $(B)
+       autoconf ../configure.ac > configure.sh
+       $(CHMOD) 755 configure.sh
+$(B)/config.status: $(B)/configure.sh | $(B)
+       $(call SAY, SH $<)
+       cd $(B)
+       if test -x config.status; then \
+               ./config.status --quiet --recheck; \
+       else \
+               ./configure.sh --no-create; \
+       fi
+$(B)/config.mak $(B)/config.h: $(B)/config.status config.mak.in $(B)/config.h.in
+       $(call SAY, CS $@)
+       cd $(B)
+       ln -f ../config.mak.in
+       ./config.status -q
+       test -f config.h && touch config.h
+
+define DESCRIPTION1 :=
+       $(PACKAGE) is a lightweight container implementation for Linux.
+       It consists of a single program which reads a single configuration
+       file that describes all containers. $(PACKAGE) was written with
+       performance and simplicity in mind, and is designed for trusted
+       in-house web application hosting.
+endef
+
+define DESCRIPTION2 :=
+       Like other container frameworks, $(PACKAGE) employs Linux namespaces
+       for isolation and cgroup controllers to limit the resource utilization
+       of the containers. Networking is implemented through bridging and
+       virtual ethernet device pairs. There is built-in support for the cpu,
+       memory, I/O and device controllers. Further customization is possible
+       via startup hooks. For example, the startup hook could activate
+       additional cgroup controllers, make the container enter a different
+       namespace, and mount additional file systems.
+endef
+
+define DESCRIPTION3 :=
+       The micoforia program supports a couple of subcommands. Besides
+       the start subcommand which starts one or more containers, there are
+       subcommands for listing, killing or rebooting containers.
+endef
+
+# dependency on config.mak is because the command below depends on $(M4)
+$(B)/index.html $(B)/micoforia.suite: $(B)/%: %.m4 Makefile $(B)/config.mak
+       $(call SAY, M4 $<)
+       $(M4) -D "AUTHOR=$(AUTHOR)" -D "COPYRIGHT_YEAR=$(COPYRIGHT_YEAR)" \
+               -D "PACKAGE=$(PACKAGE)" \
+               -D "SLOGAN=$(SLOGAN)" \
+               -D "EMAIL=$(EMAIL)" \
+               -D "URL=$(URL)" \
+               -D "CLONE_URL=$(CLONE_URL)" \
+               -D "GITWEB_URL=$(GITWEB_URL)" \
+               -D "HOME_URL=$(HOME_URL)" \
+               -D "LICENSE=$(LICENSE)" \
+               -D "LICENSE_URL=$(LICENSE_URL)" \
+               -D "DESCRIPTION1=$(DESCRIPTION1)" \
+               -D "DESCRIPTION2=$(DESCRIPTION2)" \
+               -D "DESCRIPTION3=$(DESCRIPTION3)" $< > $@
+$(B)/%.lsg.c: $(B)/%.suite
+       $(call SAY, LSGC $<)
+       $(LOPSUBGEN) --gen-c --output-dir $(B) < $<
+$(B)/%.lsg.h: $(B)/%.suite
+       $(call SAY, LSGH $<)
+       $(LOPSUBGEN) --gen-header --output-dir $(B) < $<
+%.8: $(B)/%.suite $(B)/version.c
+       $(call SAY, LSGM $<)
+       $(LOPSUBGEN) --gen-man=$(*F).8 --version-string $(GIT_VERSION) < $<
+
+$(B)/%.o: %.c | $(B)
+       $(call SAY, CC $<)
+       $(CC_CMD) $<
+$(B)/%.o: $(B)/%.c
+       $(call SAY, CC $<)
+       $(CC_CMD) $<
+micoforia: $(objs)
+       $(call SAY, LD $@)
+       $(CC) -o $@ $^ $(XLDFLAGS) $(LDFLAGS) -llopsub -lmnl -lutil -lcap
+
+mandir := $(datarootdir)/man/man8
+INSTALL ?= install
+INSTALL_PROGRAM ?= $(INSTALL) -m 755
+INSTALL_DATA ?= $(INSTALL) -m 644
+ifneq ($(findstring strip, $(MAKECMDGOALS)),)
+       strip_option := -s
+endif
+install install-strip: all
+       $(MKDIR_P) $(DESTDIR)$(sbindir) $(DESTDIR)$(mandir)
+       $(INSTALL_PROGRAM) $(strip_option) micoforia $(DESTDIR)$(sbindir)
+       $(INSTALL_DATA) micoforia.8 $(DESTDIR)$(mandir)
+
+clean:
+       $(RM) $(B)/*.o $(all)
+distclean: clean
+       $(RM) -r $(B)
+maintainer-clean:
+       git clean -dfqx > /dev/null 2>&1
+
+define README :=
+$(PACKAGE) -  $(SLOGAN)
+
+$(DESCRIPTION1)
+
+$(DESCRIPTION2)
+
+$(DESCRIPTION3)
+
+Resources
+~~~~~~~~~
+|      web page: $(URL)
+|      git clone URL: $(CLONE_URL)
+|      gitweb: $(GITWEB_URL)
+|      author's home page: $(HOME_URL)
+|      Send feedback to: $(AUTHOR) <$(EMAIL)>
+
+License
+~~~~~~~
+Open source, licensed under the $(LICENSE).
+
+Documentation
+~~~~~~~~~~~~~
+See micoforia.suite.m4. Or build the man page with \"make\" and run
+\"man -l micoforia.8\".
+
+Dependencies
+~~~~~~~~~~~~
+This package requires m4, autoconf, gnu make, gcc or clang, and
+lopsub. The configure script checks if all dependencies are installed
+and prints a meaningful error message if one of them is missing.
+
+Building
+~~~~~~~~
+Run \"make\" to build the package with the default settings. Run
+\"./configure -h\" to list configuration options.
+
+Installation
+~~~~~~~~~~~~
+Run \"sudo make install\" to install to /usr/local. To install to
+/somewhere/else, run \"./configure --prefix /somewhere/else && make\"
+first.
+endef
+
+README:
+       @printf '%s\n' "$(README)"
+
+.PRECIOUS: $(B)/%.lsg.c $(B)/%.lsg.h $(B)/%.8
+.PHONY: all clean install distclean maintainer-clean README
+-include Makefile.local
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..52a1fd7
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+Run "make README".
diff --git a/config.mak.in b/config.mak.in
new file mode 100644 (file)
index 0000000..ee258b4
--- /dev/null
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+prefix := @prefix@
+exec_prefix := @exec_prefix@
+
+# These two use prefix and exec_prefix
+sbindir := @sbindir@
+datarootdir := @datarootdir@
+
+LOPSUBGEN := @LOPSUBGEN@
+M4 := @M4@
diff --git a/configure b/configure
new file mode 100755 (executable)
index 0000000..ad2ec3f
--- /dev/null
+++ b/configure
@@ -0,0 +1,12 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+set -e
+
+mkdir -p build
+cd build
+autoconf ../configure.ac > configure.sh
+chmod 755 configure.sh
+ln -f ../config.mak.in
+autoheader ../configure.ac
+sh configure.sh "$@"
diff --git a/configure.ac b/configure.ac
new file mode 100644 (file)
index 0000000..e29968f
--- /dev/null
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+AC_PREREQ([2.61])
+# only for configure -h, see Makefile
+AC_INIT([software], [packages])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES([config.mak])
+AC_USE_SYSTEM_EXTENSIONS
+AC_PROG_CC
+AC_PROG_CPP
+
+AC_DEFUN([REQUIRE_EXECUTABLE], [
+       AC_PATH_PROG(m4_toupper([$1]), [$1])
+       test -z "$m4_toupper([$1])" && AC_MSG_ERROR([$2])
+])
+REQUIRE_EXECUTABLE([m4], [m4 is required to build this package])
+
+AC_DEFUN([LOPSUB_NOT_FOUND], [
+The lopsub library is required to build this software, but the checks
+indicate it is not installed on your system.  Run the following
+command to download a copy.
+       git clone git://git.tuebingen.mpg.de/lopsub.git
+Install the library, then run this configure script again.
+
+If you installed lopsub at a non-standard location, make sure to set
+PATH, CPPFLAGS and LDFLAGS accordingly. For example:
+
+       pfx=/prefix/where/lopsub/is/installed
+       export PATH=\$pfx/bin:\$PATH
+       export CPPFLAGS=-I\$pfx/include
+       export LDFLAGS=-L\$pfx/lib
+])
+REQUIRE_EXECUTABLE([lopsubgen], [LOPSUB_NOT_FOUND()])
+AC_CHECK_HEADER(lopsub.h, [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])])
+AC_CHECK_LIB([lopsub], [lls_merge], [], [AC_MSG_ERROR([LOPSUB_NOT_FOUND()])])
+
+AC_DEFUN([LIBCAP_NOT_FOUND], [the libcap library is required to build dnl
+this software. Package: libcap-dev])
+AC_CHECK_HEADER([sys/capability.h], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])])
+AC_CHECK_LIB([cap], [cap_from_text], [], [AC_MSG_ERROR([LIBCAP_NOT_FOUND()])])
+
+AC_DEFUN([LIBMNL_NOT_FOUND], [the libmnl library is required to build dnl
+this software. Package: libmnl-dev])
+AC_CHECK_HEADER([libmnl/libmnl.h], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])])
+AC_CHECK_LIB([mnl], [mnl_socket_open], [], [AC_MSG_ERROR([LIBMNL_NOT_FOUND()])])
+
+AC_OUTPUT
diff --git a/index.html.m4 b/index.html.m4
new file mode 100644 (file)
index 0000000..a0d8ecc
--- /dev/null
@@ -0,0 +1,64 @@
+dnl SPDX-License-Identifier: GPL-2.0-only
+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
+'http://www.w3.org/TR/html4/loose.dtd'>
+
+<html>
+       <head>
+               <meta
+                       http-equiv='Content-Type';
+                       content='text/html';
+                       charset=utf-8;
+               >
+               <title>PACKAGE()</title>
+               <style type='text/css'>
+                       a {
+                               color: #003355;
+                       }
+                       p {
+                               font-size: 120%;
+                       }
+               </style>
+       </head>
+       <body>
+               <table width="100%">
+                       <tr>
+                               <td>
+                                       <h1 align="left">
+                                                PACKAGE() - SLOGAN()
+                                       </h1>
+                               </td>
+                               <td align="right" title="The micoforic dude">
+
+                                       <img src="micoforia.svg">
+                               </td>
+                       </tr>
+               </table>
+               <p> DESCRIPTION1() </p>
+               <p> DESCRIPTION2() </p>
+               <p> DESCRIPTION3() </p>
+
+               <h2> Resources </h2>
+               <ul>
+                       <li> Clone `URL': CLONE_URL() </li>
+                       <li> <a href="GITWEB_URL()">Gitweb</a> </li>
+                       <li> The author's <a href="HOME_URL()">home page</a> </li>
+                       <li> Send feedback to <a href="mailto:EMAIL()">AUTHOR()</a> </li>
+               </ul>
+
+               <h2> License </h2>
+               Open source, licensed under the <a
+               href="LICENSE_URL()">LICENSE()</a>
+
+               <h2> Documentation </h2>
+               See the manual page for details.
+
+               <h2> Programming Language </h2>
+               Plain C.
+
+               <h2> Dependencies </h2>
+               A working C compiler and a couple of other dependencies,
+               most of which are standard (autoconf, make, m4,
+               libmnl, libcap). The notable exception is the <a
+               href="http://people.tuebingen.mpg.de/maan/lopsub/">lopsub</a> library.
+       </body>
+</html>
diff --git a/m7a.h b/m7a.h
new file mode 100644 (file)
index 0000000..77c3cad
--- /dev/null
+++ b/m7a.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/uio.h>
+#include <pwd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <limits.h>
+
+#include "config.h"
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define CMD_PTR(_cname) lls_cmd(LSG_MICOFORIA_CMD_ ## _cname, micoforia_suite)
+#define OPT_RESULT(_cname, _oname) (lls_opt_result(\
+       LSG_MICOFORIA_ ## _cname ## _OPT_ ## _oname, \
+       (CMD_PTR(_cname) == CMD_PTR(MICOFORIA))? lpr : sublpr))
+#define OPT_GIVEN(_cname, _oname) (lls_opt_given(OPT_RESULT(_cname, _oname)))
+#define OPT_UINT32_VAL_N(_n, _cname, _oname) (lls_uint32_val(_n, \
+               OPT_RESULT(_cname, _oname)))
+#define OPT_UINT32_VAL(_cname, _oname) (OPT_UINT32_VAL_N(0, _cname, _oname))
+#define OPT_STRING_VAL_N(_n, _cname, _oname) (lls_string_val(_n, \
+       OPT_RESULT(_cname, _oname)))
+#define OPT_STRING_VAL(_cname, _oname) (OPT_STRING_VAL_N(0, _cname, _oname))
+
+struct micoforia_user_data {bool (*handler)(void);};
+#define EXPORT_CMD_HANDLER(_cmd) const struct micoforia_user_data \
+       lsg_micoforia_com_ ## _cmd ## _user_data = { \
+               .handler = com_ ## _cmd \
+       };
+
+
+__attribute__ ((warn_unused_result))
+void *xrealloc(void *p, size_t size);
+
+__attribute__ ((warn_unused_result))
+void *xmalloc(size_t size);
+
+__attribute__ ((warn_unused_result))
+void *xzmalloc(size_t size);
+
+void *xstrdup(const char *s);
+char *xstrcat(char *a, const char *b);
+
+__attribute__ ((format (printf, 1, 2))) __attribute__ ((warn_unused_result))
+char *msg(const char *fmt, ...);
+
+enum loglevels {LOGLEVELS, NUM_LOGLEVELS};
+extern unsigned loglevel_arg_val;
+
+__attribute__ ((format (printf, 2, 3)))
+void m7a_log(int ll, const char* fmt,...);
+
+#define DEBUG_LOG(f,...) m7a_log(LL_DEBUG, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define INFO_LOG(f,...) m7a_log(LL_INFO, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define NOTICE_LOG(f,...) m7a_log(LL_NOTICE, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define WARNING_LOG(f,...) m7a_log(LL_WARNING, "%s: " f, __FUNCTION__, ##  __VA_ARGS__)
+#define ERROR_LOG(f,...) m7a_log(LL_ERROR, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define CRIT_LOG(f,...) m7a_log(LL_CRIT, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+#define EMERG_LOG(f,...) m7a_log(LL_EMERG, "%s: " f, __FUNCTION__, ## __VA_ARGS__)
+
+__attribute__ ((noreturn))
+__attribute__ ((format (printf, 1, 2)))
+void die(const char *fmt, ...);
+
+__attribute__ ((noreturn))
+__attribute__ ((format (printf, 1, 2)))
+void die_errno(const char *fmt, ...);
+
+__attribute__ ((noreturn))
+void die_empty_arg(const char *opt);
+
+void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt);
+
+bool xexec(char * const argv[], const struct iovec *iov);
+void valid_fd012(void);
+void check_name(const char *arg);
+void parse_compound_arg(const char *arg, const char *opt, char **name, char **val);
+char *parse_cgroup_acl(const char *arg);
+char *make_hwaddr(const char *name, const char *bridge);
+void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr);
+uint32_t atou32(const char *str, const char *opt);
+bool remove_subdirs_recursively(const char *path);
+void daemonize(const char *logfile);
+bool acquire_lock(const char *string);
+bool try_lock(const char *string, pid_t *pid);
+bool release_lock(const char *string);
+bool is_locked(const char *string, pid_t *pid);
+bool attach_to_bridge(const char *iface, const char *bridge);
+bool rename_interface(const char *before, const char *after);
+void pretty_print_hwaddr(const uint8_t *hwaddr, char *result);
+bool set_hwaddr(const char *iface, const uint8_t *hwaddr);
+bool link_del(const char *iface);
+bool link_up(const char *iface);
+bool create_veth_device_pair(const char *name, char *peer);
+bool set_netns(const char *iface, pid_t pid);
+int request_fd(const char *socket_path, char *msg, int *result);
+bool request_int(const char *socket_path, char *msg, int *result);
+bool listen_on_unix_socket(const char *socket_path, int *result);
+bool recv_cred_buffer(int socketfd, char *buf, size_t size,
+               int *clientfd, uid_t *uid);
+bool pass_fd(int passfd, int socketfd);
+
+extern int signal_pipe[2];
+void init_signal_handling(void);
+int next_signal(void);
diff --git a/micoforia.c b/micoforia.c
new file mode 100644 (file)
index 0000000..4d267ec
--- /dev/null
@@ -0,0 +1,1996 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include "m7a.h"
+
+#include <lopsub.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+#include <pty.h>
+#include <utmp.h>
+#include <sys/socket.h>
+#include <sys/capability.h>
+#include <sys/syscall.h>
+
+#include "micoforia.lsg.h"
+
+static struct lls_parse_result *lpr, *sublpr;
+unsigned loglevel_arg_val = 4;
+
+struct ifspec {
+       char *bridge;
+       uint8_t hwaddr[6];
+};
+
+struct container {
+       char *name;
+       char *pre_start_hook;
+       char *pre_exec_hook;
+       char *root_dir;
+       char *init;
+       struct ifspec *ifspec;
+       /* this is never zero, even if no ifspec was given */
+       unsigned num_ifspecs;
+       char **dacl;
+       unsigned num_dac_entries;
+       char **io_max;
+       unsigned num_io_max_entries;
+       /* ~0U: not given, 0: unlimited */
+       unsigned cpu_cores;
+       unsigned memory_limit;
+       /* ~0U: not given */
+       unsigned init_type;
+       cap_value_t *capdrop;
+       unsigned num_capdrops;
+       uint32_t *tty;
+       unsigned num_ttys;
+};
+
+static struct container **container;
+static unsigned num_containers;
+
+struct container_runtime {
+       int pipe1[2], pipe2[2]; /* for startup communication */
+       uint32_t *tty;
+       unsigned num_ttys;
+       int *master, *slave, *client;
+
+       int init_pid; /* in the parent namespace */
+       char *pts, *root, *dev;
+       int socket_fd;
+};
+
+static char **default_dacl, **default_io_max;
+unsigned num_default_dac_entries, num_default_io_max_entries;
+static cap_value_t *default_capdrop;
+unsigned num_default_capdrops;
+uint32_t *default_tty;
+unsigned num_default_ttys;
+static const struct lls_command *subcmd;
+/* does not allocate memory */
+void m7a_log(int ll, const char* fmt,...)
+{
+       va_list argp;
+
+       if (ll < loglevel_arg_val)
+               return;
+       va_start(argp, fmt);
+       if (subcmd == lls_cmd(LSG_MICOFORIA_CMD_START, micoforia_suite)) {
+               char str[100];
+               struct timespec t;
+               struct tm *tm;
+               assert(clock_gettime(CLOCK_REALTIME, &t) == 0);
+               tm = localtime(&t.tv_sec);
+               strftime(str, sizeof(str), "%b %d %H:%M:%S", tm);
+               fprintf(stderr, "%s:%04lu ", str,
+                       (long unsigned)t.tv_nsec / 1000 / 1000);
+               fprintf(stderr, "(%u) ", (unsigned)getpid());
+       }
+       vfprintf(stderr, fmt, argp);
+       va_end(argp);
+}
+
+static void die_lopsub(int lopsub_ret, char **errctx)
+{
+       const char *m = lls_strerror(-lopsub_ret);
+       if (*errctx)
+               ERROR_LOG("%s: %s\n", *errctx, m);
+       else
+               ERROR_LOG("%s\n", m);
+       free(*errctx);
+       *errctx = NULL;
+       die("lopsub error");
+}
+
+#define FOR_EACH_CONTAINER(_c) for ( \
+       struct container **_cp = container; \
+       ((_c) = *(_cp)); \
+       (_cp)++, (_c) = *(_cp) \
+)
+
+static struct container *get_container(const char *name)
+{
+       struct container *c;
+       FOR_EACH_CONTAINER(c) {
+               if (!strcmp(c->name, name))
+                       return c;
+       }
+       return NULL;
+}
+
+static struct container *get_or_append_container(const char *name)
+{
+       struct container *c = get_container(name);
+       if (c)
+               return c;
+       container = xrealloc(container,
+               (++num_containers + 1) * sizeof(struct container *));
+       c = container[num_containers - 1] = xzmalloc(sizeof(struct container));
+       c->name = xstrdup(name);
+       /* ~0U means: not given */
+       c->cpu_cores = ~0U;
+       c->memory_limit = ~0U;
+       c->init_type = ~0U;
+       container[num_containers] = NULL;
+       return c;
+}
+
+static unsigned get_container_ttys(const struct container *c, uint32_t **result)
+{
+       static uint32_t dflt = {1};
+       if (c->num_ttys > 0) {
+               *result = c->tty;
+               return c->num_ttys;
+       }
+       if (num_default_ttys > 0) {
+               *result = default_tty;
+               return num_default_ttys;
+       }
+       *result = &dflt;
+       return 1;
+}
+
+enum clo_given_counter {
+       CLOGC_DEFAULT_CGROUP_DAC,
+       CLOGC_CGROUP_DAC,
+       CLOGC_DEFAULT_IO_MAX,
+       CLOGC_IO_MAX,
+       NUM_CLOGCS
+};
+
+static unsigned clo_given_counter[NUM_CLOGCS];
+
+static void append_dac_entry(const char *arg, char ***listp, unsigned *count)
+{
+       char *val = parse_cgroup_acl(arg);
+       (*count)++;
+       *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+       (*listp)[*count - 1] = val;
+       (*listp)[*count] = NULL;
+}
+
+static void append_io_max_entry(const char *arg, char ***listp, unsigned *count)
+{
+       (*count)++;
+       *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+       (*listp)[*count - 1] = xstrdup(arg);
+       (*listp)[*count] = NULL;
+}
+
+static void check_options(void)
+{
+       unsigned n, m;
+       const char *arg;
+       char *name, *val;
+       struct container *c;
+       uint32_t u32;
+
+       container = xzmalloc(sizeof(struct container *));
+       /* loop backwards to let command line opts override config file opts */
+       for (n = OPT_GIVEN(MICOFORIA, CONTAINER) - 1; n != ~0U; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, CONTAINER);
+               check_name(arg);
+               get_or_append_container(arg);
+       }
+       for (n = OPT_GIVEN(MICOFORIA, PRE_START_HOOK) - 1; n != ~0U; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_START_HOOK);
+               parse_compound_arg(arg, "pre-start-hook", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               free(c->pre_start_hook);
+               c->pre_start_hook = val;
+       }
+       for (n = OPT_GIVEN(MICOFORIA, PRE_EXEC_HOOK) - 1; n != ~0U; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_EXEC_HOOK);
+               parse_compound_arg(arg, "pre-exec-hook", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               free(c->pre_exec_hook);
+               c->pre_exec_hook = val;
+       }
+       for (n = OPT_GIVEN(MICOFORIA, CAPDROP) - 1; n != ~0U; n--) {
+               cap_value_t cap_val;
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, CAPDROP);
+               parse_compound_arg(arg, "capabilities", &name, &val);
+               c = get_or_append_container(name);
+               if (cap_from_name(val, &cap_val) < 0)
+                       die_errno("%s: invalid capability: %s", name, val);
+               c->capdrop = xrealloc(c->capdrop,
+                       ++c->num_capdrops * sizeof(cap_value_t));
+               c->capdrop[c->num_capdrops - 1] = cap_val;
+               free(name);
+               free(val);
+       }
+       for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP); n++) {
+               cap_value_t cap_val;
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CAPDROP);
+               if (cap_from_name(arg, &cap_val) < 0)
+                       die_errno("invalid default capability: %s", val);
+               default_capdrop = xrealloc(default_capdrop,
+                       ++num_default_capdrops * sizeof(cap_value_t));
+               default_capdrop[num_default_capdrops - 1] = cap_val;
+       }
+       for (n = OPT_GIVEN(MICOFORIA, TTY) - 1; n != ~0U; n--) {
+               uint32_t minor;
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, TTY);
+               parse_compound_arg(arg, "tty", &name, &val);
+               c = get_or_append_container(name);
+               minor = atou32(val, "tty");
+               if (minor == 0)
+                       die("can not capture tty0");
+               c->tty = xrealloc(c->tty, ++c->num_ttys * sizeof(uint32_t));
+               c->tty[c->num_ttys - 1] = minor;
+               free(name);
+               free(val);
+       }
+       for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_TTY); n++) {
+               uint32_t minor = OPT_UINT32_VAL_N(n, MICOFORIA, DEFAULT_TTY);
+               if (minor == 0)
+                       die("can not capture tty0");
+               default_tty = xrealloc(default_tty,
+                       ++num_default_ttys * sizeof(uint32_t));
+               default_tty[num_default_ttys - 1] = minor;
+       }
+
+       for (n = OPT_GIVEN(MICOFORIA, ROOT_DIRECTORY) - 1; n != ~0U ; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, ROOT_DIRECTORY);
+               parse_compound_arg(arg, "root-directory", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               free(c->root_dir);
+               c->root_dir = val;
+       }
+       u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+       check_range(u32, 0, 65536, "default-cpu-cores");
+       for (n = OPT_GIVEN(MICOFORIA, CPU_CORES) - 1; n != ~0U ; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, CPU_CORES);
+               parse_compound_arg(arg, "cpu-cores", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               u32 = atou32(val, "cpu-cores");
+               free(val);
+               check_range(u32, 0, 65536, "cpu-cores");
+               c->cpu_cores = u32;
+       }
+       u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+       check_range(u32, 0, 1024 * 1024, "default-memory-limit");
+       for (n = OPT_GIVEN(MICOFORIA, MEMORY_LIMIT) - 1; n != ~0U ; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, MEMORY_LIMIT);
+               parse_compound_arg(arg, "memory-limit", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               u32 = atou32(val, "memory-limit");
+               free(val);
+               check_range(u32, 0, 1024 * 1024, "memory-limit");
+               c->memory_limit = u32;
+       }
+       for (n = OPT_GIVEN(MICOFORIA, INIT) - 1; n != ~0U ; n--) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, INIT);
+               parse_compound_arg(arg, "init", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               free(c->init);
+               c->init = val;
+       }
+       for (n = 0; n < OPT_GIVEN(MICOFORIA, NET); n++) {
+               struct ifspec *ifspec;
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, NET);
+               parse_compound_arg(arg, "net", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               c->ifspec = xrealloc(c->ifspec,
+                       ++c->num_ifspecs * sizeof(struct ifspec));
+               ifspec = c->ifspec + c->num_ifspecs - 1;
+               parse_ifspec(val, &ifspec->bridge, ifspec->hwaddr);
+               free(val);
+       }
+
+       m = clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC];
+       for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_CGROUP_DAC); n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+               append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+       }
+       for (n = 0; n < m; n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+               append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+       }
+       m = clo_given_counter[CLOGC_CGROUP_DAC];
+       for (n = m; n < OPT_GIVEN(MICOFORIA, CGROUP_DAC); n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+               parse_compound_arg(arg, "cgroup-dac", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+               free(val);
+       }
+       for (n = 0; n < m; n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+               parse_compound_arg(arg, "cgroup-dac", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+               free(val);
+       }
+
+       m = clo_given_counter[CLOGC_DEFAULT_IO_MAX];
+       for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+               append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+       }
+       for (n = 0; n < m; n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+               append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+       }
+       m = clo_given_counter[CLOGC_IO_MAX];
+       for (n = m; n < OPT_GIVEN(MICOFORIA, IO_MAX); n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+               parse_compound_arg(arg, "io-max", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+               free(val);
+       }
+       for (n = 0; n < m; n++) {
+               arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+               parse_compound_arg(arg, "io-max", &name, &val);
+               c = get_or_append_container(name);
+               free(name);
+               append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+               free(val);
+       }
+
+       /* init default c->ifspec[] */
+       FOR_EACH_CONTAINER(c) {
+               if (c->num_ifspecs == 0) {
+                       const char *br = OPT_STRING_VAL(MICOFORIA, DEFAULT_BRIDGE);
+                       c->num_ifspecs = 1;
+                       c->ifspec = xmalloc(sizeof(struct ifspec));
+                       c->ifspec[0].bridge = xstrdup(br);
+                       memset(c->ifspec[0].hwaddr, 0, 6);
+                       continue;
+               }
+       }
+}
+
+static void show_subcommand_summary(bool verbose)
+{
+       int i;
+
+#define LSG_MICOFORIA_CMD(_name) #_name
+       static const char * const subcommand_names[] = {LSG_MICOFORIA_SUBCOMMANDS NULL};
+#undef LSG_MICOFORIA_CMD
+       printf("Available subcommands:\n");
+       if (verbose) {
+               const struct lls_command *cmd;
+               for (i = 1; (cmd = lls_cmd(i, micoforia_suite)); i++) {
+                       const char *purpose = lls_purpose(cmd);
+                       const char *name = lls_command_name(cmd);
+                       printf("%-12s%s\n", name, purpose);
+               }
+       } else {
+               unsigned n = 8;
+               printf("\t");
+               for (i = 0; i < LSG_NUM_MICOFORIA_SUBCOMMANDS; i++) {
+                       if (i > 0)
+                               n += printf(", ");
+                       if (n > 70) {
+                               printf("\n\t");
+                               n = 8;
+                       }
+                       n += printf("%s", subcommand_names[i]);
+               }
+               printf("\n");
+       }
+}
+
+const char *GET_VERSION(void);
+static void handle_version_and_help(void)
+{
+       char *help;
+
+       if (OPT_GIVEN(MICOFORIA, VERSION)) {
+               printf(PACKAGE " %s\n"
+                       "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n"
+                       "License: " LICENSE " <" LICENSE_URL ">.\n"
+                       "This is free software: you are free to change and redistribute it.\n"
+                       "There is NO WARRANTY, to the extent permitted by law.\n"
+                       "\n"
+                       "Web page: " URL "\n"
+                       "Clone URL: " CLONE_URL "\n"
+                       "Gitweb: " GITWEB_URL "\n"
+                       "Author's Home Page: " HOME_URL "\n"
+                       "Send feedback to: " AUTHOR " <" EMAIL ">\n"
+                       ,
+                       GET_VERSION()
+               );
+               exit(EXIT_SUCCESS);
+       }
+       if (OPT_GIVEN(MICOFORIA, DETAILED_HELP))
+               help = lls_long_help(CMD_PTR(MICOFORIA));
+       else if (OPT_GIVEN(MICOFORIA, HELP))
+               help = lls_short_help(CMD_PTR(MICOFORIA));
+       else if (lls_num_inputs(lpr) == 0) {
+               show_subcommand_summary(true /* verbose */);
+               exit(EXIT_SUCCESS);
+       } else
+               return;
+       printf("%s\n", help);
+       free(help);
+       exit(EXIT_SUCCESS);
+}
+
+static char *get_config_file_path(void)
+{
+       struct passwd *pw;
+       const char *home;
+
+       if (OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+               return xstrdup(OPT_STRING_VAL(MICOFORIA, CONFIG_FILE));
+       pw = getpwuid(getuid());
+       home = pw? pw->pw_dir : "/root";
+       return msg("%s/.micoforiarc", home);
+}
+
+static void parse_options(int argc, char **argv, const struct lls_command *cmd,
+               struct lls_parse_result **lprp)
+{
+       int ret, fd = -1;
+       char *config_file;
+       struct stat statbuf;
+       void *map;
+       size_t sz;
+       int cf_argc;
+       char **cf_argv, *errctx = NULL;
+       const char *subcmd_name;
+       struct lls_parse_result *merged_lpr, *cf_lpr;
+
+       ret = lls_parse(argc, argv, cmd, lprp, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       handle_version_and_help();
+       clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC] = OPT_GIVEN(MICOFORIA,
+               DEFAULT_CGROUP_DAC);
+       clo_given_counter[CLOGC_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, CGROUP_DAC);
+       clo_given_counter[CLOGC_DEFAULT_IO_MAX] =
+               OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX);
+       clo_given_counter[CLOGC_IO_MAX] = OPT_GIVEN(MICOFORIA, IO_MAX);
+       config_file = get_config_file_path();
+       ret = open(config_file, O_RDONLY);
+       if (ret < 0) {
+               if (errno != ENOENT || OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+                       die_errno("can not open config file %s", config_file);
+               /* no config file -- nothing to do */
+               ret = 0;
+               goto success;
+       }
+       fd = ret;
+       ret = fstat(fd, &statbuf);
+       if (ret < 0)
+               die_errno("failed to stat config file %s", config_file);
+       sz = statbuf.st_size;
+       if (sz == 0) { /* config file is empty -- nothing to do */
+               ret = 0;
+               goto success;
+       }
+       map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+       if (map == MAP_FAILED)
+               die_errno("failed to mmap config file %s", config_file);
+       subcmd_name = (cmd == CMD_PTR(MICOFORIA))? NULL : lls_command_name(cmd);
+       ret = lls_convert_config(map, sz, subcmd_name, &cf_argv,
+               &errctx);
+       munmap(map, sz);
+       if (ret < 0) {
+               ERROR_LOG("failed to convert config file %s\n", config_file);
+               die_lopsub(ret, &errctx);
+       }
+       cf_argc = ret;
+       ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx);
+       lls_free_argv(cf_argv);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       /* command line options override config file options */
+       ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       lls_free_parse_result(cf_lpr, cmd);
+       lls_free_parse_result(*lprp, cmd);
+       *lprp = merged_lpr;
+success:
+       if (fd >= 0)
+               close(fd);
+       free(config_file);
+}
+
+static const char *get_pre_start_hook(const struct container *c)
+{
+       if (c->pre_start_hook)
+               return c->pre_start_hook;
+       return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_START_HOOK);
+}
+
+static const char *get_pre_exec_hook(const struct container *c)
+{
+       if (c->pre_exec_hook)
+               return c->pre_exec_hook;
+       return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_EXEC_HOOK);
+}
+
+static char *get_root_dir(const struct container *c)
+{
+       if (c->root_dir)
+               return xstrdup(c->root_dir);
+       return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, DEFAULT_ROOT_PREFIX), c->name);
+}
+
+static char *get_ifspec_string(const struct container *c)
+{
+       unsigned n;
+       char *str = NULL;
+
+       assert(c->num_ifspecs > 0);
+       for (n = 0; n < c->num_ifspecs; n++) {
+               uint8_t *x = c->ifspec[n].hwaddr;
+               char *tmp = msg("%s%s%s:%02x:%02x:%02x:%02x:%02x:%02x",
+                       str? str : "",
+                       str? " " : "",
+                       c->ifspec[n].bridge,
+                       x[0], x[1], x[2], x[3], x[4], x[5]
+               );
+               free(str);
+               str = tmp;
+       }
+       return str;
+}
+
+static char *interface_name(const struct container *c, unsigned idx, bool peer)
+{
+       assert(idx < c->num_ifspecs);
+       if (c->num_ifspecs == 1)
+               return peer? msg("%s-g", c->name) : xstrdup(c->name);
+       if (peer)
+               return msg("%s-%s-g", c->name, c->ifspec[idx].bridge);
+       return msg("%s-%s", c->name, c->ifspec[idx].bridge);
+}
+
+static void set_m7a_root_dir_env(const struct container *c)
+{
+       char *root = get_root_dir(c);
+       DEBUG_LOG("root dir: %s\n", root);
+       setenv("MICOFORIA_ROOT_DIR", root, 1);
+       free(root);
+}
+
+static bool run_pre_start_hook(const struct container *c)
+{
+       char *ifspec;
+       char *cmd = xstrdup(get_pre_start_hook(c));
+       char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+       bool success;
+
+       setenv("MICOFORIA_CONTAINER_NAME", c->name, 1);
+       set_m7a_root_dir_env(c);
+
+       ifspec = get_ifspec_string(c);
+       DEBUG_LOG("ifspecs: %s\n", ifspec);
+       setenv("MICOFORIA_IFSPECS", ifspec, 1);
+       free(ifspec);
+
+       INFO_LOG("running pre-start hook %s\n", cmd);
+       success = xexec(argv, NULL);
+       free(cmd);
+       if (!success)
+               ERROR_LOG("pre-start hook failed\n");
+       unsetenv("MICOFORIA_CONTAINER_NAME");
+       unsetenv("MICOFORIA_IFSPECS");
+       unsetenv("MICOFORIA_ROOT_DIR");
+       return success;
+}
+
+static void run_pre_exec_hook(const struct container *c)
+{
+       char *cmd = xstrdup(get_pre_exec_hook(c));
+       char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+
+       INFO_LOG("/bin/sh -c '%s'\n", cmd);
+       set_m7a_root_dir_env(c);
+       if (!xexec(argv, NULL))
+               die("%s: pre-exec hook failed", c->name);
+       free(cmd);
+       unsetenv("MICOFORIA_ROOT_DIR");
+}
+
+static void write_cgroup(const char *path, const char *txt)
+{
+       int fd;
+       size_t sz;
+
+       if ((fd = open(path, O_WRONLY)) < 0)
+               die_errno("open %s", path);
+       sz = strlen(txt);
+       if (write(fd, txt, sz) != sz)
+               die_errno("could not write to %s", path);
+       close(fd);
+}
+
+static unsigned get_dacl(const struct container *c, char ***result)
+{
+       static char *dflt[] = {
+               "da", /* deny access to all devices except the ones below */
+               "ac 1:3 rwm", /* null */
+               "ac 1:5 rwm", /* zero */
+               "ac 1:7 rwm", /* full */
+               "ac 1:8 rwm", /* random */
+               "ac 1:9 rwm", /* urandom */
+               "ac 4:* rwm", /* tty?* */
+               "ac 5:0 rwm", /* tty */
+               "ac 5:2 rwm", /* ptmx */
+               "ac 136:* rwm", /* pts */
+       };
+       if (c->num_dac_entries > 0) {
+               *result = c->dacl;
+               return c->num_dac_entries;
+       }
+       if (num_default_dac_entries > 0) {
+               *result = default_dacl;
+               return num_default_dac_entries;
+       }
+       *result = dflt;
+       return ARRAY_SIZE(dflt);
+}
+
+static void apply_dacl(const struct container *c)
+{
+       char **dacl;
+       unsigned n, num_entries;
+       char *m7a_dir, *container_dir, *allow, *deny, *procs, *txt;
+       int fd, allow_fd, deny_fd;
+       size_t sz;
+
+       m7a_dir = msg("/var/cgroup/micoforia");
+       container_dir = msg("%s/%s", m7a_dir, c->name);
+       allow = msg("%s/devices.allow", container_dir);
+       deny = msg("%s/devices.deny", container_dir);
+       procs = msg("%s/cgroup.procs", container_dir);
+
+       if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+               die_errno("mkdir %s", m7a_dir);
+       free(m7a_dir);
+       if (mkdir(container_dir, 0777) < 0 && errno != EEXIST)
+               die_errno("mkdir %s", container_dir);
+       free(container_dir);
+       if ((allow_fd = open(allow, O_WRONLY)) < 0)
+               die_errno("open %s", allow);
+       free(allow);
+       if ((deny_fd = open(deny, O_WRONLY)) < 0)
+               die_errno("open %s", deny);
+       free(deny);
+
+       num_entries = get_dacl(c, &dacl);
+       INFO_LOG("applying %u entr%s\n", num_entries, num_entries == 1?
+               "y" : "ies");
+       for (n = 0; n < num_entries; n++) {
+               char *entry = dacl[n];
+               DEBUG_LOG("dac entry #%u: %s %s\n", n, dacl[n][0] == 'a'?
+                       "allow" : "deny", dacl[n] + 1);
+               txt = msg("%s\n", entry + 1);
+               sz = strlen(txt);
+               fd = entry[0] == 'a'? allow_fd : deny_fd;
+               if (write(fd, txt, sz) != sz)
+                       die_errno("could not write to cgroup devices.%s file",
+                               entry[0] == 'a'? "allow" : "deny");
+               free(txt);
+       }
+       close(allow_fd);
+       close(deny_fd);
+       txt = msg("%u\n", (unsigned)getpid());
+       write_cgroup(procs, txt);
+       free(txt);
+}
+
+static void cgroup_init(void)
+{
+       const char controllers[] = "+cpu +memory +io\n";
+       char *m7a_dir, *ctl;
+
+       if (access("/var/cgroup/cgroup.clone_children", F_OK) < 0)
+               die("cgroup v1 not mounted at /var/cgroup/");
+       if (access("/var/cgroup2/cgroup.subtree_control", F_OK) < 0)
+               die("cgroup v1 not mounted at /var/cgroup/");
+       write_cgroup("/var/cgroup2/cgroup.subtree_control", controllers);
+       m7a_dir = msg("/var/cgroup2/micoforia");
+       if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+               die_errno("mkdir %s", m7a_dir);
+       ctl = msg("%s/cgroup.subtree_control", m7a_dir);
+       free(m7a_dir);
+       write_cgroup(ctl, controllers);
+       free(ctl);
+}
+
+static void create_cgroup_v2(const struct container *c)
+{
+       char buf[10];
+       char *ctl, *dir = msg("/var/cgroup2/micoforia/%s", c->name);
+
+       if (mkdir(dir, 0777) < 0 && errno != EEXIST)
+               die_errno("mkdir %s", dir);
+       ctl = msg("%s/cgroup.procs", dir);
+       free(dir);
+       sprintf(buf, "%u\n", (unsigned)getpid());
+       write_cgroup(ctl, buf);
+       free(ctl);
+}
+
+static unsigned get_cpu_cores(const struct container *c)
+{
+       return c->cpu_cores != ~0U?  c->cpu_cores :
+               OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+}
+
+static void apply_cpu_limit(const struct container *c)
+{
+       char *str, *ctl;
+       unsigned cores = get_cpu_cores(c);
+
+       if (cores == 0) /* unlimited */
+               return;
+       assert(cores != ~0U);
+       INFO_LOG("%u core%s\n", cores, cores == 1? "" : "s");
+       ctl = msg("/var/cgroup2/micoforia/%s/cpu.max", c->name);
+       str = msg("%u 1000000\n", 1000000 * cores);
+       write_cgroup(ctl, str);
+       free(ctl);
+       free(str);
+}
+
+static unsigned get_memory_limit(const struct container *c)
+{
+       return c->memory_limit != ~0U?  c->memory_limit :
+               OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+}
+
+static void apply_memory_limit(const struct container *c)
+{
+       char *str, *ctl;
+       unsigned gigs = get_memory_limit(c);
+
+       if (gigs == 0) /* unlimited */
+               return;
+       assert(gigs != ~0U);
+       INFO_LOG("%uG\n", gigs);
+       ctl = msg("/var/cgroup2/micoforia/%s/memory.high", c->name);
+       str = msg("%llu\n", 1024LLU * 1024LLU * 1024LLU * gigs);
+       write_cgroup(ctl, str);
+       free(ctl);
+       free(str);
+}
+
+static unsigned get_iospecs(const struct container *c, char ***result)
+{
+       if (c->num_io_max_entries > 0) {
+               *result = c->dacl;
+               return c->num_io_max_entries;
+       }
+       if (num_default_io_max_entries > 0) {
+               *result = default_io_max;
+               return num_default_io_max_entries;
+       }
+       *result = NULL;
+       return 0;
+}
+
+static void apply_io_limit(const struct container *c)
+{
+       unsigned n, num_entries;
+       char *io_max;
+       char **iospec;
+
+       num_entries = get_iospecs(c, &iospec);
+       if (num_entries == 0)
+               return;
+       INFO_LOG("%u entries\n", num_entries);
+       io_max = msg("/var/cgroup2/micoforia/%s/io.max", c->name);
+       for (n = 0; n < num_entries; n++)
+               write_cgroup(io_max, iospec[n]);
+       free(io_max);
+}
+
+static void cgroup_cleanup(const struct container *c)
+{
+       char *dir = msg("/var/cgroup/micoforia/%s", c->name);
+       remove_subdirs_recursively(dir);
+       free(dir);
+       dir = msg("/var/cgroup2/micoforia/%s", c->name);
+       remove_subdirs_recursively(dir);
+       free(dir);
+}
+
+static bool setup_network(const struct container *c)
+{
+       unsigned n;
+       char *iface, *peer;
+
+       if (!link_up("lo"))
+               WARNING_LOG("could not set establish loopback link\n");
+       for (n = 0; n < c->num_ifspecs; n++) {
+               iface = interface_name(c, n, false);
+               peer = interface_name(c, n, true);
+               link_del(iface); /* ignore errors */
+               if (!create_veth_device_pair(iface, peer))
+                       goto fail;
+               if (!set_hwaddr(peer, c->ifspec[n].hwaddr))
+                       goto fail;
+               if (!attach_to_bridge(iface, c->ifspec[n].bridge))
+                       goto fail;
+               if (!link_up(iface))
+                       goto fail;
+               free(iface);
+               free(peer);
+       }
+       return true;
+fail:
+       free(iface);
+       free(peer);
+       return false;
+}
+
+static void setup_termios(int fd)
+{
+       struct winsize wsz; /* see ioctl_tty(2) */
+       struct termios tios;
+
+       if (!isatty(fd))
+               return;
+       if (tcgetattr(fd, &tios)) {
+               ERROR_LOG("tcgetattr: %m\n");
+               return;
+       }
+       tios.c_lflag &= ~(ECHO | ISIG | ICANON);
+       tios.c_cc[VMIN] = 1;
+       tios.c_cc[VTIME] = 0;
+       if (tcsetattr(fd, TCSAFLUSH, &tios) < 0)
+               ERROR_LOG("tcsetattr: %m\n");
+       if (ioctl(STDIN_FILENO, TIOCGWINSZ, &wsz) >= 0)
+               ioctl(fd, TIOCSWINSZ, &wsz);
+}
+
+struct device_node_info {
+       unsigned major, minor;
+       mode_t mode;
+       const char *name;
+};
+
+static void create_standard_device_nodes(struct container_runtime *cr)
+{
+       const struct device_node_info devices[] = {
+               {.major = 1, .minor = 3, .mode = 0666, .name = "null"},
+               {.major = 1, .minor = 5, .mode = 0666, .name = "zero"},
+               {.major = 1, .minor = 7, .mode = 0666, .name = "full"},
+               {.major = 1, .minor = 8, .mode = 0666, .name = "random"},
+               {.major = 1, .minor = 9, .mode = 0666, .name = "urandom"},
+               {.major = 4, .minor = 0, .mode = 0620, .name = "tty0"},
+               {.major = 5, .minor = 1, .mode = 0600, .name = "console"},
+               {.major = 5, .minor = 2, .mode = 0666, .name = "ptmx"},
+       };
+       unsigned n;
+
+       for (n = 0; n < ARRAY_SIZE(devices); n++) {
+               const struct device_node_info *d = devices + n;
+               char *path = msg("%s/%s", cr->dev, d->name);
+               if (mknod(path, S_IFCHR, makedev(d->major, d->minor)) < 0)
+                       die_errno("mknod %s", d->name);
+               chmod(path, d->mode);
+               free(path);
+       }
+}
+
+static void init_console(struct container_runtime *cr)
+{
+       char *console;
+       unsigned n;
+
+       if (mount(NULL, cr->dev, "tmpfs", 0, "size=500000,mode=755") < 0)
+               die("mount tmpfs at %s: %m", cr->dev);
+       create_standard_device_nodes(cr);
+       for (n = 0; n < cr->num_ttys; n++) {
+               char *tty = msg("%s/tty%u", cr->dev, cr->tty[n]);
+               unlink(tty);
+               if (mknod(tty, S_IFCHR, makedev(4, cr->tty[n])) < 0)
+                       die("mknod %s: %m", tty);
+               chmod(tty, 0660);
+               setup_termios(cr->slave[n]);
+               INFO_LOG("bind mounting %s -> %s\n", ttyname(cr->slave[n]), tty);
+               if (mount(ttyname(cr->slave[n]), tty, "none",
+                               MS_BIND | MS_PRIVATE, NULL) < 0)
+                       die("failed to bind mount %s: %m\n", tty);
+               free(tty);
+       }
+       console = msg("%s/console", cr->dev);
+       if (mount(ttyname(cr->slave[0]), console, "none",
+                       MS_BIND | MS_PRIVATE, NULL) < 0)
+               die("failed to bind mount %s: %m\n", console);
+       free(console);
+}
+
+/*
+ * These umounts fail if the container shutdown already umounted the bind
+ * mounted devices. This is not fatal, so log only with low severity.
+ */
+static void shutdown_console(struct container_runtime *cr)
+{
+       unsigned n;
+       char *console;
+
+       for (n = 0; n < cr->num_ttys; n++) {
+               char *tty = msg("%s/tty1", cr->dev);
+               if (umount2(tty, MNT_DETACH) < 0)
+                       DEBUG_LOG("umount %s: %m\n", tty);
+               free(tty);
+       }
+       console = msg("%s/console", cr->dev);
+       if (umount2(console, MNT_DETACH) < 0)
+               DEBUG_LOG("umount %s: %m\n", console);
+       free(console);
+}
+
+static char *get_socket_path(const char *container_name)
+{
+       return msg("micoforia/%s", container_name);
+}
+
+/* Ignore everything the client sends us, but invalidate the fd on EOF. */
+static void dispatch_client(int *client)
+{
+       char buf[1024];
+       if (read(*client, buf, sizeof(buf)) <= 0) {
+               NOTICE_LOG("detaching client on fd %d\n", *client);
+               close(*client);
+               *client = -1;
+       }
+}
+
+static void dispatch_socket_request(struct container_runtime *cr)
+{
+       uid_t uid;
+       char buf[32];
+       int cfd;
+       uint32_t minor;
+       unsigned n;
+       bool force;
+
+       memset(buf, 0, sizeof(buf));
+       if (!recv_cred_buffer(cr->socket_fd, buf, sizeof(buf) - 1, &cfd, &uid))
+               return;
+       if (uid != getuid()) {
+               const char msg[] = "\1EACCES";
+               send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+               NOTICE_LOG("access denied for uid %d\n", (int)uid);
+               goto out;
+       }
+       if (strcmp(buf, "init_pid") == 0) {
+               buf[0] = '\0';
+               memcpy(buf + 1, &cr->init_pid, sizeof(int));
+               send(cfd, buf, 1 + sizeof(int), MSG_DONTWAIT);
+               goto out;
+       }
+       if (sscanf(buf, "attach %u", &minor) == 1) {
+               force = false;
+       } else if (sscanf(buf, "force-attach %u", &minor) == 1) {
+               force = true;
+       } else {
+               const char msg[] = "\1EINVAL";
+               send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+               NOTICE_LOG("invalid request: %s\n", buf);
+               goto out;
+       }
+       for (n = 0; n < cr->num_ttys; n++) {
+               INFO_LOG("n: %u, tty[n]: %u\n", n, cr->tty[n]);
+               if (cr->tty[n] == minor)
+                       break;
+       }
+       if (n == cr->num_ttys) {
+               const char msg[] = "\1ENOTTY";
+               send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+               NOTICE_LOG("tty%u is not being forwarded\n", minor);
+               goto out;
+       }
+       if (cr->client[n] >= 0) {
+               if (force) {
+                       close(cr->client[n]);
+                       cr->client[n] = -1;
+               } else {
+                       const char msg[] = "\1EBUSY";
+                       send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+                       ERROR_LOG("tty%u is already in use\n", minor);
+                       goto out;
+               }
+       }
+       if (!pass_fd(cr->master[n], cfd)) {
+               ERROR_LOG("could not pass master fd\n");
+               goto out;
+       }
+       NOTICE_LOG("attached client on fd %d to tty%u\n", cfd, minor);
+       cr->client[n] = cfd;
+       return;
+out:
+       close(cfd);
+}
+
+/* discards read data if dst < 0 */
+static bool copy(int src, int dst)
+{
+       ssize_t sz1, sz2;
+       char buf[1024];
+again:
+       sz1 = read(src, buf, sizeof(buf));
+       if (sz1 < 0) {
+               if (errno == EINTR)
+                       goto again;
+               DEBUG_LOG("read from fd %d: %m\n", src);
+       }
+       if (sz1 <= 0)
+               return false;
+       if (dst < 0)
+               return true;
+       sz2 = write(dst, buf, sz1);
+       if (sz2 < 0) {
+               DEBUG_LOG("write to fd %d: %m\n", dst);
+               return false;
+       }
+       if (sz1 != sz2) {
+               DEBUG_LOG("short write to fd %d\n", dst);
+               return false;
+       }
+       return true;
+}
+
+/*
+ * The function returns only when the process receives SIGCHLD. In this case
+ * the return value is 0 for success, 1 for failure, and  2 if the child's exit
+ * code indicates a reboot request. Other signals are pushed down to the child
+ * process.
+ */
+static int parent_loop(pid_t pid, const struct container *c,
+               struct container_runtime *cr)
+{
+       unsigned n;
+
+       init_signal_handling();
+       for (;;) {
+               int sig, max_fileno = 0;
+               fd_set fds;
+
+               FD_ZERO(&fds);
+               if (OPT_GIVEN(START, FOREGROUND)) {
+                       FD_SET(STDIN_FILENO, &fds);
+                       if (STDIN_FILENO > max_fileno)
+                               max_fileno = STDIN_FILENO;
+               }
+               FD_SET(signal_pipe[0], &fds);
+               if (signal_pipe[0] > max_fileno)
+                       max_fileno = signal_pipe[0];
+               FD_SET(cr->socket_fd, &fds);
+               if (cr->socket_fd > max_fileno)
+                       max_fileno = cr->socket_fd;
+               for (n = 0; n < cr->num_ttys; n++) {
+                       if (cr->client[n] >= 0) { /* detached */
+                               FD_SET(cr->client[n], &fds);
+                               if (cr->client[n] > max_fileno)
+                                       max_fileno = cr->client[n];
+                       } else {
+                               FD_SET(cr->master[n], &fds);
+                               if (cr->master[n] > max_fileno)
+                                       max_fileno = cr->master[n];
+                       }
+               }
+               if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+                       if (errno != EINTR)
+                               ERROR_LOG("select: %m\n");
+                       continue;
+               }
+               do {
+                       if (!FD_ISSET(signal_pipe[0], &fds))
+                               break;
+                       sig = next_signal();
+                       if (sig == SIGCHLD) {
+                               int wstatus;
+                               if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+                                       WARNING_LOG("wait: %m\n");
+                                       break;
+                               }
+                               cgroup_cleanup(c);
+                               if (!WIFEXITED(wstatus))
+                                       return 1;
+                               if (WEXITSTATUS(wstatus) == 2)
+                                       return 2;
+                               return WEXITSTATUS(wstatus) != EXIT_SUCCESS;
+                       }
+                       kill(pid, sig);
+               } while (0);
+               if (FD_ISSET(cr->socket_fd, &fds))
+                       dispatch_socket_request(cr);
+               for (n = 0; n < cr->num_ttys; n++) {
+                       if (cr->client[n] >= 0) {
+                               if FD_ISSET(cr->client[n], &fds)
+                                       dispatch_client(cr->client + n);
+                       } else { /* stdout is /dev/null in background mode */
+                               if (FD_ISSET(cr->master[n], &fds))
+                                       copy(cr->master[n], n == 0?
+                                               STDOUT_FILENO : -1);
+                       }
+               }
+               if (OPT_GIVEN(START, FOREGROUND)) {
+                       if (FD_ISSET(STDIN_FILENO, &fds))
+                               copy(STDIN_FILENO, cr->master[0]);
+               }
+       }
+}
+
+/* Set net namespace of child and call parent_loop(). */
+static int run_parent(pid_t child_pid, const struct container *c,
+               struct container_runtime *cr)
+{
+       unsigned n;
+       bool success;
+
+       close(cr->pipe1[1]);
+       close(cr->pipe2[0]);
+       if (read(cr->pipe1[0], &cr->init_pid, 4) != 4) {
+               ERROR_LOG("pipe1 read error\n");
+               close(cr->pipe1[0]);
+               close(cr->pipe2[1]);
+               return false;
+       }
+       INFO_LOG("received grand child pid: %u\n", (unsigned)cr->init_pid);
+       close(cr->pipe1[0]);
+       for (n = 0; n < c->num_ifspecs; n++) {
+               char *peer = interface_name(c, n, true);
+               success = set_netns(peer, child_pid);
+               free(peer);
+               if (!success) {
+                       ERROR_LOG("set_netns error\n");
+                       close(cr->pipe2[1]);
+                       return false;
+               }
+       }
+       success = write(cr->pipe2[1], "\0", 1) == 1;
+       close(cr->pipe2[1]);
+       if (!success) {
+               ERROR_LOG("pipe2 write error\n");
+               return false;
+       }
+       return parent_loop(child_pid, c, cr);
+}
+
+static unsigned get_capdrops(const struct container *c, cap_value_t **result)
+{
+       static cap_value_t builtin_capdrop[] = {CAP_SYS_MODULE, CAP_SYS_TIME,
+               CAP_SYS_RESOURCE};
+
+       if (c->capdrop) {
+               *result = c->capdrop;
+               return c->num_capdrops;
+       }
+       if (OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP)) {
+               *result = default_capdrop;
+               return num_default_capdrops;
+       }
+       *result = builtin_capdrop;
+       return ARRAY_SIZE(builtin_capdrop);
+}
+
+static void drop_caps(const struct container *c)
+{
+       cap_value_t *capdrop;
+       unsigned n, num_capdrops;
+
+       INFO_LOG("lowering bounding set capabilities\n");
+       num_capdrops = get_capdrops(c, &capdrop);
+       for (n = 0; n < num_capdrops; n++) {
+               char *name = cap_to_name(capdrop[n]);
+               DEBUG_LOG("dropping %s\n", name);
+               cap_free(name);
+               if (cap_drop_bound(capdrop[n]) < 0)
+                       die_errno("cap_drop_bound");
+       }
+}
+
+__attribute ((noreturn))
+static void child_loop(pid_t pid, struct container_runtime *cr)
+{
+       int wstatus;
+
+       INFO_LOG("parent: %u, child: %u, init: %u\n", (unsigned) getppid(),
+               (unsigned)getpid(), (unsigned)pid);
+       init_signal_handling();
+       setsid();
+
+       for (;;) {
+               int max_fileno = 0;
+               fd_set fds;
+
+               FD_ZERO(&fds);
+               FD_SET(signal_pipe[0], &fds);
+               if (signal_pipe[0] > max_fileno)
+                       max_fileno = signal_pipe[0];
+               if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+                       if (errno != EINTR)
+                               ERROR_LOG("select: %m\n");
+                       continue;
+               }
+               do { if (FD_ISSET(signal_pipe[0], &fds)) {
+                       int sig = next_signal();
+                       if (sig == SIGCHLD) {
+                               if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+                                       WARNING_LOG("wait: %m\n");
+                                       break;
+                               }
+                               shutdown_console(cr);
+                               if (WIFSIGNALED(wstatus) &&
+                                       WTERMSIG(wstatus) == 1) {
+                                       NOTICE_LOG("reboot requested\n");
+                                       exit(2);
+                               }
+                               NOTICE_LOG("container terminated\n");
+                               exit(EXIT_SUCCESS);
+                       }
+                       NOTICE_LOG("sending signal %d to container init\n",
+                               sig);
+                       kill(pid, sig == SIGINT? SIGINT : SIGKILL);
+               }} while(0);
+       }
+}
+
+static const char *get_init_path(const struct container *c)
+{
+       return c->init? c->init : OPT_STRING_VAL(MICOFORIA, DEFAULT_INIT);
+}
+
+/*
+ * The child process unshares namespaces, spawns the init process which runs
+ * the pre-exec hook and executes the container init process. This function
+ * never returns, but both the child and the init process exit when the
+ * container terminates. The exit code of the child tells the parent whether
+ * it should restart the container.
+ */
+__attribute ((noreturn))
+static void run_child(const struct container *c, struct container_runtime *cr)
+{
+       unsigned n;
+       char *init, *put_old;
+       char ch;
+       pid_t pid;
+
+       close(cr->socket_fd);
+       for (n = 0; n < cr->num_ttys; n++)
+               close(cr->master[n]);
+       close(cr->pipe1[0]);
+       close(cr->pipe2[1]);
+       if (unshare(CLONE_NEWNET) < 0)
+               die_errno("unshare net ns\n");
+       if (unshare(CLONE_NEWPID) < 0)
+               die_errno("unshare pid ns\n");
+       /* fork again to become pid 1 in the new pid namespace */
+       if ((pid = fork()) < 0)
+               die_errno("fork");
+       /*
+        * By writing to pipe1 we tell the parent (a) we've unshared the net
+        * namespace, and (b) the pid of the init process in the parent
+        * namespace.
+        */
+       if (pid > 0) {
+               close(cr->pipe2[0]);
+               if (write(cr->pipe1[1], (const char *)&pid, 4) != 4)
+                       die_errno("pipe write error");
+               close(cr->pipe1[1]);
+               child_loop(pid, cr); /* never returns */
+       }
+       pid = getpid();
+       DEBUG_LOG("now running as pid %d\n", pid);
+       if (read(cr->pipe2[0], &ch, 1) != 1)
+               die_errno("pipe read error");
+       close(cr->pipe1[1]);
+       close(cr->pipe2[0]);
+       if (unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS) < 0)
+               die_errno("unshare");
+       mkdir(cr->dev, 0777);
+       init_console(cr);
+       for (n = 0; n < cr->num_ttys; n++)
+               close(cr->slave[n]);
+       INFO_LOG("setting hostname to %s\n", c->name);
+       if (sethostname(c->name, strlen(c->name)) < 0)
+               die_errno("sethostname error");
+       if (chdir(cr->root) < 0)
+               die_errno("chdir %s", cr->root);
+       drop_caps(c);
+       apply_dacl(c);
+       apply_cpu_limit(c);
+       apply_memory_limit(c);
+       apply_io_limit(c);
+       for (n = 0; n < c->num_ifspecs; n++) {
+               char *peer = interface_name(c, n, true);
+               char *renamed = msg("eth%u", n);
+               if (!rename_interface(peer, renamed))
+                       die("can not rename %s to %s\n", peer, renamed);
+               free(peer);
+               free(renamed);
+       }
+       run_pre_exec_hook(c);
+       setup_termios(STDIN_FILENO);
+       put_old = msg("%s/mnt", cr->root);
+       /* glibc does not provide a wrapper for pivot_root */
+       if (syscall(SYS_pivot_root, ".", put_old) < 0)
+               die_errno("pivot_root (put_old: %s)", put_old);
+       if (umount2("/mnt", MNT_DETACH) < 0)
+               die_errno("umount %s", put_old);
+       free(put_old);
+       close(STDIN_FILENO);
+       init = xstrdup(get_init_path(c));
+       INFO_LOG("handing over control to container init: %s\n", init);
+       execve(init, (char *[]){init, NULL}, NULL);
+       die_errno("failed to exec init process %s", c->init);
+}
+
+/*
+ * We need three processes, called parent, child, init, because we want one
+ * process run with namespaces unmodified, requiring one fork. After the child
+ * has unshared its PID namespace, it keeps its old PID, so we need to fork
+ * again to get pid 1. The child can not terminate because the parent can not
+ * wait(2) on its grandchild.
+ */
+static bool exec_container(const struct container *c)
+{
+       bool success;
+       pid_t pid;
+       unsigned n;
+       struct container_runtime cr = {0};
+       char *socket_path;
+       int ret;
+
+       create_cgroup_v2(c);
+       socket_path = get_socket_path(c->name);
+       success = listen_on_unix_socket(socket_path, &cr.socket_fd);
+       if (!success)
+               ERROR_LOG("can not listen on unix socket %s\n", socket_path);
+       free(socket_path);
+       if (!success)
+               return 1;
+       cr.root = get_root_dir(c);
+       cr.dev = msg("%s/dev", cr.root);
+       cr.pts = realpath("/proc/self/fd/0", NULL);
+       DEBUG_LOG("pts: %s\n", cr.pts);
+       cr.num_ttys = get_container_ttys(c, &cr.tty);
+       cr.master = xmalloc(cr.num_ttys * sizeof(int));
+       cr.slave = xmalloc(cr.num_ttys * sizeof(int));
+       cr.client = xmalloc(cr.num_ttys * sizeof(int));
+       for (n = 0; n < cr.num_ttys; n++)
+               cr.client[n] = -1;
+reboot:
+       NOTICE_LOG("starting %s\n", c->name);
+       for (n = 0; n < cr.num_ttys; n++) {
+               if (openpty(cr.master + n, cr.slave + n, NULL, NULL, NULL) < 0)
+                       die("openpty: %m");
+               DEBUG_LOG("pty (tty%u <-> %s)\n", n, ttyname(cr.slave[n]));
+       }
+       /* mount rw, ignore errors */
+       mount(NULL, cr.root, NULL, MS_REMOUNT, NULL);
+       if (!setup_network(c))
+               return false;
+       if (!run_pre_start_hook(c))
+               return false;
+       if (pipe(cr.pipe1) < 0) /* child -> parent */
+               die_errno("pipe1");
+       if (pipe(cr.pipe2) < 0)
+               die_errno("pipe2"); /* parent -> child */
+       if ((pid = fork()) < 0)
+               die_errno("fork");
+       if (pid == 0)
+               run_child(c, &cr); /* never returns */
+       ret = run_parent(pid, c, &cr);
+       if (ret != 2)
+               return ret == 0;
+       NOTICE_LOG("rebooting\n");
+       for (n = 0; n < cr.num_ttys; n++) {
+               close(cr.master[n]);
+               close(cr.slave[n]);
+       }
+       goto reboot;
+}
+
+static char *get_container_logfile(const char *name)
+{
+       return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, LOGDIR), name);
+}
+
+static bool start_container(const struct container *c)
+{
+       pid_t pid;
+       char *logfile;
+       struct termios tios;
+       bool success;
+
+       if (is_locked(c->name, &pid)) {
+               ERROR_LOG("%s is locked by pid %u\n", c->name, (unsigned)pid);
+               return false;
+       }
+       if (OPT_GIVEN(START, FOREGROUND)) {
+               if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+                       ERROR_LOG("both stdin and stdout must be terminals\n");
+                       return false;
+               }
+               if (tcgetattr(STDIN_FILENO, &tios) < 0) {
+                       ERROR_LOG("tcgetattr: %m\n");
+                       return false;
+               }
+       } else {
+               if ((pid = fork()) < 0)
+                       die_errno("fork");
+               if (pid > 0)
+                       return true;
+               logfile = get_container_logfile(c->name);
+               daemonize(logfile);
+               free(logfile);
+       }
+       if (!try_lock(c->name, &pid))
+               die("%s is locked by pid %u", c->name, (unsigned)pid);
+       success = exec_container(c);
+       if (OPT_GIVEN(START, FOREGROUND)) {
+               if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+                       ERROR_LOG("tcsetattr: %m\n");
+       }
+       exit(success? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static void check_container_args(void)
+{
+       unsigned n, num_inputs;
+       struct container *c;
+
+       num_inputs = lls_num_inputs(sublpr);
+       if (num_inputs == 0) {
+               if (num_containers == 0)
+                       die("no container configured\n");
+               if (OPT_GIVEN(START, FOREGROUND) && num_containers > 1)
+                       die("must specify container for foreground mode");
+       } else {
+               if (OPT_GIVEN(START, FOREGROUND) && num_inputs > 1)
+                       die("can start only one container in foreground mode");
+               for (n = 0; n < num_inputs; n++) {
+                       const char *name = lls_input(n, sublpr);
+                       c = get_container(name);
+                       if (!c)
+                               die("container not configured: %s", name);
+               }
+       }
+}
+
+struct container_arg_iter {
+       unsigned idx;
+};
+
+#define INITIALIZED_CAI(_cai) {.idx = 0}
+
+static struct container *cai_next(struct container_arg_iter *cai, bool *skipped)
+{
+       unsigned num_inputs = lls_num_inputs(sublpr);
+
+       if (skipped)
+               *skipped = false;
+       if (num_inputs == 0) {
+               if (cai->idx >= num_containers)
+                       return NULL;
+               return container[cai->idx++];
+       }
+       for (; cai->idx < num_inputs; cai->idx++) {
+               const char *name = lls_input(cai->idx, sublpr);
+               struct container *c = get_container(name);
+               if (!c) {
+                       ERROR_LOG("%s: not configured\n", name);
+                       if (skipped)
+                               *skipped = true;
+                       continue;
+               }
+               cai->idx++;
+               return c;
+       }
+       return NULL;
+}
+
+static bool for_each_container_arg(bool (*f)(const struct container *c))
+{
+       struct container *c;
+       bool success = true;
+       bool skipped;
+       struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+       while ((c = cai_next(&cai, &skipped)))
+               if (!f(c) || skipped)
+                       success = false;
+       return success;
+}
+
+static bool com_start(void)
+{
+       const char *logdir = OPT_STRING_VAL(MICOFORIA, LOGDIR);
+
+       check_container_args();
+       if (logdir[0] == '\0')
+               die_empty_arg("loggir");
+       cgroup_init();
+       if (mkdir(logdir, 0777) < 0 && errno != EEXIST)
+               die_errno("mkdir %s", logdir);
+       return for_each_container_arg(start_container);
+}
+EXPORT_CMD_HANDLER(start);
+
+static bool send_signal_to_container(int signum, const struct container *c)
+{
+       pid_t pid;
+       bool success;
+
+       if (!is_locked(c->name, &pid)) {
+               INFO_LOG("%s is not running\n", c->name);
+               return false;
+       }
+       DEBUG_LOG("sending signal %d to pid %u\n", signum, (unsigned)pid);
+       success = kill(pid, signum) >= 0;
+       if (!success)
+               ERROR_LOG("kill %s: %m\n", c->name);
+       return success;
+}
+
+static void clean_env(void)
+{
+       char *term = getenv("TERM");
+
+       clearenv();
+       if (term)
+               setenv("TERM", term, 0);
+       setenv("PATH", "/root/bin:/usr/local/sbin:/usr/local/bin"
+               ":/sbin:/usr/sbin:/bin:/usr/bin", 0);
+       setenv("USER", "root", 0);
+       setenv("LOGNAME", "root", 0);
+       setenv("HOME", "/root", 0);
+}
+
+static bool request_init_pid(const char *name, int *result)
+{
+       char *socket_path = get_socket_path(name);
+       bool success;
+
+       *result = -1;
+       success = request_int(socket_path, "init_pid", result);
+       free(socket_path);
+       if (!success)
+               ERROR_LOG("could not determine init pid of %s\n", name);
+       return success;
+}
+
+static bool shutdown_container(const struct container *c)
+{
+       pid_t pid;
+       char str[20];
+       char *argv[] = {"nsenter", "-w", "-a", "-r", "-t", str, "halt", NULL};
+
+       if (!is_locked(c->name, NULL)) {
+               if (lls_num_inputs(sublpr) == 0)
+                       return true;
+               ERROR_LOG("container not running: %s\n", c->name);
+               return false;
+       }
+       pid = fork();
+       if (pid < 0)
+               return false;
+       if (pid > 0)
+               return true;
+       if (!request_init_pid(c->name, &pid))
+               _exit(EXIT_FAILURE);
+       sprintf(str, "%d", pid);
+       clean_env();
+       execvp(argv[0], argv);
+       _exit(EXIT_FAILURE);
+}
+
+static bool container_is_dead(const struct container *c)
+{
+       return !is_locked(c->name, NULL);
+}
+
+static bool wait_for_containers_to_die(void)
+{
+       bool success;
+       unsigned ms = 32;
+       struct timespec ts;
+
+       while (ms < 20000) {
+               ts.tv_sec = ms / 1000;
+               ts.tv_nsec = (ms % 1000) * 1000 * 1000;
+               if (nanosleep(&ts, NULL) < 0)
+                       return false;
+               success = for_each_container_arg(container_is_dead);
+               if (success)
+                       return true;
+               ms *= 2;
+       }
+       return false;
+}
+
+static bool com_stop(void)
+{
+       bool success = for_each_container_arg(shutdown_container);
+
+       if (!success)
+               return false;
+       if (!OPT_GIVEN(STOP, WAIT))
+               return true;
+       return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(stop);
+
+static bool reboot_container(const struct container *c)
+{
+       return send_signal_to_container(SIGINT, c);
+}
+
+static bool com_reboot(void)
+{
+       return for_each_container_arg(reboot_container);
+}
+EXPORT_CMD_HANDLER(reboot);
+
+static bool kill_container(const struct container *c)
+{
+       return send_signal_to_container(SIGUSR1, c);
+}
+
+static bool com_kill(void)
+{
+       bool success = for_each_container_arg(kill_container);
+
+       if (!success)
+               return false;
+       if (!OPT_GIVEN(KILL, WAIT))
+               return true;
+       return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(kill);
+
+static void list_container_verbose(const struct container *c)
+{
+       char *root;
+       unsigned n, N;
+       char **word_list;
+       cap_value_t *capdrop;
+       uint32_t *tty;
+       char cores_str[25] = "unlimited";
+       unsigned cores = get_cpu_cores(c);
+
+       printf("%s:\n", c->name);
+       printf("\tpre-start hook: %s\n", get_pre_start_hook(c));
+       printf("\tpre-exec hook: %s\n", get_pre_exec_hook(c));
+       root = get_root_dir(c);
+       printf("\troot dir: %s\n", root);
+       free(root);
+       printf("\tinit path: %s\n", get_init_path(c));
+       for (n = 0; n < c->num_ifspecs; n++) {
+               char pretty_hwaddr[18];
+               char *iface = interface_name(c, n, false);
+               pretty_print_hwaddr(c->ifspec[n].hwaddr, pretty_hwaddr);
+               printf("\tinterface #%u: %s (%s)\n", n, iface, pretty_hwaddr);
+               free(iface);
+       }
+       N = get_dacl(c, &word_list);
+       for (n = 0; n < N; n++)
+               printf("\tdac entry #%u: %s %s\n", n, word_list[n][0] == 'a'?
+                       "allow" : "deny", word_list[n] + 1);
+       N = get_iospecs(c, &word_list);
+       for (n = 0; n < N; n++)
+               printf("\tiospec #%u: %s\n", n, word_list[n]);
+       if (cores > 0)
+               sprintf(cores_str, "%u", cores);
+       printf("\tCPU core limit: %s\n", cores_str);
+       printf("\tmemory limit: %uG\n", get_memory_limit(c));
+       N = get_capdrops(c, &capdrop);
+       for (n = 0; n < N; n++)
+               printf("\tcapdrop #%u: %s\n", n, cap_to_name(capdrop[n]));
+       N = get_container_ttys(c, &tty);
+       for (n = 0; n < N; n++)
+               printf("\ttty #%u: %u\n", n, tty[n]);
+}
+
+static bool com_ls(void)
+{
+       struct container *c;
+       bool skipped, success = true;
+       struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+       while ((c = cai_next(&cai, &skipped))) {
+               pid_t pid;
+               if (skipped)
+                       success = false;
+               if (!is_locked(c->name, &pid)) {
+                       if (!OPT_GIVEN(LS, ALL)) {
+                               success =false;
+                               continue;
+                       }
+                       pid = 0;
+               }
+               if (OPT_GIVEN(LS, VERBOSE)) {
+                       list_container_verbose(c);
+                       continue;
+               }
+               if (OPT_GIVEN(LS, LONG)) {
+                       if (pid > 0)
+                               printf("%u\t", (unsigned)pid);
+                       else
+                               printf("-\t");
+                       printf("%u\t", get_cpu_cores(c));
+                       printf("%uG\t", get_memory_limit(c));
+                       printf("%s\n", c->name);
+                       continue;
+               }
+               if (!OPT_GIVEN(LS, QUIET))
+                       printf("%s\n", c->name);
+       }
+       if (skipped) /* needed if the last given container arg is invalid */
+               success = false;
+       return success;
+}
+EXPORT_CMD_HANDLER(ls);
+
+static bool list_container_processes(const struct container *c)
+{
+       int pid;
+       char str[20];
+       char *argv[] = {"pstree", "-anp", str, NULL};
+       bool success;
+
+       success = is_locked(c->name, &pid);
+       if (!success) {
+               if (lls_num_inputs(sublpr) == 0)
+                       return true;
+               ERROR_LOG("container \"%s\" is not running\n", c->name);
+               return false;
+       }
+       if (!OPT_GIVEN(PS, ALL) && !request_init_pid(c->name, &pid))
+               return false;
+       sprintf(str, "%d", pid);
+       success = xexec(argv, NULL);
+       return success;
+}
+
+static bool com_ps(void)
+{
+       return for_each_container_arg(list_container_processes);
+}
+EXPORT_CMD_HANDLER(ps);
+
+static bool com_attach(void)
+{
+       char *errctx;
+       const char *arg;
+       pid_t pid;
+       char *socket_path;
+       int master, ret, socket_fd;
+       bool have_escape = false;
+       struct termios tios;
+       uint32_t minor = OPT_UINT32_VAL(ATTACH, TTY);
+       char *rq;
+
+       if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+               ERROR_LOG("both stdin and stdout must be terminals\n");
+               return false;
+       }
+       if (tcgetattr(STDIN_FILENO, &tios) < 0)
+               die_errno("tcgetattr");
+       ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       arg = lls_input(0, sublpr);
+       if (!is_locked(arg, &pid)) {
+               ERROR_LOG("container not running: %s\n", arg);
+               return false;
+       }
+       socket_path = get_socket_path(arg);
+       if (OPT_GIVEN(ATTACH, FORCE))
+               rq = msg("force-attach %u", minor);
+       else
+               rq = msg("attach %u", minor);
+       socket_fd = request_fd(socket_path, rq, &master);
+       free(rq);
+       free(socket_path);
+       INFO_LOG("Attached to /dev/tty%u of container %s\n", minor, arg);
+       NOTICE_LOG("Type CTRL+a q to quit\n");
+       setup_termios(STDIN_FILENO);
+       setup_termios(master);
+       for (;;) {
+               int max_fileno = 0;
+               fd_set fds;
+               FD_ZERO(&fds);
+               FD_SET(STDIN_FILENO, &fds);
+               if (STDIN_FILENO > max_fileno)
+                       max_fileno = STDIN_FILENO;
+               FD_SET(master, &fds);
+               if (master > max_fileno)
+                       max_fileno = master;
+               FD_SET(socket_fd, &fds);
+               if (socket_fd > max_fileno)
+                       max_fileno = socket_fd;
+               if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+                       if (errno != EINTR)
+                               ERROR_LOG("select: %m\n");
+                       continue;
+               }
+               if (FD_ISSET(socket_fd, &fds))
+                       break;
+               if (FD_ISSET(STDIN_FILENO, &fds)) {
+                       char c;
+                       if (read(STDIN_FILENO, &c, 1) <= 0)
+                               break;
+                       if (c == 1 && !have_escape)
+                               have_escape = true;
+                       else if (c == 'q' && have_escape)
+                               break;
+                       else if (write(master, &c, 1) != 1)
+                               break;
+               }
+               if (FD_ISSET(master, &fds)) {
+                       if (!copy(master, STDOUT_FILENO))
+                               break;
+               }
+       }
+       if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+               ERROR_LOG("tcsetattr: %m\n");
+       printf("\n");
+       return false;
+}
+EXPORT_CMD_HANDLER(attach);
+
+static bool com_help(void)
+{
+       int ret;
+       char *errctx, *help;
+       const char *arg;
+       const struct lls_command *cmd;
+
+       ret = lls_check_arg_count(sublpr, 0, 1, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       if (lls_num_inputs(sublpr) == 0) {
+               show_subcommand_summary(OPT_GIVEN(HELP, LONG));
+               return true;
+       }
+       arg = lls_input(0, sublpr);
+       ret = lls_lookup_subcmd(arg, micoforia_suite, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       cmd = lls_cmd(ret, micoforia_suite);
+       if (OPT_GIVEN(HELP, LONG))
+               help = lls_long_help(cmd);
+       else
+               help = lls_short_help(cmd);
+       printf("%s\n", help);
+       free(help);
+       return true;
+}
+EXPORT_CMD_HANDLER(help);
+
+static bool com_configtest(void)
+{
+       printf("Syntax Ok\n");
+       return true;
+}
+EXPORT_CMD_HANDLER(configtest);
+
+static bool com_edit(void)
+{
+       char *ed = getenv("EDITOR"); /* must not be freed */
+       char *conf = get_config_file_path();
+       char *argv[] = {ed? ed : "vi", conf, NULL};
+       bool success = xexec(argv, NULL);
+
+       free(conf);
+       return success;
+}
+EXPORT_CMD_HANDLER(edit);
+
+static bool com_enter(void)
+{
+       char str[20];
+       char **argv;
+       char *nsenter_args[] = {"nsenter", "-w", "-a", "-r", "-t"};
+       const unsigned nna = ARRAY_SIZE(nsenter_args); /* num nsenter args */
+       char *dflt_cmd[] = {"login", "-f", "root"};
+       unsigned n, N, ni = lls_num_inputs(sublpr);
+       unsigned nea = ni > 1? ni - 1 : ARRAY_SIZE(dflt_cmd); /* num extra args */
+       const char *arg;
+       bool success;
+       int ret, pid;
+       char *errctx;
+
+       ret = lls_check_arg_count(sublpr, 1, INT_MAX, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       arg = lls_input(0, sublpr);
+       if (!is_locked(arg, &pid)) {
+               ERROR_LOG("container not running: %s\n", arg);
+               return false;
+       }
+       if (!request_init_pid(arg, &pid))
+               return false;
+       N = nna + nea + 2; /* +1 for arg to -t and +1 for terminating NULL */
+       argv = xmalloc(N * sizeof(char *));
+       for (n = 0; n < nna; n++)
+               argv[n] = nsenter_args[n];
+       sprintf(str, "%d", pid);
+       argv[nna] = str;
+       for (n = 0; n < nea; n++)
+               argv[nna + 1 + n] = ni > 1? (char *)lls_input(n + 1, sublpr)
+                       : dflt_cmd[n];
+       argv[N - 1] = NULL;
+       clean_env();
+       success = xexec(argv, NULL);
+       free(argv);
+       return success;
+}
+EXPORT_CMD_HANDLER(enter);
+
+static bool com_log(void)
+{
+       int ret;
+       char *errctx, *logfile;
+       bool success, use_less = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO);
+       char *argv[] = {use_less? "less" : "cat", NULL /* filename */, NULL};
+
+       ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       logfile = get_container_logfile(lls_input(0, sublpr));
+               argv[1] = logfile;
+       success = xexec(argv, NULL);
+       free(logfile);
+       return success;
+}
+EXPORT_CMD_HANDLER(log);
+
+int main(int argc, char *argv[])
+{
+       int ret;
+       char *errctx;
+       const struct micoforia_user_data *ud;
+       unsigned num_inputs;
+
+       valid_fd012();
+       parse_options(argc, argv, CMD_PTR(MICOFORIA), &lpr);
+       loglevel_arg_val = OPT_UINT32_VAL(MICOFORIA, LOGLEVEL);
+       check_options();
+       num_inputs = lls_num_inputs(lpr);
+       ret = lls_lookup_subcmd(argv[argc - num_inputs], micoforia_suite, &errctx);
+       if (ret < 0)
+               die_lopsub(ret, &errctx);
+       subcmd = lls_cmd(ret, micoforia_suite);
+       parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr);
+       ud = lls_user_data(subcmd);
+       exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/micoforia.suite.m4 b/micoforia.suite.m4
new file mode 100644 (file)
index 0000000..697edbc
--- /dev/null
@@ -0,0 +1,754 @@
+# SPDX-License-Identifier: GPL-2.0-only
+[suite micoforia]
+       caption = Subcommands
+       mansect = 8
+       manual_title = System Manager's Manual
+[supercommand micoforia]
+       [description]
+               DESCRIPTION1()
+
+               DESCRIPTION2()
+
+               DESCRIPTION3()
+
+               In addition to global options which apply to all subcommands, each
+               subcommand has its own set of options. The usual "--" separator must
+               be used to separate global options from subcommand specific options.
+
+       [/description]
+       synopsis = [global-options...] [--] [<subcommand> [subcommand-options...]]
+       purpose = SLOGAN()
+
+       [option general-options-section]
+               summary = General options
+               flag ignored
+       [option help]
+               summary = print help and exit
+               short_opt = h
+       [option detailed-help]
+               summary = print help, including all details, and exit
+       [option version]
+               summary = print version and exit
+               short_opt = V
+       [option config-file]
+               short_opt = c
+               summary = use alternative config file (default: ~/.mismarc)
+               typestr = path
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       Options may be given at the command line or in the configuration
+                       file. As usual, if an option is given both at the command line and
+                       in the configuration file, the command line option takes precedence.
+
+                       The config file may contain global options as well as options for
+                       any subcommand, but subcommand specific options must be placed in a
+                       separate section. See the Examples section of the man page.
+               [/help]
+       [option loglevel]
+               summary = control amount of logging
+               short_opt = l
+               arg_info = required_arg
+               arg_type = string
+               typestr = severity
+               values = {
+                       LSGLL_DEBUG = "debug",
+                       LSGLL_INFO = "info",
+                       LSGLL_NOTICE = "notice",
+                       LSGLL_WARNING = "warning",
+                       LSGLL_ERROR = "error",
+                       LSGLL_CRIT = "crit",
+                       LSGLL_EMERG = "emerg"
+               }
+               default_val = warning
+               [help]
+                       Log only messages with severity greater or equal than the given
+                       value. Possible values:
+
+                       debug: produces really noisy output.
+                       info: still noisy, but won't fill up the disk quickly.
+                       notice: indicates normal, but significant event.
+                       warning: unexpected events that can be handled.
+                       error: unhandled error condition.
+                       crit: system might be unreliable.
+                       emerg: last message before exit.
+               [/help]
+
+       [option general-options-section]
+               summary = Global Container Options
+               flag ignored
+               [help]
+                       The options in this section apply to all containers. Most of them
+                       have a per-container counterpart which can be specified to override
+                       the global default.
+               [/help]
+       [option default-root-prefix]
+               summary = path to the parent directory of the container root file systems
+               typestr = directory
+               arg_info = required_arg
+               arg_type = string
+               default_val = /var/lib/micoforia
+               [help]
+                       For containers which do not specify their own root directory the path
+                       to the container root is derived from the argument of this option by
+                       appending a slash and the container name.
+               [/help]
+       [option logdir]
+               summary = directory which contains the container log files
+               arg_info = required_arg
+               arg_type = string
+               typestr = directory
+               default_val = /var/log/micoforia
+               [help]
+                       The log messages of each container are written to a dedicated
+                       logfile. This option controls in which directroy these files are
+                       written (start subcommand) or expected (log subcommand).
+
+                       Nothing is written to the logfile if the container is started in
+                       foreground mode.
+               [/help]
+       [option default-pre-start-hook]
+               summary = command to be executed before the container starts
+               typestr = command
+               arg_info = required_arg
+               arg_type = string
+               default_val = true
+               [help]
+                       This hook is run early during container startup. All veth device
+                       pairs have been created, but no namespace or cgroup operations have
+                       been performed at this point.
+
+                       If the root file system of the container must be prepared, this is the
+                       right place to perform this task. Unlike the pre exec hook described
+                       below, this hook is only called once.
+
+                       The following environment variables are set: MICOFORIA_CONTAINER_NAME,
+                       MICOFORIA_IFSPECS, MICOFORIA_ROOT_DIR.
+               [/help]
+       [option default-pre-exec-hook]
+               summary = command to be executed before /sbin/init is executed
+               typestr = command
+               arg_info = required_arg
+               arg_type = string
+               default_val = true
+               [help]
+                       This runs with all namespaces already unshared and cgroup settings
+                       applied but before the root directory is switched to the container
+                       root. The hostname has already been changed to the container name
+                       and the network interfaces have been renamed to eth0, eth1, etc.
+
+                       This is the right place to perform additional cgroup or namespace
+                       operations. When the container is rebooted, the pre-exec is called
+                       again, just before control is handed over to the new init process.
+
+                       Only MICOFORIA_ROOT_DIR is set in this hook.
+               [/help]
+       [option default-init]
+               summary = control the handover to the init process of the container
+               typestr = command
+               arg_info = required_arg
+               arg_type = string
+               default_val = /sbin/init
+               [help]
+                       This program is executed as the last step of the container startup
+                       procedure as pid 1. At this point the root directory of the process
+                       has already been changed, so the given argument refers to a path
+                       relative to the container root directory.
+               [/help]
+       [option default-bridge]
+               summary = ethernet bridge to use by default
+               typestr = bridge
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+               default_val = micoforia
+               [help]
+                       Applies to all containers which do not specify their own network
+                       interface(s) with --net. If this is given multiple times, containers
+                       will be equipped with multiple interfaces.
+               [/help]
+       [option default-cgroup-dac]
+               summary = specify which device nodes containers may access/create by default
+               typestr = dacspec
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       Applies to all containers which do not specify their own access
+                       control lists. May be given multiple times. Each device access control
+                       specifier must be of the form {allow|deny} <entry>, where <entry>
+                       is a suitable device access control string for the devices.allow or
+                       devices.deny file of the cgroup-v1 controller. Order matters.
+
+                       If this option is not given, and the corresponding per-container
+                       option is not given either, a reasonable default applies which allows
+                       access to the most common character devices (/dev/zero, /dev/null,
+                       /dev/urandom, etc.) but denies access to most other devices including
+                       all block devices.
+
+                       Example: allow c 1:5 rwm
+               [/help]
+       [option default-cpu-cores]
+               summary = Number of cores to use by default (zero means unlimited)
+               typestr = num
+               arg_info = required_arg
+               arg_type = uint32
+               default_val = 0
+               [help]
+                       The limit is enforced by the cpu cgroup-v2 controller.  Note that in
+                       contrast to the cpuset controller of cgroup-v1 this controller does not
+                       restrict the container to a set of admissible CPUs. Instead, it limits
+                       the number of CPU cycles per time unit for the processes in the cgroup.
+               [/help]
+       [option default-memory-limit]
+               summary = Memory usage throttle limit (zero means no limit)
+               typestr = gigabytes
+               arg_info = required_arg
+               arg_type = uint32
+               default_val = 0
+               [help]
+                       The value specified here is written to the cgroup-v2 memory.high
+                       control file of all containers which do not specify their own limit.
+               [/help]
+       [option default-io-max]
+               summary = I/O limit (zero means no limit)
+               flag multiple
+               typestr = iospec
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       The I/O specifier argument must be a valid string for the io.max file
+                       of the cgroup-v2 controller. For example, the string "1:5 rbps=1024"
+                       limits the read I/O rate for the /dev/zero device to 1K per second.
+               [/help]
+       [option default-capdrop]
+               summary = Capabilities to drop by default
+               typestr = capspec
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       The capability specifier argument is the text representation of a
+                       capability, like CAP_SYS_MODULE. All given capabilities will be dropped
+                       from the bounding set of the container init process, hence from all
+                       all processes of the container. If this option is not given, and no
+                       per-container capabilities to drop are given either, CAP_SYS_MODULE,
+                       CAP_SYS_TIME, and CAP_SYS_RESOURCE are dropped.
+
+                       See capabilities(7) for the list of capabilities and their meaning.
+               [/help]
+       [option default-tty]
+               summary = Minor number of a tty device to capture by default
+               typestr = minor
+               flag multiple
+               arg_info = required_arg
+               arg_type = uint32
+               [help]
+                       Normally the container's init process starts at least one "getty"
+                       login session on a tty port /dev/ttyX, where X is the minor device
+                       ID. This option lets you capture these login sessions and forward them
+                       to another micoforia process executing the "attach" subcommand. For
+                       each time the option is given, the device with the given minor device
+                       number is captured.
+
+                       If this is not given, /dev/tty1 will be captured.
+               [/help]
+       [option general-options-section]
+               summary = Per-Container Options
+               flag ignored
+               [help]
+                       These override the global container options above. Most of them take
+                       a compound argument of the form <name:value>, where the first part
+                       is the name of the container to which the option should be applied.
+
+                       Unless noted otherwise, if both a global option and the corresponding
+                       per-container option is given, the per-container option takes
+                       precedence.
+               [/help]
+       [option container]
+               summary = name of the container
+               flag multiple
+               typestr = name
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       Used for the hostname, the name of the veth interfaces and the name of
+                       the cgroup directory. The name may only contain characters of the set
+                       [a-zA-Z0-9-] and the length must not exceed 32 characters.
+
+                       This does not need to be given if one of the compound options below
+                       are given instead.
+               [/help]
+       [option pre-start-hook]
+               summary = See --default-pre-start-hook
+               flag multiple
+               typestr = name:command
+               arg_info = required_arg
+               arg_type = string
+       [option pre-exec-hook]
+               summary = See --default-pre-exec-hook
+               flag multiple
+               typestr = name:command
+               arg_info = required_arg
+               arg_type = string
+       [option init]
+               summary = See --default-init
+               typestr = name:command
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+       [option net]
+               summary = Equip the container with a non-default network interface
+               flag multiple
+               typestr = name:ifspec
+               arg_info = required_arg
+               arg_type = string
+               [help]
+                       The interface specifier is of the form bridge[:hwaddr].  If no hardware
+                       address is given, a random address will be used.  See --default-bridge.
+
+                       Unlike the other compound options of this section, this option is
+                       cumulative in that multiple options with the same container name do
+                       not override each other but accumulate, resulting in a container with
+                       multiple network interfaces.
+               [/help]
+       [option root-directory]
+               summary = Path to the container root directory. See --default-root-prefix.
+               flag multiple
+               typestr = name:path
+               arg_info = required_arg
+               arg_type = string
+               [help]
+               [/help]
+       [option cgroup-dac]
+               summary = See --default-cgroup-dac
+               typestr = name:dacspec
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+       [option cpu-cores]
+               summary = See --default-cpu-cores
+               typestr = name:num
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+       [option memory-limit]
+               summary = See --default-memory-limit
+               typestr = name:gigabytes
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+       [option io-max]
+               summary = See --default-io-max
+               flag multiple
+               typestr = name:iospec
+               arg_info = required_arg
+               arg_type = string
+       [option capdrop]
+               summary = See --default-capdrop
+               flag multiple
+               typestr = name:capspec
+               arg_info = required_arg
+               arg_type = string
+       [option tty]
+               summary = See --default-tty
+               typestr = name:minor
+               flag multiple
+               arg_info = required_arg
+               arg_type = string
+
+[introduction]
+       micoforia supports the subcommands described below. If no subcommand
+       is given, the list of available subcommands is shown and the program
+       terminates successfully without performing any further action.
+[/introduction]
+
+[subcommand start]
+       purpose = start one or more containers
+       non-opts-name = [<name>...]
+       [description]
+               If no container is given, all configured  containers are started.
+       [/description]
+       [option foreground]
+               short_opt = F
+               summary = do not run as background daemon
+               [help]
+                       Normally, the process detaches from the console and continues to run
+                       in the background. When this option is given, only a single container
+                       can be started, and this container will run with its /dev/console
+                       device redirected to the local tty, making the container startup
+                       messages visible on the local tty.
+
+                       Moreover, stdin is forwarded to the first configured tty device
+                       (/dev/tty1 by default) of the container, and anything received from
+                       the other end of the forwarding is dumped to stdout. This allows for
+                       logins on the "local" console of the container, provided the container
+                       starts getty process which listens on the tty device.
+               [/help]
+[subcommand stop]
+       purpose = shutdown one or more containers
+       non-opts-name = [<name>...]
+       [description]
+               This subcommand works by executing halt(8) in container context.
+               If no container is given, halt(8) is executed in all configured
+               container contexts.
+       [/description]
+       [option wait]
+               short_opt = w
+               summary = wait until all containers have terminated
+               [help]
+                       Without --wait the micoforia process which executes the stop
+                       subcommand exits after spawning one halt(8) process per container
+                       to be stopped. If --wait is given, the subcommand waits until all
+                       containers have terminated or the timeout expires. This is handy for
+                       system shutdown scripts which are supposed to terminate all running
+                       containers.
+               [/help]
+       [closing]
+               If --wait is not given, the subcommand exits successfully if and only
+               if all signals were sent successfully. With --wait the subcommand
+               exits successfully if, additionally, all signalled processes have
+               terminated before the timeout expires.
+       [/closing]
+
+[subcommand reboot]
+       purpose = reboot containers
+       non-opts-name = [<name>...]
+       [description]
+               Containers are rebooted and killed by sending a signal to a micoforia
+               process which executes the start subcommand.
+       [/description]
+[subcommand kill]
+       purpose = force containers to terminate
+       non-opts-name = [<name>...]
+       [description]
+               This works like the reboot subcommand, but a different signal is used
+               to notify the container.
+       [/description]
+       [option wait]
+               short_opt = w
+               summary = wait until all signalled containers have terminated
+               [help]
+                       Without --wait the micoforia process which executes the kill subcommand
+                       exits right after the underlying kill(2) system call returns. At this
+                       point the signalled process might still be alive although SIGKILL
+                       was sent. If --wait is given, the process waits until the signalled
+                       processes have terminated or the timeout expires.
+               [/help]
+[subcommand ls]
+       purpose = list containers
+       non-opts-name = [<name>...]
+       [description]
+               Several listing modes are available. By default, only the running
+               containers are listed. If no container name is given, all configured
+               containers are taken into account.
+
+       [/description]
+       [option all]
+               short_opt = a
+               summary = Also list containers which are not running
+       [option quiet]
+               short_opt = q
+               summary = Do not print any output
+               [help]
+                       For scripts to determine from the exit code whether all of the given
+                       containers are running.
+               [/help]
+       [option long]
+               short_opt = l
+               summary = Show also the pid, and the cpu and memory limits
+               [help]
+                       This overrides --quiet. That is, if both --quiet and --long are given,
+                       the long listing is shown,
+               [/help]
+       [option verbose]
+               short_opt = v
+               summary = Show all container settings, one setting per line
+               [help]
+                       This overrides --quiet and --long.
+               [/help]
+       [closing]
+               The subcommand exits successfully if and only if all given/configured
+               containers could be listed. Unless --all is given, it is considered
+               an error if a given container is not running. In particular, when ls
+               is executed with no arguments at all, it exits successfully if and
+               only if all configured containers are running.
+       [/closing]
+[subcommand ps]
+       purpose = print process list of one or more containers
+       non-opts-name = [<name>...]
+       [description]
+               This runs pstree(1). The container init process is always the third
+               process shown. Process IDs refer to the parent PID namespace, which
+               is why the process ID of the container init is not shown as 1.
+       [/description]
+       [option all]
+               short_opt = a
+               summary = also show the two micoforia processes
+[subcommand attach]
+       purpose = map the console of a running container to the local terminal.
+       non-opts-name = [<name>...]
+       [description]
+               It is an error if stdin is not associated with a terminal device.
+       [/description]
+       [option tty]
+               short_opt = t
+               summary = terminal to connect
+               arg_info = required_arg
+               arg_type = uint32
+               typestr = minor
+               default_val = 1
+               [help]
+                       This operation can only succeed if the given tty is forwarded by the
+                       container. See --tty and --default-tty.
+               [/help]
+       [option force]
+               short_opt = f
+               summary = don't fail but steal the tty if it is already attached
+[subcommand help]
+       purpose = list available subcommands or print subcommand-specific help
+       non-opts-name = [subcommand]
+       [description]
+               Without any arguments, help prints the list of available
+               subcommands. When called with a subcommand name argument, it prints
+               the help text of the given subcommand.
+       [/description]
+       [option long]
+               short_opt = l
+               summary = show the long help text
+               [help]
+                       If the optional argument is supplied, the long help text contains the
+                       synopsis, the purpose and the description of the specified subcommand,
+                       followed by the option list including summary and help text of each
+                       option. Without --long, the short help is shown instead. This omits
+                       the description of the subcommand and the option help.
+
+                       If no subcommand is supplied but --long is given, the list contains the
+                       purpose of each subcommand.
+               [/help]
+
+[subcommand configtest]
+       purpose = run a configuration file syntax test
+       [description]
+               This subcommand checks the command line options and the configuration
+               file for syntactic and semantic correctness. It either reports
+               "Syntax Ok" and exits successfully or prints information about the
+               first error and terminates with exit code 1.
+       [/description]
+
+[subcommand edit]
+       purpose = edit the configuration file
+       [description]
+               The editor to start is derived from the EDITOR environment variable.
+               If this variable is not set, vi is assumed.
+       [/description]
+
+[subcommand enter]
+       purpose = run a command in a container namespace
+       non-opts-name = <name> [<command> [arg...]]
+       [description]
+               This executes the nsenter(1) command to enter the namespaces of
+               the init process of the given container. If no command is given,
+               the login command is run to start a root shell.
+       [/description]
+
+[subcommand log]
+       purpose = show the log file for the given container
+       non-opts-name = [<name>]
+       [description]
+               This executes cat(1) or less(1), depending on whether or not stdin
+               and stdout are associated with a terminal device.
+       [/description]
+[section Notes]
+.SS The Cgroup File Systems
+       There are two implementations of Linux control groups called
+       .I cgroup-v1
+       and
+       .IR cgroup-v2 .
+       Both come with their own pseudo filesystem.
+       .B micoforia
+       requires both file systems to be mounted at
+       .IR /var/cgroup
+       and
+       .IR /var/cgroup2 .
+       Version 1 cgroups are only used to enforce device access control for
+       the containers, so the cgroup-v1 pseudo filesystem should be mounted
+       with only this controller enabled. See the Examples section below
+       for how to do this.  Future versions of
+       .B micoforia
+       might switch to the devices controller of cgroup-v2.
+.SS Container Names
+       The container name is used also for the name of the network device
+       and as a directory name if no explicit root directory is given with
+       --root-prefix. Therefore container names must not exceed 32 characters,
+       which must all be alphanumeric or '-'. In particular, whitespace and
+       underscore ('_') are not permitted.
+
+[/section]
+[section Examples]
+       .IP \(bu 2
+       Create a bash alias named
+       .I m7a
+       for
+       .I micoforia
+       which activates debug messages and already includes the double dash
+       to separate global options from subcommand options:
+
+       .RS 6
+       .EX
+               .B alias m7a='micoforia --loglevel debug --'
+       .EE
+       .RE
+       .IP \(bu 2
+       Set up an ethernet bridge named
+       .IR micoforia ,
+       add the physical interface
+       .I eth1
+       to it and give the bridge interface an IP address:
+
+       .RS 6
+       .EX
+               .B brctl addbr micoforia
+               .B ip link set up micoforia
+               .B brctl addif micoforia eth1
+               .B ip a a 192.168.137.1/24 dev micoforia
+       .EE
+       .RE
+       .IP \(bu 2
+       Mount the two cgroup file systems, but only activate the
+       .I devices
+       controller of cgroup-v1:
+
+       .RS 6
+       .EX
+               .B mkdir -p /var/cgroup && mount -t cgroup -o devices cgroup /var/cgroup
+               .B mkdir -p /var/cgroup2 && mount -t cgroup2 cgroup2 /var/cgroup2
+       .EE
+       .RE
+       .IP \(bu 2
+       Entries for
+       .I /etc/fstab
+       to mount the cgroup file systems automatically at boot:
+
+       .RS 6
+       .EX
+               .B none /var/cgroup cgroup devices 0 0
+               .B none /var/cgroup2 cgroup2 defaults 0 0
+       .EE
+       .RE
+       .IP \(bu 2
+       Download a Debian10 root file system to
+       .IR /var/lib/micoforia/debian10 ,
+       set the root password and let micoforia set the hostname
+
+       .RS 6
+       .EX
+               .B debootstrap --variant=minbase buster /var/lib/micoforia/debian10 http://deb.debian.org/debian/
+               .B chroot /var/lib/micoforia/debian10 passwd
+               .B rm -f /var/lib/micoforia/debian10/etc/hostname
+       .EE
+       .RE
+       .IP \(bu 2
+       Download a minimal Ubuntu-18.04 root file system to
+       .IR /var/lib/micoforia/c1 ,
+       set the root password and configure the
+       .I eth0
+       interface, using a static IP address:
+
+       .RS 6
+       .EX
+               .B debootstrap --include openssh-server --include ifupdown bionic /var/lib/micoforia/c1 http://de.archive.ubuntu.com/ubuntu
+               .B chroot /var/lib/micoforia/c1 passwd
+               .B printf 'auto eth0\(rsniface eth0 inet static\(rsnaddress 192.168.137.2/24\(rsn' \
+                >> /var/lib/micoforia/c1/etc/network/interfaces
+               .B echo 'PermitRootLogin yes' >> /var/lib/micoforia/c1/etc/ssh/sshd_config
+       .EE
+       .RE
+       .IP \(bu 2
+       Start the container in foreground mode:
+
+       .RS 6
+       .EX
+               .B micoforia --container c1 --start --foreground
+       .EE
+       .RE
+       .IP \(bu 2
+       Attach to
+       .I tty1
+       of the running container:
+
+       .RS 6
+       .EX
+               .B m7a attach c1
+       .EE
+       .RE
+       .IP \(bu 2
+       Ask the container to shut down, and wait for the shutdown procedure
+       to complete:
+
+       .RS 6
+       .EX
+               .B m7a stop --wait c1
+       .EE
+       .RE
+       .IP \(bu 2
+       Check whether the container is running:
+
+       .RS 6
+       .EX
+               .B m7a ls --quiet c1 && echo yes || echo no
+       .EE
+       .RE
+       .IP \(bu 2
+       A simple config file:
+
+       .RS 6
+       .EX
+               .B # two global options
+               .B loglevel info
+               .B container c1
+               .B # an option for the "attach" subcommand
+               .B [start]
+               .B \ \ \ \ tty 2
+       .EE
+       .RE
+
+[/section]
+[section copyright]
+       Written by AUTHOR()
+       .br
+       Copyright (C) COPYRIGHT_YEAR() AUTHOR()
+       .br
+       License: LICENSE()
+       .br
+       This is free software: you are free to change and redistribute it.
+       .br
+       There is NO WARRANTY, to the extent permitted by law.
+       .P
+       Web page:
+       .UR URL()
+       .UE
+       .br
+       Git clone `URL':
+       .UR CLONE_URL()
+       .UE
+       .br
+       Gitweb:
+       .UR GITWEB_URL()
+       .UE
+       .br
+       Author's home page:
+       .UR HOME_URL()
+       .UE
+       .br
+       Report bugs to
+       .MT EMAIL()
+       AUTHOR()
+       .ME
+[/section]
+[section see also]
+       .BR lxc (7),
+       .BR brct l(8),
+       .BR ip (8)
+       .BR pstree (1)
+[/section]
diff --git a/micoforia.svg b/micoforia.svg
new file mode 100644 (file)
index 0000000..4c7a3f9
--- /dev/null
@@ -0,0 +1,26 @@
+<svg
+       xmlns="http://www.w3.org/2000/svg"
+       xmlns:xlink="http://www.w3.org/1999/xlink"
+       width="90"
+       height="70"
+>
+       <g stroke-width="3" stroke="black" fill="none">
+       <path d="
+               M 5 5
+               l 25 25
+               l 0 30
+               c 10 5 20 5 30 0
+               l 0 -30
+               l 25 -25
+               l -27 20
+               l 0 -10
+               c -8 -13 -16 -13 -24 0
+               l 0 10
+               z
+       "
+       />
+       </g>
+       <ellipse cx="46" cy="22" rx="3" ry="2" fill="none" stroke="black" />
+       <ellipse cx="40" cy="15" rx="2" ry="2" />
+       <ellipse cx="52" cy="15" rx="2" ry="2" />
+</svg>
diff --git a/util.c b/util.c
new file mode 100644 (file)
index 0000000..ebe5b1e
--- /dev/null
+++ b/util.c
@@ -0,0 +1,1142 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include "m7a.h"
+
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <sys/mount.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <libmnl/libmnl.h>
+#include <linux/if_link.h>
+#include <linux/rtnetlink.h>
+#include <sys/un.h>
+
+void die(const char *fmt, ...)
+{
+       char *str;
+       va_list argp;
+       int ret;
+
+       va_start(argp, fmt);
+       ret = vasprintf(&str, fmt, argp);
+       va_end(argp);
+       if (ret < 0) { /* give up */
+               EMERG_LOG("OOM\n");
+               exit(EXIT_FAILURE);
+       }
+       m7a_log(LL_EMERG, "%s\n", str);
+       exit(EXIT_FAILURE);
+}
+
+void die_errno(const char *fmt, ...)
+{
+       char *str;
+       va_list argp;
+       int ret, save_errno = errno;
+
+       va_start(argp, fmt);
+       ret = vasprintf(&str, fmt, argp);
+       va_end(argp);
+       if (ret < 0) {
+               EMERG_LOG("OOM\n");
+               exit(EXIT_FAILURE);
+       }
+       m7a_log(LL_EMERG, "%s: %s\n", str, strerror(save_errno));
+       exit(EXIT_FAILURE);
+}
+
+void *xrealloc(void *p, size_t size)
+{
+       assert(size > 0);
+       assert((p = realloc(p, size)));
+       return p;
+}
+
+void *xmalloc(size_t size)
+{
+       return xrealloc(NULL, size);
+}
+
+void *xzmalloc(size_t size)
+{
+       void *p = xrealloc(NULL, size);
+       memset(p, 0, size);
+       return p;
+}
+
+void *xstrdup(const char *s)
+{
+       char *ret = strdup(s? s: "");
+
+       assert(ret);
+       return ret;
+}
+
+char *msg(const char *fmt, ...)
+{
+       char *m;
+       size_t size = 100;
+
+       m = xmalloc(size);
+       while (1) {
+               int n;
+               va_list ap;
+
+               /* Try to print in the allocated space. */
+               va_start(ap, fmt);
+               n = vsnprintf(m, size, fmt, ap);
+               va_end(ap);
+               /* If that worked, return the string. */
+               if (n < size)
+                       return m;
+               /* Else try again with more space. */
+               size = n + 1; /* precisely what is needed */
+               m = xrealloc(m, size);
+       }
+}
+
+char *xstrcat(char *a, const char *b)
+{
+       char *tmp;
+
+       if (!a)
+               return xstrdup(b);
+       if (!b)
+               return a;
+       tmp = msg("%s%s", a, b);
+       free(a);
+       return tmp;
+}
+
+void die_empty_arg(const char *opt)
+{
+       die("argument to --%s must not be empty", opt);
+}
+
+__attribute__ ((noreturn))
+static void die_range(const char *opt)
+{
+       die("argument to --%s is out of range", opt);
+}
+
+void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt)
+{
+       if (val < min || val > max)
+               die_range(opt);
+}
+
+bool fd2buf(int fd, const struct iovec *iov)
+{
+       ssize_t ret, nread = 0, max;
+       char *buf = iov->iov_base;
+
+       assert(iov->iov_len > 1);
+       max = iov->iov_len - 1;
+       for (;;) {
+               ret = read(fd, buf + nread, max - nread);
+               if (ret < 0) {
+                       if (errno == EAGAIN || errno == EINTR)
+                               continue;
+                       ERROR_LOG("read error: %s\n", strerror(errno));
+                       return false;
+               }
+               if (ret == 0) {
+                       buf[nread] = '\0';
+                       DEBUG_LOG("read %zd bytes\n", nread);
+                       return true;
+               }
+               nread += ret;
+               if (nread >= max) {
+                       ERROR_LOG("cmd output truncated\n");
+                       return false;
+               }
+       }
+}
+
+bool xexec(char * const argv[], const struct iovec *iov)
+{
+       pid_t pid;
+       int pipefd[2] = {-1, -1};
+       unsigned n;
+
+       for (n = 0; argv[n]; n++)
+               DEBUG_LOG("argv[%u]=%s\n", n, argv[n]);
+       if (iov) {
+               if (pipe(pipefd) < 0)
+                       die_errno("pipe");
+       }
+       if ((pid = fork()) < 0)
+               die_errno("fork");
+       if (pid > 0) { /* parent */
+               int wstatus;
+               bool success = true;
+               if (iov) {
+                       close(pipefd[1]);
+                       success = fd2buf(pipefd[0], iov);
+                       close(pipefd[0]);
+               }
+               if (waitpid(pid, &wstatus, 0) < 0)
+                       die_errno("waitp");
+               if (!success)
+                       return false;
+               if (!WIFEXITED(wstatus))
+                       return false;
+               if (WEXITSTATUS(wstatus) != EXIT_SUCCESS)
+                       return false;
+               return true;
+       }
+       if (pipefd[0] >= 0)
+               close(pipefd[0]);
+       if (pipefd[1] >= 0 && pipefd[1] != STDOUT_FILENO) {
+               if (dup2(pipefd[1], STDOUT_FILENO) < 0)
+                       die_errno("dup2()");
+               close(pipefd[1]);
+       }
+       execvp(argv[0], argv);
+       EMERG_LOG("execvp error: %s\n", strerror(errno));
+       _exit(EXIT_FAILURE);
+}
+
+void valid_fd012(void)
+{
+       /* Ensure that file descriptors 0, 1, and 2 are valid. */
+       while (1) {
+               int fd = open("/dev/null", O_RDWR);
+               if (fd < 0)
+                       die_errno("open");
+               if (fd > 2) {
+                       close(fd);
+                       break;
+               }
+       }
+}
+
+void check_name(const char *arg)
+{
+       size_t m, len;
+       char c;
+
+       len = strlen(arg);
+       if (len == 0)
+               die("empty name");
+       if (len > 32)
+               die("name too long: %s", arg);
+       for (m = 0; m < len; m++) {
+               c = arg[m];
+               if (!isascii(c))
+                       goto invalid;
+               if (!isalnum(c) && c != '-')
+                       goto invalid;
+       }
+       return;
+invalid:
+       die("invalid character '%c' in name %s", c, arg);
+}
+
+/* allocates two new strings that should be freed by the caller */
+void parse_compound_arg(const char *arg, const char *opt, char **name, char **val)
+{
+       char *copy, *p;
+
+       if (arg[0] == '\0')
+               die_empty_arg(opt);
+       copy = xstrdup(arg);
+       p = strchr(copy, ':');
+       if (!p)
+               die("could not parse argument to --%s", opt);
+       *p = '\0';
+       check_name(copy);
+       *name = copy;
+       p++;
+       *val = xstrdup(p);
+}
+
+char *parse_cgroup_acl(const char *arg)
+{
+       if (!strncmp(arg, "allow ", 6))
+               return msg("a%s", arg + 6);
+       if (!strncmp(arg, "deny ", 5))
+               return msg("d%s", arg + 5);
+       die("invalid cgroup access specifier: %s", arg);
+}
+
+void parse_ifspec(const char *arg, char **bridge, uint8_t *hwaddr)
+{
+       const char *colon = strchr(arg, ':');
+       size_t len;
+       unsigned n, x[6];
+
+       if (colon) {
+               len = colon - arg;
+               *bridge = xmalloc(len + 1);
+               memcpy(*bridge, arg, len);
+               (*bridge)[len] = '\0';
+       } else
+               *bridge = xstrdup(arg);
+       check_name(*bridge);
+       if (!colon) {
+               memset(hwaddr, 0, 6);
+               return;
+       }
+       if (sscanf(colon + 1, "%02x:%02x:%02x:%02x:%02x:%02x",
+               x, x + 1, x + 2, x + 3, x + 4, x + 5) != 6)
+               die("invalid hwaddress for ifspec %s", arg);
+       if (colon[1 + 6 * 2 + 5] != '\0')
+               die("trailing garbage at the end of ifspec %s", arg);
+       for (n = 0; n < 6; n++)
+               hwaddr[n] = x[n];
+}
+
+uint32_t atou32(const char *str, const char *opt)
+{
+       char *endptr;
+       long long tmp;
+
+       errno = 0; /* To distinguish success/failure after call */
+       tmp = strtoll(str, &endptr, 10);
+       if (errno == ERANGE && (tmp == LLONG_MAX || tmp == LLONG_MIN))
+               die_range(opt);
+       if (tmp < 0 || tmp > (uint32_t)-1)
+               die_range(opt);
+       /*
+        * If there were no digits at all, strtoll() stores the original value
+        * of str in *endptr.
+        */
+       if (endptr == str)
+               die_empty_arg(opt);
+       /*
+        * The implementation may also set errno and return 0 in case no
+        * conversion was performed.
+        */
+       if (errno != 0 && tmp == 0)
+               die_empty_arg(opt);
+       if (*endptr != '\0') /* Further characters after number */
+               die("--%s: trailing characters after number", opt);
+       return tmp;
+}
+
+bool remove_subdirs_recursively(const char *path)
+{
+       DIR *d = opendir(path);
+       struct dirent *entry;
+       int dfd;
+       struct stat stat;
+
+       if (!d) {
+               ERROR_LOG("opendir %s: %m\n", path);
+               return false;
+       }
+       dfd = dirfd(d);
+       assert(dfd >= 0);
+       while ((entry = readdir(d))) {
+               char *subpath;
+               if (!strcmp(entry->d_name, "."))
+                       continue;
+               if (!strcmp(entry->d_name, ".."))
+                       continue;
+               if (fstatat(dfd, entry->d_name, &stat, 0) == -1) {
+                       WARNING_LOG("%s/%s: %m", path, entry->d_name);
+                       continue;
+               }
+               if (!S_ISDIR(stat.st_mode))
+                       continue;
+               subpath = msg("%s/%s", path, entry->d_name);
+               remove_subdirs_recursively(subpath);
+               DEBUG_LOG("removing %s\n", subpath);
+               if (rmdir(subpath) < 0) {
+                       ERROR_LOG("rmdir %s: %m\n", subpath);
+                       return false;
+               }
+               free(subpath);
+       }
+       closedir(d);
+       return true;
+}
+
+void daemonize(const char *logfile)
+{
+       pid_t pid;
+       int nullfd, logfd;
+
+       if ((pid = fork()) < 0)
+               die_errno("fork");
+       if (pid) /* parent exits */
+               exit(EXIT_SUCCESS);
+       valid_fd012();
+       /* become session leader */
+       if (setsid() < 0)
+               die_errno("setsid");
+       if ((nullfd = open("/dev/null", O_RDWR)) < 0)
+               die_errno("open /dev/null");
+       logfile = logfile? logfile : "/dev/null";
+       if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, 0666)) < 0)
+               die_errno("open %s", logfile);
+       NOTICE_LOG("subsequent log messages go to %s\n", logfile);
+       if (dup2(nullfd, STDIN_FILENO) < 0)
+               die_errno("dup2");
+       close(nullfd);
+       if (dup2(logfd, STDOUT_FILENO) < 0)
+               die_errno("dup2");
+       if (dup2(logfd, STDERR_FILENO) < 0)
+               die_errno("dup2");
+       close(logfd);
+       if (chdir("/") < 0)
+               die_errno("chdir");
+}
+
+static int super_dull_hash(const char *input)
+{
+       const uint8_t *x = (typeof(x))input;
+       const unsigned p1 = 16777619, p2 = 2971215073;
+       unsigned n, m, h, result = 0;
+
+       for (n = 0; n < 4; n++) {
+               h = p1 * (x[0] + n);
+               for (m = 1; x[m] != 0; m++)
+                       h = p2 * (h ^ x[m]);
+               result = (result << 8) | (h % 256);
+       }
+       return result >> 1;
+}
+
+/**
+ * We use a semaphore set with two semaphores. The first semaphore is modified
+ * in all locking related functions while the second semaphore is modified only
+ * in try_lock() and aquire_lock(). This allows us to obtain the PID of the
+ * lock holder by querying the PID that last performed an operation on the
+ * second semaphore. This is achieved by passing GETPID as the control
+ * operation to semctl().
+ */
+
+static bool get_lock(const char *string, pid_t *pid, bool wait)
+{
+       int semid, ret;
+       struct sembuf sops[4];
+       key_t key = super_dull_hash(string);
+       bool success;
+       short sem_flg = SEM_UNDO;
+
+       if (!wait)
+               sem_flg |= IPC_NOWAIT;
+       ret = semget(key, 2, IPC_CREAT | 0600);
+       if (ret < 0) {
+               ERROR_LOG("semget: %m\n");
+               return false;
+       }
+       semid = ret;
+       DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+       ret = semctl(semid, 1, GETPID);
+       if (ret < 0)
+               return false;
+       if (pid)
+               *pid = ret;
+       sops[0].sem_num = 0;
+       sops[0].sem_op = 0;
+       sops[0].sem_flg = sem_flg;
+
+       sops[1].sem_num = 0;
+       sops[1].sem_op = 1;
+       sops[1].sem_flg = sem_flg;
+
+       sops[2].sem_num = 1;
+       sops[2].sem_op = 0;
+       sops[2].sem_flg = sem_flg;
+
+       sops[3].sem_num = 1;
+       sops[3].sem_op = 1;
+       sops[3].sem_flg = sem_flg;
+
+       success = semop(semid, sops, 4) >= 0;
+       if (!success)
+               INFO_LOG("semop: %m\n");
+       return success;
+}
+
+bool try_lock(const char *string, pid_t *pid)
+{
+       return get_lock(string, pid, false /* don't wait */);
+}
+
+bool acquire_lock(const char *string)
+{
+       return get_lock(string, NULL /* don't need pid */, true /* do wait */);
+}
+
+bool release_lock(const char *string)
+{
+       int semid, ret;
+       struct sembuf sops[2];
+       key_t key = super_dull_hash(string);
+       bool success;
+
+       ret = semget(key, 2, IPC_CREAT | 0600);
+       if (ret < 0) {
+               ERROR_LOG("semget: %m\n");
+               return false;
+       }
+       semid = ret;
+       DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+       sops[0].sem_num = 0;
+       sops[0].sem_op = -1;
+       sops[0].sem_flg = SEM_UNDO;
+       sops[1].sem_num = 1;
+       sops[1].sem_op = -1;
+       sops[1].sem_flg = SEM_UNDO;
+       success = semop(semid, sops, 2) >= 0;
+       if (!success)
+               INFO_LOG("semop: %m\n");
+       return success;
+}
+
+bool is_locked(const char *string, pid_t *pid)
+{
+       int ret, semid;
+       struct sembuf sops = {
+               .sem_num = 0,
+               .sem_op = 0,
+               .sem_flg = SEM_UNDO | IPC_NOWAIT
+       };
+       key_t key = super_dull_hash(string);
+
+       if (pid)
+               *pid = 0;
+       ret = semget(key, 2, 0);
+       if (ret < 0)
+               return false;
+       semid = ret;
+       DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+       if (semop(semid, &sops, 1) >= 0)
+               return false;
+       ret = semctl(semid, 1, GETPID);
+       if (ret < 0)
+               return false;
+       if (pid)
+               *pid = ret;
+       return true;
+}
+
+bool attach_to_bridge(const char *iface, const char *bridge)
+{
+       int fd, idx;
+       struct ifreq ifr;
+       bool success;
+
+       INFO_LOG("adding interface %s to bridge %s\n", iface, bridge);
+       if (!(idx = if_nametoindex(iface))) {
+               ERROR_LOG("no index for %s\n", iface);
+               return false;
+       }
+       if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               ERROR_LOG("socket: %m\n");
+               return false;
+       }
+       strncpy(ifr.ifr_name, bridge, IFNAMSIZ - 1);
+       ifr.ifr_name[IFNAMSIZ - 1] = '\0';
+       ifr.ifr_ifindex = idx;
+       success = ioctl(fd, SIOCBRADDIF, &ifr) == 0;
+       if (!success)
+               ERROR_LOG("interface %s, bridge %s: ioctl SIOCBRADDIF: %m\n",
+                       iface, bridge);
+       close(fd);
+       return success;
+}
+
+
+#define NLMSG_TAIL(nmsg) \
+       ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+
+static void addattr_l(struct nlmsghdr *nlh, int type, const void *data,
+               int alen)
+{
+       int len = RTA_LENGTH(alen);
+       struct rtattr *rta;
+
+       rta = NLMSG_TAIL(nlh);
+       rta->rta_type = type;
+       rta->rta_len = len;
+       if (alen > 0)
+               memcpy(RTA_DATA(rta), data, alen);
+       nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + RTA_ALIGN(len);
+}
+
+static struct rtattr *addattr_nest(struct nlmsghdr *n, int type)
+{
+       struct rtattr *nest = NLMSG_TAIL(n);
+       addattr_l(n, type, NULL, 0);
+       return nest;
+}
+
+static void end_nest(struct nlmsghdr *nlh, struct rtattr *attr)
+{
+       attr->rta_len = (void *)NLMSG_TAIL(nlh) - (void *)attr;
+}
+
+static struct mnl_socket *get_and_bind_netlink_socket(void)
+{
+       struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+       if (!nl) {
+               ERROR_LOG("mnl_socket_open error\n");
+               return NULL;
+       }
+       if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
+               ERROR_LOG("mnl_socket_bind\n");
+               mnl_socket_close(nl);
+               return NULL;
+       }
+       return nl;
+}
+
+static struct nlmsghdr *prepare_netlink_msg_header(char *buf)
+{
+       struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+       nlh->nlmsg_flags = NLM_F_REQUEST;
+       nlh->nlmsg_seq = time(NULL);
+       return nlh;
+}
+
+bool rename_interface(const char *before, const char *after)
+{
+       int idx;
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       bool success;
+
+       INFO_LOG("%s -> %s\n", before, after);
+       if (!(idx = if_nametoindex(before))) {
+               ERROR_LOG("no index for %s\n", before);
+               return false;
+       }
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_NEWLINK;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_index = idx;
+       addattr_l(nlh, IFLA_IFNAME, after, strlen(after) + 1);
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("mnl_socket_sendto failed\n");
+               success = false;
+               goto close;
+       }
+       success = true;
+close:
+       mnl_socket_close(nl);
+       return success;
+}
+
+void pretty_print_hwaddr(const uint8_t *hwaddr, char *result)
+{
+       sprintf(result, "%02x:%02x:%02x:%02x:%02x:%02x", hwaddr[0], hwaddr[1],
+               hwaddr[2], hwaddr[3], hwaddr[4], hwaddr[5]);
+}
+
+bool set_hwaddr(const char *iface, const uint8_t *hwaddr)
+{
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       bool success;
+       const uint8_t zero[6] = {0};
+       char pretty_hwaddr[18];
+
+       if (!memcmp(hwaddr, zero, 6))
+               return true; /* no hwaddr specified, nothing to do */
+       pretty_print_hwaddr(hwaddr, pretty_hwaddr);
+       INFO_LOG("hardware address of %s: %s\n", iface, pretty_hwaddr);
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_NEWLINK;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       addattr_l(nlh, IFLA_ADDRESS, hwaddr, 6);
+       addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+               success = false;
+               goto close;
+       }
+       success = true;
+close:
+       mnl_socket_close(nl);
+       return success;
+}
+
+bool link_del(const char *iface)
+{
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       bool success;
+
+       INFO_LOG("removing interface %s\n", iface);
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_DELLINK;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_change = IFF_UP;
+       ifm->ifi_flags = IFF_UP;
+       addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+               success = false;
+               goto close;
+       }
+       success = true;
+close:
+       mnl_socket_close(nl);
+       return success;
+}
+
+bool link_up(const char *iface)
+{
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       bool success;
+
+       INFO_LOG("activating interface %s\n", iface);
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_NEWLINK;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_change = IFF_UP;
+       ifm->ifi_flags = IFF_UP;
+       addattr_l(nlh, IFLA_IFNAME, iface, strlen(iface) + 1);
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+               success = false;
+               goto close;
+       }
+       success = true;
+close:
+       mnl_socket_close(nl);
+       return success;
+}
+
+#ifndef VETH_INFO_PEER
+#define VETH_INFO_PEER 1
+#endif
+
+bool create_veth_device_pair(const char *name, char *peer)
+{
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct rtattr *n1, *n2, *n3;
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       bool success;
+
+       INFO_LOG("new pair: %s <-> %s\n", name, peer);
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_NEWLINK;
+       nlh->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       n1 = addattr_nest(nlh, IFLA_LINKINFO);
+       addattr_l(nlh, IFLA_INFO_KIND, "veth", 5);
+       n2 = addattr_nest(nlh, IFLA_INFO_DATA);
+       n3 = addattr_nest(nlh, VETH_INFO_PEER);
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       addattr_l(nlh, IFLA_IFNAME, peer, strlen(peer) + 1);
+       end_nest(nlh, n3);
+       end_nest(nlh, n2);
+       end_nest(nlh, n1);
+       addattr_l(nlh, IFLA_IFNAME, name, strlen(name) + 1);
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("%s: mnl_socket_sendto\n", name);
+               success = false;
+               goto close;
+       }
+       success = true;
+close:
+       mnl_socket_close(nl);
+       return success;
+}
+
+bool set_netns(const char *iface, pid_t pid)
+{
+       struct mnl_socket *nl;
+       char buf[MNL_SOCKET_BUFFER_SIZE];
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+
+       INFO_LOG("changing net namespace of interface %s to pid %d\n",
+               iface, (int)pid);
+       if (!(nl = get_and_bind_netlink_socket()))
+               return false;
+
+       nlh = prepare_netlink_msg_header(buf);
+       nlh->nlmsg_type = RTM_NEWLINK;
+
+       ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_change = 0;
+       ifm->ifi_flags = 0;
+       addattr_l(nlh, IFLA_NET_NS_PID, &pid, sizeof(pid));
+       mnl_attr_put_str(nlh, IFLA_IFNAME, iface);
+
+       if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+               ERROR_LOG("%s: mnl_socket_sendto failed\n", iface);
+               return false;
+       }
+       mnl_socket_close(nl);
+       return true;
+}
+
+#ifndef UNIX_PATH_MAX
+#define UNIX_PATH_MAX (sizeof(((struct sockaddr_un *)0)->sun_path))
+#endif
+
+static bool init_unix_socket(const char *socket_path, int *socketfd,
+               struct sockaddr_un *sau)
+{
+       int fd;
+
+       *socketfd = -1;
+       if (strlen(socket_path) + 1 >= UNIX_PATH_MAX) {
+               ERROR_LOG("socket path to long: %s\n", socket_path);
+               return false;
+       }
+       memset(sau, 0, sizeof(struct sockaddr_un));
+       sau->sun_family = PF_UNIX;
+       sau->sun_path[0] = '\0'; /* use the abstract socket namespace */
+       strcpy(sau->sun_path + 1, socket_path);
+       fd = socket(PF_UNIX, SOCK_STREAM, 0);
+       if (fd < 0) {
+               ERROR_LOG("socket: %m\n");
+               return false;
+       }
+       *socketfd = fd;
+       return true;
+}
+
+bool listen_on_unix_socket(const char *socket_path, int *result)
+{
+       struct sockaddr_un sau;
+       int fd, flags;
+       bool success = false;
+
+       if (!init_unix_socket(socket_path, &fd, &sau))
+               return false;
+       flags = fcntl(fd, F_GETFL);
+       if (flags < 0) {
+               ERROR_LOG("fcntl (F_GETFL): %m\n");
+               goto fail;
+       }
+       flags = fcntl(fd, F_SETFL, ((long)flags) | O_NONBLOCK);
+       if (flags < 0) {
+               ERROR_LOG("fcntl (F_SETFL): %m\n");
+               goto fail;
+       }
+       if (bind(fd, (struct sockaddr *)&sau, sizeof(sau)) < 0) {
+               ERROR_LOG("bind: %m\n");
+               goto fail;
+       }
+       if (listen(fd , 5) < 0) {
+               ERROR_LOG("listen: %m\n");
+               goto fail;
+       }
+       *result = fd;
+       NOTICE_LOG("listening on fd %d\n", fd);
+       return true;
+fail:
+       close(fd);
+       return success;
+}
+/*
+ * Send a buffer and the credentials of the current process to a socket.
+ *
+ * buf must be zero-terminated.
+ * return the return value of the underlying call to sendmsg().
+ */
+static bool send_cred_buffer(int sock, char *buf)
+{
+       char control[255] __attribute__((__aligned__(8)));
+       struct msghdr msg;
+       struct cmsghdr *cmsg;
+       static struct iovec iov;
+       struct ucred c;
+
+       /* Response data */
+       iov.iov_base = buf;
+       iov.iov_len = strlen(buf) + 1;
+       c.pid = getpid();
+       c.uid = getuid();
+       c.gid = getgid();
+       /* compose the message */
+       memset(&msg, 0, sizeof(msg));
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = control;
+       msg.msg_controllen = sizeof(control);
+       /* attach the ucred struct */
+       cmsg = CMSG_FIRSTHDR(&msg);
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_CREDENTIALS;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+       *(struct ucred *)CMSG_DATA(cmsg) = c;
+       msg.msg_controllen = cmsg->cmsg_len;
+       if (sendmsg(sock, &msg, 0) < 0) {
+               ERROR_LOG("sendmsg: %m\n");
+               return false;
+       }
+       return true;
+}
+
+static void dispose_fds(int *fds, unsigned num)
+{
+       int i;
+
+       for (i = 0; i < num; i++)
+               close(fds[i]);
+}
+
+/* Receive a buffer and the Unix credentials of the sending process. */
+bool recv_cred_buffer(int socketfd, char *buf, size_t size,
+               int *clientfd, uid_t *uid)
+{
+       char control[255] __attribute__((__aligned__(8)));
+       struct msghdr msg;
+       struct cmsghdr *cmsg;
+       struct iovec iov;
+       int yes = 1, cfd, ret;
+       struct ucred cred;
+       struct sockaddr_un sau;
+       socklen_t sizeof_sau = sizeof(sau);
+
+       ret = accept(socketfd, (struct sockaddr *)&sau, &sizeof_sau);
+       if (ret < 0) {
+               ERROR_LOG("accept: %m\n");
+               return false;
+       }
+       cfd = ret;
+       setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &yes, sizeof(int));
+       memset(&msg, 0, sizeof(msg));
+       iov.iov_base = buf;
+       iov.iov_len = size;
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = control;
+       msg.msg_controllen = sizeof(control);
+       if (recvmsg(cfd, &msg, 0) < 0) {
+               ERROR_LOG("recvmsg: %m\n");
+               goto fail;
+       }
+       cmsg = CMSG_FIRSTHDR(&msg);
+       while (cmsg) {
+               if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type
+                               == SCM_CREDENTIALS) {
+                       memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct ucred));
+                       *uid = cred.uid;
+                       *clientfd = cfd;
+                       return true;
+               } else
+                       if (cmsg->cmsg_level == SOL_SOCKET
+                                       && cmsg->cmsg_type == SCM_RIGHTS) {
+                               dispose_fds((int *)CMSG_DATA(cmsg),
+                                       (cmsg->cmsg_len - CMSG_LEN(0))
+                                       / sizeof(int));
+                       }
+               cmsg = CMSG_NXTHDR(&msg, cmsg);
+       }
+fail:
+       close(*clientfd);
+       *clientfd = -1;
+       return false;
+}
+
+bool pass_fd(int passfd, int socketfd)
+{
+       struct msghdr msg = {.msg_iov = NULL};
+       struct cmsghdr *cmsg;
+       char control[255] __attribute__((__aligned__(8)));
+       struct iovec iov;
+       char buf[] = "\0OK";
+
+       iov.iov_base = buf;
+       iov.iov_len  = sizeof(buf);
+
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+
+       msg.msg_control = control;
+       msg.msg_controllen = sizeof(control);
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_RIGHTS;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+       *(int *)CMSG_DATA(cmsg) = passfd;
+
+       /* Sum of the length of all control messages in the buffer */
+       msg.msg_controllen = cmsg->cmsg_len;
+       DEBUG_LOG("passing %s and fd %d\n", buf, passfd);
+       if (sendmsg(socketfd, &msg, 0) < 0) {
+               ERROR_LOG("sendmsg: %m\n");
+               return false;
+       }
+       return true;
+}
+
+static bool recv_fd(int socketfd, int *recvfd)
+{
+       char control[255] __attribute__((__aligned__(8)));
+       struct msghdr msg = {.msg_iov = NULL};
+       struct cmsghdr *cmsg;
+       struct iovec iov;
+       char buf[100];
+       ssize_t sz = sizeof(buf), ssz;
+
+       *recvfd = -1;
+       iov.iov_base = buf;
+       iov.iov_len = sz - 1;
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = control;
+       msg.msg_controllen = sizeof(control);
+       memset(buf, 0, sz);
+       ssz = recvmsg(socketfd, &msg, 0);
+       if (ssz < 0) {
+               ERROR_LOG("recvmsg: %m\n");
+               return false;
+       }
+       buf[ssz] = '\0';
+       INFO_LOG("server response: %u (%s)\n", (unsigned)buf[0], buf + 1);
+       for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+               if (cmsg->cmsg_level != SOL_SOCKET
+                       || cmsg->cmsg_type != SCM_RIGHTS)
+                       continue;
+               if ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int) != 1)
+                       continue;
+               *recvfd = *(int *)CMSG_DATA(cmsg);
+               return true;
+       }
+       return false;
+}
+
+int request_fd(const char *socket_path, char *msg, int *result)
+{
+       struct sockaddr_un sau;
+       int socketfd, receivefd;
+
+       if (!init_unix_socket(socket_path, &socketfd, &sau))
+               die("could not init socket");
+       if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0)
+               die_errno("connect");
+       if (!send_cred_buffer(socketfd, msg))
+               die("could not send cred buffer");
+       if (!recv_fd(socketfd, &receivefd))
+               die("did not receive tty fd");
+       NOTICE_LOG("received fd %d\n", receivefd);
+       *result = receivefd;
+       return socketfd;
+}
+
+bool request_int(const char *socket_path, char *msg, int *result)
+{
+       struct sockaddr_un sau;
+       int socketfd;
+       bool success = false;
+       char buf[100];
+       ssize_t ssz;
+
+       *result = -1;
+       if (!init_unix_socket(socket_path, &socketfd, &sau))
+               return false;
+       if (connect(socketfd, (struct sockaddr *)&sau, sizeof(sau)) < 0) {
+               ERROR_LOG("connect: %m\n");
+               goto close;
+       }
+       if (!send_cred_buffer(socketfd, msg)) {
+               ERROR_LOG("could not send cred msg \"%s\"\n", msg);
+               goto close;
+       }
+       ssz = read(socketfd, buf, sizeof(buf) - 1);
+       if (ssz < 0) {
+               ERROR_LOG("did not receive integer: %m\n");
+               goto close;
+       }
+       if (buf[0] != 0) {
+               ERROR_LOG("did not receive integer: %s\n", buf + 1);
+               goto close;
+       }
+       if (ssz != sizeof(int) + 1) {
+               ERROR_LOG("protocol mismatch, server msg: %s\n", buf + 1);
+               goto close;
+       }
+       memcpy(result, buf + 1, sizeof(int));
+       DEBUG_LOG("received integer: %d\n", *result);
+       success = true;
+close:
+       close(socketfd);
+       return success;
+}
+
+int signal_pipe[2];
+
+static void signal_handler(int signum)
+{
+       uint8_t u = signum;
+       int save_errno = errno;
+       assert(signum > 0 && signum < 256);
+       if (write(signal_pipe[1], &u, 1) < 0)
+               ERROR_LOG("write to signal pipe: %m\n");
+       errno = save_errno;
+}
+
+void init_signal_handling(void)
+{
+       struct sigaction act;
+
+       if (pipe(signal_pipe) < 0)
+               die_errno("signal pipe");
+       act.sa_handler = signal_handler;
+       sigemptyset(&act.sa_mask);
+       act.sa_flags = SA_RESTART;
+       if (sigaction(SIGINT, &act, NULL) < 0)
+               die_errno("sigaction");
+       if (sigaction(SIGTERM, &act, NULL) < 0)
+               die_errno("sigaction");
+       if (sigaction(SIGCHLD, &act, NULL) < 0)
+               die_errno("sigaction");
+}
+
+int next_signal(void)
+{
+       uint8_t u = 0;
+again:
+       if (read(signal_pipe[0], &u, 1) < 0) {
+               if (errno != EINTR)
+                       die_errno("read");
+               goto again;
+       }
+       DEBUG_LOG("process %d received signal %u\n", getpid(), u);
+       return u;
+}
diff --git a/version-gen.sh b/version-gen.sh
new file mode 100755 (executable)
index 0000000..5e554ee
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+package="$1"
+version_file="$2"
+
+ver='unnamed_version'
+# First try git, then gitweb, then default.
+if [ -e '.git' -o -e '../.git' ]; then
+       git_ver=$(git describe --abbrev=4 HEAD 2>/dev/null)
+       [ -z "$git_ver" ] && git_ver="$ver"
+       # update stat information in index to match working tree
+       git update-index -q --refresh > /dev/null
+       # if there are differences (exit code 1), the working tree is dirty
+       git diff-index --quiet HEAD || git_ver=$git_ver-dirty
+       ver=$git_ver
+elif [ "${PWD%%-*}" = $package- ]; then
+       ver=${PWD##*/$package-}
+fi
+ver=${ver#v}
+
+echo "$ver"
+[ -z "${version_file}" ] && exit 0
+# update version file if necessary
+content="const char *${package}_version(void) {return \"$ver\";};"
+[ -r "$version_file" ] && echo "$content" | cmp -s - $version_file && exit 0
+echo "$content" > $version_file