--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include "misma.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <lopsub.h>
+#include <sys/mman.h>
+#include <math.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/ioctl.h>
+
+#include "misma.lsg.h"
+
+enum interval_type {
+ IT_CREATE,
+ IT_TRIM,
+ IT_MAX_AGE,
+ NUM_INTERVAL_TYPES
+};
+
+struct snapshot_config {
+ struct percentage_pair thresholds;
+ unsigned interval[NUM_INTERVAL_TYPES];
+};
+static struct snapshot_config global_config = {
+ .thresholds = {.data = 95, .meta = 95},
+ .interval = {
+ [IT_CREATE] = 6 * 3600,
+ [IT_TRIM] = 0,
+ [IT_MAX_AGE] = 86400 * 365
+ }
+};
+
+enum event_type {ET_CREATE, ET_CHECK, ET_TRIM, NUM_EVENT_TYPES};
+
+struct volume_group {
+ char *name;
+ struct snapshot_config config;
+};
+static unsigned num_vgs;
+static struct volume_group *volume_group; /* num_vgs elements */
+
+static const char *vgname(unsigned vgid)
+{
+ return volume_group[vgid].name;
+}
+
+/* sequential search is good enough */
+static unsigned get_vgid(const char *name)
+{
+ for (unsigned n = 0; n < num_vgs; n++)
+ if (!strcmp(name, volume_group[n].name))
+ return n;
+ return ~0U;
+}
+
+/* insert only if it not exists already */
+static unsigned insert_vg(const char *name)
+{
+ struct volume_group *vg;
+ unsigned vgid = get_vgid(name);
+
+ if (vgid != ~0U)
+ return vgid;
+ INFO_LOG("vg #%u: %s\n", num_vgs, name);
+ num_vgs++;
+ volume_group = xrealloc(volume_group, num_vgs
+ * sizeof(struct volume_group));
+ vg = volume_group + num_vgs - 1;
+ memset(vg, 0, sizeof(struct volume_group));
+ vg->name = xstrdup(name);
+ return num_vgs - 1;
+}
+
+struct thin_pool {
+ char *name;
+ unsigned vgid;
+ struct snapshot_config config;
+ struct percentage_pair utilization;
+ enum lvm_scope threshold_scope;
+};
+static unsigned num_pools;
+static struct thin_pool *thin_pool; /* num_pools elements */
+
+static unsigned get_poolid(const char *name, const char *vg_name)
+{
+ for (unsigned n = 0; n < num_pools; n++) {
+ struct thin_pool *pool = thin_pool + n;
+ if (!strcmp(name, pool->name) && !strcmp(vg_name,
+ vgname(pool->vgid)))
+ return n;
+ }
+ return ~0U;
+}
+
+/* vg of pool must have been inserted already */
+static unsigned insert_pool(const char *name, const char *vgname)
+{
+ struct thin_pool *pool;
+ unsigned poolid = get_poolid(name, vgname);
+
+ if (poolid != ~0U)
+ return poolid;
+ INFO_LOG("pool #%u: %s/%s\n", num_pools, vgname, name);
+ num_pools++;
+ thin_pool = xrealloc(thin_pool, num_pools * sizeof(struct thin_pool));
+ pool = thin_pool + num_pools - 1;
+ memset(pool, 0, sizeof(struct thin_pool));
+ pool->name = xstrdup(name);
+ pool->vgid = get_vgid(vgname);
+ if (pool->vgid == ~0U)
+ die("invalid vg: %s", vgname);
+ return num_pools - 1;
+}
+
+struct snapshot {
+ unsigned seq;
+ uint64_t epoch;
+};
+
+struct origin {
+ char *name;
+ unsigned vgid;
+ unsigned poolid;
+ struct snapshot_config config;
+ enum lvm_scope iscope[NUM_INTERVAL_TYPES]; /* interval scopes */
+ uint64_t last_event[NUM_EVENT_TYPES]; /* epochs */
+ unsigned last_seq;
+ unsigned num_slots;
+ struct snapshot *snapshot;
+};
+static unsigned num_origins;
+static struct origin *origin;
+#define FOR_EACH_ORIGIN(_n) for (_n = 0; _n < num_origins; _n++)
+
+static unsigned check_seconds = 60;
+
+static unsigned interval_length(enum interval_type it, const struct origin *o)
+{
+ switch (o->iscope[it]) {
+ case LS_GLOBAL: return global_config.interval[it];
+ case LS_VG: return volume_group[o->vgid].config.interval[it];
+ case LS_POOL: return thin_pool[o->poolid].config.interval[it];
+ case LS_ORIGIN: return o->config.interval[it];
+ default: assert(0);
+ }
+}
+
+static unsigned get_oid(const char *name, const char *vg_name)
+{
+ unsigned n;
+ FOR_EACH_ORIGIN(n) {
+ struct origin *o = origin + n;
+ if (!strcmp(name, o->name) && !strcmp(vg_name, vgname(o->vgid)))
+ return n;
+ }
+ return ~0U;
+}
+
+/* vg must have been inserted already */
+static unsigned insert_origin(const char *name, const char *vgname,
+ const char *poolname)
+{
+ struct origin *o;
+ unsigned oid = get_oid(name, vgname);
+
+ assert(oid == ~0U);
+ INFO_LOG("origin #%u: %s/%s, pool: %s\n", num_origins, vgname, name,
+ poolname);
+ num_origins++;
+ origin = xrealloc(origin, num_origins * sizeof(struct origin));
+ o = origin + num_origins - 1;
+ memset(o, 0, sizeof(struct origin));
+ o->name = xstrdup(name);
+ o->vgid = get_vgid(vgname);
+ assert(o->vgid != ~0U);
+ o->poolid = get_poolid(poolname, vgname);
+ assert(o->poolid != ~0U);
+ return num_origins - 1;
+}
+
+struct event {
+ enum event_type type;
+ uint64_t epoch;
+ struct origin *origin;
+};
+
+static int event_compare(const void *d1, const void *d2)
+{
+ const struct event *a = d1, *b = d2;
+
+ if (a->epoch < b->epoch)
+ return 1;
+ if (a->epoch > b->epoch)
+ return -1;
+ return 0;
+}
+
+static char *config_file;
+
+#define FOR_EACH_SLOT_REVERSE(_j, _o) for ( \
+ unsigned _j = _o->num_slots - 1; _j != -1U; _j--)
+
+static unsigned loglevel_arg_val = LL_WARNING;
+
+/* lopsub */
+static const struct lls_command *subcmd;
+static struct lls_parse_result *lpr, *sublpr;
+#define CMD_PTR(_cname) lls_cmd(LSG_MISMA_CMD_ ## _cname, misma_suite)
+#define OPT_RESULT(_cname, _oname) (lls_opt_result(\
+ LSG_MISMA_ ## _cname ## _OPT_ ## _oname, \
+ (CMD_PTR(_cname) == CMD_PTR(MISMA))? lpr : sublpr))
+#define OPT_GIVEN(_cname, _oname) (lls_opt_given(OPT_RESULT(_cname, _oname)))
+#define OPT_UINT32_VAL(_cname, _oname) (lls_uint32_val(0, \
+ OPT_RESULT(_cname, _oname)))
+#define OPT_STRING_VAL_N(_n, _cname, _oname) (lls_string_val(_n, \
+ OPT_RESULT(_cname, _oname)))
+#define OPT_STRING_VAL(_cname, _oname) (OPT_STRING_VAL_N(0, _cname, _oname))
+
+struct misma_user_data {bool (*handler)(void);};
+#define EXPORT_CMD_HANDLER(_cmd) const struct misma_user_data \
+ lsg_misma_com_ ## _cmd ## _user_data = { \
+ .handler = com_ ## _cmd \
+ };
+
+/* does not allocate memory */
+void misma_log(int ll, const char* fmt,...)
+{
+ va_list argp;
+ time_t t1;
+ struct tm *tm;
+ char str[255] = "";
+
+ if (ll < loglevel_arg_val)
+ return;
+ if (subcmd == CMD_PTR(RUN)) {
+ time(&t1);
+ tm = localtime(&t1);
+ strftime(str, sizeof(str), "%b %d %H:%M:%S", tm);
+ fprintf(stderr, "%s ", str);
+ }
+ va_start(argp, fmt);
+ vfprintf(stderr, fmt, argp);
+ va_end(argp);
+}
+static const char *exit_hook;
+
+__attribute__ ((noreturn))
+static void run_exit_hook_and_die(const char *str)
+{
+ char *arg;
+ char *argv[] = {"/bin/sh", "-c", NULL, NULL};
+ const char *tmp;
+
+ if (exit_hook) {
+ /*
+ * Prevent helpers from calling us again via die() or
+ * die_errno(), which would result in a crash due to an endless
+ * call stack.
+ */
+ tmp = exit_hook;
+ exit_hook = NULL;
+ arg = msg("%s '%s'", tmp, str);
+ argv[2] = arg;
+ xexec(argv, NULL);
+ }
+ exit(EXIT_FAILURE);
+}
+
+void die(const char *fmt, ...)
+{
+ char *str;
+ va_list argp;
+ int ret;
+
+ va_start(argp, fmt);
+ ret = vasprintf(&str, fmt, argp);
+ va_end(argp);
+ if (ret < 0) { /* give up */
+ EMERG_LOG("OOM\n");
+ exit(EXIT_FAILURE);
+ }
+ misma_log(LL_EMERG, "%s\n", str);
+ run_exit_hook_and_die(str);
+}
+
+void die_errno(const char *fmt, ...)
+{
+ char *str;
+ va_list argp;
+ int ret, save_errno = errno;
+
+ va_start(argp, fmt);
+ ret = vasprintf(&str, fmt, argp);
+ va_end(argp);
+ if (ret < 0) {
+ EMERG_LOG("OOM\n");
+ exit(EXIT_FAILURE);
+ }
+ misma_log(LL_EMERG, "%s: %s\n", str, strerror(save_errno));
+ run_exit_hook_and_die(str);
+}
+
+__attribute__ ((const))
+static uint32_t ffz(uint32_t v)
+{
+ uint32_t ret = 0;
+
+ assert(v != (uint32_t)-1);
+ if ((v & 0xffff) == 0xffff) {
+ ret += 16;
+ v >>= 16;
+ }
+ if ((v & 0xff) == 0xff) {
+ ret += 8;
+ v >>= 8;
+ }
+ if ((v & 0xf) == 0xf) {
+ ret += 4;
+ v >>= 4;
+ }
+ if ((v & 0x3) == 0x3) {
+ ret += 2;
+ v >>= 2;
+ }
+ if ((v & 0x1) == 0x1)
+ ret += 1;
+ return ret;
+}
+
+static bool slot_is_used(unsigned slot, const struct origin *o)
+{
+ return o->snapshot[slot].seq != 0;
+}
+
+static void mark_slot_unused(unsigned slot, struct origin *o)
+{
+ o->snapshot[slot].seq = 0;
+}
+
+/* Use highest numbered unused slot, or default if all slots are used. */
+static unsigned get_slot(unsigned seq, const struct origin *o)
+{
+ unsigned mod;
+ FOR_EACH_SLOT_REVERSE(sl, o)
+ if (!slot_is_used(sl, o))
+ return sl;
+ /* all slots used */
+ mod = (1 << o->num_slots) - 1;
+ return ffz(seq % mod);
+}
+
+/*
+ * We specify --autobackup n to avoid filling up /etc/lvm/archive with tons of
+ * useless backup configurations.
+ */
+static bool remove_snapshot(unsigned sl, struct origin *o, bool dry_run)
+{
+ struct snapshot *snap = o->snapshot + sl;
+ bool success;
+ char *arg = msg("%s/misma-%s.%u", vgname(o->vgid), o->name, snap->seq);
+ char *argv[] = {
+ "lvremove",
+ "--yes",
+ "--quiet",
+ "--quiet",
+ "--autobackup",
+ "n",
+ arg,
+ NULL
+ };
+ if (dry_run) {
+ printf("dry-run: would remove snapshot %s\n", arg);
+ free(arg);
+ return true;
+ }
+ NOTICE_LOG("removing snapshot %s\n", arg);
+ success = xexec(argv, NULL);
+ free(arg);
+ if (success)
+ mark_slot_unused(sl, o);
+ return success;
+}
+
+static int slot_compare(const void *a, const void *b, void *data)
+{
+ const struct snapshot *s1 = a, *s2 = b;
+ struct origin *o = data;
+
+ if (!slot_is_used(s1 - o->snapshot, o))
+ return -1;
+ if (!slot_is_used(s2 - o->snapshot, o))
+ return 1;
+ if (s1->seq < s2->seq)
+ return 1;
+ if (s1->seq > s2->seq)
+ return -1;
+ return 0;
+}
+
+static void sort_slots(struct origin *o)
+{
+ qsort_r(o->snapshot, o->num_slots, sizeof(struct snapshot),
+ slot_compare, o);
+}
+
+/*
+ * sleazy (adj.): 1640s, "downy, fuzzy," later "flimsy, unsubstantial" (1660s).
+ *
+ * A sleazy snapshot is one whose distance (with respect to creation time) to
+ * its sibling snapshots is minimal.
+ */
+static bool remove_sleazy_snapshot(struct origin *o, bool dry_run)
+{
+ unsigned sl, victim = 0;
+ uint64_t score = 0;
+ bool have_victim = false;
+ struct snapshot *prev = NULL, *next = NULL;
+
+ sort_slots(o);
+ for (sl = 0; sl < o->num_slots; sl++)
+ if (slot_is_used(sl, o))
+ break;
+ for (; sl < o->num_slots; prev = o->snapshot + sl, sl++) {
+ uint64_t dist;
+ struct snapshot *s = o->snapshot + sl;
+
+ assert(slot_is_used(sl, o));
+ next = sl == o->num_slots - 1? NULL : s + 1;
+ if (!prev && !next)
+ dist = 1;
+ else if (!prev)
+ dist = 10 * (s->epoch - next->epoch);
+ else if (!next)
+ dist = 10 * (prev->epoch - s->epoch);
+ else
+ dist = prev->epoch - next->epoch;
+ DEBUG_LOG("seq %u, slot %u, epoch %" PRIu64 ", score %" PRIu64"\n",
+ s->seq, sl, s->epoch, dist);
+ if (!have_victim || dist < score) {
+ have_victim = true;
+ victim = sl;
+ score = dist;
+ }
+ }
+ if (!have_victim) {
+ INFO_LOG("no snapshots\n");
+ return false;
+ }
+ NOTICE_LOG("victim: seq %u, slot %u, score %" PRIu64 "\n",
+ o->snapshot[victim].seq, victim, score);
+ if (!remove_snapshot(victim, o, dry_run))
+ return false;
+ sort_slots(o);
+ return true;
+}
+
+static void set_interval(enum interval_type it, const struct time_arg *ta)
+{
+ enum lvm_scope scope = ta->lvmspec.scope;
+ unsigned vgid, poolid, oid, n;
+
+ if (scope == LS_GLOBAL) {
+ NOTICE_LOG("default interval #%u: %u seconds\n", it,
+ ta->seconds);
+ global_config.interval[it] = ta->seconds;
+ return;
+ }
+ vgid = get_vgid(ta->lvmspec.vg);
+ if (vgid == ~0U)
+ die("invalid vg in lvmspec: %s", ta->lvmspec.vg);
+ switch (scope) {
+ case LS_VG:
+ volume_group[vgid].config.interval[it] = ta->seconds;
+ break;
+ case LS_POOL:
+ poolid = get_poolid(ta->lvmspec.pool, vgname(vgid));
+ if (poolid == ~0U)
+ die("invalid pool in lvmspec: %s", ta->lvmspec.pool);
+ thin_pool[poolid].config.interval[it] = ta->seconds;
+ break;
+ case LS_ORIGIN:
+ oid = get_oid(ta->lvmspec.tlv, vgname(vgid));
+ if (oid == ~0U)
+ die("invalid tlv in lvmspec: %s", ta->lvmspec.tlv);
+ origin[oid].config.interval[it] = ta->seconds;
+ break;
+ default:
+ assert(0);
+ }
+ /*
+ * Narrow the scope of all matching origins for which it is currently
+ * set to a wider scope.
+ */
+ FOR_EACH_ORIGIN(n) {
+ struct origin *o = origin + n;
+ if (o->iscope[it] >= scope)
+ continue; /* already set to more narrow scope */
+ switch (scope) {
+ case LS_ORIGIN:
+ if (n != oid)
+ continue;
+ break;
+ case LS_POOL:
+ if (poolid != o->poolid || vgid != o->vgid)
+ continue;
+ break;
+ case LS_VG:
+ if (vgid != o->vgid)
+ continue;
+ break;
+ default:
+ assert(0);
+ }
+ NOTICE_LOG("interval #%u for %s/%s: %u seconds\n", it,
+ vgname(o->vgid), o->name, ta->seconds);
+ o->iscope[it] = scope;
+ }
+}
+
+struct lv_info {
+ char *vg, *lv, *pool, *origin;
+ uint64_t time;
+};
+
+static void free_lv_info(struct lv_info *lv)
+{
+ free(lv->vg);
+ free(lv->lv);
+ free(lv->pool);
+ free(lv->origin);
+}
+
+static void parse_lvs_line(const char *line, struct lv_info *result)
+{
+ char *tmp = xstrdup(line), *p = tmp + 2, *comma;
+
+ comma = strchr(p, ',');
+ assert(comma && comma != p);
+ *comma = '\0';
+ result->vg = xstrdup(p);
+ p = comma + 1;
+ comma = strchr(p, ',');
+ assert(comma);
+ *comma = '\0';
+ result->lv = xstrdup(p);
+ p = comma + 1;
+ comma = strchr(p, ',');
+ assert(comma);
+ *comma = '\0';
+ result->pool = xstrdup(p);
+ p = comma + 1;
+ comma = strchr(p, ',');
+ assert(comma);
+ *comma = '\0';
+ result->origin = xstrdup(p);
+ p = comma + 1;
+ assert(sscanf(p, "%" PRIu64, &result->time) == 1);
+ free(tmp);
+}
+
+static void init_origins(void)
+{
+ unsigned n, oid;
+ char *argv[] = {
+ "lvs",
+ "--select", NULL,
+ "--noheading",
+ "--separator", ",",
+ "--readonly",
+ "--unquoted",
+ "-o", "vgname,lvname,pool_lv,origin,lvtime",
+ "-O", "-lv_time",
+ "--config", "report/time_format=%s",
+ NULL
+ };
+ char *buf, *tmp, *line, *select_string = NULL;
+ struct line_iter liter;
+ struct lv_info lv;
+
+ if (OPT_GIVEN(MISMA, ORIGIN) == 0)
+ die("--origin not given");
+
+ /* create argument to --select */
+ for (n = 0; n < OPT_GIVEN(MISMA, ORIGIN); n++) {
+ char *tmp2, *slash;
+ const char *arg = OPT_STRING_VAL_N(n, MISMA, ORIGIN);
+
+ tmp = xstrdup(arg),
+ slash = strchr(tmp, '/');
+ if (!slash || slash == tmp || !slash[1])
+ die("--origin arg must be of the form vg/tlv");
+ *slash = '\0';
+ tmp2 = msg("%s%s (vg_name=%s && (lv_name=%s ||"
+ "(origin=%s && lv_name =~ misma-%s.[0-9]+)))",
+ select_string? select_string : "",
+ select_string? " || " : "" ,
+ tmp, slash + 1, slash + 1, slash + 1
+ );
+ free(tmp);
+ free(select_string);
+ select_string = tmp2;
+ }
+ argv[2] = select_string;
+ if (!xexec(argv, &buf))
+ die("lvs failure");
+ tmp = xstrdup(buf);
+ line_iter_init(&liter, tmp);
+ /* insert vgs and pools */
+ while ((line = line_iter_get(&liter))) {
+ parse_lvs_line(line, &lv);
+ DEBUG_LOG("vg: %s, lv: %s, pool: %s, origin: %s, "
+ "time: %" PRIu64"\n",
+ lv.vg, lv.lv, lv.pool, lv.origin, lv.time);
+ if (lv.origin[0] == '\0') { /* origin */
+ insert_vg(lv.vg);
+ if (lv.pool[0] == '\0')
+ die("%s/%s is no thin LV", lv.vg, lv.lv);
+ insert_pool(lv.pool, lv.vg);
+ }
+ free_lv_info(&lv);
+ }
+ free(tmp);
+ tmp = xstrdup(buf);
+ line_iter_init(&liter, tmp);
+ /* insert origins */
+ while ((line = line_iter_get(&liter))) {
+ parse_lvs_line(line, &lv);
+ if (lv.origin[0] == '\0')
+ insert_origin(lv.lv, lv.vg, lv.pool);
+ free_lv_info(&lv);
+ }
+ free(tmp);
+ /* check that all given origins exist */
+ for (n = 0; n < OPT_GIVEN(MISMA, ORIGIN); n++) {
+ const char *arg = OPT_STRING_VAL_N(n, MISMA, ORIGIN);
+ char *slash;
+
+ tmp = xstrdup(arg),
+ slash = strchr(tmp, '/');
+ *slash = '\0';
+ oid = get_oid(slash + 1, tmp);
+ free(tmp);
+ if (oid == ~0U)
+ die("origin %s does not exist", arg);
+ }
+ tmp = xstrdup(buf);
+ line_iter_init(&liter, tmp);
+ /* allocate and init snapshot arrays */
+ while ((line = line_iter_get(&liter))) {
+ char *fmt;
+ struct snapshot *s;
+ struct origin *o;
+
+ parse_lvs_line(line, &lv);
+ if (lv.origin[0] == '\0') { /* no snapshot */
+ free_lv_info(&lv);
+ continue;
+ }
+ oid = get_oid(lv.origin, lv.vg);
+ assert(oid != ~0U);
+ o = origin + oid;
+ o->num_slots++;
+ o->snapshot = xrealloc(o->snapshot, o->num_slots
+ * sizeof(struct snapshot));
+ s = o->snapshot + o->num_slots - 1;
+ fmt = msg("misma-%s.%%u", lv.origin);
+ if (sscanf(lv.lv, fmt, &s->seq) != 1)
+ die("parse error: %s", lv.lv);
+ free(fmt);
+ s->epoch = lv.time;
+ if (s->seq > o->last_seq)
+ o->last_seq = s->seq;
+ if (s->epoch > o->last_event[ET_CREATE])
+ o->last_event[ET_CREATE] = s->epoch;
+ free_lv_info(&lv);
+ }
+ free(tmp);
+}
+
+static void die_lopsub(int lopsub_ret, char **errctx)
+{
+ const char *m = lls_strerror(-lopsub_ret);
+ if (*errctx)
+ ERROR_LOG("%s: %s\n", *errctx, m);
+ else
+ ERROR_LOG("%s\n", m);
+ free(*errctx);
+ *errctx = NULL;
+ die("lopsub error");
+}
+
+static void parse_options(int argc, char **argv, const struct lls_command *cmd,
+ struct lls_parse_result **lprp)
+{
+ int ret, fd = -1;
+ struct stat statbuf;
+ void *map;
+ size_t sz;
+ int cf_argc;
+ char **cf_argv, *errctx = NULL;
+ const char *subcmd_name;
+ struct lls_parse_result *merged_lpr, *cf_lpr;
+
+ ret = lls_parse(argc, argv, cmd, lprp, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ if (!config_file) {
+ if (OPT_GIVEN(MISMA, CONFIG_FILE))
+ config_file = xstrdup(OPT_STRING_VAL(MISMA,
+ CONFIG_FILE));
+ else {
+ const char *home = getenv("HOME");
+ if (!home || !*home)
+ die("fatal: HOME is unset or empty");
+ config_file = msg("%s/.mismarc", home);
+ }
+ }
+ ret = open(config_file, O_RDONLY);
+ if (ret < 0) {
+ if (errno != ENOENT || OPT_GIVEN(MISMA, CONFIG_FILE))
+ die_errno("can not open config file %s", config_file);
+ /* no config file -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ fd = ret;
+ ret = fstat(fd, &statbuf);
+ if (ret < 0)
+ die_errno("failed to stat config file %s", config_file);
+ sz = statbuf.st_size;
+ if (sz == 0) { /* config file is empty -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (map == MAP_FAILED)
+ die_errno("failed to mmap config file %s", config_file);
+ subcmd_name = (cmd == CMD_PTR(MISMA))? NULL : lls_command_name(cmd);
+ ret = lls_convert_config(map, sz, subcmd_name, &cf_argv,
+ &errctx);
+ munmap(map, sz);
+ if (ret < 0) {
+ ERROR_LOG("failed to convert config file %s\n", config_file);
+ die_lopsub(ret, &errctx);
+ }
+ cf_argc = ret;
+ ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx);
+ lls_free_argv(cf_argv);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ /* command line options override config file options */
+ ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ lls_free_parse_result(cf_lpr, cmd);
+ lls_free_parse_result(*lprp, cmd);
+ *lprp = merged_lpr;
+success:
+ if (fd >= 0)
+ close(fd);
+}
+
+static void get_utilization(void)
+{
+ char *select_string = NULL, *buf, *line;
+ unsigned n;
+ char *argv[] = {
+ "lvs",
+ "--select", NULL,
+ "--noheading", "--unquoted",
+ "-o", "vgname,lvname,data_percent,metadata_percent",
+ NULL
+ };
+ struct line_iter liter;
+
+ for (n = 0; n < num_pools; n++) {
+ const struct thin_pool *pool = thin_pool + n;
+ char *tmp = msg("%s%s (vg_name = %s && lv_name = %s)",
+ (n == 0)? "" : select_string, (n == 0)? "" : "||",
+ vgname(pool->vgid), pool->name);
+ free(select_string);
+ select_string = tmp;
+ }
+ argv[2] = select_string;
+ if (!xexec(argv, &buf))
+ die("lvs failure");
+ free(select_string);
+ line_iter_init(&liter, buf);
+ while ((line = line_iter_get(&liter))) {
+ struct percentage_pair *u;
+ struct thin_pool *pool;
+ unsigned poolid;
+ float data, meta;
+ size_t len = strlen(line);
+ char *vg = xmalloc(len), *lv = xmalloc(len);
+ if (sscanf(line, "%s %s %f %f", vg, lv, &data, &meta) != 4)
+ die("cannot parse lvs line: %s", line);
+ poolid = get_poolid(lv, vg);
+ free(vg);
+ free(lv);
+ assert(poolid != ~0U);
+ pool = thin_pool + poolid;
+ u = &pool->utilization;
+ u->data = data + 0.5;
+ u->meta = meta + 0.5;
+ INFO_LOG("pool %s/%s utilization: %u/%u\n",
+ vgname(pool->vgid), pool->name, u->data, u->meta);
+ }
+ free(buf);
+}
+
+static bool pool_is_full(const struct thin_pool *pool)
+{
+ bool ret;
+ struct percentage_pair t, u = pool->utilization;
+
+ if (pool->threshold_scope == LS_GLOBAL)
+ t = global_config.thresholds;
+ else if (pool->threshold_scope == LS_VG)
+ t = volume_group[pool->vgid].config.thresholds;
+ else
+ t = pool->config.thresholds;
+ ret = u.data > t.data || u.meta > t.meta;
+ if (ret) {
+ NOTICE_LOG("pool %s/%s utilization: %u/%u, threshold: %u/%u\n",
+ vgname(pool->vgid), pool->name,
+ u.data, u.meta, t.data, t.meta);
+ WARNING_LOG("pool %s/%s exceeds utilization thresholds\n",
+ vgname(pool->vgid), pool->name);
+ }
+ return ret;
+}
+
+static void check_utilization(void)
+{
+ bool found_full_pool, removed_snapshot;
+
+again:
+ found_full_pool = false;
+ removed_snapshot = false;
+ get_utilization();
+ for (unsigned n = 0; n < num_pools; n++) {
+ unsigned m;
+ const struct thin_pool *pool = thin_pool + n;
+ if (!pool_is_full(pool))
+ continue;
+ found_full_pool = true;
+ FOR_EACH_ORIGIN(m) {
+ struct origin *o = origin + m;
+ if (o->poolid != n)
+ continue;
+ if (remove_sleazy_snapshot(o, false))
+ removed_snapshot = true;
+ }
+ }
+ if (!found_full_pool)
+ return;
+ if (removed_snapshot)
+ goto again;
+ INFO_LOG("full pool found, but nothing to remove\n");
+}
+
+static bool create_snapshot(struct origin *o, bool dry_run)
+{
+ unsigned seq = o->last_seq + 1;
+ char *name = msg("misma-%s.%u", o->name, seq);
+ char *vg_origin = msg("%s/%s", vgname(o->vgid), o->name);
+ char *argv[] = {
+ "lvcreate",
+ "--type",
+ "thin",
+ "--quiet",
+ "--quiet",
+ "-s",
+ "--autobackup",
+ "n",
+ "-n",
+ name,
+ vg_origin,
+ NULL
+ };
+ if (dry_run) {
+ printf("dry-run: would create snapshot #%u of origin %s\n",
+ seq, vg_origin);
+ free(name);
+ free(vg_origin);
+ return true;
+ }
+ NOTICE_LOG("creating snapshot %s/%s\n", vgname(o->vgid), name);
+ if (!xexec(argv, NULL))
+ die("could not create snapshot");
+ free(name);
+ free(vg_origin);
+ return true;
+}
+
+static void signal_handler(int signo)
+{
+ die("caught signal %d, terminating", signo);
+}
+
+#ifndef FITRIM
+struct fstrim_range {uint64_t start; uint64_t len; uint64_t minlen;};
+#define FITRIM _IOWR('X', 121, struct fstrim_range)
+#endif
+static bool trim_filesystem(struct origin *o, bool dry_run)
+{
+ struct stat sb;
+ char *dev;
+ unsigned majo, mino;
+ int fd;
+ char *buf;
+ struct line_iter liter;
+ char *line, *mp = NULL;
+ struct fstrim_range range = {.len = ULLONG_MAX};
+
+ dev = msg("/dev/%s/%s", vgname(o->vgid), o->name);
+ if (stat(dev, &sb) < 0) {
+ WARNING_LOG("stat(%s): %m\n", dev);
+ free(dev);
+ return false;
+ }
+ if ((sb.st_mode & S_IFMT) != S_IFBLK) {
+ WARNING_LOG("not a block device: %s\n", dev);
+ free(dev);
+ return false;
+ }
+ free(dev);
+ majo = major(sb.st_rdev);
+ mino = minor(sb.st_rdev);
+ fd = open("/proc/self/mountinfo", O_RDONLY);
+ if (fd < 0) {
+ WARNING_LOG("open(/proc/self/mountinfo): %m\n");
+ return false;
+ }
+ if (!fd2buf(fd, &buf)) {
+ WARNING_LOG("fd2buf error\n");
+ close(fd);
+ return false;
+ }
+ close(fd);
+ line_iter_init(&liter, buf);
+ /* 13 15 0:5 / /proc */
+ while ((line = line_iter_get(&liter))) {
+ unsigned id, parent, mmajo, mmino;
+ size_t len = strlen(line);
+ char *mountroot = xmalloc(len), *target = xmalloc(len);
+
+ if (sscanf(line, "%u %u %u:%u %s %s", &id, &parent, &mmajo,
+ &mmino, mountroot, target) != 6) {
+ WARNING_LOG("parse mountinfo line: %s\n", line);
+ free(mountroot);
+ free(target);
+ return false;
+ }
+ free(mountroot);
+ if (mmajo == majo && mmino == mino) {
+ mp = target;
+ break;
+ }
+ free(target);
+ }
+ free(buf);
+ if (!mp) {
+ WARNING_LOG("unable to find mountpoint of origin\n");
+ return false;
+ }
+ if (dry_run) {
+ printf("%s\n", mp);
+ free(mp);
+ return true;
+ }
+ fd = open(mp, O_RDONLY);
+ if (fd < 0) {
+ WARNING_LOG("open(%s): %m\n", mp);
+ free(mp);
+ return false;
+ }
+ if (ioctl(fd, FITRIM, &range)) {
+ WARNING_LOG("ioctl(FITRIM, %s): %m\n", mp);
+ close(fd);
+ free(mp);
+ return false;
+ }
+ close(fd);
+ NOTICE_LOG("trimmed %s\n", mp);
+ free(mp);
+ return true;
+}
+
+static void set_threshold(const struct threshold_arg *ta)
+{
+ enum lvm_scope scope = ta->lvmspec.scope;
+ unsigned poolid = 0, vgid;
+
+ if (scope == LS_GLOBAL) {
+ global_config.thresholds = ta->threshold;
+ return;
+ }
+ vgid = get_vgid(ta->lvmspec.vg);
+ if (vgid == ~0U)
+ die("invalid vg in lvmspec: %s", ta->lvmspec.vg);
+ if (scope == LS_VG) {
+ volume_group[vgid].config.thresholds = ta->threshold;
+ } else {
+ assert(scope == LS_POOL);
+ poolid = get_poolid(ta->lvmspec.pool, vgname(vgid));
+ if (poolid == ~0U)
+ die("invalid pool in lvmspec: %s", ta->lvmspec.pool);
+ thin_pool[poolid].config.thresholds = ta->threshold;
+ }
+ /*
+ * Narrow the scope of all matching pools for which it is currently
+ * set to a wider scope.
+ */
+ for (unsigned n = 0; n < num_pools; n++) {
+ struct thin_pool *p = thin_pool + n;
+ if (p->threshold_scope >= scope)
+ continue; /* already set to more narrow scope */
+ if (vgid != p->vgid)
+ continue;
+ if (scope == LS_POOL && poolid != n)
+ continue;
+ NOTICE_LOG("threshold for pool %s/%s: %u/%u\n",
+ vgname(vgid), p->name, ta->threshold.data,
+ ta->threshold.meta);
+ p->threshold_scope = scope;
+ }
+}
+
+static void log_event(const void *d)
+{
+ const struct event *e = d;
+
+ if (e->origin)
+ DEBUG_LOG("(%s,%u): %" PRIu64 "\n", e->origin->name,
+ e->type, e->epoch);
+ else
+ DEBUG_LOG("(utilization): %" PRIu64 "\n", e->epoch);
+}
+
+static unsigned check_run_options(void)
+{
+ struct time_arg ta;
+ const char *arg;
+ unsigned n, num_events = 0;
+
+ for (n = 0; n < OPT_GIVEN(RUN, THRESHOLD); n++) {
+ struct threshold_arg tha;
+ arg = OPT_STRING_VAL_N(n, RUN, THRESHOLD);
+ parse_threshold_arg(arg,"--threshold", &tha);
+ set_threshold(&tha);
+ free_lvmspec(&tha.lvmspec);
+ }
+ if (OPT_GIVEN(RUN, CHECK_INTERVAL)) {
+ arg = OPT_STRING_VAL(RUN, CHECK_INTERVAL);
+ check_seconds = parse_timespec(arg, "check-interval");
+ check_range(check_seconds, 10, 86400, "check-interval");
+ }
+ for (n = 0; n < OPT_GIVEN(RUN, TRIM_INTERVAL); n++) {
+ arg = OPT_STRING_VAL_N(n, RUN, TRIM_INTERVAL);
+ parse_time_arg(arg, "--trim-interval", &ta);
+ if (ta.seconds > 0)
+ check_range(ta.seconds, 60, ~0U, "trim-interval");
+ set_interval(IT_TRIM, &ta);
+ free_lvmspec(&ta.lvmspec);
+ }
+ for (n = 0; n < OPT_GIVEN(RUN, CREATE_INTERVAL); n++) {
+ arg = OPT_STRING_VAL_N(n, RUN, CREATE_INTERVAL);
+ parse_time_arg(arg, "--create-interval", &ta);
+ check_range(ta.seconds, 60, 86400 * 365, "create-interval");
+ set_interval(IT_CREATE, &ta);
+ free_lvmspec(&ta.lvmspec);
+ }
+ for (n = 0; n < OPT_GIVEN(RUN, MAX_AGE); n++) {
+ arg = OPT_STRING_VAL_N(n, RUN, MAX_AGE);
+ parse_time_arg(arg, "--max-age", &ta);
+ check_range(ta.seconds, 86400, 86400 * 20 * 365, "max-age");
+ set_interval(IT_MAX_AGE, &ta);
+ free_lvmspec(&ta.lvmspec);
+ }
+ FOR_EACH_ORIGIN(n) {
+ struct origin *o = origin + n;
+ uint32_t ma, cr, max_slots; /* max age, create interval */
+
+ INFO_LOG("found %u snapshots of origin %s/%s\n",
+ o->num_slots, vgname(o->vgid), o->name);
+ /* set number of slots */
+ ma = interval_length(IT_MAX_AGE, o);
+ cr = interval_length(IT_CREATE, o);
+ if (ma / 3 < cr)
+ die("%s/%s: max-age/create ratio too small",
+ vgname(o->vgid), o->name);
+ max_slots = 1 + ceil(log2((double)ma / cr + 1));
+ assert(max_slots > 2);
+ assert(max_slots < 30);
+ if (o->num_slots > max_slots)
+ die("%s/%s: too many snapshots", vgname(o->vgid),
+ o->name);
+ if (o->num_slots < max_slots) {
+ unsigned diff = max_slots - o->num_slots;
+ o->snapshot = xrealloc(o->snapshot, max_slots
+ * sizeof(struct snapshot));
+ memset(o->snapshot + o->num_slots, 0,
+ diff * sizeof(struct snapshot));
+ o->num_slots = max_slots;
+ }
+ INFO_LOG("%s/%s: using %u slots\n", vgname(o->vgid), o->name,
+ o->num_slots);
+ if (interval_length(IT_TRIM, o) > 0)
+ num_events++;
+ }
+ return num_events + 1 + num_origins;
+}
+
+static void dispatch_create_event(struct origin *o)
+{
+ unsigned seq, sl;
+ const struct thin_pool *pool;
+ uint64_t now;
+
+ pool = thin_pool + o->poolid;
+ if (pool_is_full(pool)) {
+ WARNING_LOG("%s/%s: creation suspended\n", vgname(o->vgid),
+ o->name);
+ return;
+ }
+ seq = o->last_seq + 1, sl = get_slot(seq, o);
+ if (slot_is_used(sl, o) && !remove_snapshot(sl, o, false))
+ die("%s/%s: unable to free slot\n", vgname(o->vgid), o->name);
+ now = time(NULL);
+ create_snapshot(o, false);
+ o->snapshot[sl].seq = seq;
+ o->snapshot[sl].epoch = now;
+ o->last_seq = seq;
+ o->last_event[ET_CREATE] = now;
+}
+
+/* We leak the fd but that's OK as long as we're only called once. */
+static int silence_lvm(void)
+{
+ char *val;
+ int fd = open("/dev/null", O_RDWR);
+
+ if (fd < 0)
+ die_errno("open(/dev/null)");
+ val = msg("%d", fd);
+ setenv("LVM_ERR_FD", val, true /* overwrite */);
+ free(val);
+ return fd;
+}
+
+__attribute__ ((noreturn))
+static bool com_run(void)
+{
+ int fd = -1;
+ unsigned n, num_events;
+ struct event **ep;
+ struct event **event; /* At most 2 * num_origins + 1 */
+ struct heap *event_heap;
+ uint64_t now = time(NULL);
+
+ num_events = check_run_options();
+ event = xmalloc(num_events * sizeof(struct event *));
+ ep = event;
+ (*ep) = xmalloc(sizeof(struct event));
+ (*ep)->type = ET_CHECK;
+ (*ep)->origin = NULL;
+ (*ep)->epoch = 0;
+ log_event(*ep);
+ ep++;
+ FOR_EACH_ORIGIN(n) {
+ struct origin *o = origin + n;
+ (*ep) = xmalloc(sizeof(struct event));
+ (*ep)->type = ET_CREATE;
+ (*ep)->origin = o;
+ (*ep)->epoch = o->last_event[ET_CREATE]
+ + interval_length(IT_CREATE, o);
+ log_event(*ep);
+ ep++;
+ if (interval_length(IT_TRIM, o) == 0)
+ continue;
+ (*ep) = xmalloc(sizeof(struct event));
+ (*ep)->type = ET_TRIM;
+ (*ep)->origin = o;
+ (*ep)->epoch = now + interval_length(IT_TRIM, o);
+ log_event(*ep);
+ ep++;
+ }
+ event_heap = heap_init(&event, num_events, event_compare);
+ if (get_misma_pid(config_file) > 0)
+ die("already running");
+ if (OPT_GIVEN(RUN, DAEMON))
+ fd = daemonize(OPT_STRING_VAL(RUN, LOGFILE));
+ if (!misma_lock(config_file))
+ die("already running");
+ if (signal(SIGINT, &signal_handler) == SIG_ERR)
+ die_errno("signal handler for SIGINT");
+ if (signal(SIGTERM, &signal_handler) == SIG_ERR)
+ die_errno("signal handler for SIGTERM");
+ if (signal(SIGHUP, &signal_handler) == SIG_ERR)
+ die_errno("signal handler for SIGHUP");
+ if (fd >= 0) {
+ if (write(fd, "\0", 1) < 0)
+ die_errno("write");
+ close(fd);
+ }
+ exit_hook = OPT_STRING_VAL(RUN, EXIT_HOOK);
+ if (OPT_GIVEN(RUN, SUPPRESS_LVM_WARNINGS))
+ silence_lvm();
+ for (;;) {
+ struct event *e = heap_min(event_heap);
+ struct origin *o;
+
+ now = time(NULL);
+ if (e->epoch > now) {
+ INFO_LOG("sleeping %" PRIu64 " seconds\n",
+ e->epoch - now);
+ sleep(e->epoch - now);
+ continue;
+ }
+ e = heap_extract_min(event_heap);
+ o = e->origin;
+ switch (e->type) {
+ case ET_CHECK:
+ INFO_LOG("next event: check\n");
+ check_utilization();
+ now = time(NULL);
+ e->epoch = now + check_seconds;
+ break;
+ case ET_TRIM:
+ INFO_LOG("next event: trim %s/%s\n",
+ vgname(o->vgid), o->name);
+ trim_filesystem(o, false /* dry-run */);
+ e->origin->last_event[ET_TRIM] = now;
+ e->epoch = now + interval_length(IT_TRIM, o);
+ break;
+ case ET_CREATE:
+ INFO_LOG("next event: create %s/%s\n", vgname(o->vgid),
+ o->name);
+ dispatch_create_event(o);
+ e->epoch = now + interval_length(IT_CREATE, o);
+ break;
+ default: assert(0);
+ }
+ heap_insert(e, event_heap);
+ heap_dump(event_heap, log_event);
+ sleep(3);
+ }
+}
+EXPORT_CMD_HANDLER(run);
+
+static void seconds_to_human(int64_t diff, char *buf)
+{
+ if (diff > 2 * 86400 * 365)
+ sprintf(buf, "%3" PRId64 " years ", diff / (86400 * 365));
+ else if (diff > 2 * 86400 * 60)
+ sprintf(buf, "%3" PRId64 " months ", diff / (86400 * 60));
+ else if (diff > 2 * 86400 * 7)
+ sprintf(buf, "%3" PRId64 " weeks ", diff / (86400 * 7));
+ else if (diff > 2 * 86400)
+ sprintf(buf, "%3" PRId64 " days ", diff / 86400);
+ else if (diff > 2 * 3600)
+ sprintf(buf, "%3" PRId64 " hours ", diff / 3600);
+ else if (diff > 2 * 60)
+ sprintf(buf, "%3" PRId64 " minutes", diff / 60);
+ else
+ sprintf(buf, "%3" PRId64 " second%s", diff, diff == 1? "" : "s");
+}
+
+static bool origin_matches_lvmspec(const struct origin *o,
+ const struct lvmspec *spec)
+{
+ if (spec->scope == LS_GLOBAL)
+ return true;
+ if (strcmp(spec->vg, vgname(o->vgid)))
+ return false;
+ if (spec->scope == LS_VG)
+ return true;
+ if (spec->scope == LS_ORIGIN)
+ return !strcmp(spec->tlv, o->name);
+ return !strcmp(spec->pool, thin_pool[o->poolid].name);
+}
+
+static bool for_each_matching_origin(bool (*func)(struct origin *, bool),
+ bool dry_run)
+{
+ unsigned k, n, num_args = lls_num_inputs(sublpr);
+ struct lvmspec *spec = NULL; /* STFU gcc-12.3.0 */
+ bool match = false;
+
+ if (num_args > 0)
+ spec = xmalloc(num_args * sizeof(*spec));
+ for (k = 0; k < num_args; k++)
+ parse_lvmspec(lls_input(k, sublpr), "create/rm", spec + k);
+ FOR_EACH_ORIGIN(n) {
+ struct origin *o = origin + n;
+ for (k = 0; k < num_args; k++)
+ if (origin_matches_lvmspec(o, spec + k))
+ break;
+ if (num_args == 0 || k < num_args) {
+ func(o, dry_run);
+ match = true;
+ }
+ }
+ free(spec);
+ if (!match && num_args > 0)
+ printf("no matches\n");
+ return match;
+}
+
+static bool list_snapshots(struct origin *o, bool l_given)
+{
+ if (!l_given)
+ printf("%s/%s:\n", vgname(o->vgid), o->name);
+ FOR_EACH_SLOT_REVERSE(sl, o) {
+ char buf[32];
+ struct tm *tm;
+ struct snapshot *s = o->snapshot + sl;
+ time_t t;
+
+ assert(slot_is_used(sl, o));
+ if (l_given) {
+ printf("/dev/%s/misma-%s.%u\t", vgname(o->vgid),
+ o->name, s->seq);
+ t = s->epoch;
+ tm = localtime(&t);
+ strftime(buf, sizeof(buf), "%F %R", tm);
+ printf("%s", buf);
+ } else
+ printf("%8u ", s->seq);
+ t = time(NULL);
+ seconds_to_human(t - s->epoch, buf);
+ printf(" %s\n", buf);
+ }
+ return true;
+}
+
+static bool com_ls(void)
+{
+ return for_each_matching_origin(list_snapshots,
+ OPT_GIVEN(LS, LONG));
+}
+EXPORT_CMD_HANDLER(ls);
+
+static bool com_create(void)
+{
+ if (!misma_lock(config_file))
+ die("already running");
+ return for_each_matching_origin(create_snapshot,
+ OPT_GIVEN(CREATE, DRY_RUN));
+}
+EXPORT_CMD_HANDLER(create);
+
+static bool com_rm(void)
+{
+ if (!misma_lock(config_file))
+ die("already running");
+ return for_each_matching_origin(remove_sleazy_snapshot,
+ OPT_GIVEN(RM, DRY_RUN));
+}
+EXPORT_CMD_HANDLER(rm);
+
+static bool com_kill(void)
+{
+ pid_t pid;
+ unsigned sig = OPT_UINT32_VAL(KILL, SIGNAL);
+ unsigned ms = 32;
+
+ pid = get_misma_pid(config_file);
+ if (pid == 0)
+ die("no misma run process to send signal to");
+ NOTICE_LOG("sending signal %u to pid %d\n", sig, pid);
+ if (kill(pid, sig) < 0)
+ die_errno("kill");
+ if (!OPT_GIVEN(KILL, WAIT))
+ return true;
+ while (ms < 5000) {
+ struct timespec ts = {
+ .tv_sec = ms / 1000,
+ .tv_nsec = (ms % 1000) * 1000 * 1000
+ };
+ if (nanosleep(&ts, NULL) < 0)
+ return false;
+ if (kill(pid, 0) < 0)
+ return errno == ESRCH;
+ ms *= 2;
+ }
+ return false;
+}
+EXPORT_CMD_HANDLER(kill);
+
+#define LSG_MISMA_CMD(_name) #_name
+static const char * const subcommand_names[] = {LSG_MISMA_SUBCOMMANDS NULL};
+#undef LSG_MISMA_CMD
+
+static void show_subcommand_summary(bool verbose)
+{
+ int i;
+
+ printf("Available subcommands:\n");
+ if (verbose) {
+ const struct lls_command *cmd;
+ for (i = 1; (cmd = lls_cmd(i, misma_suite)); i++) {
+ const char *purpose = lls_purpose(cmd);
+ const char *name = lls_command_name(cmd);
+ printf("%-12s%s\n", name, purpose);
+ }
+ } else {
+ unsigned n = 8;
+ printf("\t");
+ for (i = 0; i < LSG_NUM_MISMA_SUBCOMMANDS; i++) {
+ if (i > 0)
+ n += printf(", ");
+ if (n > 70) {
+ printf("\n\t");
+ n = 8;
+ }
+ n += printf("%s", subcommand_names[i]);
+ }
+ printf("\n");
+ }
+}
+
+static bool com_trim(void)
+{
+ if (!misma_lock(config_file))
+ die("already running");
+ return for_each_matching_origin(trim_filesystem,
+ OPT_GIVEN(TRIM, DRY_RUN));
+}
+EXPORT_CMD_HANDLER(trim);
+
+static bool com_help(void)
+{
+ int ret;
+ char *errctx, *help;
+ const char *arg;
+ const struct lls_command *cmd;
+
+ ret = lls_check_arg_count(sublpr, 0, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ if (lls_num_inputs(sublpr) == 0) {
+ show_subcommand_summary(OPT_GIVEN(HELP, LONG));
+ return true;
+ }
+ arg = lls_input(0, sublpr);
+ ret = lls_lookup_subcmd(arg, misma_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ cmd = lls_cmd(ret, misma_suite);
+ if (OPT_GIVEN(HELP, LONG))
+ help = lls_long_help(cmd);
+ else
+ help = lls_short_help(cmd);
+ printf("%s\n", help);
+ free(help);
+ return true;
+}
+EXPORT_CMD_HANDLER(help);
+
+static bool com_configtest(void)
+{
+ printf("Syntax Ok\n");
+ return true;
+}
+EXPORT_CMD_HANDLER(configtest);
+
+static bool com_utilization(void)
+{
+ get_utilization();
+ for (unsigned n = 0; n < num_pools; n++) {
+ struct thin_pool *p = thin_pool + n;
+ printf("%s/%s: %u%%/%u%%\n",
+ vgname(p->vgid), p->name, p->utilization.data,
+ p->utilization.meta);
+ }
+ return true;
+}
+EXPORT_CMD_HANDLER(utilization);
+
+const char *GET_VERSION(void);
+static void handle_version_and_help(void)
+{
+ char *help;
+
+ if (OPT_GIVEN(MISMA, VERSION)) {
+ printf(PACKAGE " %s\n"
+ "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n"
+ "License: " LICENSE ": <" LICENSE_URL ">.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "\n"
+ "Web page: " URL "\n"
+ "Clone URL: " CLONE_URL "\n"
+ "Gitweb: " GITWEB_URL "\n"
+ "Author's Home Page: " HOME_URL "\n"
+ "Send feedback to: " AUTHOR " <" EMAIL ">\n"
+ ,
+ GET_VERSION()
+ );
+ exit(EXIT_SUCCESS);
+ }
+ if (OPT_GIVEN(MISMA, DETAILED_HELP))
+ help = lls_long_help(CMD_PTR(MISMA));
+ else if (OPT_GIVEN(MISMA, HELP))
+ help = lls_short_help(CMD_PTR(MISMA));
+ else
+ return;
+ printf("%s\n", help);
+ free(help);
+ exit(EXIT_SUCCESS);
+}
+
+int main(int argc, char **argv)
+{
+ unsigned num_inputs;
+ int ret;
+ char *errctx;
+ const struct misma_user_data *ud;
+
+ valid_fd012();
+ parse_options(argc, argv, CMD_PTR(MISMA), &lpr);
+ loglevel_arg_val = OPT_UINT32_VAL(MISMA, LOGLEVEL);
+ handle_version_and_help();
+ num_inputs = lls_num_inputs(lpr);
+ if (num_inputs == 0) {
+ show_subcommand_summary(true /* verbose */);
+ exit(EXIT_SUCCESS);
+ }
+ ret = lls_lookup_subcmd(argv[argc - num_inputs], misma_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ subcmd = lls_cmd(ret, misma_suite);
+ parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr);
+ if (subcmd != CMD_PTR(HELP))
+ init_origins();
+ ud = lls_user_data(subcmd);
+ exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE);
+}
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0+
+[suite misma]
+ caption = Subcommands
+ mansect = 8
+ manual_title = System Manager's Manual
+[supercommand misma]
+ [description]
+ DESCRIPTION1()
+
+ DESCRIPTION2()
+
+ DESCRIPTION3()
+ [/description]
+ synopsis = [global-options...] [--] [<subcommand> [subcommand-options...]]
+ purpose = SLOGAN()
+
+ [option title-text]
+ summary = General options
+ flag ignored
+ [option help]
+ summary = print help and exit
+ short_opt = h
+ [option detailed-help]
+ summary = print help, including all details, and exit
+ [option version]
+ summary = print version and exit
+ short_opt = V
+ [option loglevel]
+ summary = control amount of logging
+ short_opt = l
+ arg_info = required_arg
+ arg_type = string
+ typestr = severity
+ values = {
+ LSGLL_DEBUG = "debug",
+ LSGLL_INFO = "info",
+ LSGLL_NOTICE = "notice",
+ LSGLL_WARNING = "warning",
+ LSGLL_ERROR = "error",
+ LSGLL_CRIT = "crit",
+ LSGLL_EMERG = "emerg"
+ }
+ default_val = warning
+ [help]
+ Log only messages with severity greater or equal than the given
+ value. Possible values:
+
+ debug: produces really noisy output.
+ info: still noisy, but won't fill up the disk quickly.
+ notice: indicates normal, but significant event.
+ warning: unexpected events that can be handled.
+ error: unhandled error condition.
+ crit: system might be unreliable.
+ emerg: last message before exit.
+ [/help]
+ [option config-file]
+ short_opt = c
+ summary = use alternative config file (default: ~/.mismarc)
+ typestr = path
+ arg_info = required_arg
+ arg_type = string
+ [help]
+ Options may be given at the command line or in the configuration
+ file. As usual, if an option is given both at the command line and
+ in the configuration file, the command line option takes precedence.
+
+ The config file may contain global options as well as options for
+ any subcommand, but subcommand specific options must be placed in a
+ separate section. See the Examples section of the man page.
+ [/help]
+
+ [option title-text]
+ summary = LVM options
+ flag ignored
+ [option origin]
+ summary = the VG and the thin LV to snapshot
+ typestr = vg/tlv
+ arg_info = required_arg
+ arg_type = string
+ flag multiple
+ [help]
+ The named volume group must exist and it must contain the named thin
+ logical volume. This option may be given multiple times where each
+ instance corresponds to one origin to snapshot.
+ [/help]
+[introduction]
+ Misma supports the subcommands described below. If no subcommand is
+ given, the list of available subcommands is shown and the program
+ terminates successfully without performing any further action.
+[/introduction]
+
+[subcommand run]
+ purpose = create and prune snapshots, discard unused blocks
+ [description]
+ This is the main mode of operation. Snapshots are created and pruned
+ periodically, the thin pool utilization is monitored and filesystem
+ trims are scheduled as configured. The subcommand terminates only on
+ fatal errors or after a terminating signal was received.
+ [/description]
+ [option daemon]
+ short_opt = d
+ summary = run as background daemon
+ [help]
+ If this option is given, the process detaches from the console and
+ continues to run in the background.
+ [/help]
+ [option logfile]
+ short_opt = l
+ summary = where to write log output
+ arg_info = required_arg
+ arg_type = string
+ typestr = path
+ default_val = /dev/null
+ [help]
+ This option is only honored if --daemon is given, in which case
+ log messages go to the given file. Otherwise the option is silently
+ ignored and log output is written to stderr.
+ [/help]
+ [option create-interval]
+ summary = Time span between two subsequent snapshots
+ typestr = [lvmspec:]timespec
+ arg_info = required_arg
+ arg_type = string
+ flag multiple
+ default_val = 6h
+ [help]
+ The lvm specifier determines to which origins this instance of the
+ option applies. If no specifier is given, the option applies to all
+ origins. Otherwise the specifier may be in one of the following forms:
+ <vg>: applies to all origins in VG vg, <vg|pool>: applies to all
+ origins in thin pool <pool> of VG vg, or <vg/tlv>: applies to origin
+ tlv of vg only. If more than one specifier match a particular origin,
+ the narrowest scoped one applies. The order of precedence is therefore
+ <vg/tlv>, <vg|pool>, <vg>, <global>.
+
+ The time specifier is an unsigned integer which is followed by a time
+ unit, a single character of the set {s,m,h,d,y} for seconds, minutes,
+ hours, days, and years.
+ [/help]
+ [option max-age]
+ summary = age of the oldest snapshot to keep
+ typestr = [lvmspec:]timespec
+ arg_info = required_arg
+ arg_type = string
+ flag multiple
+ default_val = 1y
+ [help]
+ See --create-interval for the format of the lvm and time specifiers.
+ [/help]
+ [option check-interval]
+ summary = the time period between two utilization checks
+ typestr = timespec
+ arg_info = required_arg
+ arg_type = string
+ default_val = 1m
+ [help]
+ The utilization of all thin pools which contain at least one thin
+ logical volume specified as an argument to --origin are checked
+ periodically. See --create-interval for the format of the time
+ specifier.
+ [/help]
+ [option threshold]
+ summary = high watermarks for snapshot removal (1-99)
+ typestr = [lvmspec:]data_threshold,meta_threshold
+ arg_info = required_arg
+ arg_type = string
+ flag multiple
+ default_val = 95,95
+ [help]
+ The threshold part of the argument is a comma-separated pair of
+ percentages between 1 and 99, inclusively. If the percentage of used
+ space in the data/metadata logical volume of the thin pool exceeds
+ the corresponding threshold value, forced snapshot removal kicks in
+ to bring back the utilization below the thresholds.
+
+ The format of the lvm specifier is described in the help text of
+ --create-interval. However, since the utilization is a property
+ of the pool, arguments of the form <vg/tlv> make no sense and are
+ therefore rejected.
+ [/help]
+ [option trim-interval]
+ summary = discard unused blocks periodically
+ typestr = [lvmspec:]timespec
+ arg_info = required_arg
+ arg_type = string
+ flag multiple
+ default_val = 0
+ [help]
+ The argument specifies the duration between two successive trims. The
+ default value of zero deactivates this feature.
+
+ Trimming is performed in the same way as for the trim subcommand.
+ Errors related to trimming are logged but are otherwise ignored.
+
+ See --create-interval for the format of the specifiers.
+ [/help]
+ [option exit-hook]
+ summary = command to be executed before exit
+ typestr = command
+ arg_info = required_arg
+ arg_type = string
+ default_val = true
+ [help]
+ One possible application for this hook is to inform system manager
+ that no more snapshots are going to be created.
+
+ A (quoted) string which describes the error that caused the termination is
+ appended to the given command and the resulting string is passed as a single
+ argument to /bin/sh -c.
+ [/help]
+ [option suppress-lvm-warnings]
+ summary = quieten lvcreate(8) and lvremove(8)
+ [help]
+ suppress
+ [/help]
+[subcommand create]
+ purpose = create a snapshot of each matching origin
+ non-opts-name = [<lvmspec>]...
+ [description]
+ This creates one snapshot of each origin which matches the given lvm
+ specifier, ignoring creation intervals, maximal age and utilization
+ thresholds. If no specifiers are given, all origins are regarded as
+ matching so that one snapshot of each configured origin is created.
+
+ The subcommand fails if another "run", "create", or "remove" command
+ is currently running.
+ [/description]
+ [option dry-run]
+ short_opt = n
+ summary = just print which snapshot would be created
+[subcommand rm]
+ purpose = remove one snapshot of each matching origin
+ non-opts-name = [<lvmspec>]...
+ [description]
+ The remarks stated in the description of the "create" subcommand apply
+ for this subcommand as well.
+ [/description]
+ [option dry-run]
+ short_opt = n
+ summary = just print which snapshot would get removed
+[subcommand ls]
+ purpose = print the snapshot list of each origin
+ non-opts-name = [<lvmspec>]...
+ [description]
+ The list is sorted by snapshot creation date.
+ [/description]
+ [option long]
+ short_opt = l
+ summary = use long listing format
+ [help]
+ The default output mode lists only the sequence number and the age
+ of each snapshot as human readable text. This option adds additional
+ output.
+ [/help]
+[subcommand kill]
+ purpose = signal another misma process
+ [description]
+ This sends a signal to the misma "run" process.
+ [/description]
+ [option signal]
+ short_opt = s
+ summary = send the given signal rather than SIGTERM
+ typestr = signal_number
+ arg_info = required_arg
+ arg_type = uint32
+ default_val = 15
+ [help]
+ The standard Unix semantics apply if the specified signal number
+ is zero. That is, no signal is actually sent, and the subcommand
+ exits successfully only if a misma "run" process exists.
+ [/help]
+ [option wait]
+ short_opt = w
+ summary = wait until the signalled process has terminated
+ [help]
+ This option is handy for system shutdown scripts which would like
+ to terminate the misma daemon process.
+
+ Without --wait the misma process which executes the kill subcommand
+ exits right after the kill(2) system call returns. At this point the
+ signalled process might still be alive (even if SIGKILL was sent).
+ If --wait is given, the process waits until the signalled process
+ has terminated or the timeout expires.
+
+ If --wait is not given, the kill subcommand exits successfully if
+ and only if the signal was sent (i.e., if there exists another misma
+ process to receive the signal). With --wait it exits successfully
+ if, additionally, the signalled process has terminated before the
+ timeout expires.
+
+ It makes only sense to use the option for signals which terminate
+ the misma process.
+ [/help]
+[subcommand trim]
+ purpose = discard unused blocks of origin LVs
+ non-opts-name = [<lvmspec>]...
+ [description]
+ Each matching origin LV is expected to contain a mounted and writable
+ filesystem. The subcommand is equivalent to running fstrim(8) on
+ the mountpoints of these filesystems. The full block range of each
+ origin LV is taken into account and the default minimal block size for
+ discards is used. This corresponds to the default values of fstrim(8).
+ [/description]
+ [option dry-run]
+ short_opt = n
+ summary = print the mount points, but do not trim
+ [help]
+ In dry-run mode the mount points are determined as usual, but the
+ command exits without starting any trim operation.
+ [/help]
+[subcommand help]
+ purpose = list available subcommands or print subcommand-specific help
+ non-opts-name = [subcommand]
+ [description]
+ Without any arguments, help prints the list of available
+ subcommands. When called with a subcommand name argument, it prints
+ the help text of the given subcommand.
+ [/description]
+ [option long]
+ short_opt = l
+ summary = show the long help text
+ [help]
+ If the optional argument is supplied, the long help text contains the
+ synopsis, the purpose and the description of the specified subcommand,
+ followed by the option list including summary and help text of each
+ option. Without --long, the short help is shown instead. This omits
+ the description of the subcommand and the option help.
+
+ If no subcommand is supplied but --long is given, the list contains the
+ purpose of each subcommand.
+ [/help]
+[subcommand utilization]
+ purpose = show thin pool utilization
+ [description]
+ This prints the percentage of used blocks in the data and metadata
+ logical volumes of each pool.
+ [/description]
+[subcommand configtest]
+ purpose = run a configuration file syntax test
+ [description]
+ This subcommand checks the command line options and the configuration
+ file for syntactic correctness. It either reports "Syntax Ok" and
+ exits successfully or prints information about the first syntax error
+ detected and terminates with exit code 1.
+ [/description]
+
+[section Notes]
+.SS Naming
+ Snapshots created by misma are named
+ .IR misma-origin.seq ,
+ where
+ .I origin
+ is the name of the thin logical volume (i.e., the second component
+ of the argument to
+ .I --origin)
+ and
+ .I seq
+ is a sequence number.
+.SS Snapshot Replacement Strategy
+ Assume that the arguments
+ to
+ .I --create-interval
+ and
+ .I --max-age
+ correspond to
+ .I d
+ minutes and
+ .I m
+ days, respectively. These two quantities determine the length
+ .I n
+ of a sequence of snapshots such that
+ .IP \(bu 2
+ the first two snapshots are
+ .I d
+ minutes apart,
+ .IP \(bu 2
+ the difference of the creation times between two consecutive snapshots
+ doubles at each step,
+ .IP \(bu 2
+ the first and the last snapshot are at least
+ .I m
+ days apart.
+ .P
+ At startup,
+ .B misma
+ maps each existing snapshot to a slot in an array
+ of length
+ .IR n .
+ When a new snapshot has to be created and not all slots are mapped
+ yet, the new snapshot is mapped to an unmapped slot. If all slots
+ are mapped, an existing snapshot is removed first and its slot is
+ reused. The slot number of the snapshot to be replaced is computed as
+ .B ffz(seq % (2^n - 1)),
+ where
+ .I seq
+ is the sequence number of the new snapshot, and
+ .B ffz(x)
+ is the first zero in
+ the binary representation of
+ .IR x .
+ By properties of the
+ .B ffz()
+ function, the frequency at which a slot gets reused halves at each
+ step: the snapshot in slot 0 gets reused (roughly) every second time,
+ the snapshot in slot one every fourth time, and so on.
+.SS Forced Snapshot Removal
+ In addition to the normal snapshot removal which takes place when a
+ slot gets reused as described above, snapshots are
+ .I force-removed
+ when the utilization of a thin pool exceeds its configured
+ thresholds. One snapshot is removed from each affected origin until
+ the utilization drops below the thresholds. If the utilization still
+ exceeds the thresholds after all snapshots have been removed, snapshot
+ creation is suspended.
+ .P
+ Forced removal
+ reliably prevents data and metadata exhaustion if the pool is
+ not overbooked. That is, if the sum of the (virtual) sizes of the
+ non-snapshot logical volumes is smaller than the pool size.
+.SS Trimming
+ The trim operation instructs a mounted filesystem to identify blocks
+ which are currently not in use and to pass this information to the
+ underlying block device driver. For a configured misma origin, this
+ driver is
+ .BR dm-thin ,
+ which keeps track of the used and unused blocks of each thin pool.
+ The blocks which are freed by the trim operation become available
+ for subsequent snapshots.
+
+ A one-shot trim operation is started by invoking the
+ .B trim
+ subcommand while periodic trims may be configured via the
+ .I --trim-interval
+ option of the
+ .B run
+ subcommand.
+
+ Trimming is implemented by issuing the
+ .I FITRIM
+ ioctl on the mount point, which is identical to how the
+ .BR fstrim (8)
+ command works. The mount point is determined from the major and minor
+ device numbers of the block special of the origin by parsing
+ .IR /proc/self/mountinfo .
+.SS Activating and Mounting Snapshots
+ Since thin provisioned snapshots have the
+ .I activation-skip
+ flag set, one must first
+ .I activate
+ the snapshot logical volume to create the corresponding device node.
+
+ Moreover, the XFS filesystem driver refuses to mount a block device
+ which contains a UUID that is identical to the UUID of an already
+ mounted filesystem. To mount a snapshot of an XFS filesystem, one
+ must therefore tell XFS to skip the UUID check.
+
+ See the examples below for suitable command line options for
+ .BR lvchange (8)
+ and
+ . BR mount (8).
+
+ Since logical volumes which contain a mounted filesystem cannot be
+ removed, a thin pool which is not overbooked may still run out of
+ space when one of its snapshot logical volumes is still mounted. It
+ is therefore good practice to activate and mount snapshots only for
+ as long as necessary.
+[/section]
+
+[section Examples]
+ .IP \(bu 2
+ Create a 1T large thin pool named
+ .I tp
+ in the volume group
+ .IR vg :
+
+ .RS 6
+ .EX
+ .B lvcreate \-\-type thin\-pool \-L 1T \-\-poolmetadatasize 16G \-n tp vg
+ .EE
+ .RE
+ .IP \(bu 2
+ Create the thin logical volume
+ .I tlv
+ of virtual size 100G in the thin pool
+ .IR tp :
+
+ .RS 6
+ .EX
+ .B lvcreate \-\-thin \-n tlv \-\-virtualsize 100G \-\-thinpool vg/tp
+ .EE
+ .RE
+ .IP \(bu 2
+ Run
+ .B misma
+ to create snapshots of the logical volume
+ .IR tlv ,
+ using default values:
+
+ .RS 6
+ .EX
+ .B misma \-\-origin vg/tlv run
+ .EE
+ .RE
+ .IP \(bu 2
+ Same as before, but run
+ .B misma
+ as a background daemon to create a snapshot every hour:
+
+ .RS 6
+ .EX
+ .B misma \-\-origin vg/tlv \-\-create-interval 1h \-\- run \-d
+ .EE
+ .RE
+ .IP \(bu 2
+ List all snapshots created so far:
+
+ .RS 6
+ .EX
+ .B misma \-\-origin vg/tlv \-\- ls \-l
+ .EE
+ .RE
+ .IP \(bu 2
+ Run
+ .B lvs
+ to print similar information:
+
+ .RS 6
+ .EX
+ .B vg=vg; o=tlv
+ .B lvs -o 'lv_path,lv_attr,lv_time,origin' \[rs]
+ .B \~ \-S \[dq]vg_name = $vg && origin = $o\[dq] \[rs]
+ .B \~ \-\-config \[dq]report/time_format='%F %R'\[dq]
+ .EE
+ .RE
+ .IP \(bu 2
+ Activate snapshot number 42:
+
+ .RS 6
+ .EX
+ .B lvchange \-\-ignoreactivationskip \-\-activate y vg/misma-tlv.42
+ .EE
+ .RE
+ .IP \(bu 2
+ Mount an active snapshot which contains an XFS filesystem:
+
+ .RS 6
+ .EX
+ .B mount /dev/vg/misma-tlv.42 \-o nouuid /mnt
+ .EE
+ .RE
+ .IP \(bu 2
+ Terminate the
+ .B misma
+ daemon process:
+
+ .RS 6
+ .EX
+ .B misma \-\-origin vg/tlv kill
+ .EE
+ .RE
+ .IP \(bu 2
+ A simple config file:
+
+ .RS 6
+ .EX
+ # global options
+ origin vg/tlv
+ loglevel info
+ # an option for the "run" subcommand
+ [run]
+ logfile /var/log/misma.log
+ .EE
+ .RE
+
+[/section]
+
+[section copyright]
+ Written by AUTHOR()
+ .br
+ Copyright (C) COPYRIGHT_YEAR() AUTHOR()
+ .br
+ License: LICENSE()
+ .br
+ This is free software: you are free to change and redistribute it.
+ .br
+ There is NO WARRANTY, to the extent permitted by law.
+ .P
+ Web page:
+ .UR URL()
+ .UE
+ .br
+ Git clone `URL':
+ .UR CLONE_URL()
+ .UE
+ .br
+ Gitweb:
+ .UR GITWEB_URL()
+ .UE
+ .br
+ Author's home page:
+ .UR HOME_URL()
+ .UE
+ .br
+ Report bugs to
+ .MT EMAIL()
+ AUTHOR()
+ .ME
+[/section]
+[section see also]
+ .BR lvm (8),
+ .BR fstrim (8),
+ .BR lvmthin (7),
+ .BR dss (1)
+[/section]
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include "misma.h"
+
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+void *xrealloc(void *p, size_t size)
+{
+ assert(size > 0);
+ assert((p = realloc(p, size)));
+ return p;
+}
+
+void *xmalloc(size_t size)
+{
+ return xrealloc(NULL, size);
+}
+
+void *xzmalloc(size_t size)
+{
+ void *p = xrealloc(NULL, size);
+ memset(p, 0, size);
+ return p;
+}
+
+void *xstrdup(const char *s)
+{
+ char *ret = strdup(s? s: "");
+
+ assert(ret);
+ return ret;
+}
+
+char *msg(const char *fmt, ...)
+{
+ char *m;
+ size_t size = 100;
+
+ m = xmalloc(size);
+ while (1) {
+ int n;
+ va_list ap;
+
+ /* Try to print in the allocated space. */
+ va_start(ap, fmt);
+ n = vsnprintf(m, size, fmt, ap);
+ va_end(ap);
+ /* If that worked, return the string. */
+ if (n < size)
+ return m;
+ /* Else try again with more space. */
+ size = n + 1; /* precisely what is needed */
+ m = xrealloc(m, size);
+ }
+}
+
+bool fd2buf(int fd, char **buf)
+{
+ ssize_t ret, nread = 0, sz = 100;
+
+ *buf = xmalloc(sz);
+ for (;;) {
+ ret = read(fd, *buf + nread, sz - nread - 1);
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
+ ERROR_LOG("read error: %s\n", strerror(errno));
+ return false;
+ }
+ if (ret == 0) {
+ (*buf)[nread] = '\0';
+ return true;
+ }
+ nread += ret;
+ if (nread >= sz - 1) {
+ sz *= 2;
+ *buf = xrealloc(*buf, sz);
+ }
+ }
+}
+
+bool xexec(char * const argv[], char **buf)
+{
+ pid_t pid;
+ int pipefd[2] = {-1, -1};
+ unsigned n;
+
+ for (n = 0; argv[n]; n++)
+ DEBUG_LOG("argv[%u]=%s\n", n, argv[n]);
+ if (buf) {
+ if (pipe(pipefd) < 0)
+ die_errno("pipe");
+ }
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid > 0) { /* parent */
+ int wstatus;
+ bool success = true;
+ if (buf) {
+ close(pipefd[1]);
+ success = fd2buf(pipefd[0], buf);
+ close(pipefd[0]);
+ }
+ if (waitpid(pid, &wstatus, 0) < 0)
+ die_errno("waitp");
+ if (!success)
+ return false;
+ if (!WIFEXITED(wstatus))
+ return false;
+ if (WEXITSTATUS(wstatus) != EXIT_SUCCESS)
+ return false;
+ return true;
+ }
+ if (pipefd[0] >= 0)
+ close(pipefd[0]);
+ if (pipefd[1] >= 0 && pipefd[1] != STDOUT_FILENO) {
+ if (dup2(pipefd[1], STDOUT_FILENO) < 0)
+ die_errno("dup2()");
+ close(pipefd[1]);
+ }
+ execvp(argv[0], argv);
+ EMERG_LOG("execvp error: %s\n", strerror(errno));
+ _exit(EXIT_FAILURE);
+}
+
+void die_empty_arg(const char *opt)
+{
+ die("argument to --%s must not be empty", opt);
+}
+
+void die_range(const char *opt)
+{
+ die("argument to --%s is out of range", opt);
+}
+
+void check_range(uint32_t val, uint32_t min, uint32_t max, const char *opt)
+{
+ if (val < min || val > max)
+ die_range(opt);
+}
+
+static uint32_t atou32(const char *str, const char *opt)
+{
+ char *endptr;
+ long long tmp;
+
+ errno = 0; /* To distinguish success/failure after call */
+ tmp = strtoll(str, &endptr, 10);
+ if (errno == ERANGE && (tmp == LLONG_MAX || tmp == LLONG_MIN))
+ die_range(opt);
+ if (tmp < 0 || tmp > (uint32_t)-1)
+ die_range(opt);
+ /*
+ * If there were no digits at all, strtoll() stores the original value
+ * of str in *endptr.
+ */
+ if (endptr == str)
+ die_empty_arg(opt);
+ /*
+ * The implementation may also set errno and return 0 in case no
+ * conversion was performed.
+ */
+ if (errno != 0 && tmp == 0)
+ die_empty_arg(opt);
+ if (*endptr != '\0') /* Further characters after number */
+ die("--%s: trailing characters after number", opt);
+ return tmp;
+}
+
+static void split_arg(const char *arg, const char *context,
+ char **prefix, char **suffix)
+{
+ char *colon;
+ char *tmp = xstrdup(arg);
+
+ if (!tmp[0])
+ die_empty_arg(context);
+ colon = strchr(tmp, ':');
+ if (!colon) {
+ *prefix = NULL;
+ *suffix = tmp;
+ return;
+ }
+ *colon = '\0';
+ if (colon == tmp || !colon[1])
+ die("%s: invalid argument", context);
+ *prefix = xstrdup(tmp);
+ *suffix = xstrdup(colon + 1);
+ free(tmp);
+}
+
+void parse_lvmspec(const char *arg, const char *context,
+ struct lvmspec *result)
+{
+ char *slash, *pipe;
+ char *tmp = xstrdup(arg);
+
+ slash = strchr(tmp, '/');
+ if (slash) {
+ if (slash == tmp || !slash[1])
+ die("%s: invalid argument", context);
+ *slash = '\0';
+ result->scope = LS_ORIGIN;
+ result->tlv = xstrdup(slash + 1);
+ goto free_tmp;
+ }
+ pipe = strchr(tmp, '|');
+ if (pipe) {
+ if (pipe == tmp || !pipe[1])
+ die("%s: invalid argument", context);
+ *pipe = '\0';
+ result->scope = LS_POOL;
+ result->pool = xstrdup(pipe + 1);
+ goto free_tmp;
+ }
+ result->scope = LS_VG;
+free_tmp:
+ result->vg = xstrdup(tmp);
+ free(tmp);
+}
+
+void free_lvmspec(struct lvmspec *spec)
+{
+ if (spec->scope == LS_GLOBAL)
+ return;
+ free(spec->vg);
+ if (spec->scope == LS_POOL)
+ free(spec->pool);
+ else if (spec->scope == LS_ORIGIN)
+ free(spec->tlv);
+}
+
+void parse_threshold_arg(const char *arg, const char *context,
+ struct threshold_arg *result)
+{
+ char *prefix, *suffix, *comma;
+ uint32_t val;
+
+ split_arg(arg, context, &prefix, &suffix);
+ if (prefix) {
+ parse_lvmspec(prefix, context, &result->lvmspec);
+ if (result->lvmspec.scope == LS_ORIGIN)
+ die("invalid scope for threshold lvmspec");
+ } else
+ result->lvmspec.scope = LS_GLOBAL;
+ free(prefix);
+ comma = strchr(suffix, ',');
+ if (!comma)
+ die("%s: invalid argument", context);
+ *comma = '\0';
+ val = atou32(suffix, context);
+ check_range(val, 1, 99, context);
+ result->threshold.data = val;
+ val = atou32(comma + 1, context);
+ check_range(val, 1, 99, context);
+ result->threshold.meta = val;
+ free(suffix);
+}
+
+unsigned parse_timespec(const char *spec, const char *context)
+{
+ char *p, *tmp = xstrdup(spec);
+ uint64_t val, multiplier;
+
+ for (p = tmp; isdigit(*p); p++)
+ ;
+ if (*p == '\0')
+ die("%s: timepec lacks trailing time unit", context);
+ switch (*p) {
+ case 's': multiplier = 1; break;
+ case 'm': multiplier = 60; break;
+ case 'h': multiplier = 3600; break;
+ case 'd': multiplier = 86400; break;
+ case 'y': multiplier = 365 * 86400; break;
+ default:
+ die("%s: invalid time unit in timepec argument", context);
+ }
+ *p = '\0';
+ if (p[1])
+ die("%s: trailing characters after time unit", context);
+ val = atou32(tmp, context) * multiplier;
+ free(tmp);
+ if (val > (uint32_t)-1)
+ die_range(context);
+ return val;
+}
+
+void parse_time_arg(const char *arg, const char *context,
+ struct time_arg *result)
+{
+ char *prefix, *suffix;
+
+ split_arg(arg, context, &prefix, &suffix);
+ if (prefix)
+ parse_lvmspec(prefix, context, &result->lvmspec);
+ else
+ result->lvmspec.scope = LS_GLOBAL;
+ free(prefix);
+ result->seconds = parse_timespec(suffix, context);
+ free(suffix);
+}
+
+void line_iter_init(struct line_iter *liter, char *text)
+{
+ liter->line = liter->base = text;
+}
+
+char *line_iter_get(struct line_iter *liter)
+{
+ char *cr, *line;
+
+ if (!liter->line || !liter->line[0])
+ return NULL;
+ line = liter->line;
+ cr = strchr(liter->line, '\n');
+ if (cr) {
+ *cr = '\0';
+ liter->line = cr + 1;
+ } else
+ liter->line = NULL;
+ return line;
+}
+
+void valid_fd012(void)
+{
+ /* Ensure that file descriptors 0, 1, and 2 are valid. */
+ while (1) {
+ int fd = open("/dev/null", O_RDWR);
+ if (fd < 0)
+ die_errno("open");
+ if (fd > 2) {
+ close(fd);
+ break;
+ }
+ }
+}
+
+int daemonize(const char *logfile)
+{
+ pid_t pid;
+ int nullfd, logfd, pipefd[2];
+
+ if (pipe(pipefd) < 0)
+ die_errno("pipe");
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid) { /* parent exits after reading from the pipe */
+ char c;
+ close(pipefd[1]);
+ if (read(pipefd[0], &c, 1) <= 0)
+ die("child terminated unsuccessfully");
+ exit(EXIT_SUCCESS);
+ }
+ close(pipefd[0]);
+ /* become session leader */
+ if (setsid() < 0)
+ die_errno("setsid");
+ if ((nullfd = open("/dev/null", O_RDWR)) < 0)
+ die_errno("open /dev/null");
+ logfile = logfile? logfile : "/dev/null";
+ if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, 0666)) < 0)
+ die_errno("open %s", logfile);
+ INFO_LOG("subsequent log messages go to %s\n", logfile);
+ if (dup2(nullfd, STDIN_FILENO) < 0)
+ die_errno("dup2");
+ close(nullfd);
+ if (dup2(logfd, STDOUT_FILENO) < 0)
+ die_errno("dup2");
+ if (dup2(logfd, STDERR_FILENO) < 0)
+ die_errno("dup2");
+ close(logfd);
+ valid_fd012();
+ if (chdir("/") < 0)
+ die_errno("chdir");
+ return pipefd[1];
+}
+
+static int super_dull_hash(const char *input)
+{
+ const uint8_t *x = (typeof(x))input;
+ const unsigned p1 = 16777619, p2 = 2971215073;
+ unsigned n, m, h, result = 0;
+
+ for (n = 0; n < 4; n++) {
+ h = p1 * (x[0] + n);
+ for (m = 1; x[m] != 0; m++)
+ h = p2 * (h ^ x[m]);
+ result = (result << 8) | (h % 256);
+ }
+ return result >> 1;
+}
+
+/**
+ * We use a semaphore set with two semaphores. The first semaphore is modified
+ * in both misma_lock() and get_misma_pid() while the second one is modified
+ * only in misma_lock(). This allows us to obtain the PID of the running misma
+ * process by querying the PID that last performed an operation on the second
+ * semaphore. This is achieved by passing GETPID as the control operation to
+ * semctl().
+ */
+
+bool misma_lock(const char *string)
+{
+ int ret, semid;
+ struct sembuf sops[4];
+ key_t key = super_dull_hash(string);
+
+ ret = semget(key, 2, IPC_CREAT | 0600);
+ if (ret < 0)
+ return false;
+ semid = ret;
+ DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+ sops[0].sem_num = 0;
+ sops[0].sem_op = 0;
+ sops[0].sem_flg = SEM_UNDO | IPC_NOWAIT;
+
+ sops[1].sem_num = 0;
+ sops[1].sem_op = 1;
+ sops[1].sem_flg = SEM_UNDO | IPC_NOWAIT;
+
+ sops[2].sem_num = 1;
+ sops[2].sem_op = 0;
+ sops[2].sem_flg = SEM_UNDO | IPC_NOWAIT;
+
+ sops[3].sem_num = 1;
+ sops[3].sem_op = 1;
+ sops[3].sem_flg = SEM_UNDO | IPC_NOWAIT;
+
+ return semop(semid, sops, 4) >= 0;
+}
+
+/* returns zero if misma is not running */
+pid_t get_misma_pid(const char *string)
+{
+ int ret, semid;
+ struct sembuf sops = {
+ .sem_num = 0,
+ .sem_op = 0,
+ .sem_flg = SEM_UNDO | IPC_NOWAIT
+ };
+ key_t key = super_dull_hash(string);
+
+ ret = semget(key, 2, 0);
+ if (ret < 0)
+ return 0;
+ semid = ret;
+ DEBUG_LOG("key: 0x%0x, semid: %d\n", (unsigned)key, semid);
+ if (semop(semid, &sops, 1) >= 0)
+ return 0;
+ ret = semctl(semid, 1, GETPID);
+ if (ret < 0)
+ return 0;
+ return ret;
+}
+
+/* Simplistic min-heap implementation (see e.g. Cormen et al. Chapter 6) */
+struct heap {
+ void ***aa; /* array address */
+ unsigned n; /* num elements */
+ int (*compare)(const void *data1, const void *data2);
+};
+
+static unsigned heap_parent(unsigned idx)
+{
+ return (idx + 1) / 2 - 1;
+}
+
+static unsigned heap_left(unsigned idx)
+{
+ return (idx + 1) * 2 - 1;
+}
+
+static unsigned heap_right(unsigned idx)
+{
+ return (idx + 1) * 2;
+}
+
+static void heapify(struct heap *h, unsigned idx)
+{
+ unsigned l = heap_left(idx), r = heap_right(idx), smallest;
+ void **array = *(h->aa);
+
+ assert(idx < h->n);
+ if (l < h->n && h->compare(array[l], array[idx]) > 0)
+ smallest = l;
+ else
+ smallest = idx;
+ if (r < h->n && h->compare(array[r], array[smallest]) > 0)
+ smallest = r;
+ if (smallest != idx) { /* exchange idx and smallest */
+ void *tmp = array[idx];
+ array[idx] = array[smallest];
+ array[smallest] = tmp;
+ heapify(h, smallest);
+ }
+}
+
+struct heap *heap_init(void *aa, unsigned num_elements,
+ int (*compare)(const void *data1, const void *data2))
+{
+ struct heap *h = xmalloc(sizeof(*h));
+
+ INFO_LOG("creating heap with %u elements\n", num_elements);
+ h->aa = aa;
+ h->n = num_elements;
+ h->compare = compare;
+ for (unsigned j = h->n / 2 - 1; j != ~0U; j--)
+ heapify(h, j);
+ return h;
+}
+
+void *heap_min(const struct heap *h)
+{
+ assert(h->n > 0);
+ return (*(h->aa))[0];
+}
+
+unsigned heap_num_elements(const struct heap *h)
+{
+ return h->n;
+}
+
+void *heap_extract_min(struct heap *h)
+{
+ void *smallest = heap_min(h);
+ void **array = *(h->aa);
+
+ array[0] = array[h->n - 1];
+ h->n--;
+ *(h->aa) = xrealloc((*h->aa), h->n * sizeof(void *));
+ heapify(h, 0);
+ return smallest;
+}
+
+void heap_insert(void *new_element, struct heap *h)
+{
+ unsigned parent;
+ void **array;
+
+ h->n++;
+ *(h->aa) = xrealloc((*h->aa), h->n * sizeof(void *));
+ array = *(h->aa);
+ array[h->n - 1] = new_element;
+ for (unsigned j = h->n - 1; j > 0; j = parent) {
+ void *tmp;
+ parent = heap_parent(j);
+ if (h->compare(array[j], array[parent]) <= 0)
+ break;
+ tmp = array[j];
+ array[j] = array[parent];
+ array[parent] = tmp;
+ }
+}
+
+void heap_dump(const struct heap *h, void (*dumper)(const void *))
+{
+ void **array = *(h->aa);
+ for (unsigned j = 0; j < h->n; j++)
+ dumper(array[j]);
+}