static pid_t create_pid;
/** Whether the pre-create-hook/rsync/post-create-hook is currently stopped. */
static int create_process_stopped;
+/** How many times in a row the rsync command failed. */
+static int num_consecutive_rsync_errors;
/** Process id of current pre-remove/rm/post-remove process. */
static pid_t remove_pid;
/** When the next snapshot is due. */
"reference_snapshot: %s\n"
"snapshot_creation_status: %s\n"
"snapshot_removal_status: %s\n"
+ "num_consecutive_rsync_errors: %d\n"
,
(int) getpid(),
logfile? conf.logfile_arg : "stderr",
name_of_reference_snapshot?
name_of_reference_snapshot : "(none)",
hook_status_description[snapshot_creation_status],
- hook_status_description[snapshot_removal_status]
+ hook_status_description[snapshot_removal_status],
+ num_consecutive_rsync_errors
);
if (create_pid != 0)
fprintf(log,
es = WEXITSTATUS(status);
/*
* Restart rsync on non-fatal errors:
- * 12: Error in rsync protocol data stream
- * 13: Errors with program diagnostics
+ * 24: Partial transfer due to vanished source files
*/
- if (es == 12 || es == 13) {
- DSS_WARNING_LOG(("rsync process %d returned %d -- restarting\n",
- (int)create_pid, es));
+ if (es != 0 && es != 24) {
+ DSS_WARNING_LOG(("rsync exit code %d, error count %d\n",
+ es, ++num_consecutive_rsync_errors));
+ if (num_consecutive_rsync_errors > conf.max_rsync_errors_arg) {
+ ret = -E_TOO_MANY_RSYNC_ERRORS;
+ snapshot_creation_status = HS_READY;
+ goto out;
+ }
+ DSS_WARNING_LOG(("restarting rsync process\n"));
snapshot_creation_status = HS_NEEDS_RESTART;
next_snapshot_time = get_current_time() + 60;
ret = 1;
goto out;
}
- if (es != 0 && es != 23 && es != 24) {
- DSS_ERROR_LOG(("rsync process %d returned %d\n", (int)create_pid, es));
- ret = -E_BAD_EXIT_CODE;
- snapshot_creation_status = HS_READY;
- goto out;
- }
+ num_consecutive_rsync_errors = 0;
ret = rename_incomplete_snapshot(current_snapshot_creation_time);
if (ret < 0)
goto out;
--rsync-option --exclude --rsync-option /proc
"
+option "max-rsync-errors" -
+"Terminate after this many rsync failures"
+int typestr="count"
+default="10"
+optional
+details="
+ Only relevant when operating in --run mode (see above). If
+ the rsync process exits with a fatal error, dss restarts
+ the command in the hope that the problem is transient
+ and subsequent rsync runs succeed. After the given number
+ of consecutive rsync error exits, however, dss gives up,
+ executes the exit hook and terminates. Set this to zero if
+ dss should exit immediately on the first rsync error.
+
+ The only non-fatal error is when rsync exits with code 24. This
+ indicates a partial transfer due to vanished source files
+ and happens frequently when snapshotting a directory which
+ is concurrently being modified.
+"
+
###################
section "Intervals"
###################
DSS_ERROR(SIGNAL_SIG_ERR, "signal() returned SIG_ERR"), \
DSS_ERROR(SIGNAL, "caught terminating signal"), \
DSS_ERROR(BUG, "values of beta might cause dom!"), \
- DSS_ERROR(NOT_RUNNING, "dss not running")
+ DSS_ERROR(NOT_RUNNING, "dss not running"), \
+ DSS_ERROR(TOO_MANY_RSYNC_ERRORS, "too many consecutive rsync errors")
/**
* This is temporarily defined to expand to its first argument (prefixed by