]> xenbits.xensource.com Git - people/royger/xen.git/commitdiff
xenstored: apply a write transaction rate limit
authorIan Jackson <ian.jackson@eu.citrix.com>
Tue, 7 Mar 2017 16:09:12 +0000 (16:09 +0000)
committerIan Jackson <Ian.Jackson@eu.citrix.com>
Tue, 28 Mar 2017 12:07:03 +0000 (13:07 +0100)
This avoids a rogue client being about to stall another client (eg the
toolstack) indefinitely.

This is XSA-206.

Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
tools/xenstore/Makefile
tools/xenstore/xenstored_core.c
tools/xenstore/xenstored_core.h
tools/xenstore/xenstored_domain.c
tools/xenstore/xenstored_domain.h
tools/xenstore/xenstored_transaction.c

index c4f9cdeec7d961b84899da434373a4819eec2ded..773d646539f5d66c8d4c4f5b20124a34687d3127 100644 (file)
@@ -34,6 +34,7 @@ XENSTORED_OBJS_$(CONFIG_FreeBSD) = xenstored_posix.o
 XENSTORED_OBJS_$(CONFIG_MiniOS) = xenstored_minios.o
 
 XENSTORED_OBJS += $(XENSTORED_OBJS_y)
+LDLIBS_xenstored += -lrt
 
 ifneq ($(XENSTORE_STATIC_CLIENTS),y)
 LIBXENSTORE := libxenstore.so
@@ -75,7 +76,7 @@ endif
 $(XENSTORED_OBJS): CFLAGS += $(CFLAGS_libxengnttab)
 
 xenstored: $(XENSTORED_OBJS)
-       $(CC) $^ $(LDFLAGS) $(LDLIBS_libxenevtchn) $(LDLIBS_libxengnttab) $(LDLIBS_libxenctrl) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
+       $(CC) $^ $(LDFLAGS) $(LDLIBS_libxenevtchn) $(LDLIBS_libxengnttab) $(LDLIBS_libxenctrl) $(LDLIBS_xenstored) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
 
 xenstored.a: $(XENSTORED_OBJS)
        $(AR) cr $@ $^
index 5c659d87c3d92f024be4387901ef86c016a3109f..4a0f6349b24caacf9a619e039ba4be3da6b9d727 100644 (file)
@@ -336,6 +336,7 @@ static void initialize_fds(int sock, int *p_sock_pollfd_idx,
                           int *ptimeout)
 {
        struct connection *conn;
+       struct wrl_timestampt now;
 
        if (fds)
                memset(fds, 0, sizeof(struct pollfd) * current_array_size);
@@ -355,8 +356,11 @@ static void initialize_fds(int sock, int *p_sock_pollfd_idx,
                xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle),
                                        POLLIN|POLLPRI);
 
+       wrl_gettime_now(&now);
+
        list_for_each_entry(conn, &connections, list) {
                if (conn->domain) {
+                       wrl_check_timeout(conn->domain, now, ptimeout);
                        if (domain_can_read(conn) ||
                            (domain_can_write(conn) &&
                             !list_empty(&conn->out_list)))
@@ -450,6 +454,7 @@ static bool write_node(struct connection *conn, struct node *node)
                goto error;
 
        add_change_node(conn, node, false);
+       wrl_apply_debit_direct(conn);
 
        data.dptr = talloc_size(node, data.dsize);
        hdr = (void *)data.dptr;
@@ -907,8 +912,10 @@ static void delete_node_single(struct connection *conn, struct node *node,
                return;
        }
 
-       if (changed)
+       if (changed) {
                add_change_node(conn, node, true);
+               wrl_apply_debit_direct(conn);
+       }
 
        domain_entry_dec(conn, node);
 }
index 92cccb6b8c2511e13adc459a9f30055f3ab8b39e..05808278c33de898f829dd438ab20930fa2e211e 100644 (file)
 /* DEFAULT_BUFFER_SIZE should be large enough for each errno string. */
 #define DEFAULT_BUFFER_SIZE 16
 
+#define MIN(a, b) (((a) < (b))? (a) : (b))
+
+typedef int32_t wrl_creditt;
+#define WRL_CREDIT_MAX (1000*1000*1000)
+/* ^ satisfies non-overflow condition for wrl_xfer_credit */
+
 struct buffered_data
 {
        struct list_head list;
index 53222808573b4f65417b5a7d89220d4c5d438259..cc2a0cd61bbc2969b7ad27bfbb5532828179ce87 100644 (file)
@@ -21,6 +21,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <time.h>
 
 #include "utils.h"
 #include "talloc.h"
@@ -74,6 +75,10 @@ struct domain
 
        /* number of watch for this domain */
        int nbwatch;
+
+       /* write rate limit */
+       wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
+       struct wrl_timestampt wrl_timestamp;
 };
 
 static LIST_HEAD(domains);
@@ -206,6 +211,8 @@ static int destroy_domain(void *_domain)
 
        fire_watches(NULL, domain, "@releaseDomain", false);
 
+       wrl_domain_destroy(domain);
+
        return 0;
 }
 
@@ -253,6 +260,9 @@ void handle_event(void)
 bool domain_can_read(struct connection *conn)
 {
        struct xenstore_domain_interface *intf = conn->domain->interface;
+
+       if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
+               return false;
        return (intf->req_cons != intf->req_prod);
 }
 
@@ -289,6 +299,8 @@ static struct domain *new_domain(void *context, unsigned int domid,
        if (!domain->path)
                return NULL;
 
+       wrl_domain_new(domain);
+
        list_add(&domain->list, &domains);
        talloc_set_destructor(domain, destroy_domain);
 
@@ -723,6 +735,209 @@ int domain_watch(struct connection *conn)
                : 0;
 }
 
+static wrl_creditt wrl_config_writecost      = WRL_FACTOR;
+static wrl_creditt wrl_config_rate           = WRL_RATE   * WRL_FACTOR;
+static wrl_creditt wrl_config_dburst         = WRL_DBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_gburst         = WRL_GBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_newdoms_dburst =
+                                WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
+
+long wrl_ntransactions;
+
+static long wrl_ndomains;
+static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
+
+void wrl_gettime_now(struct wrl_timestampt *now_wt)
+{
+       struct timespec now_ts;
+       int r;
+
+       r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
+       if (r)
+               barf_perror("Could not find time (clock_gettime failed)");
+
+       now_wt->sec = now_ts.tv_sec;
+       now_wt->msec = now_ts.tv_nsec / 1000000;
+}
+
+static void wrl_xfer_credit(wrl_creditt *debit,  wrl_creditt debit_floor,
+                           wrl_creditt *credit, wrl_creditt credit_ceil)
+       /*
+        * Transfers zero or more credit from "debit" to "credit".
+        * Transfers as much as possible while maintaining
+        * debit >= debit_floor and credit <= credit_ceil.
+        * (If that's violated already, does nothing.)
+        *
+        * Sufficient conditions to avoid overflow, either of:
+        *  |every argument| <= 0x3fffffff
+        *  |every argument| <= 1E9
+        *  |every argument| <= WRL_CREDIT_MAX
+        * (And this condition is preserved.)
+        */
+{
+       wrl_creditt xfer = MIN( *debit      - debit_floor,
+                               credit_ceil - *credit      );
+       if (xfer > 0) {
+               *debit -= xfer;
+               *credit += xfer;
+       }
+}
+
+void wrl_domain_new(struct domain *domain)
+{
+       domain->wrl_credit = 0;
+       wrl_gettime_now(&domain->wrl_timestamp);
+       wrl_ndomains++;
+       /* Steal up to DBURST from the reserve */
+       wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
+                       &domain->wrl_credit, wrl_config_dburst);
+}
+
+void wrl_domain_destroy(struct domain *domain)
+{
+       wrl_ndomains--;
+       /*
+        * Don't bother recalculating domain's credit - this just
+        * means we don't give the reserve the ending domain's credit
+        * for time elapsed since last update.
+        */
+       wrl_xfer_credit(&domain->wrl_credit, 0,
+                       &wrl_reserve, wrl_config_dburst);
+}
+
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
+{
+       /*
+        * We want to calculate
+        *    credit += (now - timestamp) * RATE / ndoms;
+        * But we want it to saturate, and to avoid floating point.
+        * To avoid rounding errors from constantly adding small
+        * amounts of credit, we only add credit for whole milliseconds.
+        */
+       long seconds      = now.sec -  domain->wrl_timestamp.sec;
+       long milliseconds = now.msec - domain->wrl_timestamp.msec;
+       long msec;
+       int64_t denom, num;
+       wrl_creditt surplus;
+
+       seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
+       msec = seconds * 1000 + milliseconds;
+
+       if (msec < 0)
+                /* shouldn't happen with CLOCK_MONOTONIC */
+               msec = 0;
+
+       /* 32x32 -> 64 cannot overflow */
+       denom = (int64_t)msec * wrl_config_rate;
+       num  =  (int64_t)wrl_ndomains * 1000;
+       /* denom / num <= 1E6 * wrl_config_rate, so with
+          reasonable wrl_config_rate, denom / num << 2^64 */
+
+       /* at last! */
+       domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
+                                 WRL_CREDIT_MAX );
+       /* (maybe briefly violating the DBURST cap on wrl_credit) */
+
+       /* maybe take from the reserve to make us nonnegative */
+       wrl_xfer_credit(&wrl_reserve,        0,
+                       &domain->wrl_credit, 0);
+
+       /* return any surplus (over DBURST) to the reserve */
+       surplus = 0;
+       wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
+                       &surplus,            WRL_CREDIT_MAX);
+       wrl_xfer_credit(&surplus,     0,
+                       &wrl_reserve, wrl_config_gburst);
+       /* surplus is now implicitly discarded */
+
+       domain->wrl_timestamp = now;
+
+       trace("wrl: dom %4d %6ld  msec  %9ld credit   %9ld reserve"
+             "  %9ld discard\n",
+             domain->domid,
+             msec,
+             (long)domain->wrl_credit, (long)wrl_reserve,
+             (long)surplus);
+}
+
+void wrl_check_timeout(struct domain *domain,
+                      struct wrl_timestampt now,
+                      int *ptimeout)
+{
+       uint64_t num, denom;
+       int wakeup;
+
+       wrl_credit_update(domain, now);
+
+       if (domain->wrl_credit >= 0)
+               /* not blocked */
+               return;
+
+       if (!*ptimeout)
+               /* already decided on immediate wakeup,
+                  so no need to calculate our timeout */
+               return;
+
+       /* calculate  wakeup = now + -credit / (RATE / ndoms); */
+
+       /* credit cannot go more -ve than one transaction,
+        * so the first multiplication cannot overflow even 32-bit */
+       num   = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
+       denom = wrl_config_rate;
+
+       wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
+       if (*ptimeout==-1 || wakeup < *ptimeout)
+               *ptimeout = wakeup;
+
+       trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
+             domain->domid,
+             (long)domain->wrl_credit, (long)wrl_reserve,
+             wakeup);
+}
+
+void wrl_apply_debit_actual(struct domain *domain)
+{
+       struct wrl_timestampt now;
+
+       if (!domain)
+               /* sockets escape the write rate limit */
+               return;
+
+       wrl_gettime_now(&now);
+       wrl_credit_update(domain, now);
+
+       domain->wrl_credit -= wrl_config_writecost;
+       trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
+             domain->domid,
+             (long)domain->wrl_credit, (long)wrl_reserve);
+}
+
+void wrl_apply_debit_direct(struct connection *conn)
+{
+       if (!conn)
+               /* some writes are generated internally */
+               return;
+
+       if (conn->transaction)
+               /* these are accounted for when the transaction ends */
+               return;
+
+       if (!wrl_ntransactions)
+               /* we don't conflict with anyone */
+               return;
+
+       wrl_apply_debit_actual(conn->domain);
+}
+
+void wrl_apply_debit_trans_commit(struct connection *conn)
+{
+       if (wrl_ntransactions <= 1)
+               /* our own transaction appears in the counter */
+               return;
+
+       wrl_apply_debit_actual(conn->domain);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
index 40e15d13e727e095f84f25f2f3c84533ef6fda98..123ce45767d0626b9076fdf1bf91d56f169e5bc1 100644 (file)
@@ -65,4 +65,29 @@ void domain_watch_inc(struct connection *conn);
 void domain_watch_dec(struct connection *conn);
 int domain_watch(struct connection *conn);
 
+/* Write rate limiting */
+
+#define WRL_FACTOR   1000 /* for fixed-point arithmetic */
+#define WRL_RATE      200
+#define WRL_DBURST     10
+#define WRL_GBURST   1000
+#define WRL_NEWDOMS     5
+
+struct wrl_timestampt {
+       time_t sec;
+       int msec;
+};
+
+extern long wrl_ntransactions;
+
+void wrl_gettime_now(struct wrl_timestampt *now_ts);
+void wrl_domain_new(struct domain *domain);
+void wrl_domain_destroy(struct domain *domain);
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now);
+void wrl_check_timeout(struct domain *domain,
+                       struct wrl_timestampt now,
+                       int *ptimeout);
+void wrl_apply_debit_direct(struct connection *conn);
+void wrl_apply_debit_trans_commit(struct connection *conn);
+
 #endif /* _XENSTORED_DOMAIN_H */
index 16f25fbf3153c613f7ba558ba2360693b67430bc..a01f8cfe3486d20f0a86df86292e0cc98054fd61 100644 (file)
@@ -139,6 +139,7 @@ static int destroy_transaction(void *_transaction)
 {
        struct transaction *trans = _transaction;
 
+       wrl_ntransactions--;
        trace_destroy(trans, "transaction");
        if (trans->tdb)
                tdb_close(trans->tdb);
@@ -201,6 +202,7 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in)
        talloc_steal(conn, trans);
        talloc_set_destructor(trans, destroy_transaction);
        conn->transaction_started++;
+       wrl_ntransactions++;
 
        snprintf(id_str, sizeof(id_str), "%u", trans->id);
        send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
@@ -232,6 +234,9 @@ int do_transaction_end(struct connection *conn, struct buffered_data *in)
                /* FIXME: Merge, rather failing on any change. */
                if (trans->generation != generation)
                        return EAGAIN;
+
+               wrl_apply_debit_trans_commit(conn);
+
                if (!replace_tdb(trans->tdb_name, trans->tdb))
                        return errno;
                /* Don't close this: we won! */