]> xenbits.xensource.com Git - people/julieng/freebsd.git/commitdiff
Calculate the correct amount of bytes that are in-flight for a connection as
authorhiren <hiren@FreeBSD.org>
Wed, 28 Oct 2015 22:57:51 +0000 (22:57 +0000)
committerhiren <hiren@FreeBSD.org>
Wed, 28 Oct 2015 22:57:51 +0000 (22:57 +0000)
suggested by RFC 6675.

Currently differnt places in the stack tries to guess this in suboptimal ways.
The main problem is that current calculations don't take sacked bytes into
account. Sacked bytes are the bytes receiver acked via SACK option. This is
suboptimal because it assumes that network has more outstanding (unacked) bytes
than the actual value and thus sends less data by setting congestion window
lower than what's possible which in turn may cause slower recovery from losses.

As an example, one of the current calculations looks something like this:
snd_nxt - snd_fack + sackhint.sack_bytes_rexmit
New proposal from RFC 6675 is:
snd_max - snd_una - sackhint.sacked_bytes + sackhint.sack_bytes_rexmit
which takes sacked bytes into account which is a new addition to the sackhint
struct. Only thing we are missing from RFC 6675 is isLost() i.e. segment being
considered lost and thus adjusting pipe based on that which makes this
calculation a bit on conservative side.

The approach is very simple. We already process each ack with sack info in
tcp_sack_doack() and extract sack blocks/holes out of it. We'd now also track
this new variable sacked_bytes which keeps track of total sacked bytes reported.

One downside to this approach is that we may get incorrect count of sacked_bytes
if the other end decides to drop sack info in the ack because of memory pressure
or some other reasons. But in this (not very likely) case also the pipe
calculation would be conservative which is okay as opposed to being aggressive
in sending packets into the network.

Next step is to use this more accurate pipe estimation to drive congestion
window adjustments.

In collaboration with: rrs
Reviewed by: jason_eggnet dot com, rrs
MFC after: 2 weeks
Sponsored by: Limelight Networks
Differential Revision: https://reviews.freebsd.org/D3971

sys/netinet/tcp_input.c
sys/netinet/tcp_sack.c
sys/netinet/tcp_var.h

index 8f5662033048a2177e067dce0d46358af75300ed..10c3cdca9cdd5fc9eefebbb3157edb379eb0a2c4 100644 (file)
@@ -148,6 +148,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
+VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_pipe, CTLFLAG_VNET | CTLFLAG_RW,
+    &VNET_NAME(tcp_do_rfc6675_pipe), 0,
+    "Use calculated pipe/in-flight bytes per RFC 6675");
+
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define        V_tcp_do_rfc3042        VNET(tcp_do_rfc3042)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
@@ -2420,6 +2425,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
                    ((to.to_flags & TOF_SACK) ||
                     !TAILQ_EMPTY(&tp->snd_holes)))
                        tcp_sack_doack(tp, &to, th->th_ack);
+               else
+                       /*
+                        * Reset the value so that previous (valid) value
+                        * from the last ack with SACK doesn't get used.
+                        */
+                       tp->sackhint.sacked_bytes = 0;
 
                /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
                hhook_run_tcp_est_in(tp, th, &to);
@@ -2483,8 +2494,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
                                                 * we have less than 1/2 the original window's
                                                 * worth of data in flight.
                                                 */
-                                               awnd = (tp->snd_nxt - tp->snd_fack) +
-                                                       tp->sackhint.sack_bytes_rexmit;
+                                               if (V_tcp_do_rfc6675_pipe)
+                                                       awnd = tcp_compute_pipe(tp);
+                                               else
+                                                       awnd = (tp->snd_nxt - tp->snd_fack) +
+                                                               tp->sackhint.sack_bytes_rexmit;
+
                                                if (awnd < tp->snd_ssthresh) {
                                                        tp->snd_cwnd += tp->t_maxseg;
                                                        if (tp->snd_cwnd > tp->snd_ssthresh)
@@ -3729,3 +3744,11 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
                tp->snd_cwnd = 0;
        tp->snd_cwnd += tp->t_maxseg;
 }
+
+int
+tcp_compute_pipe(struct tcpcb *tp)
+{
+       return (tp->snd_max - tp->snd_una +
+               tp->sackhint.sack_bytes_rexmit -
+               tp->sackhint.sacked_bytes);
+}
index 223403cd41190a12a5869b0af3bdeeae02f67abf..f9c71c84f0d4021062773b58144436e867330607 100644 (file)
@@ -369,6 +369,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
         * received new blocks from the other side.
         */
        if (to->to_flags & TOF_SACK) {
+               tp->sackhint.sacked_bytes = 0;  /* reset */
                for (i = 0; i < to->to_nsacks; i++) {
                        bcopy((to->to_sacks + i * TCPOLEN_SACK),
                            &sack, sizeof(sack));
@@ -379,8 +380,11 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
                            SEQ_GT(sack.start, th_ack) &&
                            SEQ_LT(sack.start, tp->snd_max) &&
                            SEQ_GT(sack.end, tp->snd_una) &&
-                           SEQ_LEQ(sack.end, tp->snd_max))
+                           SEQ_LEQ(sack.end, tp->snd_max)) {
                                sack_blocks[num_sack_blks++] = sack;
+                               tp->sackhint.sacked_bytes +=
+                                   (sack.end-sack.start);
+                       }
                }
        }
        /*
index 2a2a148f6b7a4c9909634cc0f200140f72020319..a013bda6c67f2ea28cb168edf21c5ef9a4990964 100644 (file)
@@ -74,7 +74,11 @@ struct sackhint {
        tcp_seq         last_sack_ack;  /* Most recent/largest sacked ack */
 
        int             ispare;         /* explicit pad for 64bit alignment */
-       uint64_t        _pad[2];        /* 1 sacked_bytes, 1 TBD */
+       int             sacked_bytes;   /*
+                                        * Total sacked bytes reported by the
+                                        * receiver via sack option
+                                        */
+       uint64_t        _pad[1];        /* TBD */
 };
 
 struct tcptemp {
@@ -653,6 +657,9 @@ VNET_DECLARE(int, tcp_ecn_maxretries);
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #define        V_tcp_hhh               VNET(tcp_hhh)
 
+VNET_DECLARE(int, tcp_do_rfc6675_pipe);
+#define V_tcp_do_rfc6675_pipe  VNET(tcp_do_rfc6675_pipe)
+
 int     tcp_addoptions(struct tcpopt *, u_char *);
 int     tcp_ccalgounload(struct cc_algo *unload_algo);
 struct tcpcb *
@@ -743,6 +750,7 @@ void         tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void    tcp_free_sackholes(struct tcpcb *tp);
 int     tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long  tcp_seq_subtract(u_long, u_long );
+int     tcp_compute_pipe(struct tcpcb *);
 
 void   cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);