cregit-Linux how code gets into the kernel

Release 4.8 net/ipv4/tcp_input.c

Directory: net/ipv4
/*
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
 *              operating system.  INET is implemented using the  BSD Socket
 *              interface as the means of communication with the user level.
 *
 *              Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:     Ross Biro
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
 *              Florian La Roche, <flla@stud.uni-sb.de>
 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 *              Matthew Dillon, <dillon@apollo.west.oic.com>
 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *              Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *              Pedro Roque     :       Fast Retransmit/Recovery.
 *                                      Two receive queues.
 *                                      Retransmit queue handled by TCP.
 *                                      Better retransmit timer handling.
 *                                      New congestion avoidance.
 *                                      Header prediction.
 *                                      Variable renaming.
 *
 *              Eric            :       Fast Retransmit.
 *              Randy Scott     :       MSS option defines.
 *              Eric Schenk     :       Fixes to slow start algorithm.
 *              Eric Schenk     :       Yet another double ACK bug.
 *              Eric Schenk     :       Delayed ACK bug fixes.
 *              Eric Schenk     :       Floyd style fast retrans war avoidance.
 *              David S. Miller :       Don't allow zero congestion window.
 *              Eric Schenk     :       Fix retransmitter so that it sends
 *                                      next packet on ack of previous packet.
 *              Andi Kleen      :       Moved open_request checking here
 *                                      and process RSTs for open_requests.
 *              Andi Kleen      :       Better prune_queue, and other fixes.
 *              Andrey Savochkin:       Fix RTT measurements in the presence of
 *                                      timestamps.
 *              Andrey Savochkin:       Check sequence numbers correctly when
 *                                      removing SACKs due to in sequence incoming
 *                                      data segments.
 *              Andi Kleen:             Make sure we never ack data there is not
 *                                      enough room for. Also make this condition
 *                                      a fatal error if it might still happen.
 *              Andi Kleen:             Add tcp_measure_rcv_mss to make
 *                                      connections with MSS<min(MTU,ann. MSS)
 *                                      work without delayed acks.
 *              Andi Kleen:             Process packets with PSH set in the
 *                                      fast path.
 *              J Hadi Salim:           ECN support
 *              Andrei Gurtov,
 *              Pasi Sarolahti,
 *              Panu Kuhlberg:          Experimental audit of TCP (re)transmission
 *                                      engine. Lots of bugs are found.
 *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
 */


#define pr_fmt(fmt) "TCP: " fmt

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>


int sysctl_tcp_timestamps __read_mostly = 1;

int sysctl_tcp_window_scaling __read_mostly = 1;

int sysctl_tcp_sack __read_mostly = 1;

int sysctl_tcp_fack __read_mostly = 1;

int sysctl_tcp_max_reordering __read_mostly = 300;

int sysctl_tcp_dsack __read_mostly = 1;

int sysctl_tcp_app_win __read_mostly = 31;

int sysctl_tcp_adv_win_scale __read_mostly = 1;

EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);

/* rfc5961 challenge ack rate limiting */

int sysctl_tcp_challenge_ack_limit = 1000;


int sysctl_tcp_stdurg __read_mostly;

int sysctl_tcp_rfc1337 __read_mostly;

int sysctl_tcp_max_orphans __read_mostly = NR_FILE;

int sysctl_tcp_frto __read_mostly = 2;

int sysctl_tcp_min_rtt_wlen __read_mostly = 300;


int sysctl_tcp_thin_dupack __read_mostly;


int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;

int sysctl_tcp_early_retrans __read_mostly = 3;

int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;


#define FLAG_DATA		0x01 
/* Incoming frame contained data.               */

#define FLAG_WIN_UPDATE		0x02 
/* Incoming ACK was a window update.    */

#define FLAG_DATA_ACKED		0x04 
/* This ACK acknowledged new data.              */

#define FLAG_RETRANS_DATA_ACKED	0x08 
/* "" "" some of which was retransmitted.       */

#define FLAG_SYN_ACKED		0x10 
/* This ACK acknowledged SYN.           */

#define FLAG_DATA_SACKED	0x20 
/* New SACK.                            */

#define FLAG_ECE		0x40 
/* ECE in this ACK                              */

#define FLAG_LOST_RETRANS	0x80 
/* This ACK marks some retransmission lost */

#define FLAG_SLOWPATH		0x100 
/* Do not skip RFC checks for window update.*/

#define FLAG_ORIG_SACK_ACKED	0x200 
/* Never retransmitted data are (s)acked        */

#define FLAG_SND_UNA_ADVANCED	0x400 
/* Snd_una was changed (!= FLAG_DATA_ACKED) */

#define FLAG_DSACKING_ACK	0x800 
/* SACK blocks contained D-SACK info */

#define FLAG_SACK_RENEGING	0x2000 
/* snd_una advanced to a sacked seq */

#define FLAG_UPDATE_TS_RECENT	0x4000 
/* tcp_replace_ts_recent() */


#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)

#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)

#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)

#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)


#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)

#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))


#define REXMIT_NONE	0 
/* no loss recovery to do */

#define REXMIT_LOST	1 
/* retransmit packets marked lost */

#define REXMIT_NEW	2 
/* FRTO-style transmit of unsent/new packets */

/* Adapt the MSS value used to make delayed ack decision to the
 * real world.
 */

static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git12763.18%956.25%
arnaldo carvalho de meloarnaldo carvalho de melo4522.39%318.75%
alexey kuznetsovalexey kuznetsov188.96%16.25%
herbert xuherbert xu83.98%16.25%
linus torvaldslinus torvalds21.00%16.25%
william allen simpsonwilliam allen simpson10.50%16.25%
Total201100.00%16100.00%


static void tcp_incr_quickack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git4965.33%562.50%
arnaldo carvalho de meloarnaldo carvalho de melo2229.33%112.50%
linus torvaldslinus torvalds34.00%112.50%
eric dumazeteric dumazet11.33%112.50%
Total75100.00%8100.00%


static void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git2457.14%133.33%
arnaldo carvalho de meloarnaldo carvalho de melo1740.48%133.33%
stephen hemmingerstephen hemminger12.38%133.33%
Total42100.00%3100.00%

/* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */
static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct dst_entry *dst = __sk_dst_get(sk); return (dst && dst_metric(dst, RTAX_QUICKACK)) || (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); }

Contributors

PersonTokensPropCommitsCommitProp
jon maxwelljon maxwell2440.00%125.00%
pre-gitpre-git1830.00%125.00%
arnaldo carvalho de meloarnaldo carvalho de melo1728.33%125.00%
eric dumazeteric dumazet11.67%125.00%
Total60100.00%4100.00%


static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen2496.00%150.00%
florian westphalflorian westphal14.00%150.00%
Total25100.00%2100.00%


static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen3193.94%133.33%
florian westphalflorian westphal13.03%133.33%
eric dumazeteric dumazet13.03%133.33%
Total33100.00%3100.00%


static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen1794.44%150.00%
florian westphalflorian westphal15.56%150.00%
Total18100.00%2100.00%


static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, * and we already seen ECT on a previous segment, * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: if (tcp_ca_needs_ecn((struct sock *)tp)) tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn((struct sock *)tp)) tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; } }

Contributors

PersonTokensPropCommitsCommitProp
florian westphalflorian westphal5738.26%233.33%
eric dumazeteric dumazet5536.91%350.00%
ilpo jarvinenilpo jarvinen3724.83%116.67%
Total149100.00%6100.00%


static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { if (tp->ecn_flags & TCP_ECN_OK) __tcp_ecn_check_ce(tp, skb); }

Contributors

PersonTokensPropCommitsCommitProp
florian westphalflorian westphal32100.00%1100.00%
Total32100.00%1100.00%


static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen4395.56%133.33%
eric dumazeteric dumazet12.22%133.33%
florian westphalflorian westphal12.22%133.33%
Total45100.00%3100.00%


static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen4495.65%133.33%
florian westphalflorian westphal12.17%133.33%
eric dumazeteric dumazet12.17%133.33%
Total46100.00%3100.00%


static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return true; return false; }

Contributors

PersonTokensPropCommitsCommitProp
ilpo jarvinenilpo jarvinen3786.05%125.00%
eric dumazeteric dumazet511.63%250.00%
florian westphalflorian westphal12.33%125.00%
Total43100.00%4100.00%

/* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */
static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); int sndmem, per_mss; u32 nr_segs; /* Worst case is non GSO/TSO : each frame consumes one skb * and skb->head is kmalloced using power of two area of memory */ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to POLLOUT) */ sndmem = 2 * nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet9470.15%222.22%
pre-gitpre-git3123.13%333.33%
arnaldo carvalho de meloarnaldo carvalho de melo53.73%333.33%
david s. millerdavid s. miller42.99%111.11%
Total134100.00%9100.00%

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize) >> 1; int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0; }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git6972.63%450.00%
ilpo jarvinenilpo jarvinen1010.53%112.50%
arnaldo carvalho de meloarnaldo carvalho de melo77.37%112.50%
john heffnerjohn heffner55.26%112.50%
eric dumazeteric dumazet44.21%112.50%
Total95100.00%8100.00%


static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git10674.65%646.15%
eric dumazeteric dumazet1711.97%323.08%
ilpo jarvinenilpo jarvinen107.04%17.69%
arnaldo carvalho de meloarnaldo carvalho de melo53.52%17.69%
glauber costaglauber costa32.11%17.69%
linus torvaldslinus torvalds10.70%17.69%
Total142100.00%13100.00%

/* 3. Tuning rcvbuf, when connection enters established state. */
static void tcp_fixup_rcvbuf(struct sock *sk) { u32 mss = tcp_sk(sk)->advmss; int rcvmem; rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * tcp_default_init_rwnd(mss); /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ if (sysctl_tcp_moderate_rcvbuf) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git3650.70%430.77%
eric dumazeteric dumazet2332.39%430.77%
yuchung chengyuchung cheng57.04%17.69%
david s. millerdavid s. miller34.23%17.69%
arnaldo carvalho de meloarnaldo carvalho de melo34.23%215.38%
linus torvaldslinus torvalds11.41%17.69%
Total71100.00%13100.00%

/* 4. Try to fixup all. It is made immediately after connection enters * established state. */
void tcp_init_buffer_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; tp->rcvq_space.time = tcp_time_stamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - (maxwin >> sysctl_tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ if (sysctl_tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git16681.37%853.33%
eric dumazeteric dumazet199.31%213.33%
david s. millerdavid s. miller136.37%213.33%
arnaldo carvalho de meloarnaldo carvalho de melo31.47%213.33%
linus torvaldslinus torvalds31.47%16.67%
Total204100.00%15100.00%

/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git8766.92%323.08%
arnaldo carvalho de meloarnaldo carvalho de melo2015.38%538.46%
glauber costaglauber costa107.69%17.69%
ilpo jarvinenilpo jarvinen86.15%17.69%
linus torvaldslinus torvalds32.31%17.69%
john heffnerjohn heffner10.77%17.69%
eric dumazeteric dumazet10.77%17.69%
Total130100.00%13100.00%

/* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */
void tcp_initialize_rcv_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; }

Contributors

PersonTokensPropCommitsCommitProp
stephen hemmingerstephen hemminger7497.37%133.33%
william allen simpsonwilliam allen simpson11.32%133.33%
eric dumazeteric dumazet11.32%133.33%
Total76100.00%3100.00%

EXPORT_SYMBOL(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <http://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { u32 new_sample = tp->rcv_rtt_est.rtt; long m = sample; if (m == 0) m = 1; if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { m -= (new_sample >> 3); new_sample += m; } else { m <<= 3; if (m < new_sample) new_sample = m; } } else { /* No previous measure. */ new_sample = m << 3; } if (tp->rcv_rtt_est.rtt != new_sample) tp->rcv_rtt_est.rtt = new_sample; }

Contributors

PersonTokensPropCommitsCommitProp
david s. millerdavid s. miller10491.23%120.00%
neal cardwellneal cardwell76.14%120.00%
stephen hemmingerstephen hemminger21.75%240.00%
arnaldo carvalho de meloarnaldo carvalho de melo10.88%120.00%
Total114100.00%5100.00%


static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp; }

Contributors

PersonTokensPropCommitsCommitProp
david s. millerdavid s. miller7897.50%133.33%
arnaldo carvalho de meloarnaldo carvalho de melo11.25%133.33%
neal cardwellneal cardwell11.25%133.33%
Total80100.00%3100.00%


static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); }

Contributors

PersonTokensPropCommitsCommitProp
david s. millerdavid s. miller5471.05%133.33%
arnaldo carvalho de meloarnaldo carvalho de melo2228.95%266.67%
Total76100.00%3100.00%

/* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */
void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; int copied; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; if (copied <= tp->rcvq_space.space) goto new_measure; /* A bit of theory : * copied = bytes received in previous RTT, our base window * To cope with packet losses, we need a 2x factor * To cope with slow start, and sender growing its cwin by 100 % * every RTT, we need a 4x factor, because the ACK we are sending * now is for the next RTT, not the current one : * <prev RTT . ><current RTT .. ><next RTT .... > */ if (sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvwin, rcvmem, rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ rcvwin = (copied << 1) + 16 * tp->advmss; /* If rate increased by 25%, * assume slow start, rcvwin = 3 * copied * If rate increased by 50%, * assume sender can use 2x growth, rcvwin = 4 * copied */ if (copied >= tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { if (copied >= tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) rcvwin <<= 1; else rcvwin += (rcvwin >> 1); } rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ tp->window_clamp = rcvwin; } } tp->rcvq_space.space = copied; new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tcp_time_stamp; }

Contributors

PersonTokensPropCommitsCommitProp
david s. millerdavid s. miller16861.31%337.50%
eric dumazeteric dumazet9534.67%225.00%
john heffnerjohn heffner93.28%112.50%
arnaldo carvalho de meloarnaldo carvalho de melo20.73%225.00%
Total274100.00%8100.00%

/* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; inet_csk_schedule_ack(sk); tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; tcp_ecn_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); }

Contributors

PersonTokensPropCommitsCommitProp
pre-gitpre-git16771.06%1161.11%
arnaldo carvalho de meloarnaldo carvalho de melo4318.30%15.56%
ilpo jarvinenilpo jarvinen104.26%15.56%
linus torvaldslinus torvalds72.98%15.56%
david s. millerdavid s. miller52.13%15.56%
stephen hemmingerstephen hemminger10.43%15.56%
florian westphalflorian westphal10.43%15.56%
hideo aokihideo aoki10.43%15.56%
Total235100.00%18100.00%

/* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt_us; /* RTT */ u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev_us >> 2); /* similar update on mdev */ } tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev_us > tp->mdev_max_us) { tp->mdev_max_us = tp->mdev_us; if (tp->mdev_max_us > tp->rttvar_us) tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max_us < tp->rttvar_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt;