cregit-Linux how code gets into the kernel

Release 4.8 net/ipv4/inet_connection_sock.c

Directory: net/ipv4
/*
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
 *              operating system.  INET is implemented using the  BSD Socket
 *              interface as the means of communication with the user level.
 *
 *              Support for INET connection oriented protocols.
 *
 * Authors:     See the TCP sources
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or(at your option) any later version.
 */

#include <linux/module.h>
#include <linux/jhash.h>

#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
#include <net/sock_reuseport.h>

#ifdef INET_CSK_DEBUG

const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";

EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif


void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = net->ipv4.ip_local_ports.range[0]; *high = net->ipv4.ip_local_ports.range[1]; } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); }

Contributors

PersonTokensPropCommitsCommitProp
stephen hemmingerstephen hemminger5059.52%120.00%
eric w. biedermaneric w. biederman2125.00%120.00%
eric dumazeteric dumazet910.71%240.00%
americo wangamerico wang44.76%120.00%
Total84100.00%5100.00%

EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) { struct sock *sk2; int reuse = sk->sk_reuse; int reuseport = sk->sk_reuseport; kuid_t uid = sock_i_uid((struct sock *)sk); /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners list belong to the same net - the * one this bucket belongs to. */ sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr) break; } } } return sk2 != NULL; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo10046.08%220.00%
tom herberttom herbert5123.50%110.00%
alex copotalex copot3315.21%110.00%
eric dumazeteric dumazet2210.14%220.00%
craig gallekcraig gallek73.23%110.00%
david s. millerdavid s. miller20.92%110.00%
sasha levinsasha levin10.46%110.00%
pavel emelianovpavel emelianov10.46%110.00%
Total217100.00%10100.00%

EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */
int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; int ret = 1, attempts = 5, port = snum; int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); u32 remaining, offset; if (port) { have_port: head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) goto tb_found; goto tb_not_found; } again: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = prandom_u32() % remaining; /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; smallest_size = -1; smallest_port = low; /* avoid compiler warning */ other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_port = port; } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) goto tb_found; goto next_port; } goto tb_not_found; next_port: spin_unlock_bh(&head->lock); cond_resched(); } if (smallest_size != -1) { port = smallest_port; goto have_port; } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } return ret; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port); if (!tb) goto fail_unlock; tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) goto success; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if ((reuse || (tb->fastreuseport > 0 && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock_bh(&head->lock); goto again; } goto fail_unlock; } if (!reuse) tb->fastreuse = 0; if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) tb->fastreuseport = 0; } else { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; } else { tb->fastreuseport = 0; } } success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: spin_unlock_bh(&head->lock); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet34439.86%310.34%
arnaldo carvalho de meloarnaldo carvalho de melo20223.41%310.34%
tom herberttom herbert11012.75%26.90%
evgeniy polyakovevgeniy polyakov8810.20%13.45%
pavel emelianovpavel emelianov293.36%413.79%
craig gallekcraig gallek242.78%26.90%
stephen hemmingerstephen hemminger212.43%310.34%
flavio leitnerflavio leitner202.32%26.90%
americo wangamerico wang80.93%26.90%
octavian purdilaoctavian purdila40.46%13.45%
alex copotalex copot40.46%13.45%
hideaki yoshifujihideaki yoshifuji30.35%13.45%
eric w. biedermaneric w. biederman20.23%13.45%
ilpo jarvinenilpo jarvinen20.23%13.45%
aruna-hewapathiranearuna-hewapathirane10.12%13.45%
anton arapovanton arapov10.12%13.45%
Total863100.00%29100.00%

EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */
static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo14494.12%133.33%
eric dumazeteric dumazet95.88%266.67%
Total153100.00%3100.00%

/* * This will accept the next outstanding connection. */
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); if (req) reqsk_put(req); return newsk; out_err: newsk = NULL; req = NULL; *err = error; goto out; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo14360.34%112.50%
jerry chujerry chu7632.07%112.50%
eric dumazeteric dumazet187.59%675.00%
Total237100.00%8100.00%

EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */
void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(unsigned long), void (*delack_handler)(unsigned long), void (*keepalive_handler)(unsigned long)) { struct inet_connection_sock *icsk = inet_csk(sk); setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, (unsigned long)sk); setup_timer(&icsk->icsk_delack_timer, delack_handler, (unsigned long)sk); setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo8880.00%150.00%
pavel emelianovpavel emelianov2220.00%150.00%
Total110100.00%2100.00%

EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo68100.00%1100.00%
Total68100.00%1100.00%

EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo20100.00%1100.00%
Total20100.00%1100.00%

EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo28100.00%1100.00%
Total28100.00%1100.00%

EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt = ireq->opt; struct rtable *rt; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo8341.71%15.00%
david s. millerdavid s. miller4723.62%420.00%
eric dumazeteric dumazet3819.10%630.00%
ilpo jarvinenilpo jarvinen168.04%15.00%
venkat yekkiralavenkat yekkirala63.02%210.00%
pavel emelianovpavel emelianov42.01%210.00%
lorenzo colittilorenzo colitti21.01%15.00%
julian anastasovjulian anastasov10.50%15.00%
atis elstsatis elsts10.50%15.00%
denis v. lunevdenis v. lunev10.50%15.00%
Total199100.00%20100.00%

EXPORT_SYMBOL_GPL(inet_csk_route_req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
david s. millerdavid s. miller19681.33%112.50%
eric dumazeteric dumazet2510.37%450.00%
christoph paaschchristoph paasch187.47%112.50%
lorenzo colittilorenzo colitti10.41%112.50%
julian anastasovjulian anastasov10.41%112.50%
Total241100.00%8100.00%

EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif /* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= thresh; *resend = 1; return; } *expire = req->num_timeout >= thresh && (!inet_rsk(req)->acked || req->num_timeout >= max_retries); /* * Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; }

Contributors

PersonTokensPropCommitsCommitProp
julian anastasovjulian anastasov9595.96%150.00%
eric dumazeteric dumazet44.04%150.00%
Total99100.00%2100.00%


int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; return err; }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet43100.00%2100.00%
Total43100.00%2100.00%

EXPORT_SYMBOL(inet_rtx_syn_ack); /* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; if (sk_hashed(req_to_sk(req))) { spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet108100.00%5100.00%
Total108100.00%5100.00%


void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet50100.00%1100.00%
Total50100.00%1100.00%

EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); }

Contributors

PersonTokensPropCommitsCommitProp
eric dumazeteric dumazet27100.00%1100.00%
Total27100.00%1100.00%

EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); if (