cregit-Linux how code gets into the kernel

Release 4.8 net/kcm/kcmsock.c

Directory: net/kcm
#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/file.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/poll.h>
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/uaccess.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <uapi/linux/kcm.h>


unsigned int kcm_net_id;


static struct kmem_cache *kcm_psockp __read_mostly;

static struct kmem_cache *kcm_muxp __read_mostly;

static struct workqueue_struct *kcm_wq;


static inline struct kcm_sock *kcm_sk(const struct sock *sk) { return (struct kcm_sock *)sk; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert23100.00%1100.00%
Total23100.00%1100.00%


static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) { return (struct kcm_tx_msg *)skb->cb; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert24100.00%1100.00%
Total24100.00%1100.00%


static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) { return (struct kcm_rx_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert38100.00%1100.00%
Total38100.00%1100.00%


static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; csk->sk_error_report(csk); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert27100.00%1100.00%
Total27100.00%1100.00%

/* Callback lock held */
static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, struct sk_buff *skb) { struct sock *csk = psock->sk; /* Unrecoverable error in receive */ del_timer(&psock->rx_msg_timer); if (psock->rx_stopped) return; psock->rx_stopped = 1; KCM_STATS_INCR(psock->stats.rx_aborts); /* Report an error on the lower socket */ report_csk_error(csk, err); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert67100.00%3100.00%
Total67100.00%3100.00%


static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; /* Unrecoverable error in transmit */ spin_lock_bh(&mux->lock); if (psock->tx_stopped) { spin_unlock_bh(&mux->lock); return; } psock->tx_stopped = 1; KCM_STATS_INCR(psock->stats.tx_aborts); if (!psock->tx_kcm) { /* Take off psocks_avail list */ list_del(&psock->psock_avail_list); } else if (wakeup_kcm) { /* In this case psock is being aborted while outside of * write_msgs and psock is reserved. Schedule tx_work * to handle the failure there. Need to commit tx_stopped * before queuing work. */ smp_mb(); queue_work(kcm_wq, &psock->tx_kcm->tx_work); } spin_unlock_bh(&mux->lock); /* Report error on lower socket */ report_csk_error(csk, err); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert133100.00%2100.00%
Total133100.00%2100.00%

/* RX mux lock held. */
static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.rx_bytes, psock->stats.rx_bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += psock->stats.rx_msgs - psock->saved_rx_msgs; psock->saved_rx_msgs = psock->stats.rx_msgs; psock->saved_rx_bytes = psock->stats.rx_bytes; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert71100.00%1100.00%
Total71100.00%1100.00%


static void kcm_update_tx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.tx_bytes, psock->stats.tx_bytes - psock->saved_tx_bytes); mux->stats.tx_msgs += psock->stats.tx_msgs - psock->saved_tx_msgs; psock->saved_tx_msgs = psock->stats.tx_msgs; psock->saved_tx_bytes = psock->stats.tx_bytes; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert71100.00%1100.00%
Total71100.00%1100.00%

static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* KCM is ready to receive messages on its queue-- either the KCM is new or * has become unblocked after being blocked on full socket buffer. Queue any * pending ready messages on a psock. RX mux lock held. */
static void kcm_rcv_ready(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct sk_buff *skb; if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) return; while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Assuming buffer limit has been reached */ skb_queue_head(&mux->rx_hold_queue, skb); WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } } while (!list_empty(&mux->psocks_ready)) { psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, psock_ready_list); if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { /* Assuming buffer limit has been reached */ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } /* Consumed the ready message on the psock. Schedule rx_work to * get more messages. */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); queue_work(kcm_wq, &psock->rx_work); } /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); kcm->rx_wait = true; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert214100.00%1100.00%
Total214100.00%1100.00%


static void kcm_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct kcm_sock *kcm = kcm_sk(sk); struct kcm_mux *mux = kcm->mux; unsigned int len = skb->truesize; sk_mem_uncharge(sk, len); atomic_sub(len, &sk->sk_rmem_alloc); /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); if (!kcm->rx_wait && !kcm->rx_psock && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert112100.00%1100.00%
Total112100.00%1100.00%


static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) return -ENOMEM; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return -ENOBUFS; skb->dev = NULL; skb_orphan(skb); skb->sk = sk; skb->destructor = kcm_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); skb_queue_tail(list, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert133100.00%1100.00%
Total133100.00%1100.00%

/* Requeue received messages for a kcm socket to other kcm sockets. This is * called with a kcm socket is receive disabled. * RX mux lock held. */
static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) { struct sk_buff *skb; struct kcm_sock *kcm; while ((skb = __skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); try_again: if (list_empty(&mux->kcm_rx_waiters)) { skb_queue_tail(&mux->rx_hold_queue, skb); continue; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); kcm->rx_wait = false; /* Commit rx_wait to read in kcm_free */ smp_wmb(); goto try_again; } } }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert127100.00%1100.00%
Total127100.00%1100.00%

/* Lower sock lock held */
static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, struct sk_buff *head) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; WARN_ON(psock->ready_rx_msg); if (psock->rx_kcm) return psock->rx_kcm; spin_lock_bh(&mux->rx_lock); if (psock->rx_kcm) { spin_unlock_bh(&mux->rx_lock); return psock->rx_kcm; } kcm_update_rx_mux_stats(mux, psock); if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); return NULL; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); kcm->rx_wait = false; psock->rx_kcm = kcm; kcm->rx_psock = psock; spin_unlock_bh(&mux->rx_lock); return kcm; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert180100.00%2100.00%
Total180100.00%2100.00%

static void kcm_done(struct kcm_sock *kcm);
static void kcm_done_work(struct work_struct *w) { kcm_done(container_of(w, struct kcm_sock, done_work)); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert24100.00%1100.00%
Total24100.00%1100.00%

/* Lower sock held */
static void unreserve_rx_kcm(struct kcm_psock *psock, bool rcv_ready) { struct kcm_sock *kcm = psock->rx_kcm; struct kcm_mux *mux = psock->mux; if (!kcm) return; spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; kcm->rx_psock = NULL; /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree */ smp_mb(); if (unlikely(kcm->done)) { spin_unlock_bh(&mux->rx_lock); /* Need to run kcm_done in a task since we need to qcquire * callback locks which may already be held here. */ INIT_WORK(&kcm->done_work, kcm_done_work); schedule_work(&kcm->done_work); return; } if (unlikely(kcm->rx_disabled)) { requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { /* Check for degenerative race with rx_wait that all * data was dequeued (accounted for in kcm_rfree). */ kcm_rcv_ready(kcm); } spin_unlock_bh(&mux->rx_lock); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert157100.00%1100.00%
Total157100.00%1100.00%


static void kcm_start_rx_timer(struct kcm_psock *psock) { if (psock->sk->sk_rcvtimeo) mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert33100.00%1100.00%
Total33100.00%1100.00%

/* Macro to invoke filter function. */ #define KCM_RUN_FILTER(prog, ctx) \ (*prog->bpf_func)(ctx, prog->insnsi) /* Lower socket lock held */
static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len) { struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; struct kcm_rx_msg *rxm; struct kcm_sock *kcm; struct sk_buff *head, *skb; size_t eaten = 0, cand_len; ssize_t extra; int err; bool cloned_orig = false; if (psock->ready_rx_msg) return 0; head = psock->rx_skb_head; if (head) { /* Message already in progress */ rxm = kcm_rx_msg(head); if (unlikely(rxm->early_eaten)) { /* Already some number of bytes on the receive sock * data saved in rx_skb_head, just indicate they * are consumed. */ eaten = orig_len <= rxm->early_eaten ? orig_len : rxm->early_eaten; rxm->early_eaten -= eaten; return eaten; } if (unlikely(orig_offset)) { /* Getting data with a non-zero offset when a message is * in progress is not expected. If it does happen, we * need to clone and pull since we can't deal with * offsets in the skbs for a message expect in the head. */ orig_skb = skb_clone(orig_skb, GFP_ATOMIC); if (!orig_skb) { KCM_STATS_INCR(psock->stats.rx_mem_fail); desc->error = -ENOMEM; return 0; } if (!pskb_pull(orig_skb, orig_offset)) { KCM_STATS_INCR(psock->stats.rx_mem_fail); kfree_skb(orig_skb); desc->error = -ENOMEM; return 0; } cloned_orig = true; orig_offset = 0; } if (!psock->rx_skb_nextp) { /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ err = skb_unclone(head, GFP_ATOMIC); if (err) { KCM_STATS_INCR(psock->stats.rx_mem_fail); desc->error = err; return 0; } if (unlikely(skb_shinfo(head)->frag_list)) { /* We can't append to an sk_buff that already * has a frag_list. We create a new head, point * the frag_list of that to the old head, and * then are able to use the old head->next for * appending to the message. */ if (WARN_ON(head->next)) { desc->error = -EINVAL; return 0; } skb = alloc_skb(0, GFP_ATOMIC); if (!skb) { KCM_STATS_INCR(psock->stats.rx_mem_fail); desc->error = -ENOMEM; return 0; } skb->len = head->len; skb->data_len = head->len; skb->truesize = head->truesize; *kcm_rx_msg(skb) = *kcm_rx_msg(head); psock->rx_skb_nextp = &head->next; skb_shinfo(skb)->frag_list = head; psock->rx_skb_head = skb; head = skb; } else { psock->rx_skb_nextp = &skb_shinfo(head)->frag_list; } } } while (eaten < orig_len) { /* Always clone since we will consume something */ skb = skb_clone(orig_skb, GFP_ATOMIC); if (!skb) { KCM_STATS_INCR(psock->stats.rx_mem_fail); desc->error = -ENOMEM; break; } cand_len = orig_len - eaten; head = psock->rx_skb_head; if (!head) { head = skb; psock->rx_skb_head = head; /* Will set rx_skb_nextp on next packet if needed */ psock->rx_skb_nextp = NULL; rxm = kcm_rx_msg(head); memset(rxm, 0, sizeof(*rxm)); rxm->offset = orig_offset + eaten; } else { /* Unclone since we may be appending to an skb that we * already share a frag_list with. */ err = skb_unclone(skb, GFP_ATOMIC); if (err) { KCM_STATS_INCR(psock->stats.rx_mem_fail); desc->error = err; break; } rxm = kcm_rx_msg(head); *psock->rx_skb_nextp = skb; psock->rx_skb_nextp = &skb->next; head->data_len += skb->len; head->len += skb->len; head->truesize += skb->truesize; } if (!rxm->full_len) { ssize_t len; len = KCM_RUN_FILTER(psock->bpf_prog, head); if (!len) { /* Need more header to determine length */ if (!rxm->accum_len) { /* Start RX timer for new message */ kcm_start_rx_timer(psock); } rxm->accum_len += cand_len; eaten += cand_len; KCM_STATS_INCR(psock->stats.rx_need_more_hdr); WARN_ON(eaten != orig_len); break; } else if (len > psock->sk->sk_rcvbuf) { /* Message length exceeds maximum allowed */ KCM_STATS_INCR(psock->stats.rx_msg_too_big); desc->error = -EMSGSIZE; psock->rx_skb_head = NULL; kcm_abort_rx_psock(psock, EMSGSIZE, head); break; } else if (len <= (ssize_t)head->len - skb->len - rxm->offset) { /* Length must be into new skb (and also * greater than zero) */ KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); desc->error = -EPROTO; psock->rx_skb_head = NULL; kcm_abort_rx_psock(psock, EPROTO, head); break; } rxm->full_len = len; } extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; if (extra < 0) { /* Message not complete yet. */ if (rxm->full_len - rxm->accum_len > tcp_inq(psock->sk)) { /* Don't have the whole messages in the socket * buffer. Set psock->rx_need_bytes to wait for * the rest of the message. Also, set "early * eaten" since we've already buffered the skb * but don't consume yet per tcp_read_sock. */ if (!rxm->accum_len) { /* Start RX timer for new message */ kcm_start_rx_timer(psock); } psock->rx_need_bytes = rxm->full_len - rxm->accum_len; rxm->accum_len += cand_len; rxm->early_eaten = cand_len; KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); desc->count = 0; /* Stop reading socket */ break; } rxm->accum_len += cand_len; eaten += cand_len; WARN_ON(eaten != orig_len); break; } /* Positive extra indicates ore bytes than needed for the * message */ WARN_ON(extra > cand_len); eaten += (cand_len - extra); /* Hurray, we have a new message! */ del_timer(&psock->rx_msg_timer); psock->rx_skb_head = NULL; KCM_STATS_INCR(psock->stats.rx_msgs); try_queue: kcm = reserve_rx_kcm(psock, head); if (!kcm) { /* Unable to reserve a KCM, message is held in psock. */ break; } if (kcm_queue_rcv_skb(&kcm->sk, head)) { /* Should mean socket buffer full */ unreserve_rx_kcm(psock, false); goto try_queue; } } if (cloned_orig) kfree_skb(orig_skb); KCM_STATS_ADD(psock->stats.rx_bytes, eaten); return eaten; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert1018100.00%4100.00%
Total1018100.00%4100.00%

/* Called with lock held on lower socket */
static int psock_tcp_read_sock(struct kcm_psock *psock) { read_descriptor_t desc; desc.arg.data = psock; desc.error = 0; desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do tcp_read_sock */ tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); unreserve_rx_kcm(psock, true); return desc.error; }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert60100.00%1100.00%
Total60100.00%1100.00%

/* Lower sock lock held */
static void psock_tcp_data_ready(struct sock *sk) { struct kcm_psock *psock; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock || psock->rx_stopped)) goto out; if (psock->ready_rx_msg) goto out; if (psock->rx_need_bytes) { if (tcp_inq(sk) >= psock->rx_need_bytes) psock->rx_need_bytes = 0; else goto out; } if (psock_tcp_read_sock(psock) == -ENOMEM) queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); out: read_unlock_bh(&sk->sk_callback_lock); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert120100.00%2100.00%
Total120100.00%2100.00%


static void do_psock_rx_work(struct kcm_psock *psock) { read_descriptor_t rd_desc; struct sock *csk = psock->sk; /* We need the read lock to synchronize with psock_tcp_data_ready. We * need the socket lock for calling tcp_read_sock. */ lock_sock(csk); read_lock_bh(&csk->sk_callback_lock); if (unlikely(csk->sk_user_data != psock)) goto out; if (unlikely(psock->rx_stopped)) goto out; if (psock->ready_rx_msg) goto out; rd_desc.arg.data = psock; if (psock_tcp_read_sock(psock) == -ENOMEM) queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); out: read_unlock_bh(&csk->sk_callback_lock); release_sock(csk); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert117100.00%1100.00%
Total117100.00%1100.00%


static void psock_rx_work(struct work_struct *w) { do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert24100.00%1100.00%
Total24100.00%1100.00%


static void psock_rx_delayed_work(struct work_struct *w) { do_psock_rx_work(container_of(w, struct kcm_psock, rx_delayed_work.work)); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert26100.00%1100.00%
Total26100.00%1100.00%


static void psock_tcp_state_change(struct sock *sk) { /* TCP only does a POLLIN for a half close. Do a POLLHUP here * since application will normally not poll with POLLIN * on the TCP sockets. */ report_csk_error(sk, EPIPE); }

Contributors

PersonTokensPropCommitsCommitProp
tom herberttom herbert19100.00%1100.00%
Total19100.00%1100.00%


static void psock_tcp_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; struct kcm_sock *kcm; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock