cregit-Linux how code gets into the kernel

Release 4.11 net/kcm/kcmsock.c

Directory: net/kcm
/*
 * Kernel Connection Multiplexor
 *
 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2
 * as published by the Free Software Foundation.
 */

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/file.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/poll.h>
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/uaccess.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/sched/signal.h>

#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <uapi/linux/kcm.h>


unsigned int kcm_net_id;


static struct kmem_cache *kcm_psockp __read_mostly;

static struct kmem_cache *kcm_muxp __read_mostly;

static struct workqueue_struct *kcm_wq;


static inline struct kcm_sock *kcm_sk(const struct sock *sk) { return (struct kcm_sock *)sk; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert23100.00%1100.00%
Total23100.00%1100.00%


static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) { return (struct kcm_tx_msg *)skb->cb; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert24100.00%1100.00%
Total24100.00%1100.00%


static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; csk->sk_error_report(csk); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert27100.00%1100.00%
Total27100.00%1100.00%


static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; /* Unrecoverable error in transmit */ spin_lock_bh(&mux->lock); if (psock->tx_stopped) { spin_unlock_bh(&mux->lock); return; } psock->tx_stopped = 1; KCM_STATS_INCR(psock->stats.tx_aborts); if (!psock->tx_kcm) { /* Take off psocks_avail list */ list_del(&psock->psock_avail_list); } else if (wakeup_kcm) { /* In this case psock is being aborted while outside of * write_msgs and psock is reserved. Schedule tx_work * to handle the failure there. Need to commit tx_stopped * before queuing work. */ smp_mb(); queue_work(kcm_wq, &psock->tx_kcm->tx_work); } spin_unlock_bh(&mux->lock); /* Report error on lower socket */ report_csk_error(csk, err); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert133100.00%2100.00%
Total133100.00%2100.00%

/* RX mux lock held. */
static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, psock->strp.stats.rx_bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += psock->strp.stats.rx_msgs - psock->saved_rx_msgs; psock->saved_rx_msgs = psock->strp.stats.rx_msgs; psock->saved_rx_bytes = psock->strp.stats.rx_bytes; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert79100.00%2100.00%
Total79100.00%2100.00%


static void kcm_update_tx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.tx_bytes, psock->stats.tx_bytes - psock->saved_tx_bytes); mux->stats.tx_msgs += psock->stats.tx_msgs - psock->saved_tx_msgs; psock->saved_tx_msgs = psock->stats.tx_msgs; psock->saved_tx_bytes = psock->stats.tx_bytes; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert71100.00%1100.00%
Total71100.00%1100.00%

static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* KCM is ready to receive messages on its queue-- either the KCM is new or * has become unblocked after being blocked on full socket buffer. Queue any * pending ready messages on a psock. RX mux lock held. */
static void kcm_rcv_ready(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct sk_buff *skb; if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) return; while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Assuming buffer limit has been reached */ skb_queue_head(&mux->rx_hold_queue, skb); WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } } while (!list_empty(&mux->psocks_ready)) { psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, psock_ready_list); if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { /* Assuming buffer limit has been reached */ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } /* Consumed the ready message on the psock. Schedule rx_work to * get more messages. */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); strp_unpause(&psock->strp); strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); kcm->rx_wait = true; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert220100.00%2100.00%
Total220100.00%2100.00%


static void kcm_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct kcm_sock *kcm = kcm_sk(sk); struct kcm_mux *mux = kcm->mux; unsigned int len = skb->truesize; sk_mem_uncharge(sk, len); atomic_sub(len, &sk->sk_rmem_alloc); /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); if (!kcm->rx_wait && !kcm->rx_psock && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert112100.00%1100.00%
Total112100.00%1100.00%


static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) return -ENOMEM; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return -ENOBUFS; skb->dev = NULL; skb_orphan(skb); skb->sk = sk; skb->destructor = kcm_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); skb_queue_tail(list, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert133100.00%1100.00%
Total133100.00%1100.00%

/* Requeue received messages for a kcm socket to other kcm sockets. This is * called with a kcm socket is receive disabled. * RX mux lock held. */
static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) { struct sk_buff *skb; struct kcm_sock *kcm; while ((skb = __skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); try_again: if (list_empty(&mux->kcm_rx_waiters)) { skb_queue_tail(&mux->rx_hold_queue, skb); continue; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); kcm->rx_wait = false; /* Commit rx_wait to read in kcm_free */ smp_wmb(); goto try_again; } } }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert127100.00%1100.00%
Total127100.00%1100.00%

/* Lower sock lock held */
static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, struct sk_buff *head) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; WARN_ON(psock->ready_rx_msg); if (psock->rx_kcm) return psock->rx_kcm; spin_lock_bh(&mux->rx_lock); if (psock->rx_kcm) { spin_unlock_bh(&mux->rx_lock); return psock->rx_kcm; } kcm_update_rx_mux_stats(mux, psock); if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); return NULL; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); kcm->rx_wait = false; psock->rx_kcm = kcm; kcm->rx_psock = psock; spin_unlock_bh(&mux->rx_lock); return kcm; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert188100.00%3100.00%
Total188100.00%3100.00%

static void kcm_done(struct kcm_sock *kcm);
static void kcm_done_work(struct work_struct *w) { kcm_done(container_of(w, struct kcm_sock, done_work)); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert24100.00%1100.00%
Total24100.00%1100.00%

/* Lower sock held */
static void unreserve_rx_kcm(struct kcm_psock *psock, bool rcv_ready) { struct kcm_sock *kcm = psock->rx_kcm; struct kcm_mux *mux = psock->mux; if (!kcm) return; spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; kcm->rx_psock = NULL; /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree */ smp_mb(); if (unlikely(kcm->done)) { spin_unlock_bh(&mux->rx_lock); /* Need to run kcm_done in a task since we need to qcquire * callback locks which may already be held here. */ INIT_WORK(&kcm->done_work, kcm_done_work); schedule_work(&kcm->done_work); return; } if (unlikely(kcm->rx_disabled)) { requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { /* Check for degenerative race with rx_wait that all * data was dequeued (accounted for in kcm_rfree). */ kcm_rcv_ready(kcm); } spin_unlock_bh(&mux->rx_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert157100.00%1100.00%
Total157100.00%1100.00%

/* Lower sock lock held */
static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (likely(psock)) strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert58100.00%3100.00%
Total58100.00%3100.00%

/* Called with lower sock held */
static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct kcm_sock *kcm; try_queue: kcm = reserve_rx_kcm(psock, skb); if (!kcm) { /* Unable to reserve a KCM, message is held in psock and strp * is paused. */ return; } if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ unreserve_rx_kcm(psock, false); goto try_queue; } }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert81100.00%3100.00%
Total81100.00%3100.00%


static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; return (*prog->bpf_func)(skb, prog->insnsi); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert55100.00%2100.00%
Total55100.00%2100.00%


static int kcm_read_sock_done(struct strparser *strp, int err) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); unreserve_rx_kcm(psock, true); return err; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert39100.00%2100.00%
Total39100.00%2100.00%


static void psock_state_change(struct sock *sk) { /* TCP only does a POLLIN for a half close. Do a POLLHUP here * since application will normally not poll with POLLIN * on the TCP sockets. */ report_csk_error(sk, EPIPE); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert19100.00%2100.00%
Total19100.00%2100.00%


static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; struct kcm_sock *kcm; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); out: read_unlock_bh(&sk->sk_callback_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert117100.00%3100.00%
Total117100.00%3100.00%

static void unreserve_psock(struct kcm_sock *kcm); /* kcm sock is locked. */
static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; psock = kcm->tx_psock; smp_rmb(); /* Must read tx_psock before tx_wait */ if (psock) { WARN_ON(kcm->tx_wait); if (unlikely(psock->tx_stopped)) unreserve_psock(kcm); else return kcm->tx_psock; } spin_lock_bh(&mux->lock); /* Check again under lock to see if psock was reserved for this * psock via psock_unreserve. */ psock = kcm->tx_psock; if (unlikely(psock)) { WARN_ON(kcm->tx_wait); spin_unlock_bh(&mux->lock); return kcm->tx_psock; } if (!list_empty(&mux->psocks_avail)) { psock = list_first_entry(&mux->psocks_avail, struct kcm_psock, psock_avail_list); list_del(&psock->psock_avail_list); if (kcm->tx_wait) { list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } kcm->tx_psock = psock; psock->tx_kcm = kcm; KCM_STATS_INCR(psock->stats.reserved); } else if (!kcm->tx_wait) { list_add_tail(&kcm->wait_psock_list, &mux->kcm_tx_waiters); kcm->tx_wait = true; } spin_unlock_bh(&mux->lock); return psock; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert233100.00%2100.00%
Total233100.00%2100.00%

/* mux lock held */
static void psock_now_avail(struct kcm_psock *psock) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; if (list_empty(&mux->kcm_tx_waiters)) { list_add_tail(&psock->psock_avail_list, &mux->psocks_avail); } else { kcm = list_first_entry(&mux->kcm_tx_waiters, struct kcm_sock, wait_psock_list); list_del(&kcm->wait_psock_list); kcm->tx_wait = false; psock->tx_kcm = kcm; /* Commit before changing tx_psock since that is read in * reserve_psock before queuing work. */ smp_mb(); kcm->tx_psock = psock; KCM_STATS_INCR(psock->stats.reserved); queue_work(kcm_wq, &kcm->tx_work); } }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert117100.00%2100.00%
Total117100.00%2100.00%

/* kcm sock is locked. */
static void unreserve_psock(struct kcm_sock *kcm) { struct kcm_psock *psock; struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); psock = kcm->tx_psock; if (WARN_ON(!psock)) { spin_unlock_bh(&mux->lock); return; } smp_rmb(); /* Read tx_psock before tx_wait */ kcm_update_tx_mux_stats(mux, psock); WARN_ON(kcm->tx_wait); kcm->tx_psock = NULL; psock->tx_kcm = NULL; KCM_STATS_INCR(psock->stats.unreserved); if (unlikely(psock->tx_stopped)) { if (psock->done) { /* Deferred free */ list_del(&psock->psock_list); mux->psocks_cnt--; sock_put(psock->sk); fput(psock->sk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } /* Don't put back on available list */ spin_unlock_bh(&mux->lock); return; } psock_now_avail(psock); spin_unlock_bh(&mux->lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert178100.00%2100.00%
Total178100.00%2100.00%


static void kcm_report_tx_retry(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); KCM_STATS_INCR(mux->stats.tx_retries); spin_unlock_bh(&mux->lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert45100.00%1100.00%
Total45100.00%1100.00%

/* Write any messages ready on the kcm socket. Called with kcm sock lock * held. Return bytes actually sent or error. */
static int kcm_write_msgs(struct kcm_sock *kcm) { struct sock *sk = &kcm->sk; struct kcm_psock *psock; struct sk_buff *skb, *head; struct kcm_tx_msg *txm; unsigned short fragidx, frag_offset; unsigned int sent, total_sent = 0; int ret = 0; kcm->tx_wait_more = false; psock = kcm->tx_psock; if (unlikely(psock && psock->tx_stopped)) { /* A reserved psock was aborted asynchronously. Unreserve * it and we'll retry the message. */ unreserve_psock(kcm); kcm_report_tx_retry(kcm); if (skb_queue_empty(&sk->sk_write_queue)) return 0; kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; } else if (skb_queue_empty(&sk->sk_write_queue)) { return 0; } head = skb_peek(&sk->sk_write_queue); txm = kcm_tx_msg(head); if (txm->sent) { /* Send of first skbuff in queue already in progress */ if (WARN_ON(!psock)) { ret = -EINVAL; goto out; } sent = txm->sent; frag_offset = txm->frag_offset; fragidx = txm->fragidx; skb = txm->frag_skb; goto do_frag; } try_again: psock = reserve_psock(kcm); if (!psock) goto out; do { skb = head; txm = kcm_tx_msg(head); sent = 0; do_frag_list: if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { ret = -EINVAL; goto out; } for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag; frag_offset = 0; do_frag: frag = &skb_shinfo(skb)->frags[fragidx]; if (WARN_ON(!frag->size)) { ret = -EINVAL; goto out; } ret = kernel_sendpage(psock->sk->sk_socket, frag->page.p, frag->page_offset + frag_offset, frag->size - frag_offset, MSG_DONTWAIT); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ txm->sent = sent; txm->frag_offset = frag_offset; txm->fragidx = fragidx; txm->frag_skb = skb; ret = 0; goto out; } /* Hard failure in sending message, abort this * psock since it has lost framing * synchonization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); txm->sent = 0; kcm_report_tx_retry(kcm); ret = 0; goto try_again; } sent += ret; frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); if (frag_offset < frag->size) { /* Not finished with this frag */ goto do_frag; } } if (skb == head) { if (skb_has_frag_list(skb)) { skb = skb_shinfo(skb)->frag_list; goto do_frag_list; } } else if (skb->next) { skb = skb->next; goto do_frag_list; } /* Successfully sent the whole packet, account for it. */ skb_dequeue(&sk->sk_write_queue); kfree_skb(head); sk->sk_wmem_queued -= sent; total_sent += sent; KCM_STATS_INCR(psock->stats.tx_msgs); } while ((head = skb_peek(&sk->sk_write_queue))); out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); unreserve_psock(kcm); } /* Check if write space is available */ sk->sk_write_space(sk); return total_sent ? : ret; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Herbert619100.00%2100.00%
Total619100.00%2100.00%


static void kcm_tx_work(struct work_struct *w) { struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); struct sock *sk = &kcm->sk; int err; lock_sock(sk); /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx * aborts */ err = kcm_write_msgs(kcm); if (err < 0) { /* Hard failure in write, report error on KCM socket */ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); report_csk_error(&kcm->sk, -err); goto out; } /* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk