Contributors: 3
Author Tokens Token Proportion Commits Commit Proportion
Martin KaFai Lau 1370 99.35% 3 60.00%
Daniel Borkmann 5 0.36% 1 20.00%
Andrii Nakryiko 4 0.29% 1 20.00%
Total 1379 5


// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2022 Meta

#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/stddef.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>

/* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst
 *           |                                 |
 *  ns_src   |              ns_fwd             |   ns_dst
 *
 * ns_src and ns_dst: ENDHOST namespace
 *            ns_fwd: Fowarding namespace
 */

#define ctx_ptr(field)		(void *)(long)(field)

#define ip4_src			__bpf_htonl(0xac100164) /* 172.16.1.100 */
#define ip4_dst			__bpf_htonl(0xac100264) /* 172.16.2.100 */

#define ip6_src			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
				  0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
#define ip6_dst			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
				  0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }

#define v6_equal(a, b)		(a.s6_addr32[0] == b.s6_addr32[0] && \
				 a.s6_addr32[1] == b.s6_addr32[1] && \
				 a.s6_addr32[2] == b.s6_addr32[2] && \
				 a.s6_addr32[3] == b.s6_addr32[3])

volatile const __u32 IFINDEX_SRC;
volatile const __u32 IFINDEX_DST;

#define EGRESS_ENDHOST_MAGIC	0x0b9fbeef
#define INGRESS_FWDNS_MAGIC	0x1b9fbeef
#define EGRESS_FWDNS_MAGIC	0x2b9fbeef

enum {
	INGRESS_FWDNS_P100,
	INGRESS_FWDNS_P101,
	EGRESS_FWDNS_P100,
	EGRESS_FWDNS_P101,
	INGRESS_ENDHOST,
	EGRESS_ENDHOST,
	SET_DTIME,
	__MAX_CNT,
};

enum {
	TCP_IP6_CLEAR_DTIME,
	TCP_IP4,
	TCP_IP6,
	UDP_IP4,
	UDP_IP6,
	TCP_IP4_RT_FWD,
	TCP_IP6_RT_FWD,
	UDP_IP4_RT_FWD,
	UDP_IP6_RT_FWD,
	UKN_TEST,
	__NR_TESTS,
};

enum {
	SRC_NS = 1,
	DST_NS,
};

__u32 dtimes[__NR_TESTS][__MAX_CNT] = {};
__u32 errs[__NR_TESTS][__MAX_CNT] = {};
__u32 test = 0;

static void inc_dtimes(__u32 idx)
{
	if (test < __NR_TESTS)
		dtimes[test][idx]++;
	else
		dtimes[UKN_TEST][idx]++;
}

static void inc_errs(__u32 idx)
{
	if (test < __NR_TESTS)
		errs[test][idx]++;
	else
		errs[UKN_TEST][idx]++;
}

static int skb_proto(int type)
{
	return type & 0xff;
}

static int skb_ns(int type)
{
	return (type >> 8) & 0xff;
}

static bool fwdns_clear_dtime(void)
{
	return test == TCP_IP6_CLEAR_DTIME;
}

static bool bpf_fwd(void)
{
	return test < TCP_IP4_RT_FWD;
}

static __u8 get_proto(void)
{
	switch (test) {
	case UDP_IP4:
	case UDP_IP6:
	case UDP_IP4_RT_FWD:
	case UDP_IP6_RT_FWD:
		return IPPROTO_UDP;
	default:
		return IPPROTO_TCP;
	}
}

/* -1: parse error: TC_ACT_SHOT
 *  0: not testing traffic: TC_ACT_OK
 * >0: first byte is the inet_proto, second byte has the netns
 *     of the sender
 */
static int skb_get_type(struct __sk_buff *skb)
{
	__u16 dst_ns_port = __bpf_htons(50000 + test);
	void *data_end = ctx_ptr(skb->data_end);
	void *data = ctx_ptr(skb->data);
	__u8 inet_proto = 0, ns = 0;
	struct ipv6hdr *ip6h;
	__u16 sport, dport;
	struct iphdr *iph;
	struct tcphdr *th;
	struct udphdr *uh;
	void *trans;

	switch (skb->protocol) {
	case __bpf_htons(ETH_P_IP):
		iph = data + sizeof(struct ethhdr);
		if (iph + 1 > data_end)
			return -1;
		if (iph->saddr == ip4_src)
			ns = SRC_NS;
		else if (iph->saddr == ip4_dst)
			ns = DST_NS;
		inet_proto = iph->protocol;
		trans = iph + 1;
		break;
	case __bpf_htons(ETH_P_IPV6):
		ip6h = data + sizeof(struct ethhdr);
		if (ip6h + 1 > data_end)
			return -1;
		if (v6_equal(ip6h->saddr, (struct in6_addr){{ip6_src}}))
			ns = SRC_NS;
		else if (v6_equal(ip6h->saddr, (struct in6_addr){{ip6_dst}}))
			ns = DST_NS;
		inet_proto = ip6h->nexthdr;
		trans = ip6h + 1;
		break;
	default:
		return 0;
	}

	/* skb is not from src_ns or dst_ns.
	 * skb is not the testing IPPROTO.
	 */
	if (!ns || inet_proto != get_proto())
		return 0;

	switch (inet_proto) {
	case IPPROTO_TCP:
		th = trans;
		if (th + 1 > data_end)
			return -1;
		sport = th->source;
		dport = th->dest;
		break;
	case IPPROTO_UDP:
		uh = trans;
		if (uh + 1 > data_end)
			return -1;
		sport = uh->source;
		dport = uh->dest;
		break;
	default:
		return 0;
	}

	/* The skb is the testing traffic */
	if ((ns == SRC_NS && dport == dst_ns_port) ||
	    (ns == DST_NS && sport == dst_ns_port))
		return (ns << 8 | inet_proto);

	return 0;
}

/* format: direction@iface@netns
 * egress@veth_(src|dst)@ns_(src|dst)
 */
SEC("tc")
int egress_host(struct __sk_buff *skb)
{
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1)
		return TC_ACT_SHOT;
	if (!skb_type)
		return TC_ACT_OK;

	if (skb_proto(skb_type) == IPPROTO_TCP) {
		if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
		    skb->tstamp)
			inc_dtimes(EGRESS_ENDHOST);
		else
			inc_errs(EGRESS_ENDHOST);
	} else {
		if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
		    skb->tstamp)
			inc_dtimes(EGRESS_ENDHOST);
		else
			inc_errs(EGRESS_ENDHOST);
	}

	skb->tstamp = EGRESS_ENDHOST_MAGIC;

	return TC_ACT_OK;
}

/* ingress@veth_(src|dst)@ns_(src|dst) */
SEC("tc")
int ingress_host(struct __sk_buff *skb)
{
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1)
		return TC_ACT_SHOT;
	if (!skb_type)
		return TC_ACT_OK;

	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
	    skb->tstamp == EGRESS_FWDNS_MAGIC)
		inc_dtimes(INGRESS_ENDHOST);
	else
		inc_errs(INGRESS_ENDHOST);

	return TC_ACT_OK;
}

/* ingress@veth_(src|dst)_fwd@ns_fwd priority 100 */
SEC("tc")
int ingress_fwdns_prio100(struct __sk_buff *skb)
{
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1)
		return TC_ACT_SHOT;
	if (!skb_type)
		return TC_ACT_OK;

	/* delivery_time is only available to the ingress
	 * if the tc-bpf checks the skb->tstamp_type.
	 */
	if (skb->tstamp == EGRESS_ENDHOST_MAGIC)
		inc_errs(INGRESS_FWDNS_P100);

	if (fwdns_clear_dtime())
		skb->tstamp = 0;

	return TC_ACT_UNSPEC;
}

/* egress@veth_(src|dst)_fwd@ns_fwd priority 100 */
SEC("tc")
int egress_fwdns_prio100(struct __sk_buff *skb)
{
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1)
		return TC_ACT_SHOT;
	if (!skb_type)
		return TC_ACT_OK;

	/* delivery_time is always available to egress even
	 * the tc-bpf did not use the tstamp_type.
	 */
	if (skb->tstamp == INGRESS_FWDNS_MAGIC)
		inc_dtimes(EGRESS_FWDNS_P100);
	else
		inc_errs(EGRESS_FWDNS_P100);

	if (fwdns_clear_dtime())
		skb->tstamp = 0;

	return TC_ACT_UNSPEC;
}

/* ingress@veth_(src|dst)_fwd@ns_fwd priority 101 */
SEC("tc")
int ingress_fwdns_prio101(struct __sk_buff *skb)
{
	__u64 expected_dtime = EGRESS_ENDHOST_MAGIC;
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1 || !skb_type)
		/* Should have handled in prio100 */
		return TC_ACT_SHOT;

	if (skb_proto(skb_type) == IPPROTO_UDP)
		expected_dtime = 0;

	if (skb->tstamp_type) {
		if (fwdns_clear_dtime() ||
		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
		    skb->tstamp != expected_dtime)
			inc_errs(INGRESS_FWDNS_P101);
		else
			inc_dtimes(INGRESS_FWDNS_P101);
	} else {
		if (!fwdns_clear_dtime() && expected_dtime)
			inc_errs(INGRESS_FWDNS_P101);
	}

	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
		skb->tstamp = INGRESS_FWDNS_MAGIC;
	} else {
		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
				       BPF_SKB_TSTAMP_DELIVERY_MONO))
			inc_errs(SET_DTIME);
		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
					BPF_SKB_TSTAMP_UNSPEC))
			inc_errs(SET_DTIME);
	}

	if (skb_ns(skb_type) == SRC_NS)
		return bpf_fwd() ?
			bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0) : TC_ACT_OK;
	else
		return bpf_fwd() ?
			bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0) : TC_ACT_OK;
}

/* egress@veth_(src|dst)_fwd@ns_fwd priority 101 */
SEC("tc")
int egress_fwdns_prio101(struct __sk_buff *skb)
{
	int skb_type;

	skb_type = skb_get_type(skb);
	if (skb_type == -1 || !skb_type)
		/* Should have handled in prio100 */
		return TC_ACT_SHOT;

	if (skb->tstamp_type) {
		if (fwdns_clear_dtime() ||
		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
		    skb->tstamp != INGRESS_FWDNS_MAGIC)
			inc_errs(EGRESS_FWDNS_P101);
		else
			inc_dtimes(EGRESS_FWDNS_P101);
	} else {
		if (!fwdns_clear_dtime())
			inc_errs(EGRESS_FWDNS_P101);
	}

	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
		skb->tstamp = EGRESS_FWDNS_MAGIC;
	} else {
		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
				       BPF_SKB_TSTAMP_DELIVERY_MONO))
			inc_errs(SET_DTIME);
		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
					BPF_SKB_TSTAMP_UNSPEC))
			inc_errs(SET_DTIME);
	}

	return TC_ACT_OK;
}

char __license[] SEC("license") = "GPL";