cregit-Linux how code gets into the kernel

Release 4.8 net/openvswitch/conntrack.c

Directory: net/openvswitch
 * Copyright (c) 2015 Nicira, Inc.
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * General Public License for more details.

#include <linux/module.h>
#include <linux/openvswitch.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <net/ip.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>

#include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>

#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
#include "flow_netlink.h"

struct ovs_ct_len_tbl {
int maxlen;
int minlen;

/* Metadata mark for masked write to conntrack mark */

struct md_mark {
u32 value;
u32 mask;

/* Metadata label for masked write to conntrack label. */

struct md_labels {
struct ovs_key_ct_labels value;
struct ovs_key_ct_labels mask;

enum ovs_ct_nat {
OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */

/* Conntrack action context for execution. */

struct ovs_conntrack_info {
struct nf_conntrack_helper *helper;
struct nf_conntrack_zone zone;
struct nf_conn *ct;
u8 commit : 1;
u8 nat : 3;                 /* enum ovs_ct_nat */
u16 family;
struct md_mark mark;
struct md_labels labels;
struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */

static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);

static u16 key_to_nfproto(const struct sw_flow_key *key) { switch (ntohs(key->eth.type)) { case ETH_P_IP: return NFPROTO_IPV4; case ETH_P_IPV6: return NFPROTO_IPV6; default: return NFPROTO_UNSPEC; } }


joe stringerjoe stringer41100.00%1100.00%

/* Map SKB connection state into the values used by flow definition. */
static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) { u8 ct_state = OVS_CS_F_TRACKED; switch (ctinfo) { case IP_CT_ESTABLISHED_REPLY: case IP_CT_RELATED_REPLY: ct_state |= OVS_CS_F_REPLY_DIR; break; default: break; } switch (ctinfo) { case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: ct_state |= OVS_CS_F_ESTABLISHED; break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: ct_state |= OVS_CS_F_RELATED; break; case IP_CT_NEW: ct_state |= OVS_CS_F_NEW; break; default: break; } return ct_state; }


joe stringerjoe stringer75100.00%1100.00%

static u32 ovs_ct_get_mark(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) return ct ? ct->mark : 0; #else return 0; #endif }


joe stringerjoe stringer34100.00%1100.00%

static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { size_t len = sizeof(cl->bits); if (len > OVS_CT_LABELS_LEN) len = OVS_CT_LABELS_LEN; else if (len < OVS_CT_LABELS_LEN) memset(labels, 0, OVS_CT_LABELS_LEN); memcpy(labels, cl->bits, len); } else { memset(labels, 0, OVS_CT_LABELS_LEN); } }


joe stringerjoe stringer9396.88%266.67%
florian westphalflorian westphal33.12%133.33%

static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, const struct nf_conntrack_zone *zone, const struct nf_conn *ct) { key->ct.state = state; key-> = zone->id; key->ct.mark = ovs_ct_get_mark(ct); ovs_ct_get_labels(ct, &key->ct.labels); }


joe stringerjoe stringer67100.00%5100.00%

/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has * previously sent the packet to conntrack via the ct action. If * 'keep_nat_flags' is true, the existing NAT flags retained, else they are * initialized from the connection status. */
static void ovs_ct_update_key(const struct sk_buff *skb, const struct ovs_conntrack_info *info, struct sw_flow_key *key, bool post_ct, bool keep_nat_flags) { const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u8 state = 0; ct = nf_ct_get(skb, &ctinfo); if (ct) { state = ovs_ct_get_state(ctinfo); /* All unconfirmed entries are NEW connections. */ if (!nf_ct_is_confirmed(ct)) state |= OVS_CS_F_NEW; /* OVS persists the related flag for the duration of the * connection. */ if (ct->master) state |= OVS_CS_F_RELATED; if (keep_nat_flags) { state |= key->ct.state & OVS_CS_F_NAT_MASK; } else { if (ct->status & IPS_SRC_NAT) state |= OVS_CS_F_SRC_NAT; if (ct->status & IPS_DST_NAT) state |= OVS_CS_F_DST_NAT; } zone = nf_ct_zone(ct); } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; if (info) zone = &info->zone; } __ovs_ct_update_key(key, state, zone, ct); }


joe stringerjoe stringer13673.91%466.67%
jarno rajahalmejarno rajahalme4826.09%233.33%

/* This is called to initialize CT key fields possibly coming in from the local * stack. */
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { ovs_ct_update_key(skb, NULL, key, false, false); }


joe stringerjoe stringer2793.10%266.67%
jarno rajahalmejarno rajahalme26.90%133.33%

int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) { if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key-> return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), &key->ct.labels)) return -EMSGSIZE; return 0; }


joe stringerjoe stringer11999.17%583.33%
valentin rothbergvalentin rothberg10.83%116.67%

static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, u32 ct_mark, u32 mask) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) enum ip_conntrack_info ctinfo; struct nf_conn *ct; u32 new_mark; /* The connection could be invalid, in which case set_mark is no-op. */ ct = nf_ct_get(skb, &ctinfo); if (!ct) return 0; new_mark = ct_mark | (ct->mark & ~(mask)); if (ct->mark != new_mark) { ct->mark = new_mark; nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; } return 0; #else return -ENOTSUPP; #endif }


joe stringerjoe stringer116100.00%3100.00%

static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_key_ct_labels *labels, const struct ovs_key_ct_labels *mask) { enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; int err; /* The connection could be invalid, in which case set_label is no-op.*/ ct = nf_ct_get(skb, &ctinfo); if (!ct) return 0; cl = nf_ct_labels_find(ct); if (!cl) { nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) return -ENOSPC; err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, OVS_CT_LABELS_LEN / sizeof(u32)); if (err) return err; ovs_ct_get_labels(ct, &key->ct.labels); return 0; }


joe stringerjoe stringer15398.08%480.00%
florian westphalflorian westphal31.92%120.00%

/* 'skb' should already be pulled to nh_ofs. */
static int ovs_ct_helper(struct sk_buff *skb, u16 proto) { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; enum ip_conntrack_info ctinfo; unsigned int protoff; struct nf_conn *ct; int err; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) return NF_ACCEPT; help = nfct_help(ct); if (!help) return NF_ACCEPT; helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; switch (proto) { case NFPROTO_IPV4: protoff = ip_hdrlen(skb); break; case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; int ofs; ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } protoff = ofs; break; } default: WARN_ONCE(1, "helper invoked on non-IP family!"); return NF_DROP; } err = helper->help(skb, protoff, ct, ctinfo); if (err != NF_ACCEPT) return err; /* Adjust seqs after helper. This is needed due to some helpers (e.g., * FTP with NAT) adusting the TCP payload size when mangling IP * addresses and/or port numbers in the text-based control connection. */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) return NF_DROP; return NF_ACCEPT; }


joe stringerjoe stringer20782.14%266.67%
jarno rajahalmejarno rajahalme4517.86%133.33%

/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */
static int handle_fragments(struct net *net, struct sw_flow_key *key, u16 zone, struct sk_buff *skb) { struct ovs_skb_cb ovs_cb = *OVS_CB(skb); int err; if (key->eth.type == htons(ETH_P_IP)) { enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); err = ip_defrag(net, skb, user); if (err) return err; ovs_cb.mru = IPCB(skb)->frag_max_size; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) return err; key->ip.proto = ipv6_hdr(skb)->nexthdr; ovs_cb.mru = IP6CB(skb)->frag_max_size; #endif } else { kfree_skb(skb); return -EPFNOSUPPORT; } key->ip.frag = OVS_FRAG_TYPE_NONE; skb_clear_hash(skb); skb->ignore_df = 1; *OVS_CB(skb) = ovs_cb; return 0; }


joe stringerjoe stringer23295.87%457.14%
florian westphalflorian westphal62.48%114.29%
eric w. biedermaneric w. biederman41.65%228.57%

static struct nf_conntrack_expect * ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; return __nf_ct_expect_find(net, zone, &tuple); }


joe stringerjoe stringer6496.97%150.00%
eric w. biedermaneric w. biederman23.03%150.00%

/* This replicates logic from nf_conntrack_core.c that is not exported. */
static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) { const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) return IP_CT_ESTABLISHED_REPLY; /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) return IP_CT_ESTABLISHED; if (test_bit(IPS_EXPECTED_BIT, &ct->status)) return IP_CT_RELATED; return IP_CT_NEW; }


jarno rajahalmejarno rajahalme70100.00%1100.00%

/* Find an existing connection which this packet belongs to without * re-attributing statistics or modifying the connection state. This allows an * skb->nfct lost due to an upcall to be recovered during actions execution. * * Must be called with rcu_read_lock. * * On success, populates skb->nfct and skb->nfctinfo, and returns the * connection. Returns NULL if there is no existing entry. */
static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 l3num, struct sk_buff *skb) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; unsigned int dataoff; u8 protonum; l3proto = __nf_ct_l3proto_find(l3num); if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum) <= 0) { pr_debug("ovs_ct_find_existing: Can't get protonum\n"); return NULL; } l4proto = __nf_ct_l4proto_find(l3num, protonum); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); return NULL; } /* look for tuple match */ h = nf_conntrack_find_get(net, zone, &tuple); if (!h) return NULL; /* Not found. */ ct = nf_ct_tuplehash_to_ctrack(h); skb->nfct = &ct->ct_general; skb->nfctinfo = ovs_ct_get_info(h); return ct; }


jarno rajahalmejarno rajahalme194100.00%2100.00%

/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
static bool skb_nfct_cached(struct net *net, const struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); /* If no ct, check if we have evidence that an existing conntrack entry * might be found for this skb. This happens when we lose a skb->nfct * due to an upcall. If the connection was not confirmed, it is not * cached and needs to be run through conntrack again. */ if (!ct && key->ct.state & OVS_CS_F_TRACKED && !(key->ct.state & OVS_CS_F_INVALID) && key-> == info-> ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) return false; if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) return false; if (info->helper) { struct nf_conn_help *help; help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); if (help && rcu_access_pointer(help->helper) != info->helper) return false; } return true; }


joe stringerjoe stringer12867.37%266.67%
jarno rajahalmejarno rajahalme6232.63%133.33%

#ifdef CONFIG_NF_NAT_NEEDED /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. * Returns either NF_ACCEPT or NF_DROP. */
static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { int hooknum, nh_off, err = NF_ACCEPT; nh_off = skb_network_offset(skb); skb_pull(skb, nh_off); /* See HOOK2MANIP(). */ if (maniptype == NF_NAT_MANIP_SRC) hooknum = NF_INET_LOCAL_IN; /* Source NAT */ else hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; goto push; } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { __be16 frag_off; u8 nexthdr = ipv6_hdr(skb)->nexthdr; int hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, hooknum, hdrlen)) err = NF_DROP; goto push; } } /* Non-ICMP, fall thru to initialize if needed. */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { /* Initialize according to the NAT action. */ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) /* Action is set up to establish a new * mapping. */ ? nf_nat_setup_info(ct, range, maniptype) : nf_nat_alloc_null_binding(ct, hooknum); if (err != NF_ACCEPT) goto push; } break; case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: break; default: err = NF_DROP; goto push; } err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: skb_push(skb, nh_off); return err; }


jarno rajahalmejarno rajahalme25480.63%562.50%
joe stringerjoe stringer5116.19%225.00%
arnd bergmannarnd bergmann103.17%112.50%

static void ovs_nat_update_key(struct sw_flow_key *key, const struct sk_buff *skb, enum nf_nat_manip_type maniptype) { if (maniptype == NF_NAT_MANIP_SRC) { __be16 src; key->ct.state |= OVS_CS_F_SRC_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.src = ip_hdr(skb)->saddr; else if (key->eth.type == htons(ETH_P_IPV6)) memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, sizeof(key->ipv6.addr.src)); else return; if (key->ip.proto == IPPROTO_UDP) src = udp_hdr(skb)->source; else if (key->ip.proto == IPPROTO_TCP) src = tcp_hdr(skb)->source; else if (key->ip.proto == IPPROTO_SCTP) src = sctp_hdr(skb)->source; else return; key->tp.src = src; } else { __be16 dst; key->ct.state |= OVS_CS_F_DST_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.dst = ip_hdr(skb)->daddr; else if (key->eth.type == htons(ETH_P_IPV6)) memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, sizeof(key->ipv6.addr.dst)); else return; if (key->ip.proto == IPPROTO_UDP) dst = udp_hdr(skb)->dest; else if (key->ip.proto == IPPROTO_TCP) dst = tcp_hdr(skb)->dest; else if (key->ip.proto == IPPROTO_SCTP) dst = sctp_hdr(skb)->dest; else return; key->tp.dst = dst; } }


jarno rajahalmejarno rajahalme29586.26%133.33%
joe stringerjoe stringer4713.74%266.67%

/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { enum nf_nat_manip_type maniptype; int err; if (nf_ct_is_untracked(ct)) { /* A NAT action may only be performed on tracked packets. */ return NF_ACCEPT; } /* Add NAT extension if not confirmed yet. */ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) return NF_ACCEPT; /* Can't NAT. */ /* Determine NAT type. * Check if the NAT type can be deduced from the tracked connection. * Make sure new expected connections (IP_CT_RELATED) are NATted only * when committing. */ if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && ct->status & IPS_NAT_MASK && (ctinfo != IP_CT_RELATED || info->commit)) { /* NAT an established or related connection like before. */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) /* This is the REPLY direction for a connection * for which NAT was applied in the forward * direction. Do the reverse NAT. */ maniptype = ct->status & IPS_SRC_NAT ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; else maniptype = ct->status & IPS_SRC_NAT ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; } else if (info->nat & OVS_CT_SRC_NAT) { maniptype = NF_NAT_MANIP_SRC; } else if (info->nat & OVS_CT_DST_NAT) { maniptype = NF_NAT_MANIP_DST; } else { return NF_ACCEPT; /* Connection is not NATed. */ } err = ovs_ct_nat_execute(skb,