#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

#include "datapath.h"
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"

int ovs_net_id __read_mostly;


static struct genl_family dp_packet_genl_family;

static struct genl_family dp_flow_genl_family;

static struct genl_family dp_datapath_genl_family;

static const struct nla_policy flow_policy[];

static const struct genl_multicast_group ovs_dp_flow_multicast_group = {

static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {

static const struct genl_multicast_group ovs_dp_vport_multicast_group = {

/* Check if need to build a reply message.
 * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */

static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || genl_has_listeners(family, genl_info_net(info), group); }


static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { genl_notify(family, skb, info, 0, GFP_KERNEL); }


/** * DOC: Locking: * * All writes e.g. Writes to device state (add/remove datapath, port, set * operations on vports, etc.), Writes to other state (flow table * modifications, set miscellaneous datapath parameters, etc.) are protected * by ovs_lock. * * Reads are protected by RCU. * * There are a few special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. * * The RTNL lock nests inside ovs_mutex. */ static DEFINE_MUTEX(ovs_mutex);
void ovs_lock(void) { mutex_lock(&ovs_mutex); }


void ovs_unlock(void) { mutex_unlock(&ovs_mutex); }


int lockdep_ovsl_is_held(void) { if (debug_locks) return lockdep_is_held(&ovs_mutex); else return 1; }


EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); #endif static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); /* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) return vport->dp; } return NULL; }


/* The caller must hold either ovs_mutex or rcu_read_lock to keep the * returned dp pointer valid. */
static inline struct datapath *get_dp(struct net *net, int dp_ifindex) { struct datapath *dp; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); rcu_read_lock(); dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; }


/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return ovs_vport_name(vport); }


static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; rcu_read_lock(); local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) ifindex = local->dev->ifindex; else ifindex = 0; rcu_read_unlock(); return ifindex; }


static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); kfree(dp); }


static struct hlist_head *vport_hash_bucket(const struct datapath *dp, u16 port_no) { return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; }


/* Called with ovs_mutex or RCU read lock. */
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node) { if (vport->port_no == port_no) return vport; } return NULL; }


/* Called with ovs_mutex. */
static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_add(parms); if (!IS_ERR(vport)) { struct datapath *dp = parms->dp; struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); hlist_add_head_rcu(&vport->dp_hash_node, head); } return vport; }


void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); /* Then destroy it. */ ovs_vport_del(p); }


/* Must be called with rcu_read_lock. */
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; int error; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) kfree_skb(skb); else consume_skb(skb); stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); ovs_execute_actions(dp, skb, sf_acts, key); stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; u64_stats_update_end(&stats->syncp); }


int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct dp_stats_percpu *stats; int err; if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; } if (!skb_is_gso(skb)) err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); if (err) goto err; return 0; err: stats = this_cpu_ptr(dp->stats_percpu); u64_stats_update_begin(&stats->syncp); stats->n_lost++; u64_stats_update_end(&stats->syncp); return err; }


static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; if (gso_type & SKB_GSO_UDP) { /* The initial flow key extracted by ovs_flow_key_extract() * in this case is for a first fragment, so we need to * properly mark later fragments. */ later_key = *key; later_key.ip.frag = OVS_FRAG_TYPE_LATER; } /* Queue all of the segments. */ skb = segs; do { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; } while ((skb = skb->next)); /* Free all of the segments. */ skb = segs; do { nskb = skb->next; if (err) kfree_skb(skb); else consume_skb(skb); } while ((skb = nskb)); return err; }


static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) size += NLA_ALIGN(upcall_info->userdata->nla_len); /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) size += nla_total_size(upcall_info->actions_len); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) size += nla_total_size(sizeof(upcall_info->mru)); return size; }


static void pad_packet(struct datapath *dp, struct sk_buff *skb) { if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { size_t plen = NLA_ALIGN(skb->len) - skb->len; if (plen > 0) memset(skb_put(skb, plen), 0, plen); } }


static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; size_t len; unsigned int hlen; int err, dp_ifindex; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) return -ENODEV; if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; skb = nskb; } if (nla_attr_size(skb->len) > USHRT_MAX) { err = -EFBIG; goto out; } /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto out; /* Older versions of OVS user space enforce alignment of the last * Netlink attribute to NLA_ALIGNTO which would require extensive * padding logic. Only perform zerocopy if padding is not required. */ if (dp->user_features & OVS_DP_F_UNALIGNED) hlen = skb_zerocopy_headlen(skb); else hlen = skb->len; len = upcall_msg_size(upcall_info, hlen - cutlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; } upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); BUG_ON(err); if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); BUG_ON(err); nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); err = ovs_nla_put_actions(upcall_info->actions, upcall_info->actions_len, user_skb); if (!err) nla_nest_end(user_skb, nla); else nla_nest_cancel(user_skb, nla); } /* Add OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) { if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { err = -ENOBUFS; goto out; } pad_packet(dp, user_skb); } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ if (cutlen > 0) { if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { err = -ENOBUFS; goto out; } pad_packet(dp, user_skb); } /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } nla->nla_len = nla_attr_size(skb->len - cutlen); err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); user_skb = NULL; out: if (err) skb_tx_error(skb); kfree_skb(user_skb); kfree_skb(nskb); return err; }


static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; u16 mru = 0; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || !a[OVS_PACKET_ATTR_ACTIONS]) goto err; len = nla_len(a[OVS_PACKET_ATTR_PACKET]); packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); err = -ENOMEM; if (!packet) goto err; skb_reserve(packet, NET_IP_ALIGN); nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); skb_reset_mac_header(packet); eth = eth_hdr(packet); /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ if (eth_proto_is_802_3(eth->h_proto)) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); /* Set packet's mru */ if (a[OVS_PACKET_ATTR_MRU]) { mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); packet->ignore_df = 1; } OVS_CB(packet)->mru = mru; /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], packet, &flow->key, log); if (err) goto err_flow_free; err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); if (!input_vport) input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); if (!input_vport) goto err_unlock; packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: return err; }


static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, }; static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, .doit = ovs_packet_cmd_execute } }; static struct genl_family dp_packet_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct