cregit-Linux how code gets into the kernel

Release 4.18 net/netfilter/ipvs/ip_vs_mh.c

// SPDX-License-Identifier: GPL-2.0
/* IPVS:        Maglev Hashing scheduling module
 *
 * Authors:     Inju Song <inju.song@navercorp.com>
 *
 */

/* The mh algorithm is to assign a preference list of all the lookup
 * table positions to each destination and populate the table with
 * the most-preferred position of destinations. Then it is to select
 * destination with the hash key of source IP address through looking
 * up a the lookup table.
 *
 * The algorithm is detailed in:
 * [3.4 Consistent Hasing]
https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
 *
 */


#define KMSG_COMPONENT "IPVS"

#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/ip.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>

#include <net/ip_vs.h>

#include <linux/siphash.h>
#include <linux/bitops.h>
#include <linux/gcd.h>


#define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 
/* MH fallback */

#define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 
/* MH use port */


struct ip_vs_mh_lookup {
	
struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
};


struct ip_vs_mh_dest_setup {
	
unsigned int	offset; /* starting offset */
	
unsigned int	skip;	/* skip */
	
unsigned int	perm;	/* next_offset */
	
int		turns;	/* weight / gcd() and rshift */
};

/* Available prime numbers for MH table */

static int primes[] = {251, 509, 1021, 2039, 4093,
		       8191, 16381, 32749, 65521, 131071};

/* For IPVS MH entry hash table */
#ifndef CONFIG_IP_VS_MH_TAB_INDEX

#define CONFIG_IP_VS_MH_TAB_INDEX	12
#endif

#define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)

#define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)

#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]


struct ip_vs_mh_state {
	
struct rcu_head			rcu_head;
	
struct ip_vs_mh_lookup		*lookup;
	
struct ip_vs_mh_dest_setup	*dest_setup;
	

hsiphash_key_t			hash1, hash2;
	
int				gcd;
	
int				rshift;
};


static inline void generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2) { hash1->key[0] = 2654435761UL; hash1->key[1] = 2654435761UL; hash2->key[0] = 2654446892UL; hash2->key[1] = 2654446892UL; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song51100.00%1100.00%
Total51100.00%1100.00%

/* Helper function to determine if server is unavailable */
static inline bool is_unavailable(struct ip_vs_dest *dest) { return atomic_read(&dest->weight) <= 0 || dest->flags & IP_VS_DEST_F_OVERLOAD; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song29100.00%1100.00%
Total29100.00%1100.00%

/* Returns hash value for IPVS MH entry */
static inline unsigned int ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, __be16 port, hsiphash_key_t *key, unsigned int offset) { unsigned int v; __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ addr->ip6[2] ^ addr->ip6[3]; #endif v = (offset + ntohs(port) + ntohl(addr_fold)); return hsiphash(&v, sizeof(v), key); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song110100.00%1100.00%
Total110100.00%1100.00%

/* Reset all the hash buckets of the specified table. */
static void ip_vs_mh_reset(struct ip_vs_mh_state *s) { int i; struct ip_vs_mh_lookup *l; struct ip_vs_dest *dest; l = &s->lookup[0]; for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { dest = rcu_dereference_protected(l->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(l->dest, NULL); } l++; } }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song83100.00%1100.00%
Total83100.00%1100.00%


static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, struct ip_vs_service *svc) { struct list_head *p; struct ip_vs_mh_dest_setup *ds; struct ip_vs_dest *dest; int lw; /* If gcd is smaller then 1, number of dests or * all last_weight of dests are zero. So, skip * permutation for the dests. */ if (s->gcd < 1) return 0; /* Set dest_setup for the dests permutation */ p = &svc->destinations; ds = &s->dest_setup[0]; while ((p = p->next) != &svc->destinations) { dest = list_entry(p, struct ip_vs_dest, n_list); ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port, &s->hash1, 0) % IP_VS_MH_TAB_SIZE; ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port, &s->hash2, 0) % (IP_VS_MH_TAB_SIZE - 1) + 1; ds->perm = ds->offset; lw = atomic_read(&dest->last_weight); ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); ds++; } return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song206100.00%1100.00%
Total206100.00%1100.00%


static int ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc) { int n, c, dt_count; unsigned long *table; struct list_head *p; struct ip_vs_mh_dest_setup *ds; struct ip_vs_dest *dest, *new_dest; /* If gcd is smaller then 1, number of dests or * all last_weight of dests are zero. So, skip * the population for the dests and reset lookup table. */ if (s->gcd < 1) { ip_vs_mh_reset(s); return 0; } table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), sizeof(unsigned long), GFP_KERNEL); if (!table) return -ENOMEM; p = &svc->destinations; n = 0; dt_count = 0; while (n < IP_VS_MH_TAB_SIZE) { if (p == &svc->destinations) p = p->next; ds = &s->dest_setup[0]; while (p != &svc->destinations) { /* Ignore added server with zero weight */ if (ds->turns < 1) { p = p->next; ds++; continue; } c = ds->perm; while (test_bit(c, table)) { /* Add skip, mod IP_VS_MH_TAB_SIZE */ ds->perm += ds->skip; if (ds->perm >= IP_VS_MH_TAB_SIZE) ds->perm -= IP_VS_MH_TAB_SIZE; c = ds->perm; } __set_bit(c, table); dest = rcu_dereference_protected(s->lookup[c].dest, 1); new_dest = list_entry(p, struct ip_vs_dest, n_list); if (dest != new_dest) { if (dest) ip_vs_dest_put(dest); ip_vs_dest_hold(new_dest); RCU_INIT_POINTER(s->lookup[c].dest, new_dest); } if (++n == IP_VS_MH_TAB_SIZE) goto out; if (++dt_count >= ds->turns) { dt_count = 0; p = p->next; ds++; } } } out: kfree(table); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song333100.00%1100.00%
Total333100.00%1100.00%

/* Get ip_vs_dest associated with supplied parameters. */
static inline struct ip_vs_dest * ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) % IP_VS_MH_TAB_SIZE; struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); return (!dest || is_unavailable(dest)) ? NULL : dest; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song84100.00%1100.00%
Total84100.00%1100.00%

/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
static inline struct ip_vs_dest * ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int offset, roffset; unsigned int hash, ihash; struct ip_vs_dest *dest; /* First try the dest it's supposed to go to */ ihash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) % IP_VS_MH_TAB_SIZE; dest = rcu_dereference(s->lookup[ihash].dest); if (!dest) return NULL; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* If the original dest is unavailable, loop around the table * starting from ihash to find a new dest */ for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, roffset) % IP_VS_MH_TAB_SIZE; dest = rcu_dereference(s->lookup[hash].dest); if (!dest) break; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u (offset %u), reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song237100.00%1100.00%
Total237100.00%1100.00%

/* Assign all the hash buckets of the specified table with the service. */
static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc) { int ret; if (svc->num_dests > IP_VS_MH_TAB_SIZE) return -EINVAL; if (svc->num_dests >= 1) { s->dest_setup = kcalloc(svc->num_dests, sizeof(struct ip_vs_mh_dest_setup), GFP_KERNEL); if (!s->dest_setup) return -ENOMEM; } ip_vs_mh_permutate(s, svc); ret = ip_vs_mh_populate(s, svc); if (ret < 0) goto out; IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); out: if (svc->num_dests >= 1) { kfree(s->dest_setup); s->dest_setup = NULL; } return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song150100.00%1100.00%
Total150100.00%1100.00%


static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) { struct ip_vs_dest *dest; int weight; int g = 0; list_for_each_entry(dest, &svc->destinations, n_list) { weight = atomic_read(&dest->last_weight); if (weight > 0) { if (g > 0) g = gcd(weight, g); else g = weight; } } return g; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song75100.00%1100.00%
Total75100.00%1100.00%

/* To avoid assigning huge weight for the MH table, * calculate shift value with gcd. */
static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) { struct ip_vs_dest *dest; int new_weight, weight = 0; int mw, shift; /* If gcd is smaller then 1, number of dests or * all last_weight of dests are zero. So, return * shift value as zero. */ if (gcd < 1) return 0; list_for_each_entry(dest, &svc->destinations, n_list) { new_weight = atomic_read(&dest->last_weight); if (new_weight > weight) weight = new_weight; } /* Because gcd is greater than zero, * the maximum weight and gcd are always greater than zero */ mw = weight / gcd; /* shift = occupied bits of weight/gcd - MH highest bits */ shift = fls(mw) - IP_VS_MH_TAB_BITS; return (shift >= 0) ? shift : 0; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song99100.00%1100.00%
Total99100.00%1100.00%


static void ip_vs_mh_state_free(struct rcu_head *head) { struct ip_vs_mh_state *s; s = container_of(head, struct ip_vs_mh_state, rcu_head); kfree(s->lookup); kfree(s); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song40100.00%1100.00%
Total40100.00%1100.00%


static int ip_vs_mh_init_svc(struct ip_vs_service *svc) { int ret; struct ip_vs_mh_state *s; /* Allocate the MH table for this service */ s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), GFP_KERNEL); if (!s->lookup) { kfree(s); return -ENOMEM; } generate_hash_secret(&s->hash1, &s->hash2); s->gcd = ip_vs_mh_gcd_weight(svc); s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) allocated for current service\n", sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); /* Assign the lookup table with current dests */ ret = ip_vs_mh_reassign(s, svc); if (ret < 0) { ip_vs_mh_reset(s); ip_vs_mh_state_free(&s->rcu_head); return ret; } /* No more failures, attach state */ svc->sched_data = s; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song171100.00%1100.00%
Total171100.00%1100.00%


static void ip_vs_mh_done_svc(struct ip_vs_service *svc) { struct ip_vs_mh_state *s = svc->sched_data; /* Got to clean up lookup entry here */ ip_vs_mh_reset(s); call_rcu(&s->rcu_head, ip_vs_mh_state_free); IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song51100.00%1100.00%
Total51100.00%1100.00%


static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_mh_state *s = svc->sched_data; s->gcd = ip_vs_mh_gcd_weight(svc); s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); /* Assign the lookup table with the updated service */ return ip_vs_mh_reassign(s, svc); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song56100.00%1100.00%
Total56100.00%1100.00%

/* Helper function to get port number */
static inline __be16 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { __be16 _ports[2], *ports; /* At this point we know that we have a valid packet of some kind. * Because ICMP packets are only guaranteed to have the first 8 * bytes, let's just grab the ports. Fortunately they're in the * same position for all three of the protocols we care about. */ switch (iph->protocol) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: ports = skb_header_pointer(skb, iph->len, sizeof(_ports), &_ports); if (unlikely(!ports)) return 0; if (likely(!ip_vs_iph_inverse(iph))) return ports[0]; else return ports[1]; default: return 0; } }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song103100.00%1100.00%
Total103100.00%1100.00%

/* Maglev Hashing scheduling */
static struct ip_vs_dest * ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_mh_state *s; __be16 port = 0; const union nf_inet_addr *hash_addr; hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "%s : Scheduling...\n", __func__); if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) port = ip_vs_mh_get_port(skb, iph); s = (struct ip_vs_mh_state *)svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); else dest = ip_vs_mh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", IP_VS_DBG_ADDR(svc->af, hash_addr), ntohs(port), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song194100.00%1100.00%
Total194100.00%1100.00%

/* IPVS MH Scheduler structure */ static struct ip_vs_scheduler ip_vs_mh_scheduler = { .name = "mh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), .init_service = ip_vs_mh_init_svc, .done_service = ip_vs_mh_done_svc, .add_dest = ip_vs_mh_dest_changed, .del_dest = ip_vs_mh_dest_changed, .upd_dest = ip_vs_mh_dest_changed, .schedule = ip_vs_mh_schedule, };
static int __init ip_vs_mh_init(void) { return register_ip_vs_scheduler(&ip_vs_mh_scheduler); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song16100.00%1100.00%
Total16100.00%1100.00%


static void __exit ip_vs_mh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); rcu_barrier(); }

Contributors

PersonTokensPropCommitsCommitProp
Inju Song18100.00%1100.00%
Total18100.00%1100.00%

module_init(ip_vs_mh_init); module_exit(ip_vs_mh_cleanup); MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");

Overall Contributors

PersonTokensPropCommitsCommitProp
Inju Song2373100.00%1100.00%
Total2373100.00%1100.00%
Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.
Created with cregit.