ia64/xen-unstable

view tools/vnet/vnet-module/varp.c @ 6946:e703abaf6e3d

Add behaviour to the remove methods to remove the transaction's path itself. This allows us to write Remove(path) to remove the specified path rather than having to slice the path ourselves.
author emellor@ewan
date Sun Sep 18 14:42:13 2005 +0100 (2005-09-18)
parents 3233e7ecfa9f
children 06d84bf87159
line source
1 /*
2 * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by the
6 * Free Software Foundation; either version 2 of the License, or (at your
7 * option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free software Foundation, Inc.,
16 * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
17 *
18 */
20 #include <linux/config.h>
21 #include <linux/kernel.h>
22 #include <linux/module.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/version.h>
27 #include <linux/net.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/udp.h>
33 #include <net/ip.h>
34 #include <net/protocol.h>
35 #include <net/route.h>
36 #include <linux/skbuff.h>
37 #include <linux/spinlock.h>
38 #include <asm/semaphore.h>
40 #include <tunnel.h>
41 #include <vnet.h>
42 #include <vif.h>
43 #include <if_varp.h>
44 #include <varp.h>
45 #include <vnet.h>
47 #include "allocate.h"
48 #include "hash_table.h"
49 #include "sys_net.h"
50 #include "sys_string.h"
51 #include "skb_util.h"
53 #define MODULE_NAME "VARP"
54 #define DEBUG 1
55 #undef DEBUG
56 #include "debug.h"
58 /** @file VARP: Virtual ARP.
59 *
60 * Handles virtual ARP requests for vnet/vmac.
61 */
63 /*
65 Varp uses UDP on port 1798.
67 on domain up: ?
68 send varp.announce { id, vmac, vnet, coa } for each vif
69 that haven't announced before, or has changed.
70 install vif entries in local table.
72 on varp.announce{ id, vmac, vnet, coa }:
73 update VARP entry for vmac x vnet if have one, reset ttl.
75 on varp.request { id, vmac, vnet }:
76 if have a vif for the requested vmac/vnet,
77 reply with varp.announce{ id, vmac, vnet, coa }
79 on timer:
80 traverse VARP table, flush old entries.
82 on probe timer:
83 probe again if not out of tries.
84 if out of tries invalidate entry.
86 */
88 /** Time-to-live of varp entries (in jiffies).*/
89 #define VARP_ENTRY_TTL (60*HZ)
91 /** Maximum number of varp probes to make. */
92 #define VARP_PROBE_MAX 5
94 /** Interval between varp probes (in jiffies). */
95 #define VARP_PROBE_INTERVAL (3*HZ)
97 /** Maximum number of queued skbs for a varp entry. */
98 #define VARP_QUEUE_MAX 16
100 /** Number of buckets in the varp table (must be prime). */
101 #define VARP_TABLE_BUCKETS 3001
103 /** Varp entry states. */
104 enum {
105 VARP_STATE_INCOMPLETE = 1,
106 VARP_STATE_REACHABLE = 2,
107 VARP_STATE_FAILED = 3
108 };
110 /** Varp entry flags. */
111 enum {
112 VARP_FLAG_PROBING = 1,
113 VARP_FLAG_PERMANENT = 2,
114 };
116 /** Key for varp entries. */
117 typedef struct VarpKey {
118 /** Vnet id (network order). */
119 VnetId vnet;
120 /** Virtual MAC address. */
121 Vmac vmac;
122 } VarpKey;
124 /** An entry in the varp cache. */
125 typedef struct VarpEntry {
126 /** Key for the entry. */
127 VarpKey key;
128 /** Care-of address for the key. */
129 VarpAddr addr;
130 /** Last-updated timestamp. */
131 unsigned long timestamp;
132 /** State. */
133 short state;
134 /** Flags. */
135 short flags;
136 /** Reference count. */
137 atomic_t refcount;
138 /** Lock. */
139 rwlock_t lock;
140 /** How many probes have been made. */
141 atomic_t probes;
142 /** Probe timer. */
143 struct timer_list timer;
144 void (*error)(struct VarpEntry *ventry, struct sk_buff *skb);
145 /** Outbound skb queue. */
146 struct sk_buff_head queue;
147 /** Maximum size of the queue. */
148 int queue_max;
149 } VarpEntry;
151 /** The varp cache. Varp entries indexed by VarpKey. */
152 typedef struct VarpTable {
154 HashTable *table;
156 /** Sweep timer. */
157 struct timer_list timer;
159 /** Lock. Need to use a semaphore instead of a spinlock because
160 * some operations under the varp table lock can schedule - and
161 * you mustn't hold a spinlock when scheduling.
162 */
163 struct semaphore lock;
165 } VarpTable;
167 /** The varp cache. */
168 static VarpTable *varp_table = NULL;
170 /** Module parameter for the multicast address. */
171 static char *varp_mcaddr = NULL;
173 /** Multicast address (network order). */
174 u32 varp_mcast_addr = 0;
176 /** UDP port (network order). */
177 u16 varp_port = 0;
179 char *varp_device = "xen-br0";
181 #define VarpTable_read_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
182 #define VarpTable_read_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
183 #define VarpTable_write_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
184 #define VarpTable_write_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
186 #define VarpEntry_lock(ventry, flags) write_lock_irqsave(&(ventry)->lock, (flags))
187 #define VarpEntry_unlock(ventry, flags) write_unlock_irqrestore(&(ventry)->lock, (flags))
189 void VarpTable_sweep(VarpTable *z, int all);
190 void VarpTable_flush(VarpTable *z);
191 void VarpTable_print(VarpTable *z);
193 #include "./varp_util.c"
195 /** Print the varp cache (if debug on).
196 */
197 void varp_dprint(void){
198 #ifdef DEBUG
199 VarpTable_print(varp_table);
200 #endif
201 }
203 /** Flush the varp cache.
204 */
205 void varp_flush(void){
206 VarpTable_flush(varp_table);
207 }
209 static int device_ucast_addr(const char *device, uint32_t *addr)
210 {
211 int err;
212 struct net_device *dev = NULL;
214 err = vnet_get_device(device, &dev);
215 if(err) goto exit;
216 err = vnet_get_device_address(dev, addr);
217 exit:
218 if(err){
219 *addr = 0;
220 }
221 return err;
222 }
224 /** Get the unicast address of the varp device.
225 */
226 int varp_ucast_addr(uint32_t *addr)
227 {
228 int err = -ENODEV;
229 const char *devices[] = { varp_device, "eth0", "eth1", "eth2", NULL };
230 const char **p;
231 for(p = devices; err && *p; p++){
232 err = device_ucast_addr(*p, addr);
233 }
234 return err;
235 }
237 /** Print varp info and the varp cache.
238 */
239 void varp_print(void){
240 uint32_t addr = 0;
241 varp_ucast_addr(&addr);
243 printk(KERN_INFO "=== VARP ===============================================================\n");
244 printk(KERN_INFO "varp_device %s\n", varp_device);
245 printk(KERN_INFO "varp_mcast_addr " IPFMT "\n", NIPQUAD(varp_mcast_addr));
246 printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(addr));
247 printk(KERN_INFO "varp_port %d\n", ntohs(varp_port));
248 vnet_print();
249 vif_print();
250 VarpTable_print(varp_table);
251 printk(KERN_INFO "========================================================================\n");
252 }
254 /** Lookup a network device by name.
255 *
256 * @param name device name
257 * @param dev return parameter for the device
258 * @return 0 on success, error code otherwise
259 */
260 int vnet_get_device(const char *name, struct net_device **dev){
261 int err = 0;
262 *dev = dev_get_by_name(name);
263 if(!*dev){
264 err = -ENETDOWN;
265 }
266 return err;
267 }
269 /** Get the source address from a device.
270 *
271 * @param dev device
272 * @param addr return parameter for address
273 * @return 0 on success, error code otherwise
274 */
275 int vnet_get_device_address(struct net_device *dev, u32 *addr){
276 int err = 0;
277 struct in_device *in_dev;
279 in_dev = in_dev_get(dev);
280 if(!in_dev){
281 err = -ENODEV;
282 goto exit;
283 }
284 *addr = in_dev->ifa_list->ifa_address;
285 in_dev_put(in_dev);
286 exit:
287 return err;
288 }
290 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
292 static inline int addr_route(u32 daddr, struct rtable **prt){
293 int err = 0;
294 struct flowi fl = {
295 .nl_u = {
296 .ip4_u = {
297 .daddr = daddr,
298 }
299 }
300 };
302 err = ip_route_output_key(prt, &fl);
303 return err;
304 }
306 #else
308 static inline int addr_route(u32 daddr, struct rtable **prt){
309 int err = 0;
310 struct rt_key key = { .dst = daddr };
311 err = ip_route_output_key(prt, &key);
312 return err;
313 }
315 #endif
317 #ifndef LL_RESERVED_SPACE
318 #define HH_DATA_MOD 16
319 #define LL_RESERVED_SPACE(dev) \
320 ((dev->hard_header_len & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
321 #endif
323 /** Send a varp protocol message.
324 *
325 * @param opcode varp opcode (host order)
326 * @param dev device (may be null)
327 * @param skb skb being replied to (may be null)
328 * @param vnet vnet id (in network order)
329 * @param vmac vmac (in network order)
330 * @return 0 on success, error code otherwise
331 */
332 int varp_send(u16 opcode, struct net_device *dev, struct sk_buff *skbin,
333 VnetId *vnet, Vmac *vmac){
334 int err = 0;
335 int link_n = 0;
336 int ip_n = sizeof(struct iphdr);
337 int udp_n = sizeof(struct udphdr);
338 int varp_n = sizeof(VarpHdr);
339 struct sk_buff *skbout = NULL;
340 struct in_device *in_dev = NULL;
341 VarpHdr *varph = NULL;
342 u8 macbuf[6] = {};
343 u8 *smac, *dmac = macbuf;
344 u32 saddr, daddr;
345 u16 sport, dport;
346 #if defined(DEBUG)
347 char vnetbuf[VNET_ID_BUF];
348 #endif
350 dprintf("> opcode=%d vnet= %s vmac=" MACFMT "\n",
351 opcode, VnetId_ntoa(vnet, vnetbuf), MAC6TUPLE(vmac->mac));
353 dport = varp_port;
354 if(skbin){
355 daddr = skbin->nh.iph->saddr;
356 dmac = eth_hdr(skbin)->h_source;
357 sport = skbin->h.uh->dest;
358 } else {
359 if(MULTICAST(varp_mcast_addr)){
360 daddr = varp_mcast_addr;
361 ip_eth_mc_map(daddr, dmac);
362 } else {
363 daddr = INADDR_BROADCAST;
364 }
365 sport = varp_port;
366 }
368 if(!dev){
369 struct rtable *rt = NULL;
370 err = addr_route(daddr, &rt);
371 if(err) goto exit;
372 dev = rt->u.dst.dev;
373 }
375 in_dev = in_dev_get(dev);
376 if(!in_dev){
377 err = -ENODEV;
378 goto exit;
379 }
380 link_n = LL_RESERVED_SPACE(dev);
381 saddr = in_dev->ifa_list->ifa_address;
382 smac = dev->dev_addr;
383 if(daddr == INADDR_BROADCAST){
384 daddr = in_dev->ifa_list->ifa_broadcast;
385 dmac = dev->broadcast;
386 }
387 in_dev_put(in_dev);
389 dprintf("> dev=%s\n", dev->name);
390 dprintf("> smac=" MACFMT " dmac=" MACFMT "\n", MAC6TUPLE(smac), MAC6TUPLE(dmac));
391 dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(saddr), NIPQUAD(daddr));
392 dprintf("> sport=%u dport=%u\n", ntohs(sport), ntohs(dport));
394 skbout = alloc_skb(link_n + ip_n + udp_n + varp_n, GFP_ATOMIC);
395 if (!skbout){
396 err = -ENOMEM;
397 goto exit;
398 }
399 skbout->dev = dev;
400 skb_reserve(skbout, link_n);
401 skbout->protocol = htons(ETH_P_IP);
403 // Device header. Pushes device header on front of skb.
404 if (dev->hard_header){
405 err = dev->hard_header(skbout, dev, ETH_P_IP, dmac, smac, skbout->len);
406 if(err < 0) goto exit;
407 skbout->mac.raw = skbout->data;
408 }
410 // IP header.
411 skbout->nh.raw = skb_put(skbout, ip_n);
412 skbout->nh.iph->version = 4;
413 skbout->nh.iph->ihl = ip_n / 4;
414 skbout->nh.iph->tos = 0;
415 skbout->nh.iph->tot_len = htons(ip_n + udp_n + varp_n);
416 skbout->nh.iph->id = 0;
417 skbout->nh.iph->frag_off = 0;
418 skbout->nh.iph->ttl = 64;
419 skbout->nh.iph->protocol = IPPROTO_UDP;
420 skbout->nh.iph->saddr = saddr;
421 skbout->nh.iph->daddr = daddr;
422 skbout->nh.iph->check = 0;
424 // UDP header.
425 skbout->h.raw = skb_put(skbout, udp_n);
426 skbout->h.uh->source = sport;
427 skbout->h.uh->dest = dport;
428 skbout->h.uh->len = htons(udp_n + varp_n);
429 skbout->h.uh->check = 0;
431 // Varp header.
432 varph = (void*)skb_put(skbout, varp_n);
433 *varph = (VarpHdr){};
434 varph->hdr.id = htons(VARP_ID);
435 varph->hdr.opcode = htons(opcode);
436 varph->vnet = *vnet;
437 varph->vmac = *vmac;
438 varph->addr.family = AF_INET;
439 varph->addr.u.ip4.s_addr = saddr;
441 err = skb_xmit(skbout);
443 exit:
444 if(err && skbout) kfree_skb(skbout);
445 dprintf("< err=%d\n", err);
446 return err;
447 }
449 /** Send a varp request for the vnet and destination mac of a packet.
450 *
451 * @param skb packet
452 * @param vnet vnet (in network order)
453 * @return 0 on success, error code otherwise
454 */
455 int varp_solicit(struct sk_buff *skb, VnetId *vnet){
456 int err = 0;
457 err = varp_send(VARP_OP_REQUEST, NULL, NULL,
458 vnet, (Vmac*)eth_hdr(skb)->h_dest);
459 return err;
460 }
462 /* Test some flags.
463 *
464 * @param z varp entry
465 * @param flags to test
466 * @return nonzero if flags set
467 */
468 int VarpEntry_get_flags(VarpEntry *z, int flags){
469 return z->flags & flags;
470 }
472 /** Set some flags.
473 *
474 * @param z varp entry
475 * @param flags to set
476 * @param set set flags on if nonzero, off if zero
477 * @return new flags value
478 */
479 int VarpEntry_set_flags(VarpEntry *z, int flags, int set){
480 if(set){
481 z->flags |= flags;
482 } else {
483 z->flags &= ~flags;
484 }
485 return z->flags;
486 }
488 /** Print a varp entry.
489 *
490 * @param ventry varp entry
491 */
492 void VarpEntry_print(VarpEntry *ventry){
493 if(ventry){
494 char *state, *flags;
495 char vnetbuf[VNET_ID_BUF];
496 char addrbuf[VARP_ADDR_BUF];
498 switch(ventry->state){
499 case VARP_STATE_INCOMPLETE: state = "INC"; break;
500 case VARP_STATE_REACHABLE: state = "RCH"; break;
501 case VARP_STATE_FAILED: state = "FLD"; break;
502 default: state = "UNK"; break;
503 }
504 flags = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " ");
506 printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%s vmac=" MACFMT
507 " addr=%s q=%3d t=%lu)\n",
508 ventry,
509 atomic_read(&ventry->refcount),
510 state, flags,
511 VnetId_ntoa(&ventry->key.vnet, vnetbuf),
512 MAC6TUPLE(ventry->key.vmac.mac),
513 VarpAddr_ntoa(&ventry->addr, addrbuf),
514 skb_queue_len(&ventry->queue),
515 ventry->timestamp);
516 } else {
517 printk("VENTRY: Null!\n");
518 }
519 }
521 /** Free a varp entry.
522 *
523 * @param z varp entry
524 */
525 void VarpEntry_free(VarpEntry *z){
526 if(!z) return;
527 deallocate(z);
528 }
530 /** Increment reference count.
531 *
532 * @param z varp entry (may be null)
533 */
534 void VarpEntry_incref(VarpEntry *z){
535 if(!z) return;
536 atomic_inc(&z->refcount);
537 }
539 /** Decrement reference count, freeing if zero.
540 *
541 * @param z varp entry (may be null)
542 */
543 void VarpEntry_decref(VarpEntry *z){
544 if(!z) return;
545 if(atomic_dec_and_test(&z->refcount)){
546 VarpEntry_free(z);
547 }
548 }
550 /** Call the error handler.
551 *
552 * @param ventry varp entry
553 */
554 void VarpEntry_error(VarpEntry *ventry){
555 struct sk_buff *skb;
556 skb = skb_peek(&ventry->queue);
557 if(!skb) return;
558 if(ventry->error) ventry->error(ventry, skb);
559 skb_queue_purge(&ventry->queue);
560 }
562 /** Schedule the varp entry timer.
563 * Must increment the reference count before doing
564 * this the first time, so the ventry won't be freed
565 * before the timer goes off.
566 *
567 * @param ventry varp entry
568 */
569 void VarpEntry_schedule(VarpEntry *ventry){
570 unsigned long now = jiffies;
571 ventry->timer.expires = now + VARP_PROBE_INTERVAL;
572 add_timer(&ventry->timer);
573 }
575 /** Function called when a varp entry timer goes off.
576 * If the entry is still incomplete, carries on probing.
577 * Otherwise stops probing.
578 *
579 * @param arg ventry
580 */
581 static void varp_timer_fn(unsigned long arg){
582 unsigned long flags;
583 VarpEntry *ventry = (VarpEntry *)arg;
584 struct sk_buff *skb = NULL;
585 int locked = 0, probing = 0;
587 dprintf(">\n"); //VarpEntry_print(ventry);
588 VarpEntry_lock(ventry, flags);
589 locked = 1;
590 if(ventry->state == VARP_STATE_REACHABLE){
591 // Do nothing.
592 } else {
593 // Probe if haven't run out of tries, otherwise fail.
594 if(atomic_read(&ventry->probes) < VARP_PROBE_MAX){
595 probing = 1;
596 VarpEntry_schedule(ventry);
597 skb = skb_peek(&ventry->queue);
598 if(skb){
599 dprintf("> skbs in queue - solicit\n");
600 atomic_inc(&ventry->probes);
601 VarpEntry_unlock(ventry, flags);
602 locked = 0;
603 varp_solicit(skb, &ventry->key.vnet);
604 } else {
605 dprintf("> empty queue.\n");
606 }
607 } else {
608 dprintf("> Out of probes: FAILED\n");
609 VarpEntry_error(ventry);
610 ventry->state = VARP_STATE_FAILED;
611 }
612 }
613 VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, probing);
614 if(locked) VarpEntry_unlock(ventry, flags);
615 if(!probing) VarpEntry_decref(ventry);
616 dprintf("<\n");
617 }
619 /** Default error function for varp entries.
620 *
621 * @param ventry varp entry
622 * @param skb packet dropped because of error
623 */
624 static void varp_error_fn(VarpEntry *ventry, struct sk_buff *skb){
625 }
627 /** Create a varp entry. Initializes the internal state.
628 *
629 * @param vnet vnet id
630 * @param vmac virtual MAC address (copied)
631 * @return ventry or null
632 */
633 VarpEntry * VarpEntry_new(VnetId *vnet, Vmac *vmac){
634 VarpEntry *z = ALLOCATE(VarpEntry);
635 if(z){
636 unsigned long now = jiffies;
638 atomic_set(&z->refcount, 1);
639 z->lock = RW_LOCK_UNLOCKED;
640 z->state = VARP_STATE_INCOMPLETE;
641 z->queue_max = VARP_QUEUE_MAX;
642 skb_queue_head_init(&z->queue);
643 init_timer(&z->timer);
644 z->timer.data = (unsigned long)z;
645 z->timer.function = varp_timer_fn;
646 z->timestamp = now;
647 z->error = varp_error_fn;
649 z->key.vnet = *vnet;
650 z->key.vmac = *vmac;
651 }
652 return z;
653 }
655 /** Hash function for keys in the varp cache.
656 * Hashes the vnet id and mac.
657 *
658 * @param k key (VarpKey)
659 * @return hashcode
660 */
661 Hashcode varp_key_hash_fn(void *k){
662 VarpKey *key = k;
663 Hashcode h = 0;
664 h = VnetId_hash(h, &key->vnet);
665 h = Vmac_hash(h, &key->vmac);
666 return h;
667 }
669 /** Test equality for keys in the varp cache.
670 * Compares vnet and mac.
671 *
672 * @param k1 key to compare (VarpKey)
673 * @param k2 key to compare (VarpKey)
674 * @return 1 if equal, 0 otherwise
675 */
676 int varp_key_equal_fn(void *k1, void *k2){
677 VarpKey *key1 = k1;
678 VarpKey *key2 = k2;
679 return (VnetId_eq(&key1->vnet, &key2->vnet) &&
680 Vmac_eq(&key1->vmac, &key2->vmac));
681 }
683 /** Free an entry in the varp cache.
684 *
685 * @param table containing table
686 * @param entry entry to free
687 */
688 static void varp_entry_free_fn(HashTable *table, HTEntry *entry){
689 VarpEntry *ventry;
690 if(!entry) return;
691 ventry = entry->value;
692 if(ventry) VarpEntry_decref(ventry);
693 HTEntry_free(entry);
694 }
696 /** Free the whole varp cache.
697 * Dangerous.
698 *
699 * @param z varp cache
700 */
701 void VarpTable_free(VarpTable *z){
702 unsigned long flags;
703 if(!z) return;
704 VarpTable_write_lock(z, flags);
705 del_timer(&z->timer);
706 z->timer.data = 0;
707 if(z->table) HashTable_free(z->table);
708 VarpTable_write_unlock(z, flags);
709 deallocate(z);
710 }
712 /** Schedule the varp table timer.
713 *
714 * @param z varp table
715 */
716 void VarpTable_schedule(VarpTable *z){
717 unsigned long now = jiffies;
718 z->timer.expires = now + VARP_ENTRY_TTL;
719 add_timer(&z->timer);
720 }
722 /** Function called when the varp table timer goes off.
723 * Sweeps old varp cache entries and reschedules itself.
724 *
725 * @param arg varp table
726 */
727 static void varp_table_timer_fn(unsigned long arg){
728 VarpTable *z = (VarpTable *)arg;
729 if(z){
730 VarpTable_sweep(z, 0);
731 VarpTable_schedule(z);
732 }
733 }
735 /** Print a varp table.
736 *
737 * @param z table
738 */
739 void VarpTable_print(VarpTable *z){
740 HashTable_for_decl(entry);
741 VarpEntry *ventry;
742 unsigned long flags, vflags;
744 VarpTable_read_lock(z, flags);
745 HashTable_for_each(entry, varp_table->table){
746 ventry = entry->value;
747 VarpEntry_lock(ventry, vflags);
748 VarpEntry_print(ventry);
749 VarpEntry_unlock(ventry, vflags);
750 }
751 VarpTable_read_unlock(z, flags);
752 }
754 /** Create a varp table.
755 *
756 * @return new table or null
757 */
758 VarpTable * VarpTable_new(void){
759 int err = -ENOMEM;
760 VarpTable *z = NULL;
762 z = ALLOCATE(VarpTable);
763 if(!z) goto exit;
764 z->table = HashTable_new(VARP_TABLE_BUCKETS);
765 if(!z->table) goto exit;
766 z->table->key_equal_fn = varp_key_equal_fn;
767 z->table->key_hash_fn = varp_key_hash_fn;
768 z->table->entry_free_fn = varp_entry_free_fn;
769 init_MUTEX(&z->lock);
770 init_timer(&z->timer);
771 z->timer.data = (unsigned long)z;
772 z->timer.function = varp_table_timer_fn;
773 VarpTable_schedule(z);
774 err = 0;
775 exit:
776 if(err){
777 VarpTable_free(z);
778 z = NULL;
779 }
780 return z;
781 }
783 /** Add a new entry to the varp table.
784 *
785 * @param z table
786 * @param vnet vnet id
787 * @param vmac virtual MAC address (copied)
788 * @return new entry or null
789 */
790 VarpEntry * VarpTable_add(VarpTable *z, VnetId *vnet, Vmac *vmac){
791 int err = -ENOMEM;
792 VarpEntry *ventry;
793 HTEntry *entry;
794 unsigned long flags;
796 ventry = VarpEntry_new(vnet, vmac);
797 if(!ventry) goto exit;
798 VarpTable_write_lock(z, flags);
799 entry = HashTable_add(z->table, ventry, ventry);
800 VarpTable_write_unlock(z, flags);
801 if(!entry) goto exit;
802 VarpEntry_incref(ventry);
803 err = 0;
804 exit:
805 if(err){
806 VarpEntry_free(ventry);
807 ventry = NULL;
808 }
809 return ventry;
810 }
812 /** Remove an entry from the varp table.
813 *
814 * @param z table
815 * @param ventry entry to remove
816 * @return removed count
817 */
818 int VarpTable_remove(VarpTable *z, VarpEntry *ventry){
819 return HashTable_remove(z->table, ventry);
820 }
822 /** Lookup an entry in the varp table.
823 *
824 * @param z table
825 * @param vnet vnet id
826 * @param vmac virtual MAC addres
827 * @return entry found or null
828 */
829 VarpEntry * VarpTable_lookup(VarpTable *z, VnetId *vnet, Vmac *vmac){
830 unsigned long flags;
831 VarpKey key = { .vnet = *vnet, .vmac = *vmac };
832 VarpEntry *ventry;
833 VarpTable_read_lock(z, flags);
834 ventry = HashTable_get(z->table, &key);
835 if(ventry) VarpEntry_incref(ventry);
836 VarpTable_read_unlock(z, flags);
837 return ventry;
838 }
840 /** Handle output for a reachable ventry.
841 * Send the skb using the tunnel to the care-of address.
842 * Assumes the ventry lock is held.
843 *
844 * @param ventry varp entry
845 * @param skb skb to send
846 * @return 0 on success, error code otherwise
847 */
848 int VarpEntry_send(VarpEntry *ventry, struct sk_buff *skb){
849 int err = 0;
850 unsigned long flags = 0;
851 VarpAddr addr;
853 dprintf("> skb=%p\n", skb);
854 addr = ventry->addr;
855 VarpEntry_unlock(ventry, flags);
856 err = vnet_tunnel_send(&ventry->key.vnet, &addr, skb);
857 VarpEntry_lock(ventry, flags);
858 dprintf("< err=%d\n", err);
859 return err;
860 }
862 /** Handle output for a non-reachable ventry. Send messages to complete it.
863 * If the entry is still incomplete, queue the skb, otherwise
864 * send it. If the queue is full, dequeue and free an old skb to
865 * make room for the new one.
866 * Assumes the ventry lock is held.
867 *
868 * @param ventry varp entry
869 * @param skb skb to send
870 * @return 0 on success, error code otherwise
871 */
872 int VarpEntry_resolve(VarpEntry *ventry, struct sk_buff *skb){
873 int err = 0;
874 unsigned long flags = 0;
876 dprintf("> skb=%p\n", skb);
877 ventry->state = VARP_STATE_INCOMPLETE;
878 atomic_set(&ventry->probes, 1);
879 if(!VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)){
880 VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, 1);
881 VarpEntry_incref(ventry);
882 VarpEntry_schedule(ventry);
883 }
884 VarpEntry_unlock(ventry, flags);
885 varp_solicit(skb, &ventry->key.vnet);
886 VarpEntry_lock(ventry, flags);
888 if(ventry->state == VARP_STATE_INCOMPLETE){
889 if(skb_queue_len(&ventry->queue) >= ventry->queue_max){
890 struct sk_buff *oldskb;
891 oldskb = ventry->queue.next;
892 __skb_unlink(oldskb, &ventry->queue);
893 dprintf("> dropping skb=%p\n", oldskb);
894 kfree_skb(oldskb);
895 }
896 __skb_queue_tail(&ventry->queue, skb);
897 } else {
898 err = VarpEntry_send(ventry, skb);
899 }
900 dprintf("< err=%d\n", err);
901 return err;
902 }
904 /** Handle output for a ventry. Resolves the ventry
905 * if necessary.
906 *
907 * @param ventry varp entry
908 * @param skb skb to send
909 * @return 0 on success, error code otherwise
910 */
911 int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb){
912 int err = 0;
914 switch(ventry->state){
915 case VARP_STATE_REACHABLE:
916 err = VarpEntry_send(ventry, skb);
917 break;
918 default:
919 err = VarpEntry_resolve(ventry, skb);
920 break;
921 }
922 return err;
923 }
925 /** Process the output queue for a ventry. Sends the queued skbs if
926 * the ventry is reachable, otherwise drops them.
927 *
928 * @param ventry varp entry
929 */
930 void VarpEntry_process_queue(VarpEntry *ventry){
931 struct sk_buff *skb;
932 for( ; ; ){
933 if(ventry->state != VARP_STATE_REACHABLE) break;
934 skb = __skb_dequeue(&ventry->queue);
935 if(!skb) break;
936 VarpEntry_output(ventry, skb);
937 }
938 skb_queue_purge(&ventry->queue);
939 }
941 /** Update a ventry. Sets the address and state to those given
942 * and sets the timestamp to 'now'.
943 *
944 * @param ventry varp entry
945 * @param addr care-of address
946 * @param state state
947 * @return 0 on success, error code otherwise
948 */
949 int VarpEntry_update(VarpEntry *ventry, VarpAddr *addr, int state){
950 int err = 0;
951 unsigned long now = jiffies;
952 unsigned long flags;
954 dprintf("> addr=" IPFMT " state=%d\n", NIPQUAD(addr), state);
955 VarpEntry_lock(ventry, flags);
956 if(VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT)) goto exit;
957 ventry->addr = *addr;
958 ventry->timestamp = now;
959 ventry->state = state;
960 VarpEntry_process_queue(ventry);
961 exit:
962 VarpEntry_unlock(ventry, flags);
963 dprintf("< err=%d\n", err);
964 return err;
965 }
967 int VarpTable_update(VarpTable *z, VnetId *vnet, Vmac *vmac, VarpAddr *addr,
968 int state, int force){
969 int err = 0;
970 VarpEntry *ventry;
971 #ifdef DEBUG
972 char vnetbuf[VNET_ID_BUF];
973 char addrbuf[VARP_ADDR_BUF];
974 #endif
976 dprintf("> vnet=%s mac=" MACFMT " addr=%s state=%d force=%d\n",
977 VnetId_ntoa(vnet, vnetbuf),
978 MAC6TUPLE(vmac->mac),
979 VarpAddr_ntoa(addr, addrbuf),
980 state,
981 force);
982 ventry = VarpTable_lookup(z, vnet, vmac);
983 if(force && !ventry){
984 dprintf("> No entry, adding\n");
985 ventry = VarpTable_add(z, vnet, vmac);
986 }
987 if(ventry){
988 dprintf("> Updating\n");
989 err = VarpEntry_update(ventry, addr, state);
990 VarpEntry_decref(ventry);
991 } else {
992 dprintf("> No entry found\n");
993 err = -ENOENT;
994 }
995 dprintf("< err=%d\n", err);
996 return err;
997 }
999 /** Update the ventry corresponding to the given varp header.
1001 * @param z table
1002 * @param varph varp header
1003 * @param state state
1004 * @return 0 on success, -ENOENT if no entry found
1005 */
1006 int VarpTable_update_entry(VarpTable *z, VarpHdr *varph, int state){
1007 return VarpTable_update(z, &varph->vnet, &varph->vmac, &varph->addr, state, 0);
1010 int varp_update(VnetId *vnet, unsigned char *vmac, VarpAddr *addr){
1011 if(!varp_table){
1012 return -ENOSYS;
1014 return VarpTable_update(varp_table, vnet, (Vmac*)vmac, addr,
1015 VARP_STATE_REACHABLE, 1);
1018 /** Put old varp entries into the incomplete state.
1019 * Permanent entries are not changed.
1020 * If 'all' is non-zero, all non-permanent entries
1021 * are put into the incomplete state, regardless of age.
1023 * @param z table
1024 * @param all reset all entries if non-zero
1025 */
1026 void VarpTable_sweep(VarpTable *z, int all){
1027 HashTable_for_decl(entry);
1028 VarpEntry *ventry;
1029 unsigned long now = jiffies;
1030 unsigned long old = now - VARP_ENTRY_TTL;
1031 unsigned long flags, vflags;
1033 VarpTable_read_lock(z, flags);
1034 HashTable_for_each(entry, varp_table->table){
1035 ventry = entry->value;
1036 VarpEntry_lock(ventry, vflags);
1037 if(!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) &&
1038 (all || (ventry->timestamp < old))){
1039 VarpEntry_process_queue(ventry);
1040 ventry->state = VARP_STATE_INCOMPLETE;
1042 VarpEntry_unlock(ventry, vflags);
1044 VarpTable_read_unlock(z, flags);
1047 /** Flush the varp table.
1048 * Remove old unreachable varp entries with empty queues.
1049 * Permanent entries are not removed.
1051 * @param z table
1052 */
1053 void VarpTable_flush(VarpTable *z){
1054 HashTable_for_decl(entry);
1055 VarpEntry *ventry;
1056 unsigned long now = jiffies;
1057 unsigned long old = now - VARP_ENTRY_TTL;
1058 unsigned long flags, vflags;
1059 int flush;
1061 VarpTable_write_lock(z, flags);
1062 HashTable_for_each(entry, varp_table->table){
1063 ventry = entry->value;
1064 VarpEntry_lock(ventry, vflags);
1065 flush = (!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) &&
1066 (ventry->timestamp < old) &&
1067 (ventry->state != VARP_STATE_REACHABLE) &&
1068 (skb_queue_len(&ventry->queue) == 0));
1069 VarpEntry_unlock(ventry, vflags);
1070 if(flush){
1071 VarpTable_remove(z, ventry);
1074 VarpTable_write_unlock(z, flags);
1077 /** Handle a varp request. Look for a vif with the requested
1078 * vnet and vmac. If find one, reply with the vnet, vmac and our
1079 * address. Otherwise do nothing.
1081 * @param skb incoming message
1082 * @param varph varp message
1083 * @return 0 if ok, -ENOENT if no matching vif, or error code
1084 */
1085 int varp_handle_request(struct sk_buff *skb, VarpHdr *varph){
1086 int err = -ENOENT;
1087 VnetId *vnet;
1088 Vmac *vmac;
1089 Vif *vif = NULL;
1091 dprintf(">\n");
1092 vnet = &varph->vnet;
1093 vmac = &varph->vmac;
1094 if(vif_lookup(vnet, vmac, &vif)) goto exit;
1095 varp_send(VARP_OP_ANNOUNCE, skb->dev, skb, vnet, vmac);
1096 vif_decref(vif);
1097 exit:
1098 dprintf("< err=%d\n", err);
1099 return err;
1102 /** Announce the vnet and vmac of a vif (gratuitous varp).
1104 * @param dev device to send on (may be null)
1105 * @param vif vif
1106 * @return 0 on success, error code otherwise
1107 */
1108 int varp_announce_vif(struct net_device *dev, Vif *vif){
1109 int err = 0;
1110 dprintf(">\n");
1111 if(!varp_table){
1112 err = -ENOSYS;
1113 goto exit;
1115 err = varp_send(VARP_OP_ANNOUNCE, dev, NULL, &vif->vnet, &vif->vmac);
1116 exit:
1117 dprintf("< err=%d\n", err);
1118 return err;
1121 /** Handle a varp announce message.
1122 * Update the matching ventry if we have one.
1124 * @param skb incoming message
1125 * @param varp message
1126 * @return 0 if OK, -ENOENT if no matching entry
1127 */
1128 int varp_handle_announce(struct sk_buff *skb, VarpHdr *varph){
1129 int err = 0;
1131 dprintf(">\n");
1132 err = VarpTable_update_entry(varp_table, varph, VARP_STATE_REACHABLE);
1133 dprintf("< err=%d\n", err);
1134 return err;
1137 /** Handle an incoming varp message.
1139 * @param skb incoming message
1140 * @return 0 if OK, error code otherwise
1141 */
1142 int varp_handle_message(struct sk_buff *skb){
1143 // Assume h. nh set, skb->data point after udp hdr (at varphdr).
1144 int err = -EINVAL, mine = 0;
1145 VarpHdr *varph = (void*)(skb->h.uh + 1);
1147 dprintf(">\n");
1148 if(!varp_table){
1149 err = -ENOSYS;
1150 goto exit;
1152 if(MULTICAST(skb->nh.iph->daddr) &&
1153 (skb->nh.iph->daddr != varp_mcast_addr)){
1154 // Ignore multicast packets not addressed to us.
1155 err = 0;
1156 dprintf("> Ignoring daddr=" IPFMT " mcaddr=" IPFMT "\n",
1157 NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr));
1158 goto exit;
1160 if(skb->len < sizeof(*varph)){
1161 wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(*varph));
1162 goto exit;
1164 mine = 1;
1165 if(varph->hdr.id != htons(VARP_ID)){
1166 // It's not varp at all - ignore it.
1167 wprintf("> Invalid varp id: %d, expected %d \n",
1168 ntohs(varph->hdr.id),
1169 VARP_ID);
1170 goto exit;
1172 #ifdef DEBUG
1174 char vnetbuf[VNET_ID_BUF];
1175 char addrbuf[VARP_ADDR_BUF];
1176 dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n",
1177 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr));
1178 dprintf("> sport=%u dport=%u\n", ntohs(skb->h.uh->source), ntohs(skb->h.uh->dest));
1179 dprintf("> opcode=%d vnet=%s vmac=" MACFMT " addr=%s\n",
1180 ntohs(varph->hdr.opcode),
1181 VnetId_ntoa(&varph->vnet, vnetbuf),
1182 MAC6TUPLE(varph->vmac.mac),
1183 VarpAddr_ntoa(&varph->addr, addrbuf));
1184 varp_dprint();
1186 #endif
1187 switch(ntohs(varph->hdr.opcode)){
1188 case VARP_OP_REQUEST:
1189 err = varp_handle_request(skb, varph);
1190 break;
1191 case VARP_OP_ANNOUNCE:
1192 err = varp_handle_announce(skb, varph);
1193 break;
1194 default:
1195 wprintf("> Unknown opcode: %d \n", ntohs(varph->hdr.opcode));
1196 break;
1198 exit:
1199 if(mine) err = 1;
1200 dprintf("< err=%d\n", err);
1201 return err;
1204 /** Send an outgoing packet on the appropriate vnet tunnel.
1206 * @param skb outgoing message
1207 * @param vnet vnet (network order)
1208 * @return 0 on success, error code otherwise
1209 */
1210 int varp_output(struct sk_buff *skb, VnetId *vnet){
1211 int err = 0;
1212 unsigned char *mac = NULL;
1213 Vmac *vmac = NULL;
1214 VarpEntry *ventry = NULL;
1216 dprintf(">\n");
1217 if(!varp_table){
1218 err = -ENOSYS;
1219 goto exit;
1221 if(!skb->mac.raw){
1222 wprintf("> No ethhdr in skb!\n");
1223 err = -EINVAL;
1224 goto exit;
1226 mac = eth_hdr(skb)->h_dest;
1227 vmac = (Vmac*)mac;
1228 if(mac_is_multicast(mac)){
1229 VarpAddr addr = {};
1230 addr.family = AF_INET;
1231 addr.u.ip4.s_addr = varp_mcast_addr;
1232 err = vnet_tunnel_send(vnet, &addr, skb);
1233 } else {
1234 ventry = VarpTable_lookup(varp_table, vnet, vmac);
1235 if(!ventry){
1236 ventry = VarpTable_add(varp_table, vnet, vmac);
1238 if(ventry){
1239 unsigned long flags;
1240 VarpEntry_lock(ventry, flags);
1241 err = VarpEntry_output(ventry, skb);
1242 VarpEntry_unlock(ventry, flags);
1243 VarpEntry_decref(ventry);
1244 } else {
1245 err = -ENOMEM;
1248 exit:
1249 dprintf("< err=%d\n", err);
1250 return err;
1253 /** Set the varp multicast address (after initialization).
1255 * @param addr address (network order)
1256 * @return 0 on success, error code otherwise
1257 */
1258 int varp_set_mcast_addr(uint32_t addr){
1259 int err = 0;
1260 varp_close();
1261 varp_mcast_addr = addr;
1262 err = varp_open(varp_mcast_addr, varp_port);
1263 return err;
1266 /** Initialize the varp multicast address from a module parameter.
1268 * @param s address in IPv4 notation
1269 * @return 0 on success, error code otherwise
1270 */
1271 static void varp_init_mcast_addr(char *s){
1272 unsigned long v = 0;
1274 dprintf("> %s\n", s);
1275 if(s && (get_inet_addr(s, &v) >= 0)){
1276 varp_mcast_addr = (u32)v;
1277 } else {
1278 varp_mcast_addr = htonl(VARP_MCAST_ADDR);
1282 /** Initialize the varp cache.
1284 * @return 0 on success, error code otherwise
1285 */
1286 int varp_init(void){
1287 int err = 0;
1289 dprintf(">\n");
1290 varp_table = VarpTable_new();
1291 if(!varp_table){
1292 err = -ENOMEM;
1293 goto exit;
1295 varp_init_mcast_addr(varp_mcaddr);
1296 varp_port = htons(VARP_PORT);
1298 err = varp_open(varp_mcast_addr, varp_port);
1299 exit:
1300 dprintf("< err=%d\n", err);
1301 return err;
1304 /** Close the varp cache.
1305 */
1306 void varp_exit(void){
1307 dprintf(">\n");
1308 varp_close();
1309 if(varp_table){
1310 VarpTable *z = varp_table;
1311 varp_table = NULL;
1312 VarpTable_free(z);
1314 dprintf("<\n");
1317 MODULE_PARM(varp_mcaddr, "s");
1318 MODULE_PARM_DESC(varp_mcaddr, "VARP multicast address");
1320 MODULE_PARM(varp_device, "s");
1321 MODULE_PARM_DESC(varp_device, "VARP network device");