ia64/xen-unstable

view tools/vnet/vnet-module/varp.c @ 8740:3d7ea7972b39

Update patches for linux 2.6.15.

Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
author cl349@firebug.cl.cam.ac.uk
date Thu Feb 02 17:16:00 2006 +0000 (2006-02-02)
parents 18eb059ae471
children 71b0f00f6344
line source
1 /*
2 * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by the
6 * Free Software Foundation; either version 2 of the License, or (at your
7 * option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free software Foundation, Inc.,
16 * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
17 *
18 */
20 #include <linux/config.h>
21 #include <linux/kernel.h>
22 #include <linux/module.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/version.h>
27 #include <linux/net.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/udp.h>
33 #include <net/ip.h>
34 #include <net/protocol.h>
35 #include <net/route.h>
36 #include <linux/skbuff.h>
37 #include <linux/spinlock.h>
38 #include <asm/semaphore.h>
40 #include <tunnel.h>
41 #include <vnet.h>
42 #include <vif.h>
43 #include <if_varp.h>
44 #include <varp.h>
45 #include <vnet.h>
47 #include "allocate.h"
48 #include "hash_table.h"
49 #include "sys_net.h"
50 #include "sys_string.h"
51 #include "skb_util.h"
53 #define MODULE_NAME "VARP"
54 #define DEBUG 1
55 #undef DEBUG
56 #include "debug.h"
58 /** @file VARP: Virtual ARP.
59 *
60 * Handles virtual ARP requests for vnet/vmac.
61 */
63 /*
65 Varp uses UDP on port 1798.
67 on domain up: ?
68 send varp.announce { id, vmac, vnet, coa } for each vif
69 that haven't announced before, or has changed.
70 install vif entries in local table.
72 on varp.announce{ id, vmac, vnet, coa }:
73 update VARP entry for vmac x vnet if have one, reset ttl.
75 on varp.request { id, vmac, vnet }:
76 if have a vif for the requested vmac/vnet,
77 reply with varp.announce{ id, vmac, vnet, coa }
79 on timer:
80 traverse VARP table, flush old entries.
82 on probe timer:
83 probe again if not out of tries.
84 if out of tries invalidate entry.
86 */
88 /** Time-to-live of varp entries (in jiffies).*/
89 #define VARP_ENTRY_TTL (60*HZ)
91 /** Maximum number of varp probes to make. */
92 #define VARP_PROBE_MAX 5
94 /** Interval between varp probes (in jiffies). */
95 #define VARP_PROBE_INTERVAL (3*HZ)
97 /** Maximum number of queued skbs for a varp entry. */
98 #define VARP_QUEUE_MAX 16
100 /** Number of buckets in the varp table (must be prime). */
101 #define VARP_TABLE_BUCKETS 3001
103 /** Varp entry states. */
104 enum {
105 VARP_STATE_INCOMPLETE = 1,
106 VARP_STATE_REACHABLE = 2,
107 VARP_STATE_FAILED = 3
108 };
110 /** Varp entry flags. */
111 enum {
112 VARP_FLAG_PROBING = 1,
113 VARP_FLAG_PERMANENT = 2,
114 };
116 /** Key for varp entries. */
117 typedef struct VarpKey {
118 /** Vnet id (network order). */
119 VnetId vnet;
120 /** Virtual MAC address. */
121 Vmac vmac;
122 } VarpKey;
124 /** An entry in the varp cache. */
125 typedef struct VarpEntry {
126 /** Key for the entry. */
127 VarpKey key;
128 /** Care-of address for the key. */
129 VarpAddr addr;
130 /** Last-updated timestamp. */
131 unsigned long timestamp;
132 /** State. */
133 short state;
134 /** Flags. */
135 short flags;
136 /** Reference count. */
137 atomic_t refcount;
138 /** Lock. */
139 rwlock_t lock;
140 /** How many probes have been made. */
141 atomic_t probes;
142 /** Probe timer. */
143 struct timer_list timer;
144 void (*error)(struct VarpEntry *ventry, struct sk_buff *skb);
145 /** Outbound skb queue. */
146 struct sk_buff_head queue;
147 /** Maximum size of the queue. */
148 int queue_max;
149 } VarpEntry;
151 /** The varp cache. Varp entries indexed by VarpKey. */
152 typedef struct VarpTable {
154 HashTable *table;
156 /** Sweep timer. */
157 struct timer_list timer;
159 /** Lock. Need to use a semaphore instead of a spinlock because
160 * some operations under the varp table lock can schedule - and
161 * you mustn't hold a spinlock when scheduling.
162 */
163 struct semaphore lock;
165 } VarpTable;
167 /** The varp cache. */
168 static VarpTable *varp_table = NULL;
170 /** Module parameter for the multicast address. */
171 static char *varp_mcaddr = NULL;
173 /** Multicast address (network order). */
174 u32 varp_mcast_addr = 0;
176 /** UDP port (network order). */
177 u16 varp_port = 0;
179 char *varp_device = "xenbr0";
181 #define VarpTable_read_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
182 #define VarpTable_read_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
183 #define VarpTable_write_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
184 #define VarpTable_write_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
186 #define VarpEntry_lock(ventry, flags) write_lock_irqsave(&(ventry)->lock, (flags))
187 #define VarpEntry_unlock(ventry, flags) write_unlock_irqrestore(&(ventry)->lock, (flags))
189 void VarpTable_sweep(VarpTable *z, int all);
190 void VarpTable_flush(VarpTable *z);
191 void VarpTable_print(VarpTable *z);
193 #include "./varp_util.c"
195 /** Print the varp cache (if debug on).
196 */
197 void varp_dprint(void){
198 #ifdef DEBUG
199 VarpTable_print(varp_table);
200 #endif
201 }
203 /** Flush the varp cache.
204 */
205 void varp_flush(void){
206 VarpTable_flush(varp_table);
207 }
209 static int device_ucast_addr(const char *device, uint32_t *addr)
210 {
211 int err;
212 struct net_device *dev = NULL;
214 err = vnet_get_device(device, &dev);
215 if(err) goto exit;
216 err = vnet_get_device_address(dev, addr);
217 exit:
218 if(err){
219 *addr = 0;
220 }
221 return err;
222 }
224 /** Get the unicast address of the varp device.
225 */
226 int varp_ucast_addr(uint32_t *addr)
227 {
228 int err = -ENODEV;
229 const char *devices[] = { varp_device, "eth0", "eth1", "eth2", NULL };
230 const char **p;
231 for(p = devices; err && *p; p++){
232 err = device_ucast_addr(*p, addr);
233 }
234 return err;
235 }
237 /** Print varp info and the varp cache.
238 */
239 void varp_print(void){
240 uint32_t addr = 0;
241 varp_ucast_addr(&addr);
243 printk(KERN_INFO "=== VARP ===============================================================\n");
244 printk(KERN_INFO "varp_device %s\n", varp_device);
245 printk(KERN_INFO "varp_mcast_addr " IPFMT "\n", NIPQUAD(varp_mcast_addr));
246 printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(addr));
247 printk(KERN_INFO "varp_port %d\n", ntohs(varp_port));
248 vnet_print();
249 vif_print();
250 VarpTable_print(varp_table);
251 printk(KERN_INFO "========================================================================\n");
252 }
254 /** Lookup a network device by name.
255 *
256 * @param name device name
257 * @param dev return parameter for the device
258 * @return 0 on success, error code otherwise
259 */
260 int vnet_get_device(const char *name, struct net_device **dev){
261 int err = 0;
262 *dev = dev_get_by_name(name);
263 if(!*dev){
264 err = -ENETDOWN;
265 }
266 return err;
267 }
269 /** Get the source address from a device.
270 *
271 * @param dev device
272 * @param addr return parameter for address
273 * @return 0 on success, error code otherwise
274 */
275 int vnet_get_device_address(struct net_device *dev, u32 *addr){
276 int err = 0;
277 struct in_device *in_dev;
279 in_dev = in_dev_get(dev);
280 if(!in_dev){
281 err = -ENODEV;
282 goto exit;
283 }
284 *addr = in_dev->ifa_list->ifa_address;
285 in_dev_put(in_dev);
286 exit:
287 return err;
288 }
290 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
292 static inline int addr_route(u32 daddr, struct rtable **prt){
293 int err = 0;
294 struct flowi fl = {
295 .nl_u = {
296 .ip4_u = {
297 .daddr = daddr,
298 }
299 }
300 };
302 err = ip_route_output_key(prt, &fl);
303 return err;
304 }
306 #else
308 static inline int addr_route(u32 daddr, struct rtable **prt){
309 int err = 0;
310 struct rt_key key = { .dst = daddr };
311 err = ip_route_output_key(prt, &key);
312 return err;
313 }
315 #endif
317 #ifndef LL_RESERVED_SPACE
318 #define HH_DATA_MOD 16
319 #define LL_RESERVED_SPACE(dev) \
320 ((dev->hard_header_len & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
321 #endif
323 /** Send a varp protocol message.
324 *
325 * @param opcode varp opcode (host order)
326 * @param dev device (may be null)
327 * @param skb skb being replied to (may be null)
328 * @param vnet vnet id (in network order)
329 * @param vmac vmac (in network order)
330 * @return 0 on success, error code otherwise
331 */
332 int varp_send(u16 opcode, struct net_device *dev, struct sk_buff *skbin,
333 VnetId *vnet, Vmac *vmac){
334 int err = 0;
335 int link_n = 0;
336 int ip_n = sizeof(struct iphdr);
337 int udp_n = sizeof(struct udphdr);
338 int varp_n = sizeof(VarpHdr);
339 struct sk_buff *skbout = NULL;
340 struct in_device *in_dev = NULL;
341 VarpHdr *varph = NULL;
342 u8 macbuf[6] = {};
343 u8 *smac, *dmac = macbuf;
344 u32 saddr, daddr;
345 u16 sport, dport;
346 #if defined(DEBUG)
347 char vnetbuf[VNET_ID_BUF];
348 #endif
350 dprintf("> opcode=%d vnet= %s vmac=" MACFMT "\n",
351 opcode, VnetId_ntoa(vnet, vnetbuf), MAC6TUPLE(vmac->mac));
353 dport = varp_port;
354 if(skbin){
355 daddr = skbin->nh.iph->saddr;
356 dmac = eth_hdr(skbin)->h_source;
357 sport = skbin->h.uh->dest;
358 } else {
359 if(MULTICAST(varp_mcast_addr)){
360 daddr = varp_mcast_addr;
361 ip_eth_mc_map(daddr, dmac);
362 } else {
363 daddr = INADDR_BROADCAST;
364 }
365 sport = varp_port;
366 }
368 if(!dev){
369 struct rtable *rt = NULL;
370 err = addr_route(daddr, &rt);
371 if(err) goto exit;
372 dev = rt->u.dst.dev;
373 }
375 in_dev = in_dev_get(dev);
376 if(!in_dev){
377 err = -ENODEV;
378 goto exit;
379 }
380 link_n = LL_RESERVED_SPACE(dev);
381 saddr = in_dev->ifa_list->ifa_address;
382 smac = dev->dev_addr;
383 if(daddr == INADDR_BROADCAST){
384 daddr = in_dev->ifa_list->ifa_broadcast;
385 dmac = dev->broadcast;
386 }
387 in_dev_put(in_dev);
389 dprintf("> dev=%s\n", dev->name);
390 dprintf("> smac=" MACFMT " dmac=" MACFMT "\n", MAC6TUPLE(smac), MAC6TUPLE(dmac));
391 dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(saddr), NIPQUAD(daddr));
392 dprintf("> sport=%u dport=%u\n", ntohs(sport), ntohs(dport));
394 skbout = alloc_skb(link_n + ip_n + udp_n + varp_n, GFP_ATOMIC);
395 if (!skbout){
396 err = -ENOMEM;
397 goto exit;
398 }
399 skbout->dev = dev;
400 skb_reserve(skbout, link_n);
401 skbout->protocol = htons(ETH_P_IP);
403 // Device header. Pushes device header on front of skb.
404 if (dev->hard_header){
405 err = dev->hard_header(skbout, dev, ETH_P_IP, dmac, smac, skbout->len);
406 if(err < 0) goto exit;
407 skbout->mac.raw = skbout->data;
408 }
410 // IP header.
411 skbout->nh.raw = skb_put(skbout, ip_n);
412 skbout->nh.iph->version = 4;
413 skbout->nh.iph->ihl = ip_n / 4;
414 skbout->nh.iph->tos = 0;
415 skbout->nh.iph->tot_len = htons(ip_n + udp_n + varp_n);
416 skbout->nh.iph->id = 0;
417 skbout->nh.iph->frag_off = 0;
418 skbout->nh.iph->ttl = 64;
419 skbout->nh.iph->protocol = IPPROTO_UDP;
420 skbout->nh.iph->saddr = saddr;
421 skbout->nh.iph->daddr = daddr;
422 skbout->nh.iph->check = 0;
424 // UDP header.
425 skbout->h.raw = skb_put(skbout, udp_n);
426 skbout->h.uh->source = sport;
427 skbout->h.uh->dest = dport;
428 skbout->h.uh->len = htons(udp_n + varp_n);
429 skbout->h.uh->check = 0;
431 // Varp header.
432 varph = (void*)skb_put(skbout, varp_n);
433 *varph = (VarpHdr){};
434 varph->hdr.id = htons(VARP_ID);
435 varph->hdr.opcode = htons(opcode);
436 varph->vnet = *vnet;
437 varph->vmac = *vmac;
438 varph->addr.family = AF_INET;
439 varph->addr.u.ip4.s_addr = saddr;
441 err = skb_xmit(skbout);
443 exit:
444 if(err && skbout) kfree_skb(skbout);
445 dprintf("< err=%d\n", err);
446 return err;
447 }
449 /** Send a varp request for the vnet and destination mac of a packet.
450 *
451 * @param skb packet
452 * @param vnet vnet (in network order)
453 * @return 0 on success, error code otherwise
454 */
455 int varp_solicit(struct sk_buff *skb, VnetId *vnet){
456 int err = 0;
457 err = varp_send(VARP_OP_REQUEST, NULL, NULL,
458 vnet, (Vmac*)eth_hdr(skb)->h_dest);
459 return err;
460 }
462 /* Test some flags.
463 *
464 * @param z varp entry
465 * @param flags to test
466 * @return nonzero if flags set
467 */
468 int VarpEntry_get_flags(VarpEntry *z, int flags){
469 return z->flags & flags;
470 }
472 /** Set some flags.
473 *
474 * @param z varp entry
475 * @param flags to set
476 * @param set set flags on if nonzero, off if zero
477 * @return new flags value
478 */
479 int VarpEntry_set_flags(VarpEntry *z, int flags, int set){
480 if(set){
481 z->flags |= flags;
482 } else {
483 z->flags &= ~flags;
484 }
485 return z->flags;
486 }
488 /** Print a varp entry.
489 *
490 * @param ventry varp entry
491 */
492 void VarpEntry_print(VarpEntry *ventry){
493 if(ventry){
494 char *state, *flags;
495 char vnetbuf[VNET_ID_BUF];
496 char addrbuf[VARP_ADDR_BUF];
498 switch(ventry->state){
499 case VARP_STATE_INCOMPLETE: state = "INC"; break;
500 case VARP_STATE_REACHABLE: state = "RCH"; break;
501 case VARP_STATE_FAILED: state = "FLD"; break;
502 default: state = "UNK"; break;
503 }
504 flags = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " ");
506 printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%s vmac=" MACFMT
507 " addr=%s q=%3d t=%lu)\n",
508 ventry,
509 atomic_read(&ventry->refcount),
510 state, flags,
511 VnetId_ntoa(&ventry->key.vnet, vnetbuf),
512 MAC6TUPLE(ventry->key.vmac.mac),
513 VarpAddr_ntoa(&ventry->addr, addrbuf),
514 skb_queue_len(&ventry->queue),
515 ventry->timestamp);
516 } else {
517 printk("VENTRY: Null!\n");
518 }
519 }
521 /** Free a varp entry.
522 *
523 * @param z varp entry
524 */
525 void VarpEntry_free(VarpEntry *z){
526 if(!z) return;
527 deallocate(z);
528 }
530 /** Increment reference count.
531 *
532 * @param z varp entry (may be null)
533 */
534 void VarpEntry_incref(VarpEntry *z){
535 if(!z) return;
536 atomic_inc(&z->refcount);
537 }
539 /** Decrement reference count, freeing if zero.
540 *
541 * @param z varp entry (may be null)
542 */
543 void VarpEntry_decref(VarpEntry *z){
544 if(!z) return;
545 if(atomic_dec_and_test(&z->refcount)){
546 VarpEntry_free(z);
547 }
548 }
550 /** Call the error handler.
551 *
552 * @param ventry varp entry
553 */
554 void VarpEntry_error(VarpEntry *ventry){
555 struct sk_buff *skb;
556 skb = skb_peek(&ventry->queue);
557 if(!skb) return;
558 if(ventry->error) ventry->error(ventry, skb);
559 skb_queue_purge(&ventry->queue);
560 }
562 /** Schedule the varp entry timer.
563 * Must increment the reference count before doing
564 * this the first time, so the ventry won't be freed
565 * before the timer goes off.
566 *
567 * @param ventry varp entry
568 */
569 void VarpEntry_schedule(VarpEntry *ventry){
570 unsigned long now = jiffies;
571 ventry->timer.expires = now + VARP_PROBE_INTERVAL;
572 add_timer(&ventry->timer);
573 }
575 /** Function called when a varp entry timer goes off.
576 * If the entry is still incomplete, carries on probing.
577 * Otherwise stops probing.
578 *
579 * @param arg ventry
580 */
581 static void varp_timer_fn(unsigned long arg){
582 unsigned long flags;
583 VarpEntry *ventry = (VarpEntry *)arg;
584 struct sk_buff *skb = NULL;
585 int locked = 0, probing = 0;
587 dprintf(">\n"); //VarpEntry_print(ventry);
588 VarpEntry_lock(ventry, flags);
589 locked = 1;
590 if(ventry->state == VARP_STATE_REACHABLE){
591 // Do nothing.
592 } else {
593 // Probe if haven't run out of tries, otherwise fail.
594 if(atomic_read(&ventry->probes) < VARP_PROBE_MAX){
595 probing = 1;
596 VarpEntry_schedule(ventry);
597 skb = skb_peek(&ventry->queue);
598 if(skb){
599 dprintf("> skbs in queue - solicit\n");
600 atomic_inc(&ventry->probes);
601 VarpEntry_unlock(ventry, flags);
602 locked = 0;
603 varp_solicit(skb, &ventry->key.vnet);
604 } else {
605 dprintf("> empty queue.\n");
606 }
607 } else {
608 dprintf("> Out of probes: FAILED\n");
609 VarpEntry_error(ventry);
610 ventry->state = VARP_STATE_FAILED;
611 }
612 }
613 VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, probing);
614 if(locked) VarpEntry_unlock(ventry, flags);
615 if(!probing) VarpEntry_decref(ventry);
616 dprintf("<\n");
617 }
619 /** Default error function for varp entries.
620 *
621 * @param ventry varp entry
622 * @param skb packet dropped because of error
623 */
624 static void varp_error_fn(VarpEntry *ventry, struct sk_buff *skb){
625 }
627 /** Create a varp entry. Initializes the internal state.
628 *
629 * @param vnet vnet id
630 * @param vmac virtual MAC address (copied)
631 * @return ventry or null
632 */
633 VarpEntry * VarpEntry_new(VnetId *vnet, Vmac *vmac){
634 VarpEntry *z = ALLOCATE(VarpEntry);
635 if(z){
636 unsigned long now = jiffies;
638 atomic_set(&z->refcount, 1);
639 z->lock = RW_LOCK_UNLOCKED;
640 z->state = VARP_STATE_INCOMPLETE;
641 z->queue_max = VARP_QUEUE_MAX;
642 skb_queue_head_init(&z->queue);
643 init_timer(&z->timer);
644 z->timer.data = (unsigned long)z;
645 z->timer.function = varp_timer_fn;
646 z->timestamp = now;
647 z->error = varp_error_fn;
649 z->key.vnet = *vnet;
650 z->key.vmac = *vmac;
651 }
652 return z;
653 }
655 /** Hash function for keys in the varp cache.
656 * Hashes the vnet id and mac.
657 *
658 * @param k key (VarpKey)
659 * @return hashcode
660 */
661 Hashcode varp_key_hash_fn(void *k){
662 VarpKey *key = k;
663 Hashcode h = 0;
664 h = VnetId_hash(h, &key->vnet);
665 h = Vmac_hash(h, &key->vmac);
666 return h;
667 }
669 /** Test equality for keys in the varp cache.
670 * Compares vnet and mac.
671 *
672 * @param k1 key to compare (VarpKey)
673 * @param k2 key to compare (VarpKey)
674 * @return 1 if equal, 0 otherwise
675 */
676 int varp_key_equal_fn(void *k1, void *k2){
677 VarpKey *key1 = k1;
678 VarpKey *key2 = k2;
679 return (VnetId_eq(&key1->vnet, &key2->vnet) &&
680 Vmac_eq(&key1->vmac, &key2->vmac));
681 }
683 /** Free an entry in the varp cache.
684 *
685 * @param table containing table
686 * @param entry entry to free
687 */
688 static void varp_entry_free_fn(HashTable *table, HTEntry *entry){
689 VarpEntry *ventry;
690 if(!entry) return;
691 ventry = entry->value;
692 if(ventry) VarpEntry_decref(ventry);
693 HTEntry_free(entry);
694 }
696 /** Free the whole varp cache.
697 * Dangerous.
698 *
699 * @param z varp cache
700 */
701 void VarpTable_free(VarpTable *z){
702 unsigned long flags;
703 if(!z) return;
704 VarpTable_write_lock(z, flags);
705 del_timer(&z->timer);
706 z->timer.data = 0;
707 if(z->table) HashTable_free(z->table);
708 VarpTable_write_unlock(z, flags);
709 deallocate(z);
710 }
712 /** Schedule the varp table timer.
713 *
714 * @param z varp table
715 */
716 void VarpTable_schedule(VarpTable *z){
717 unsigned long now = jiffies;
718 z->timer.expires = now + VARP_ENTRY_TTL;
719 add_timer(&z->timer);
720 }
722 /** Function called when the varp table timer goes off.
723 * Sweeps old varp cache entries and reschedules itself.
724 *
725 * @param arg varp table
726 */
727 static void varp_table_timer_fn(unsigned long arg){
728 VarpTable *z = (VarpTable *)arg;
729 if(z){
730 VarpTable_sweep(z, 0);
731 VarpTable_schedule(z);
732 }
733 }
735 /** Print a varp table.
736 *
737 * @param z table
738 */
739 void VarpTable_print(VarpTable *z){
740 HashTable_for_decl(entry);
741 VarpEntry *ventry;
742 unsigned long flags, vflags;
744 VarpTable_read_lock(z, flags);
745 HashTable_for_each(entry, varp_table->table){
746 ventry = entry->value;
747 VarpEntry_lock(ventry, vflags);
748 VarpEntry_print(ventry);
749 VarpEntry_unlock(ventry, vflags);
750 }
751 VarpTable_read_unlock(z, flags);
752 }
754 /** Create a varp table.
755 *
756 * @return new table or null
757 */
758 VarpTable * VarpTable_new(void){
759 int err = -ENOMEM;
760 VarpTable *z = NULL;
762 z = ALLOCATE(VarpTable);
763 if(!z) goto exit;
764 z->table = HashTable_new(VARP_TABLE_BUCKETS);
765 if(!z->table) goto exit;
766 z->table->key_equal_fn = varp_key_equal_fn;
767 z->table->key_hash_fn = varp_key_hash_fn;
768 z->table->entry_free_fn = varp_entry_free_fn;
769 init_MUTEX(&z->lock);
770 init_timer(&z->timer);
771 z->timer.data = (unsigned long)z;
772 z->timer.function = varp_table_timer_fn;
773 VarpTable_schedule(z);
774 err = 0;
775 exit:
776 if(err){
777 VarpTable_free(z);
778 z = NULL;
779 }
780 return z;
781 }
783 /** Add a new entry to the varp table.
784 *
785 * @param z table
786 * @param vnet vnet id
787 * @param vmac virtual MAC address (copied)
788 * @return new entry or null
789 */
790 VarpEntry * VarpTable_add(VarpTable *z, VnetId *vnet, Vmac *vmac){
791 int err = -ENOMEM;
792 VarpEntry *ventry;
793 HTEntry *entry;
794 unsigned long flags;
796 ventry = VarpEntry_new(vnet, vmac);
797 if(!ventry) goto exit;
798 VarpTable_write_lock(z, flags);
799 entry = HashTable_add(z->table, ventry, ventry);
800 VarpTable_write_unlock(z, flags);
801 if(!entry) goto exit;
802 VarpEntry_incref(ventry);
803 err = 0;
804 exit:
805 if(err){
806 VarpEntry_free(ventry);
807 ventry = NULL;
808 }
809 return ventry;
810 }
812 /** Remove an entry from the varp table.
813 *
814 * @param z table
815 * @param ventry entry to remove
816 * @return removed count
817 */
818 int VarpTable_remove(VarpTable *z, VarpEntry *ventry){
819 return HashTable_remove(z->table, ventry);
820 }
822 /** Lookup an entry in the varp table.
823 *
824 * @param z table
825 * @param vnet vnet id
826 * @param vmac virtual MAC addres
827 * @return entry found or null
828 */
829 VarpEntry * VarpTable_lookup(VarpTable *z, VnetId *vnet, Vmac *vmac){
830 unsigned long flags;
831 VarpKey key = { .vnet = *vnet, .vmac = *vmac };
832 VarpEntry *ventry;
833 VarpTable_read_lock(z, flags);
834 ventry = HashTable_get(z->table, &key);
835 if(ventry) VarpEntry_incref(ventry);
836 VarpTable_read_unlock(z, flags);
837 return ventry;
838 }
840 /** Handle output for a reachable ventry.
841 * Send the skb using the tunnel to the care-of address.
842 * Assumes the ventry lock is held.
843 *
844 * @param ventry varp entry
845 * @param skb skb to send
846 * @return 0 on success, error code otherwise
847 */
848 int VarpEntry_send(VarpEntry *ventry, struct sk_buff *skb){
849 int err = 0;
850 unsigned long flags = 0;
851 VarpAddr addr;
853 dprintf("> skb=%p\n", skb);
854 addr = ventry->addr;
855 VarpEntry_unlock(ventry, flags);
856 err = vnet_tunnel_send(&ventry->key.vnet, &addr, skb);
857 VarpEntry_lock(ventry, flags);
858 dprintf("< err=%d\n", err);
859 return err;
860 }
862 /** Handle output for a non-reachable ventry. Send messages to complete it.
863 * If the entry is still incomplete, queue the skb, otherwise
864 * send it. If the queue is full, dequeue and free an old skb to
865 * make room for the new one.
866 * Assumes the ventry lock is held.
867 *
868 * @param ventry varp entry
869 * @param skb skb to send
870 * @return 0 on success, error code otherwise
871 */
872 int VarpEntry_resolve(VarpEntry *ventry, struct sk_buff *skb){
873 int err = 0;
874 unsigned long flags = 0;
876 dprintf("> skb=%p\n", skb);
877 ventry->state = VARP_STATE_INCOMPLETE;
878 atomic_set(&ventry->probes, 1);
879 if(!VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)){
880 VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, 1);
881 VarpEntry_incref(ventry);
882 VarpEntry_schedule(ventry);
883 }
884 VarpEntry_unlock(ventry, flags);
885 varp_solicit(skb, &ventry->key.vnet);
886 VarpEntry_lock(ventry, flags);
888 if(ventry->state == VARP_STATE_INCOMPLETE){
889 if(skb_queue_len(&ventry->queue) >= ventry->queue_max){
890 struct sk_buff *oldskb;
891 oldskb = ventry->queue.next;
892 __skb_unlink(oldskb, &ventry->queue);
893 dprintf("> dropping skb=%p\n", oldskb);
894 kfree_skb(oldskb);
895 }
896 __skb_queue_tail(&ventry->queue, skb);
897 } else {
898 err = VarpEntry_send(ventry, skb);
899 }
900 dprintf("< err=%d\n", err);
901 return err;
902 }
904 /** Handle output for a ventry. Resolves the ventry
905 * if necessary.
906 *
907 * @param ventry varp entry
908 * @param skb skb to send
909 * @return 0 on success, error code otherwise
910 */
911 int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb){
912 int err = 0;
914 switch(ventry->state){
915 case VARP_STATE_REACHABLE:
916 err = VarpEntry_send(ventry, skb);
917 break;
918 default:
919 err = VarpEntry_resolve(ventry, skb);
920 break;
921 }
922 return err;
923 }
925 /** Process the output queue for a ventry. Sends the queued skbs if
926 * the ventry is reachable, otherwise drops them.
927 *
928 * @param ventry varp entry
929 */
930 void VarpEntry_process_queue(VarpEntry *ventry){
931 struct sk_buff *skb;
932 for( ; ; ){
933 if(ventry->state != VARP_STATE_REACHABLE) break;
934 skb = __skb_dequeue(&ventry->queue);
935 if(!skb) break;
936 VarpEntry_output(ventry, skb);
937 }
938 skb_queue_purge(&ventry->queue);
939 }
941 /** Update a ventry. Sets the address and state to those given
942 * and sets the timestamp to 'now'.
943 *
944 * @param ventry varp entry
945 * @param addr care-of address
946 * @param state state
947 * @return 0 on success, error code otherwise
948 */
949 int VarpEntry_update(VarpEntry *ventry, VarpAddr *addr, int state){
950 int err = 0;
951 unsigned long now = jiffies;
952 unsigned long flags;
954 dprintf("> addr=" IPFMT " state=%d\n", NIPQUAD(addr), state);
955 VarpEntry_lock(ventry, flags);
956 if(VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT)) goto exit;
957 ventry->addr = *addr;
958 ventry->timestamp = now;
959 ventry->state = state;
960 VarpEntry_process_queue(ventry);
961 exit:
962 VarpEntry_unlock(ventry, flags);
963 dprintf("< err=%d\n", err);
964 return err;
965 }
967 int VarpTable_update(VarpTable *z, VnetId *vnet, Vmac *vmac, VarpAddr *addr,
968 int state, int force){
969 int err = 0;
970 VarpEntry *ventry;
971 #ifdef DEBUG
972 char vnetbuf[VNET_ID_BUF];
973 char addrbuf[VARP_ADDR_BUF];
974 #endif
976 dprintf("> vnet=%s mac=" MACFMT " addr=%s state=%d force=%d\n",
977 VnetId_ntoa(vnet, vnetbuf),
978 MAC6TUPLE(vmac->mac),
979 VarpAddr_ntoa(addr, addrbuf),
980 state,
981 force);
982 ventry = VarpTable_lookup(z, vnet, vmac);
983 if(force && !ventry){
984 dprintf("> No entry, adding\n");
985 ventry = VarpTable_add(z, vnet, vmac);
986 }
987 if(ventry){
988 dprintf("> Updating\n");
989 err = VarpEntry_update(ventry, addr, state);
990 VarpEntry_decref(ventry);
991 } else {
992 dprintf("> No entry found\n");
993 err = -ENOENT;
994 }
995 dprintf("< err=%d\n", err);
996 return err;
997 }
999 /** Update the ventry corresponding to the given varp header.
1001 * @param z table
1002 * @param varph varp header
1003 * @param state state
1004 * @return 0 on success, -ENOENT if no entry found
1005 */
1006 int VarpTable_update_entry(VarpTable *z, VarpHdr *varph, int state){
1007 return VarpTable_update(z, &varph->vnet, &varph->vmac, &varph->addr, state, 0);
1010 int varp_update(VnetId *vnet, unsigned char *vmac, VarpAddr *addr){
1011 if(!varp_table){
1012 return -ENOSYS;
1014 return VarpTable_update(varp_table, vnet, (Vmac*)vmac, addr,
1015 VARP_STATE_REACHABLE, 1);
1018 /** Put old varp entries into the incomplete state.
1019 * Permanent entries are not changed.
1020 * If 'all' is non-zero, all non-permanent entries
1021 * are put into the incomplete state, regardless of age.
1023 * @param z table
1024 * @param all reset all entries if non-zero
1025 */
1026 void VarpTable_sweep(VarpTable *z, int all){
1027 HashTable_for_decl(entry);
1028 VarpEntry *ventry;
1029 unsigned long now = jiffies;
1030 unsigned long old = now - VARP_ENTRY_TTL;
1031 unsigned long flags, vflags;
1033 VarpTable_read_lock(z, flags);
1034 HashTable_for_each(entry, varp_table->table){
1035 ventry = entry->value;
1036 VarpEntry_lock(ventry, vflags);
1037 if(!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) &&
1038 (all || (ventry->timestamp < old))){
1039 VarpEntry_process_queue(ventry);
1040 ventry->state = VARP_STATE_INCOMPLETE;
1042 VarpEntry_unlock(ventry, vflags);
1044 VarpTable_read_unlock(z, flags);
1047 /** Flush the varp table.
1048 * Remove old unreachable varp entries with empty queues.
1049 * Permanent entries are not removed.
1051 * @param z table
1052 */
1053 void VarpTable_flush(VarpTable *z){
1054 HashTable_for_decl(entry);
1055 VarpEntry *ventry;
1056 unsigned long now = jiffies;
1057 unsigned long old = now - VARP_ENTRY_TTL;
1058 unsigned long flags, vflags;
1059 int flush;
1061 VarpTable_write_lock(z, flags);
1062 HashTable_for_each(entry, varp_table->table){
1063 ventry = entry->value;
1064 VarpEntry_lock(ventry, vflags);
1065 flush = (!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) &&
1066 (ventry->timestamp < old) &&
1067 (ventry->state != VARP_STATE_REACHABLE) &&
1068 (skb_queue_len(&ventry->queue) == 0));
1069 VarpEntry_unlock(ventry, vflags);
1070 if(flush){
1071 VarpTable_remove(z, ventry);
1074 VarpTable_write_unlock(z, flags);
1077 /** Handle a varp request. Look for a vif with the requested
1078 * vnet and vmac. If find one, reply with the vnet, vmac and our
1079 * address. Otherwise do nothing.
1081 * @param skb incoming message
1082 * @param varph varp message
1083 * @return 0 if ok, -ENOENT if no matching vif, or error code
1084 */
1085 int varp_handle_request(struct sk_buff *skb, VarpHdr *varph){
1086 int err = -ENOENT;
1087 VnetId *vnet;
1088 Vmac *vmac;
1089 Vif *vif = NULL;
1091 dprintf(">\n");
1092 vnet = &varph->vnet;
1093 vmac = &varph->vmac;
1094 if(vif_lookup(vnet, vmac, &vif)) goto exit;
1095 varp_send(VARP_OP_ANNOUNCE, skb->dev, skb, vnet, vmac);
1096 vif_decref(vif);
1097 exit:
1098 dprintf("< err=%d\n", err);
1099 return err;
1102 /** Announce the vnet and vmac of a vif (gratuitous varp).
1104 * @param dev device to send on (may be null)
1105 * @param vif vif
1106 * @return 0 on success, error code otherwise
1107 */
1108 int varp_announce_vif(struct net_device *dev, Vif *vif){
1109 int err = 0;
1110 dprintf(">\n");
1111 if(!varp_table){
1112 err = -ENOSYS;
1113 goto exit;
1115 err = varp_send(VARP_OP_ANNOUNCE, dev, NULL, &vif->vnet, &vif->vmac);
1116 exit:
1117 dprintf("< err=%d\n", err);
1118 return err;
1121 /** Handle a varp announce message.
1122 * Update the matching ventry if we have one.
1124 * @param skb incoming message
1125 * @param varp message
1126 * @return 0 if OK, -ENOENT if no matching entry
1127 */
1128 int varp_handle_announce(struct sk_buff *skb, VarpHdr *varph){
1129 int err = 0;
1131 dprintf(">\n");
1132 err = VarpTable_update_entry(varp_table, varph, VARP_STATE_REACHABLE);
1133 dprintf("< err=%d\n", err);
1134 return err;
1137 /** Handle an incoming varp message.
1139 * @param skb incoming message
1140 * @return 0 if OK, error code otherwise
1141 */
1142 int varp_handle_message(struct sk_buff *skb){
1143 // Assume h. nh set, skb->data point after udp hdr (at varphdr).
1144 int err = -EINVAL, mine = 0;
1145 VarpHdr *varph = (void*)(skb->h.uh + 1);
1147 dprintf(">\n");
1148 if(!varp_table){
1149 err = -ENOSYS;
1150 goto exit;
1152 if(MULTICAST(skb->nh.iph->daddr) &&
1153 (skb->nh.iph->daddr != varp_mcast_addr)){
1154 // Ignore multicast packets not addressed to us.
1155 err = 0;
1156 dprintf("> Ignoring daddr=" IPFMT " mcaddr=" IPFMT "\n",
1157 NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr));
1158 goto exit;
1160 if(skb->len < sizeof(*varph)){
1161 wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(*varph));
1162 goto exit;
1164 mine = 1;
1165 if(varph->hdr.id != htons(VARP_ID)){
1166 // It's not varp at all - ignore it.
1167 wprintf("> Invalid varp id: %d, expected %d \n",
1168 ntohs(varph->hdr.id),
1169 VARP_ID);
1170 goto exit;
1172 #ifdef DEBUG
1174 char vnetbuf[VNET_ID_BUF];
1175 char addrbuf[VARP_ADDR_BUF];
1176 dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n",
1177 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr));
1178 dprintf("> sport=%u dport=%u\n", ntohs(skb->h.uh->source), ntohs(skb->h.uh->dest));
1179 dprintf("> opcode=%d vnet=%s vmac=" MACFMT " addr=%s\n",
1180 ntohs(varph->hdr.opcode),
1181 VnetId_ntoa(&varph->vnet, vnetbuf),
1182 MAC6TUPLE(varph->vmac.mac),
1183 VarpAddr_ntoa(&varph->addr, addrbuf));
1184 varp_dprint();
1186 #endif
1187 switch(ntohs(varph->hdr.opcode)){
1188 case VARP_OP_REQUEST:
1189 err = varp_handle_request(skb, varph);
1190 break;
1191 case VARP_OP_ANNOUNCE:
1192 err = varp_handle_announce(skb, varph);
1193 break;
1194 default:
1195 wprintf("> Unknown opcode: %d \n", ntohs(varph->hdr.opcode));
1196 break;
1198 exit:
1199 if(mine) err = 1;
1200 dprintf("< err=%d\n", err);
1201 return err;
1204 /** Send an outgoing packet on the appropriate vnet tunnel.
1206 * @param skb outgoing message
1207 * @param vnet vnet (network order)
1208 * @return 0 on success, error code otherwise
1209 */
1210 int varp_output(struct sk_buff *skb, VnetId *vnet){
1211 int err = 0;
1212 unsigned char *mac = NULL;
1213 Vmac *vmac = NULL;
1214 VarpEntry *ventry = NULL;
1216 dprintf(">\n");
1217 if(!varp_table){
1218 err = -ENOSYS;
1219 goto exit;
1221 if(!skb->mac.raw){
1222 wprintf("> No ethhdr in skb!\n");
1223 err = -EINVAL;
1224 goto exit;
1226 mac = eth_hdr(skb)->h_dest;
1227 vmac = (Vmac*)mac;
1228 if(mac_is_multicast(mac)){
1229 VarpAddr addr = {};
1230 addr.family = AF_INET;
1231 addr.u.ip4.s_addr = varp_mcast_addr;
1232 err = vnet_tunnel_send(vnet, &addr, skb);
1233 } else {
1234 ventry = VarpTable_lookup(varp_table, vnet, vmac);
1235 if(!ventry){
1236 ventry = VarpTable_add(varp_table, vnet, vmac);
1238 if(ventry){
1239 unsigned long flags;
1240 VarpEntry_lock(ventry, flags);
1241 err = VarpEntry_output(ventry, skb);
1242 VarpEntry_unlock(ventry, flags);
1243 VarpEntry_decref(ventry);
1244 } else {
1245 err = -ENOMEM;
1248 exit:
1249 dprintf("< err=%d\n", err);
1250 return err;
1253 /** Set the varp multicast address (after initialization).
1255 * @param addr address (network order)
1256 * @return 0 on success, error code otherwise
1257 */
1258 int varp_set_mcast_addr(uint32_t addr){
1259 int err = 0;
1260 varp_close();
1261 varp_mcast_addr = addr;
1262 err = varp_open(varp_mcast_addr, varp_port);
1263 return err;
1266 /** Initialize the varp multicast address from a module parameter.
1268 * @param s address in IPv4 notation
1269 * @return 0 on success, error code otherwise
1270 */
1271 static void varp_init_mcast_addr(char *s){
1272 unsigned long v = 0;
1274 dprintf("> %s\n", s);
1275 if(s && (get_inet_addr(s, &v) >= 0)){
1276 varp_mcast_addr = (u32)v;
1277 } else {
1278 varp_mcast_addr = htonl(VARP_MCAST_ADDR);
1282 /** Initialize the varp cache.
1284 * @return 0 on success, error code otherwise
1285 */
1286 int varp_init(void){
1287 int err = 0;
1289 dprintf(">\n");
1290 varp_table = VarpTable_new();
1291 if(!varp_table){
1292 err = -ENOMEM;
1293 goto exit;
1295 varp_init_mcast_addr(varp_mcaddr);
1296 varp_port = htons(VARP_PORT);
1298 err = varp_open(varp_mcast_addr, varp_port);
1299 exit:
1300 dprintf("< err=%d\n", err);
1301 return err;
1304 /** Close the varp cache.
1305 */
1306 void varp_exit(void){
1307 dprintf(">\n");
1308 varp_close();
1309 if(varp_table){
1310 VarpTable *z = varp_table;
1311 varp_table = NULL;
1312 VarpTable_free(z);
1314 dprintf("<\n");
1317 MODULE_PARM(varp_mcaddr, "s");
1318 MODULE_PARM_DESC(varp_mcaddr, "VARP multicast address");
1320 MODULE_PARM(varp_device, "s");
1321 MODULE_PARM_DESC(varp_device, "VARP network device");