ia64/linux-2.6.18-xen.hg

view drivers/net/chelsio/sge.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*****************************************************************************
2 * *
3 * File: sge.c *
4 * $Revision: 1.26 $ *
5 * $Date: 2005/06/21 18:29:48 $ *
6 * Description: *
7 * DMA engine. *
8 * part of the Chelsio 10Gb Ethernet Driver. *
9 * *
10 * This program is free software; you can redistribute it and/or modify *
11 * it under the terms of the GNU General Public License, version 2, as *
12 * published by the Free Software Foundation. *
13 * *
14 * You should have received a copy of the GNU General Public License along *
15 * with this program; if not, write to the Free Software Foundation, Inc., *
16 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
17 * *
18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED *
19 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF *
20 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. *
21 * *
22 * http://www.chelsio.com *
23 * *
24 * Copyright (c) 2003 - 2005 Chelsio Communications, Inc. *
25 * All rights reserved. *
26 * *
27 * Maintainers: maintainers@chelsio.com *
28 * *
29 * Authors: Dimitrios Michailidis <dm@chelsio.com> *
30 * Tina Yang <tainay@chelsio.com> *
31 * Felix Marti <felix@chelsio.com> *
32 * Scott Bardone <sbardone@chelsio.com> *
33 * Kurt Ottaway <kottaway@chelsio.com> *
34 * Frank DiMambro <frank@chelsio.com> *
35 * *
36 * History: *
37 * *
38 ****************************************************************************/
40 #include "common.h"
42 #include <linux/types.h>
43 #include <linux/errno.h>
44 #include <linux/pci.h>
45 #include <linux/netdevice.h>
46 #include <linux/etherdevice.h>
47 #include <linux/if_vlan.h>
48 #include <linux/skbuff.h>
49 #include <linux/init.h>
50 #include <linux/mm.h>
51 #include <linux/ip.h>
52 #include <linux/in.h>
53 #include <linux/if_arp.h>
55 #include "cpl5_cmd.h"
56 #include "sge.h"
57 #include "regs.h"
58 #include "espi.h"
61 #ifdef NETIF_F_TSO
62 #include <linux/tcp.h>
63 #endif
65 #define SGE_CMDQ_N 2
66 #define SGE_FREELQ_N 2
67 #define SGE_CMDQ0_E_N 1024
68 #define SGE_CMDQ1_E_N 128
69 #define SGE_FREEL_SIZE 4096
70 #define SGE_JUMBO_FREEL_SIZE 512
71 #define SGE_FREEL_REFILL_THRESH 16
72 #define SGE_RESPQ_E_N 1024
73 #define SGE_INTRTIMER_NRES 1000
74 #define SGE_RX_COPY_THRES 256
75 #define SGE_RX_SM_BUF_SIZE 1536
77 # define SGE_RX_DROP_THRES 2
79 #define SGE_RESPQ_REPLENISH_THRES (SGE_RESPQ_E_N / 4)
81 /*
82 * Period of the TX buffer reclaim timer. This timer does not need to run
83 * frequently as TX buffers are usually reclaimed by new TX packets.
84 */
85 #define TX_RECLAIM_PERIOD (HZ / 4)
87 #ifndef NET_IP_ALIGN
88 # define NET_IP_ALIGN 2
89 #endif
91 #define M_CMD_LEN 0x7fffffff
92 #define V_CMD_LEN(v) (v)
93 #define G_CMD_LEN(v) ((v) & M_CMD_LEN)
94 #define V_CMD_GEN1(v) ((v) << 31)
95 #define V_CMD_GEN2(v) (v)
96 #define F_CMD_DATAVALID (1 << 1)
97 #define F_CMD_SOP (1 << 2)
98 #define V_CMD_EOP(v) ((v) << 3)
100 /*
101 * Command queue, receive buffer list, and response queue descriptors.
102 */
103 #if defined(__BIG_ENDIAN_BITFIELD)
104 struct cmdQ_e {
105 u32 addr_lo;
106 u32 len_gen;
107 u32 flags;
108 u32 addr_hi;
109 };
111 struct freelQ_e {
112 u32 addr_lo;
113 u32 len_gen;
114 u32 gen2;
115 u32 addr_hi;
116 };
118 struct respQ_e {
119 u32 Qsleeping : 4;
120 u32 Cmdq1CreditReturn : 5;
121 u32 Cmdq1DmaComplete : 5;
122 u32 Cmdq0CreditReturn : 5;
123 u32 Cmdq0DmaComplete : 5;
124 u32 FreelistQid : 2;
125 u32 CreditValid : 1;
126 u32 DataValid : 1;
127 u32 Offload : 1;
128 u32 Eop : 1;
129 u32 Sop : 1;
130 u32 GenerationBit : 1;
131 u32 BufferLength;
132 };
133 #elif defined(__LITTLE_ENDIAN_BITFIELD)
134 struct cmdQ_e {
135 u32 len_gen;
136 u32 addr_lo;
137 u32 addr_hi;
138 u32 flags;
139 };
141 struct freelQ_e {
142 u32 len_gen;
143 u32 addr_lo;
144 u32 addr_hi;
145 u32 gen2;
146 };
148 struct respQ_e {
149 u32 BufferLength;
150 u32 GenerationBit : 1;
151 u32 Sop : 1;
152 u32 Eop : 1;
153 u32 Offload : 1;
154 u32 DataValid : 1;
155 u32 CreditValid : 1;
156 u32 FreelistQid : 2;
157 u32 Cmdq0DmaComplete : 5;
158 u32 Cmdq0CreditReturn : 5;
159 u32 Cmdq1DmaComplete : 5;
160 u32 Cmdq1CreditReturn : 5;
161 u32 Qsleeping : 4;
162 } ;
163 #endif
165 /*
166 * SW Context Command and Freelist Queue Descriptors
167 */
168 struct cmdQ_ce {
169 struct sk_buff *skb;
170 DECLARE_PCI_UNMAP_ADDR(dma_addr);
171 DECLARE_PCI_UNMAP_LEN(dma_len);
172 };
174 struct freelQ_ce {
175 struct sk_buff *skb;
176 DECLARE_PCI_UNMAP_ADDR(dma_addr);
177 DECLARE_PCI_UNMAP_LEN(dma_len);
178 };
180 /*
181 * SW command, freelist and response rings
182 */
183 struct cmdQ {
184 unsigned long status; /* HW DMA fetch status */
185 unsigned int in_use; /* # of in-use command descriptors */
186 unsigned int size; /* # of descriptors */
187 unsigned int processed; /* total # of descs HW has processed */
188 unsigned int cleaned; /* total # of descs SW has reclaimed */
189 unsigned int stop_thres; /* SW TX queue suspend threshold */
190 u16 pidx; /* producer index (SW) */
191 u16 cidx; /* consumer index (HW) */
192 u8 genbit; /* current generation (=valid) bit */
193 u8 sop; /* is next entry start of packet? */
194 struct cmdQ_e *entries; /* HW command descriptor Q */
195 struct cmdQ_ce *centries; /* SW command context descriptor Q */
196 spinlock_t lock; /* Lock to protect cmdQ enqueuing */
197 dma_addr_t dma_addr; /* DMA addr HW command descriptor Q */
198 };
200 struct freelQ {
201 unsigned int credits; /* # of available RX buffers */
202 unsigned int size; /* free list capacity */
203 u16 pidx; /* producer index (SW) */
204 u16 cidx; /* consumer index (HW) */
205 u16 rx_buffer_size; /* Buffer size on this free list */
206 u16 dma_offset; /* DMA offset to align IP headers */
207 u16 recycleq_idx; /* skb recycle q to use */
208 u8 genbit; /* current generation (=valid) bit */
209 struct freelQ_e *entries; /* HW freelist descriptor Q */
210 struct freelQ_ce *centries; /* SW freelist context descriptor Q */
211 dma_addr_t dma_addr; /* DMA addr HW freelist descriptor Q */
212 };
214 struct respQ {
215 unsigned int credits; /* credits to be returned to SGE */
216 unsigned int size; /* # of response Q descriptors */
217 u16 cidx; /* consumer index (SW) */
218 u8 genbit; /* current generation(=valid) bit */
219 struct respQ_e *entries; /* HW response descriptor Q */
220 dma_addr_t dma_addr; /* DMA addr HW response descriptor Q */
221 };
223 /* Bit flags for cmdQ.status */
224 enum {
225 CMDQ_STAT_RUNNING = 1, /* fetch engine is running */
226 CMDQ_STAT_LAST_PKT_DB = 2 /* last packet rung the doorbell */
227 };
229 /*
230 * Main SGE data structure
231 *
232 * Interrupts are handled by a single CPU and it is likely that on a MP system
233 * the application is migrated to another CPU. In that scenario, we try to
234 * seperate the RX(in irq context) and TX state in order to decrease memory
235 * contention.
236 */
237 struct sge {
238 struct adapter *adapter; /* adapter backpointer */
239 struct net_device *netdev; /* netdevice backpointer */
240 struct freelQ freelQ[SGE_FREELQ_N]; /* buffer free lists */
241 struct respQ respQ; /* response Q */
242 unsigned long stopped_tx_queues; /* bitmap of suspended Tx queues */
243 unsigned int rx_pkt_pad; /* RX padding for L2 packets */
244 unsigned int jumbo_fl; /* jumbo freelist Q index */
245 unsigned int intrtimer_nres; /* no-resource interrupt timer */
246 unsigned int fixed_intrtimer;/* non-adaptive interrupt timer */
247 struct timer_list tx_reclaim_timer; /* reclaims TX buffers */
248 struct timer_list espibug_timer;
249 unsigned int espibug_timeout;
250 struct sk_buff *espibug_skb;
251 u32 sge_control; /* shadow value of sge control reg */
252 struct sge_intr_counts stats;
253 struct sge_port_stats port_stats[MAX_NPORTS];
254 struct cmdQ cmdQ[SGE_CMDQ_N] ____cacheline_aligned_in_smp;
255 };
257 /*
258 * PIO to indicate that memory mapped Q contains valid descriptor(s).
259 */
260 static inline void doorbell_pio(struct adapter *adapter, u32 val)
261 {
262 wmb();
263 writel(val, adapter->regs + A_SG_DOORBELL);
264 }
266 /*
267 * Frees all RX buffers on the freelist Q. The caller must make sure that
268 * the SGE is turned off before calling this function.
269 */
270 static void free_freelQ_buffers(struct pci_dev *pdev, struct freelQ *q)
271 {
272 unsigned int cidx = q->cidx;
274 while (q->credits--) {
275 struct freelQ_ce *ce = &q->centries[cidx];
277 pci_unmap_single(pdev, pci_unmap_addr(ce, dma_addr),
278 pci_unmap_len(ce, dma_len),
279 PCI_DMA_FROMDEVICE);
280 dev_kfree_skb(ce->skb);
281 ce->skb = NULL;
282 if (++cidx == q->size)
283 cidx = 0;
284 }
285 }
287 /*
288 * Free RX free list and response queue resources.
289 */
290 static void free_rx_resources(struct sge *sge)
291 {
292 struct pci_dev *pdev = sge->adapter->pdev;
293 unsigned int size, i;
295 if (sge->respQ.entries) {
296 size = sizeof(struct respQ_e) * sge->respQ.size;
297 pci_free_consistent(pdev, size, sge->respQ.entries,
298 sge->respQ.dma_addr);
299 }
301 for (i = 0; i < SGE_FREELQ_N; i++) {
302 struct freelQ *q = &sge->freelQ[i];
304 if (q->centries) {
305 free_freelQ_buffers(pdev, q);
306 kfree(q->centries);
307 }
308 if (q->entries) {
309 size = sizeof(struct freelQ_e) * q->size;
310 pci_free_consistent(pdev, size, q->entries,
311 q->dma_addr);
312 }
313 }
314 }
316 /*
317 * Allocates basic RX resources, consisting of memory mapped freelist Qs and a
318 * response queue.
319 */
320 static int alloc_rx_resources(struct sge *sge, struct sge_params *p)
321 {
322 struct pci_dev *pdev = sge->adapter->pdev;
323 unsigned int size, i;
325 for (i = 0; i < SGE_FREELQ_N; i++) {
326 struct freelQ *q = &sge->freelQ[i];
328 q->genbit = 1;
329 q->size = p->freelQ_size[i];
330 q->dma_offset = sge->rx_pkt_pad ? 0 : NET_IP_ALIGN;
331 size = sizeof(struct freelQ_e) * q->size;
332 q->entries = (struct freelQ_e *)
333 pci_alloc_consistent(pdev, size, &q->dma_addr);
334 if (!q->entries)
335 goto err_no_mem;
336 memset(q->entries, 0, size);
337 size = sizeof(struct freelQ_ce) * q->size;
338 q->centries = kmalloc(size, GFP_KERNEL);
339 if (!q->centries)
340 goto err_no_mem;
341 memset(q->centries, 0, size);
342 }
344 /*
345 * Calculate the buffer sizes for the two free lists. FL0 accommodates
346 * regular sized Ethernet frames, FL1 is sized not to exceed 16K,
347 * including all the sk_buff overhead.
348 *
349 * Note: For T2 FL0 and FL1 are reversed.
350 */
351 sge->freelQ[!sge->jumbo_fl].rx_buffer_size = SGE_RX_SM_BUF_SIZE +
352 sizeof(struct cpl_rx_data) +
353 sge->freelQ[!sge->jumbo_fl].dma_offset;
354 sge->freelQ[sge->jumbo_fl].rx_buffer_size = (16 * 1024) -
355 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
357 /*
358 * Setup which skb recycle Q should be used when recycling buffers from
359 * each free list.
360 */
361 sge->freelQ[!sge->jumbo_fl].recycleq_idx = 0;
362 sge->freelQ[sge->jumbo_fl].recycleq_idx = 1;
364 sge->respQ.genbit = 1;
365 sge->respQ.size = SGE_RESPQ_E_N;
366 sge->respQ.credits = 0;
367 size = sizeof(struct respQ_e) * sge->respQ.size;
368 sge->respQ.entries = (struct respQ_e *)
369 pci_alloc_consistent(pdev, size, &sge->respQ.dma_addr);
370 if (!sge->respQ.entries)
371 goto err_no_mem;
372 memset(sge->respQ.entries, 0, size);
373 return 0;
375 err_no_mem:
376 free_rx_resources(sge);
377 return -ENOMEM;
378 }
380 /*
381 * Reclaims n TX descriptors and frees the buffers associated with them.
382 */
383 static void free_cmdQ_buffers(struct sge *sge, struct cmdQ *q, unsigned int n)
384 {
385 struct cmdQ_ce *ce;
386 struct pci_dev *pdev = sge->adapter->pdev;
387 unsigned int cidx = q->cidx;
389 q->in_use -= n;
390 ce = &q->centries[cidx];
391 while (n--) {
392 if (q->sop)
393 pci_unmap_single(pdev, pci_unmap_addr(ce, dma_addr),
394 pci_unmap_len(ce, dma_len),
395 PCI_DMA_TODEVICE);
396 else
397 pci_unmap_page(pdev, pci_unmap_addr(ce, dma_addr),
398 pci_unmap_len(ce, dma_len),
399 PCI_DMA_TODEVICE);
400 q->sop = 0;
401 if (ce->skb) {
402 dev_kfree_skb(ce->skb);
403 q->sop = 1;
404 }
405 ce++;
406 if (++cidx == q->size) {
407 cidx = 0;
408 ce = q->centries;
409 }
410 }
411 q->cidx = cidx;
412 }
414 /*
415 * Free TX resources.
416 *
417 * Assumes that SGE is stopped and all interrupts are disabled.
418 */
419 static void free_tx_resources(struct sge *sge)
420 {
421 struct pci_dev *pdev = sge->adapter->pdev;
422 unsigned int size, i;
424 for (i = 0; i < SGE_CMDQ_N; i++) {
425 struct cmdQ *q = &sge->cmdQ[i];
427 if (q->centries) {
428 if (q->in_use)
429 free_cmdQ_buffers(sge, q, q->in_use);
430 kfree(q->centries);
431 }
432 if (q->entries) {
433 size = sizeof(struct cmdQ_e) * q->size;
434 pci_free_consistent(pdev, size, q->entries,
435 q->dma_addr);
436 }
437 }
438 }
440 /*
441 * Allocates basic TX resources, consisting of memory mapped command Qs.
442 */
443 static int alloc_tx_resources(struct sge *sge, struct sge_params *p)
444 {
445 struct pci_dev *pdev = sge->adapter->pdev;
446 unsigned int size, i;
448 for (i = 0; i < SGE_CMDQ_N; i++) {
449 struct cmdQ *q = &sge->cmdQ[i];
451 q->genbit = 1;
452 q->sop = 1;
453 q->size = p->cmdQ_size[i];
454 q->in_use = 0;
455 q->status = 0;
456 q->processed = q->cleaned = 0;
457 q->stop_thres = 0;
458 spin_lock_init(&q->lock);
459 size = sizeof(struct cmdQ_e) * q->size;
460 q->entries = (struct cmdQ_e *)
461 pci_alloc_consistent(pdev, size, &q->dma_addr);
462 if (!q->entries)
463 goto err_no_mem;
464 memset(q->entries, 0, size);
465 size = sizeof(struct cmdQ_ce) * q->size;
466 q->centries = kmalloc(size, GFP_KERNEL);
467 if (!q->centries)
468 goto err_no_mem;
469 memset(q->centries, 0, size);
470 }
472 /*
473 * CommandQ 0 handles Ethernet and TOE packets, while queue 1 is TOE
474 * only. For queue 0 set the stop threshold so we can handle one more
475 * packet from each port, plus reserve an additional 24 entries for
476 * Ethernet packets only. Queue 1 never suspends nor do we reserve
477 * space for Ethernet packets.
478 */
479 sge->cmdQ[0].stop_thres = sge->adapter->params.nports *
480 (MAX_SKB_FRAGS + 1);
481 return 0;
483 err_no_mem:
484 free_tx_resources(sge);
485 return -ENOMEM;
486 }
488 static inline void setup_ring_params(struct adapter *adapter, u64 addr,
489 u32 size, int base_reg_lo,
490 int base_reg_hi, int size_reg)
491 {
492 writel((u32)addr, adapter->regs + base_reg_lo);
493 writel(addr >> 32, adapter->regs + base_reg_hi);
494 writel(size, adapter->regs + size_reg);
495 }
497 /*
498 * Enable/disable VLAN acceleration.
499 */
500 void t1_set_vlan_accel(struct adapter *adapter, int on_off)
501 {
502 struct sge *sge = adapter->sge;
504 sge->sge_control &= ~F_VLAN_XTRACT;
505 if (on_off)
506 sge->sge_control |= F_VLAN_XTRACT;
507 if (adapter->open_device_map) {
508 writel(sge->sge_control, adapter->regs + A_SG_CONTROL);
509 readl(adapter->regs + A_SG_CONTROL); /* flush */
510 }
511 }
513 /*
514 * Programs the various SGE registers. However, the engine is not yet enabled,
515 * but sge->sge_control is setup and ready to go.
516 */
517 static void configure_sge(struct sge *sge, struct sge_params *p)
518 {
519 struct adapter *ap = sge->adapter;
521 writel(0, ap->regs + A_SG_CONTROL);
522 setup_ring_params(ap, sge->cmdQ[0].dma_addr, sge->cmdQ[0].size,
523 A_SG_CMD0BASELWR, A_SG_CMD0BASEUPR, A_SG_CMD0SIZE);
524 setup_ring_params(ap, sge->cmdQ[1].dma_addr, sge->cmdQ[1].size,
525 A_SG_CMD1BASELWR, A_SG_CMD1BASEUPR, A_SG_CMD1SIZE);
526 setup_ring_params(ap, sge->freelQ[0].dma_addr,
527 sge->freelQ[0].size, A_SG_FL0BASELWR,
528 A_SG_FL0BASEUPR, A_SG_FL0SIZE);
529 setup_ring_params(ap, sge->freelQ[1].dma_addr,
530 sge->freelQ[1].size, A_SG_FL1BASELWR,
531 A_SG_FL1BASEUPR, A_SG_FL1SIZE);
533 /* The threshold comparison uses <. */
534 writel(SGE_RX_SM_BUF_SIZE + 1, ap->regs + A_SG_FLTHRESHOLD);
536 setup_ring_params(ap, sge->respQ.dma_addr, sge->respQ.size,
537 A_SG_RSPBASELWR, A_SG_RSPBASEUPR, A_SG_RSPSIZE);
538 writel((u32)sge->respQ.size - 1, ap->regs + A_SG_RSPQUEUECREDIT);
540 sge->sge_control = F_CMDQ0_ENABLE | F_CMDQ1_ENABLE | F_FL0_ENABLE |
541 F_FL1_ENABLE | F_CPL_ENABLE | F_RESPONSE_QUEUE_ENABLE |
542 V_CMDQ_PRIORITY(2) | F_DISABLE_CMDQ1_GTS | F_ISCSI_COALESCE |
543 F_DISABLE_FL0_GTS | F_DISABLE_FL1_GTS |
544 V_RX_PKT_OFFSET(sge->rx_pkt_pad);
546 #if defined(__BIG_ENDIAN_BITFIELD)
547 sge->sge_control |= F_ENABLE_BIG_ENDIAN;
548 #endif
550 /* Initialize no-resource timer */
551 sge->intrtimer_nres = SGE_INTRTIMER_NRES * core_ticks_per_usec(ap);
553 t1_sge_set_coalesce_params(sge, p);
554 }
556 /*
557 * Return the payload capacity of the jumbo free-list buffers.
558 */
559 static inline unsigned int jumbo_payload_capacity(const struct sge *sge)
560 {
561 return sge->freelQ[sge->jumbo_fl].rx_buffer_size -
562 sge->freelQ[sge->jumbo_fl].dma_offset -
563 sizeof(struct cpl_rx_data);
564 }
566 /*
567 * Frees all SGE related resources and the sge structure itself
568 */
569 void t1_sge_destroy(struct sge *sge)
570 {
571 if (sge->espibug_skb)
572 kfree_skb(sge->espibug_skb);
574 free_tx_resources(sge);
575 free_rx_resources(sge);
576 kfree(sge);
577 }
579 /*
580 * Allocates new RX buffers on the freelist Q (and tracks them on the freelist
581 * context Q) until the Q is full or alloc_skb fails.
582 *
583 * It is possible that the generation bits already match, indicating that the
584 * buffer is already valid and nothing needs to be done. This happens when we
585 * copied a received buffer into a new sk_buff during the interrupt processing.
586 *
587 * If the SGE doesn't automatically align packets properly (!sge->rx_pkt_pad),
588 * we specify a RX_OFFSET in order to make sure that the IP header is 4B
589 * aligned.
590 */
591 static void refill_free_list(struct sge *sge, struct freelQ *q)
592 {
593 struct pci_dev *pdev = sge->adapter->pdev;
594 struct freelQ_ce *ce = &q->centries[q->pidx];
595 struct freelQ_e *e = &q->entries[q->pidx];
596 unsigned int dma_len = q->rx_buffer_size - q->dma_offset;
599 while (q->credits < q->size) {
600 struct sk_buff *skb;
601 dma_addr_t mapping;
603 skb = alloc_skb(q->rx_buffer_size, GFP_ATOMIC);
604 if (!skb)
605 break;
607 skb_reserve(skb, q->dma_offset);
608 mapping = pci_map_single(pdev, skb->data, dma_len,
609 PCI_DMA_FROMDEVICE);
610 ce->skb = skb;
611 pci_unmap_addr_set(ce, dma_addr, mapping);
612 pci_unmap_len_set(ce, dma_len, dma_len);
613 e->addr_lo = (u32)mapping;
614 e->addr_hi = (u64)mapping >> 32;
615 e->len_gen = V_CMD_LEN(dma_len) | V_CMD_GEN1(q->genbit);
616 wmb();
617 e->gen2 = V_CMD_GEN2(q->genbit);
619 e++;
620 ce++;
621 if (++q->pidx == q->size) {
622 q->pidx = 0;
623 q->genbit ^= 1;
624 ce = q->centries;
625 e = q->entries;
626 }
627 q->credits++;
628 }
630 }
632 /*
633 * Calls refill_free_list for both free lists. If we cannot fill at least 1/4
634 * of both rings, we go into 'few interrupt mode' in order to give the system
635 * time to free up resources.
636 */
637 static void freelQs_empty(struct sge *sge)
638 {
639 struct adapter *adapter = sge->adapter;
640 u32 irq_reg = readl(adapter->regs + A_SG_INT_ENABLE);
641 u32 irqholdoff_reg;
643 refill_free_list(sge, &sge->freelQ[0]);
644 refill_free_list(sge, &sge->freelQ[1]);
646 if (sge->freelQ[0].credits > (sge->freelQ[0].size >> 2) &&
647 sge->freelQ[1].credits > (sge->freelQ[1].size >> 2)) {
648 irq_reg |= F_FL_EXHAUSTED;
649 irqholdoff_reg = sge->fixed_intrtimer;
650 } else {
651 /* Clear the F_FL_EXHAUSTED interrupts for now */
652 irq_reg &= ~F_FL_EXHAUSTED;
653 irqholdoff_reg = sge->intrtimer_nres;
654 }
655 writel(irqholdoff_reg, adapter->regs + A_SG_INTRTIMER);
656 writel(irq_reg, adapter->regs + A_SG_INT_ENABLE);
658 /* We reenable the Qs to force a freelist GTS interrupt later */
659 doorbell_pio(adapter, F_FL0_ENABLE | F_FL1_ENABLE);
660 }
662 #define SGE_PL_INTR_MASK (F_PL_INTR_SGE_ERR | F_PL_INTR_SGE_DATA)
663 #define SGE_INT_FATAL (F_RESPQ_OVERFLOW | F_PACKET_TOO_BIG | F_PACKET_MISMATCH)
664 #define SGE_INT_ENABLE (F_RESPQ_EXHAUSTED | F_RESPQ_OVERFLOW | \
665 F_FL_EXHAUSTED | F_PACKET_TOO_BIG | F_PACKET_MISMATCH)
667 /*
668 * Disable SGE Interrupts
669 */
670 void t1_sge_intr_disable(struct sge *sge)
671 {
672 u32 val = readl(sge->adapter->regs + A_PL_ENABLE);
674 writel(val & ~SGE_PL_INTR_MASK, sge->adapter->regs + A_PL_ENABLE);
675 writel(0, sge->adapter->regs + A_SG_INT_ENABLE);
676 }
678 /*
679 * Enable SGE interrupts.
680 */
681 void t1_sge_intr_enable(struct sge *sge)
682 {
683 u32 en = SGE_INT_ENABLE;
684 u32 val = readl(sge->adapter->regs + A_PL_ENABLE);
686 if (sge->adapter->flags & TSO_CAPABLE)
687 en &= ~F_PACKET_TOO_BIG;
688 writel(en, sge->adapter->regs + A_SG_INT_ENABLE);
689 writel(val | SGE_PL_INTR_MASK, sge->adapter->regs + A_PL_ENABLE);
690 }
692 /*
693 * Clear SGE interrupts.
694 */
695 void t1_sge_intr_clear(struct sge *sge)
696 {
697 writel(SGE_PL_INTR_MASK, sge->adapter->regs + A_PL_CAUSE);
698 writel(0xffffffff, sge->adapter->regs + A_SG_INT_CAUSE);
699 }
701 /*
702 * SGE 'Error' interrupt handler
703 */
704 int t1_sge_intr_error_handler(struct sge *sge)
705 {
706 struct adapter *adapter = sge->adapter;
707 u32 cause = readl(adapter->regs + A_SG_INT_CAUSE);
709 if (adapter->flags & TSO_CAPABLE)
710 cause &= ~F_PACKET_TOO_BIG;
711 if (cause & F_RESPQ_EXHAUSTED)
712 sge->stats.respQ_empty++;
713 if (cause & F_RESPQ_OVERFLOW) {
714 sge->stats.respQ_overflow++;
715 CH_ALERT("%s: SGE response queue overflow\n",
716 adapter->name);
717 }
718 if (cause & F_FL_EXHAUSTED) {
719 sge->stats.freelistQ_empty++;
720 freelQs_empty(sge);
721 }
722 if (cause & F_PACKET_TOO_BIG) {
723 sge->stats.pkt_too_big++;
724 CH_ALERT("%s: SGE max packet size exceeded\n",
725 adapter->name);
726 }
727 if (cause & F_PACKET_MISMATCH) {
728 sge->stats.pkt_mismatch++;
729 CH_ALERT("%s: SGE packet mismatch\n", adapter->name);
730 }
731 if (cause & SGE_INT_FATAL)
732 t1_fatal_err(adapter);
734 writel(cause, adapter->regs + A_SG_INT_CAUSE);
735 return 0;
736 }
738 const struct sge_intr_counts *t1_sge_get_intr_counts(struct sge *sge)
739 {
740 return &sge->stats;
741 }
743 const struct sge_port_stats *t1_sge_get_port_stats(struct sge *sge, int port)
744 {
745 return &sge->port_stats[port];
746 }
748 /**
749 * recycle_fl_buf - recycle a free list buffer
750 * @fl: the free list
751 * @idx: index of buffer to recycle
752 *
753 * Recycles the specified buffer on the given free list by adding it at
754 * the next available slot on the list.
755 */
756 static void recycle_fl_buf(struct freelQ *fl, int idx)
757 {
758 struct freelQ_e *from = &fl->entries[idx];
759 struct freelQ_e *to = &fl->entries[fl->pidx];
761 fl->centries[fl->pidx] = fl->centries[idx];
762 to->addr_lo = from->addr_lo;
763 to->addr_hi = from->addr_hi;
764 to->len_gen = G_CMD_LEN(from->len_gen) | V_CMD_GEN1(fl->genbit);
765 wmb();
766 to->gen2 = V_CMD_GEN2(fl->genbit);
767 fl->credits++;
769 if (++fl->pidx == fl->size) {
770 fl->pidx = 0;
771 fl->genbit ^= 1;
772 }
773 }
775 /**
776 * get_packet - return the next ingress packet buffer
777 * @pdev: the PCI device that received the packet
778 * @fl: the SGE free list holding the packet
779 * @len: the actual packet length, excluding any SGE padding
780 * @dma_pad: padding at beginning of buffer left by SGE DMA
781 * @skb_pad: padding to be used if the packet is copied
782 * @copy_thres: length threshold under which a packet should be copied
783 * @drop_thres: # of remaining buffers before we start dropping packets
784 *
785 * Get the next packet from a free list and complete setup of the
786 * sk_buff. If the packet is small we make a copy and recycle the
787 * original buffer, otherwise we use the original buffer itself. If a
788 * positive drop threshold is supplied packets are dropped and their
789 * buffers recycled if (a) the number of remaining buffers is under the
790 * threshold and the packet is too big to copy, or (b) the packet should
791 * be copied but there is no memory for the copy.
792 */
793 static inline struct sk_buff *get_packet(struct pci_dev *pdev,
794 struct freelQ *fl, unsigned int len,
795 int dma_pad, int skb_pad,
796 unsigned int copy_thres,
797 unsigned int drop_thres)
798 {
799 struct sk_buff *skb;
800 struct freelQ_ce *ce = &fl->centries[fl->cidx];
802 if (len < copy_thres) {
803 skb = alloc_skb(len + skb_pad, GFP_ATOMIC);
804 if (likely(skb != NULL)) {
805 skb_reserve(skb, skb_pad);
806 skb_put(skb, len);
807 pci_dma_sync_single_for_cpu(pdev,
808 pci_unmap_addr(ce, dma_addr),
809 pci_unmap_len(ce, dma_len),
810 PCI_DMA_FROMDEVICE);
811 memcpy(skb->data, ce->skb->data + dma_pad, len);
812 pci_dma_sync_single_for_device(pdev,
813 pci_unmap_addr(ce, dma_addr),
814 pci_unmap_len(ce, dma_len),
815 PCI_DMA_FROMDEVICE);
816 } else if (!drop_thres)
817 goto use_orig_buf;
819 recycle_fl_buf(fl, fl->cidx);
820 return skb;
821 }
823 if (fl->credits < drop_thres) {
824 recycle_fl_buf(fl, fl->cidx);
825 return NULL;
826 }
828 use_orig_buf:
829 pci_unmap_single(pdev, pci_unmap_addr(ce, dma_addr),
830 pci_unmap_len(ce, dma_len), PCI_DMA_FROMDEVICE);
831 skb = ce->skb;
832 skb_reserve(skb, dma_pad);
833 skb_put(skb, len);
834 return skb;
835 }
837 /**
838 * unexpected_offload - handle an unexpected offload packet
839 * @adapter: the adapter
840 * @fl: the free list that received the packet
841 *
842 * Called when we receive an unexpected offload packet (e.g., the TOE
843 * function is disabled or the card is a NIC). Prints a message and
844 * recycles the buffer.
845 */
846 static void unexpected_offload(struct adapter *adapter, struct freelQ *fl)
847 {
848 struct freelQ_ce *ce = &fl->centries[fl->cidx];
849 struct sk_buff *skb = ce->skb;
851 pci_dma_sync_single_for_cpu(adapter->pdev, pci_unmap_addr(ce, dma_addr),
852 pci_unmap_len(ce, dma_len), PCI_DMA_FROMDEVICE);
853 CH_ERR("%s: unexpected offload packet, cmd %u\n",
854 adapter->name, *skb->data);
855 recycle_fl_buf(fl, fl->cidx);
856 }
858 /*
859 * Write the command descriptors to transmit the given skb starting at
860 * descriptor pidx with the given generation.
861 */
862 static inline void write_tx_descs(struct adapter *adapter, struct sk_buff *skb,
863 unsigned int pidx, unsigned int gen,
864 struct cmdQ *q)
865 {
866 dma_addr_t mapping;
867 struct cmdQ_e *e, *e1;
868 struct cmdQ_ce *ce;
869 unsigned int i, flags, nfrags = skb_shinfo(skb)->nr_frags;
871 mapping = pci_map_single(adapter->pdev, skb->data,
872 skb->len - skb->data_len, PCI_DMA_TODEVICE);
873 ce = &q->centries[pidx];
874 ce->skb = NULL;
875 pci_unmap_addr_set(ce, dma_addr, mapping);
876 pci_unmap_len_set(ce, dma_len, skb->len - skb->data_len);
878 flags = F_CMD_DATAVALID | F_CMD_SOP | V_CMD_EOP(nfrags == 0) |
879 V_CMD_GEN2(gen);
880 e = &q->entries[pidx];
881 e->addr_lo = (u32)mapping;
882 e->addr_hi = (u64)mapping >> 32;
883 e->len_gen = V_CMD_LEN(skb->len - skb->data_len) | V_CMD_GEN1(gen);
884 for (e1 = e, i = 0; nfrags--; i++) {
885 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
887 ce++;
888 e1++;
889 if (++pidx == q->size) {
890 pidx = 0;
891 gen ^= 1;
892 ce = q->centries;
893 e1 = q->entries;
894 }
896 mapping = pci_map_page(adapter->pdev, frag->page,
897 frag->page_offset, frag->size,
898 PCI_DMA_TODEVICE);
899 ce->skb = NULL;
900 pci_unmap_addr_set(ce, dma_addr, mapping);
901 pci_unmap_len_set(ce, dma_len, frag->size);
903 e1->addr_lo = (u32)mapping;
904 e1->addr_hi = (u64)mapping >> 32;
905 e1->len_gen = V_CMD_LEN(frag->size) | V_CMD_GEN1(gen);
906 e1->flags = F_CMD_DATAVALID | V_CMD_EOP(nfrags == 0) |
907 V_CMD_GEN2(gen);
908 }
910 ce->skb = skb;
911 wmb();
912 e->flags = flags;
913 }
915 /*
916 * Clean up completed Tx buffers.
917 */
918 static inline void reclaim_completed_tx(struct sge *sge, struct cmdQ *q)
919 {
920 unsigned int reclaim = q->processed - q->cleaned;
922 if (reclaim) {
923 free_cmdQ_buffers(sge, q, reclaim);
924 q->cleaned += reclaim;
925 }
926 }
928 #ifndef SET_ETHTOOL_OPS
929 # define __netif_rx_complete(dev) netif_rx_complete(dev)
930 #endif
932 /*
933 * We cannot use the standard netif_rx_schedule_prep() because we have multiple
934 * ports plus the TOE all multiplexing onto a single response queue, therefore
935 * accepting new responses cannot depend on the state of any particular port.
936 * So define our own equivalent that omits the netif_running() test.
937 */
938 static inline int napi_schedule_prep(struct net_device *dev)
939 {
940 return !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
941 }
944 /**
945 * sge_rx - process an ingress ethernet packet
946 * @sge: the sge structure
947 * @fl: the free list that contains the packet buffer
948 * @len: the packet length
949 *
950 * Process an ingress ethernet pakcet and deliver it to the stack.
951 */
952 static int sge_rx(struct sge *sge, struct freelQ *fl, unsigned int len)
953 {
954 struct sk_buff *skb;
955 struct cpl_rx_pkt *p;
956 struct adapter *adapter = sge->adapter;
958 sge->stats.ethernet_pkts++;
959 skb = get_packet(adapter->pdev, fl, len - sge->rx_pkt_pad,
960 sge->rx_pkt_pad, 2, SGE_RX_COPY_THRES,
961 SGE_RX_DROP_THRES);
962 if (!skb) {
963 sge->port_stats[0].rx_drops++; /* charge only port 0 for now */
964 return 0;
965 }
967 p = (struct cpl_rx_pkt *)skb->data;
968 skb_pull(skb, sizeof(*p));
969 skb->dev = adapter->port[p->iff].dev;
970 skb->dev->last_rx = jiffies;
971 skb->protocol = eth_type_trans(skb, skb->dev);
972 if ((adapter->flags & RX_CSUM_ENABLED) && p->csum == 0xffff &&
973 skb->protocol == htons(ETH_P_IP) &&
974 (skb->data[9] == IPPROTO_TCP || skb->data[9] == IPPROTO_UDP)) {
975 sge->port_stats[p->iff].rx_cso_good++;
976 skb->ip_summed = CHECKSUM_UNNECESSARY;
977 } else
978 skb->ip_summed = CHECKSUM_NONE;
980 if (unlikely(adapter->vlan_grp && p->vlan_valid)) {
981 sge->port_stats[p->iff].vlan_xtract++;
982 if (adapter->params.sge.polling)
983 vlan_hwaccel_receive_skb(skb, adapter->vlan_grp,
984 ntohs(p->vlan));
985 else
986 vlan_hwaccel_rx(skb, adapter->vlan_grp,
987 ntohs(p->vlan));
988 } else if (adapter->params.sge.polling)
989 netif_receive_skb(skb);
990 else
991 netif_rx(skb);
992 return 0;
993 }
995 /*
996 * Returns true if a command queue has enough available descriptors that
997 * we can resume Tx operation after temporarily disabling its packet queue.
998 */
999 static inline int enough_free_Tx_descs(const struct cmdQ *q)
1001 unsigned int r = q->processed - q->cleaned;
1003 return q->in_use - r < (q->size >> 1);
1006 /*
1007 * Called when sufficient space has become available in the SGE command queues
1008 * after the Tx packet schedulers have been suspended to restart the Tx path.
1009 */
1010 static void restart_tx_queues(struct sge *sge)
1012 struct adapter *adap = sge->adapter;
1014 if (enough_free_Tx_descs(&sge->cmdQ[0])) {
1015 int i;
1017 for_each_port(adap, i) {
1018 struct net_device *nd = adap->port[i].dev;
1020 if (test_and_clear_bit(nd->if_port,
1021 &sge->stopped_tx_queues) &&
1022 netif_running(nd)) {
1023 sge->stats.cmdQ_restarted[2]++;
1024 netif_wake_queue(nd);
1030 /*
1031 * update_tx_info is called from the interrupt handler/NAPI to return cmdQ0
1032 * information.
1033 */
1034 static unsigned int update_tx_info(struct adapter *adapter,
1035 unsigned int flags,
1036 unsigned int pr0)
1038 struct sge *sge = adapter->sge;
1039 struct cmdQ *cmdq = &sge->cmdQ[0];
1041 cmdq->processed += pr0;
1043 if (flags & F_CMDQ0_ENABLE) {
1044 clear_bit(CMDQ_STAT_RUNNING, &cmdq->status);
1046 if (cmdq->cleaned + cmdq->in_use != cmdq->processed &&
1047 !test_and_set_bit(CMDQ_STAT_LAST_PKT_DB, &cmdq->status)) {
1048 set_bit(CMDQ_STAT_RUNNING, &cmdq->status);
1049 writel(F_CMDQ0_ENABLE, adapter->regs + A_SG_DOORBELL);
1051 flags &= ~F_CMDQ0_ENABLE;
1054 if (unlikely(sge->stopped_tx_queues != 0))
1055 restart_tx_queues(sge);
1057 return flags;
1060 /*
1061 * Process SGE responses, up to the supplied budget. Returns the number of
1062 * responses processed. A negative budget is effectively unlimited.
1063 */
1064 static int process_responses(struct adapter *adapter, int budget)
1066 struct sge *sge = adapter->sge;
1067 struct respQ *q = &sge->respQ;
1068 struct respQ_e *e = &q->entries[q->cidx];
1069 int budget_left = budget;
1070 unsigned int flags = 0;
1071 unsigned int cmdq_processed[SGE_CMDQ_N] = {0, 0};
1074 while (likely(budget_left && e->GenerationBit == q->genbit)) {
1075 flags |= e->Qsleeping;
1077 cmdq_processed[0] += e->Cmdq0CreditReturn;
1078 cmdq_processed[1] += e->Cmdq1CreditReturn;
1080 /* We batch updates to the TX side to avoid cacheline
1081 * ping-pong of TX state information on MP where the sender
1082 * might run on a different CPU than this function...
1083 */
1084 if (unlikely(flags & F_CMDQ0_ENABLE || cmdq_processed[0] > 64)) {
1085 flags = update_tx_info(adapter, flags, cmdq_processed[0]);
1086 cmdq_processed[0] = 0;
1088 if (unlikely(cmdq_processed[1] > 16)) {
1089 sge->cmdQ[1].processed += cmdq_processed[1];
1090 cmdq_processed[1] = 0;
1092 if (likely(e->DataValid)) {
1093 struct freelQ *fl = &sge->freelQ[e->FreelistQid];
1095 BUG_ON(!e->Sop || !e->Eop);
1096 if (unlikely(e->Offload))
1097 unexpected_offload(adapter, fl);
1098 else
1099 sge_rx(sge, fl, e->BufferLength);
1101 /*
1102 * Note: this depends on each packet consuming a
1103 * single free-list buffer; cf. the BUG above.
1104 */
1105 if (++fl->cidx == fl->size)
1106 fl->cidx = 0;
1107 if (unlikely(--fl->credits <
1108 fl->size - SGE_FREEL_REFILL_THRESH))
1109 refill_free_list(sge, fl);
1110 } else
1111 sge->stats.pure_rsps++;
1113 e++;
1114 if (unlikely(++q->cidx == q->size)) {
1115 q->cidx = 0;
1116 q->genbit ^= 1;
1117 e = q->entries;
1119 prefetch(e);
1121 if (++q->credits > SGE_RESPQ_REPLENISH_THRES) {
1122 writel(q->credits, adapter->regs + A_SG_RSPQUEUECREDIT);
1123 q->credits = 0;
1125 --budget_left;
1128 flags = update_tx_info(adapter, flags, cmdq_processed[0]);
1129 sge->cmdQ[1].processed += cmdq_processed[1];
1131 budget -= budget_left;
1132 return budget;
1135 /*
1136 * A simpler version of process_responses() that handles only pure (i.e.,
1137 * non data-carrying) responses. Such respones are too light-weight to justify
1138 * calling a softirq when using NAPI, so we handle them specially in hard
1139 * interrupt context. The function is called with a pointer to a response,
1140 * which the caller must ensure is a valid pure response. Returns 1 if it
1141 * encounters a valid data-carrying response, 0 otherwise.
1142 */
1143 static int process_pure_responses(struct adapter *adapter, struct respQ_e *e)
1145 struct sge *sge = adapter->sge;
1146 struct respQ *q = &sge->respQ;
1147 unsigned int flags = 0;
1148 unsigned int cmdq_processed[SGE_CMDQ_N] = {0, 0};
1150 do {
1151 flags |= e->Qsleeping;
1153 cmdq_processed[0] += e->Cmdq0CreditReturn;
1154 cmdq_processed[1] += e->Cmdq1CreditReturn;
1156 e++;
1157 if (unlikely(++q->cidx == q->size)) {
1158 q->cidx = 0;
1159 q->genbit ^= 1;
1160 e = q->entries;
1162 prefetch(e);
1164 if (++q->credits > SGE_RESPQ_REPLENISH_THRES) {
1165 writel(q->credits, adapter->regs + A_SG_RSPQUEUECREDIT);
1166 q->credits = 0;
1168 sge->stats.pure_rsps++;
1169 } while (e->GenerationBit == q->genbit && !e->DataValid);
1171 flags = update_tx_info(adapter, flags, cmdq_processed[0]);
1172 sge->cmdQ[1].processed += cmdq_processed[1];
1174 return e->GenerationBit == q->genbit;
1177 /*
1178 * Handler for new data events when using NAPI. This does not need any locking
1179 * or protection from interrupts as data interrupts are off at this point and
1180 * other adapter interrupts do not interfere.
1181 */
1182 static int t1_poll(struct net_device *dev, int *budget)
1184 struct adapter *adapter = dev->priv;
1185 int effective_budget = min(*budget, dev->quota);
1187 int work_done = process_responses(adapter, effective_budget);
1188 *budget -= work_done;
1189 dev->quota -= work_done;
1191 if (work_done >= effective_budget)
1192 return 1;
1194 __netif_rx_complete(dev);
1196 /*
1197 * Because we don't atomically flush the following write it is
1198 * possible that in very rare cases it can reach the device in a way
1199 * that races with a new response being written plus an error interrupt
1200 * causing the NAPI interrupt handler below to return unhandled status
1201 * to the OS. To protect against this would require flushing the write
1202 * and doing both the write and the flush with interrupts off. Way too
1203 * expensive and unjustifiable given the rarity of the race.
1204 */
1205 writel(adapter->sge->respQ.cidx, adapter->regs + A_SG_SLEEPING);
1206 return 0;
1209 /*
1210 * Returns true if the device is already scheduled for polling.
1211 */
1212 static inline int napi_is_scheduled(struct net_device *dev)
1214 return test_bit(__LINK_STATE_RX_SCHED, &dev->state);
1217 /*
1218 * NAPI version of the main interrupt handler.
1219 */
1220 static irqreturn_t t1_interrupt_napi(int irq, void *data, struct pt_regs *regs)
1222 int handled;
1223 struct adapter *adapter = data;
1224 struct sge *sge = adapter->sge;
1225 struct respQ *q = &adapter->sge->respQ;
1227 /*
1228 * Clear the SGE_DATA interrupt first thing. Normally the NAPI
1229 * handler has control of the response queue and the interrupt handler
1230 * can look at the queue reliably only once it knows NAPI is off.
1231 * We can't wait that long to clear the SGE_DATA interrupt because we
1232 * could race with t1_poll rearming the SGE interrupt, so we need to
1233 * clear the interrupt speculatively and really early on.
1234 */
1235 writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE);
1237 spin_lock(&adapter->async_lock);
1238 if (!napi_is_scheduled(sge->netdev)) {
1239 struct respQ_e *e = &q->entries[q->cidx];
1241 if (e->GenerationBit == q->genbit) {
1242 if (e->DataValid ||
1243 process_pure_responses(adapter, e)) {
1244 if (likely(napi_schedule_prep(sge->netdev)))
1245 __netif_rx_schedule(sge->netdev);
1246 else
1247 printk(KERN_CRIT
1248 "NAPI schedule failure!\n");
1249 } else
1250 writel(q->cidx, adapter->regs + A_SG_SLEEPING);
1251 handled = 1;
1252 goto unlock;
1253 } else
1254 writel(q->cidx, adapter->regs + A_SG_SLEEPING);
1255 } else
1256 if (readl(adapter->regs + A_PL_CAUSE) & F_PL_INTR_SGE_DATA)
1257 printk(KERN_ERR "data interrupt while NAPI running\n");
1259 handled = t1_slow_intr_handler(adapter);
1260 if (!handled)
1261 sge->stats.unhandled_irqs++;
1262 unlock:
1263 spin_unlock(&adapter->async_lock);
1264 return IRQ_RETVAL(handled != 0);
1267 /*
1268 * Main interrupt handler, optimized assuming that we took a 'DATA'
1269 * interrupt.
1271 * 1. Clear the interrupt
1272 * 2. Loop while we find valid descriptors and process them; accumulate
1273 * information that can be processed after the loop
1274 * 3. Tell the SGE at which index we stopped processing descriptors
1275 * 4. Bookkeeping; free TX buffers, ring doorbell if there are any
1276 * outstanding TX buffers waiting, replenish RX buffers, potentially
1277 * reenable upper layers if they were turned off due to lack of TX
1278 * resources which are available again.
1279 * 5. If we took an interrupt, but no valid respQ descriptors was found we
1280 * let the slow_intr_handler run and do error handling.
1281 */
1282 static irqreturn_t t1_interrupt(int irq, void *cookie, struct pt_regs *regs)
1284 int work_done;
1285 struct respQ_e *e;
1286 struct adapter *adapter = cookie;
1287 struct respQ *Q = &adapter->sge->respQ;
1289 spin_lock(&adapter->async_lock);
1290 e = &Q->entries[Q->cidx];
1291 prefetch(e);
1293 writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE);
1295 if (likely(e->GenerationBit == Q->genbit))
1296 work_done = process_responses(adapter, -1);
1297 else
1298 work_done = t1_slow_intr_handler(adapter);
1300 /*
1301 * The unconditional clearing of the PL_CAUSE above may have raced
1302 * with DMA completion and the corresponding generation of a response
1303 * to cause us to miss the resulting data interrupt. The next write
1304 * is also unconditional to recover the missed interrupt and render
1305 * this race harmless.
1306 */
1307 writel(Q->cidx, adapter->regs + A_SG_SLEEPING);
1309 if (!work_done)
1310 adapter->sge->stats.unhandled_irqs++;
1311 spin_unlock(&adapter->async_lock);
1312 return IRQ_RETVAL(work_done != 0);
1315 intr_handler_t t1_select_intr_handler(adapter_t *adapter)
1317 return adapter->params.sge.polling ? t1_interrupt_napi : t1_interrupt;
1320 /*
1321 * Enqueues the sk_buff onto the cmdQ[qid] and has hardware fetch it.
1323 * The code figures out how many entries the sk_buff will require in the
1324 * cmdQ and updates the cmdQ data structure with the state once the enqueue
1325 * has complete. Then, it doesn't access the global structure anymore, but
1326 * uses the corresponding fields on the stack. In conjuction with a spinlock
1327 * around that code, we can make the function reentrant without holding the
1328 * lock when we actually enqueue (which might be expensive, especially on
1329 * architectures with IO MMUs).
1331 * This runs with softirqs disabled.
1332 */
1333 static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
1334 unsigned int qid, struct net_device *dev)
1336 struct sge *sge = adapter->sge;
1337 struct cmdQ *q = &sge->cmdQ[qid];
1338 unsigned int credits, pidx, genbit, count;
1340 spin_lock(&q->lock);
1341 reclaim_completed_tx(sge, q);
1343 pidx = q->pidx;
1344 credits = q->size - q->in_use;
1345 count = 1 + skb_shinfo(skb)->nr_frags;
1347 { /* Ethernet packet */
1348 if (unlikely(credits < count)) {
1349 netif_stop_queue(dev);
1350 set_bit(dev->if_port, &sge->stopped_tx_queues);
1351 sge->stats.cmdQ_full[2]++;
1352 spin_unlock(&q->lock);
1353 if (!netif_queue_stopped(dev))
1354 CH_ERR("%s: Tx ring full while queue awake!\n",
1355 adapter->name);
1356 return NETDEV_TX_BUSY;
1358 if (unlikely(credits - count < q->stop_thres)) {
1359 sge->stats.cmdQ_full[2]++;
1360 netif_stop_queue(dev);
1361 set_bit(dev->if_port, &sge->stopped_tx_queues);
1364 q->in_use += count;
1365 genbit = q->genbit;
1366 q->pidx += count;
1367 if (q->pidx >= q->size) {
1368 q->pidx -= q->size;
1369 q->genbit ^= 1;
1371 spin_unlock(&q->lock);
1373 write_tx_descs(adapter, skb, pidx, genbit, q);
1375 /*
1376 * We always ring the doorbell for cmdQ1. For cmdQ0, we only ring
1377 * the doorbell if the Q is asleep. There is a natural race, where
1378 * the hardware is going to sleep just after we checked, however,
1379 * then the interrupt handler will detect the outstanding TX packet
1380 * and ring the doorbell for us.
1381 */
1382 if (qid)
1383 doorbell_pio(adapter, F_CMDQ1_ENABLE);
1384 else {
1385 clear_bit(CMDQ_STAT_LAST_PKT_DB, &q->status);
1386 if (test_and_set_bit(CMDQ_STAT_RUNNING, &q->status) == 0) {
1387 set_bit(CMDQ_STAT_LAST_PKT_DB, &q->status);
1388 writel(F_CMDQ0_ENABLE, adapter->regs + A_SG_DOORBELL);
1391 return NETDEV_TX_OK;
1394 #define MK_ETH_TYPE_MSS(type, mss) (((mss) & 0x3FFF) | ((type) << 14))
1396 /*
1397 * eth_hdr_len - return the length of an Ethernet header
1398 * @data: pointer to the start of the Ethernet header
1400 * Returns the length of an Ethernet header, including optional VLAN tag.
1401 */
1402 static inline int eth_hdr_len(const void *data)
1404 const struct ethhdr *e = data;
1406 return e->h_proto == htons(ETH_P_8021Q) ? VLAN_ETH_HLEN : ETH_HLEN;
1409 /*
1410 * Adds the CPL header to the sk_buff and passes it to t1_sge_tx.
1411 */
1412 int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
1414 struct adapter *adapter = dev->priv;
1415 struct sge_port_stats *st = &adapter->sge->port_stats[dev->if_port];
1416 struct sge *sge = adapter->sge;
1417 struct cpl_tx_pkt *cpl;
1419 #ifdef NETIF_F_TSO
1420 if (skb_is_gso(skb)) {
1421 int eth_type;
1422 struct cpl_tx_pkt_lso *hdr;
1424 st->tso++;
1426 eth_type = skb->nh.raw - skb->data == ETH_HLEN ?
1427 CPL_ETH_II : CPL_ETH_II_VLAN;
1429 hdr = (struct cpl_tx_pkt_lso *)skb_push(skb, sizeof(*hdr));
1430 hdr->opcode = CPL_TX_PKT_LSO;
1431 hdr->ip_csum_dis = hdr->l4_csum_dis = 0;
1432 hdr->ip_hdr_words = skb->nh.iph->ihl;
1433 hdr->tcp_hdr_words = skb->h.th->doff;
1434 hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
1435 skb_shinfo(skb)->gso_size));
1436 hdr->len = htonl(skb->len - sizeof(*hdr));
1437 cpl = (struct cpl_tx_pkt *)hdr;
1438 sge->stats.tx_lso_pkts++;
1439 } else
1440 #endif
1442 /*
1443 * Packets shorter than ETH_HLEN can break the MAC, drop them
1444 * early. Also, we may get oversized packets because some
1445 * parts of the kernel don't handle our unusual hard_header_len
1446 * right, drop those too.
1447 */
1448 if (unlikely(skb->len < ETH_HLEN ||
1449 skb->len > dev->mtu + eth_hdr_len(skb->data))) {
1450 dev_kfree_skb_any(skb);
1451 return NETDEV_TX_OK;
1454 /*
1455 * We are using a non-standard hard_header_len and some kernel
1456 * components, such as pktgen, do not handle it right.
1457 * Complain when this happens but try to fix things up.
1458 */
1459 if (unlikely(skb_headroom(skb) <
1460 dev->hard_header_len - ETH_HLEN)) {
1461 struct sk_buff *orig_skb = skb;
1463 if (net_ratelimit())
1464 printk(KERN_ERR "%s: inadequate headroom in "
1465 "Tx packet\n", dev->name);
1466 skb = skb_realloc_headroom(skb, sizeof(*cpl));
1467 dev_kfree_skb_any(orig_skb);
1468 if (!skb)
1469 return NETDEV_TX_OK;
1472 if (!(adapter->flags & UDP_CSUM_CAPABLE) &&
1473 skb->ip_summed == CHECKSUM_HW &&
1474 skb->nh.iph->protocol == IPPROTO_UDP)
1475 if (unlikely(skb_checksum_help(skb, 0))) {
1476 dev_kfree_skb_any(skb);
1477 return NETDEV_TX_OK;
1480 /* Hmmm, assuming to catch the gratious arp... and we'll use
1481 * it to flush out stuck espi packets...
1482 */
1483 if (unlikely(!adapter->sge->espibug_skb)) {
1484 if (skb->protocol == htons(ETH_P_ARP) &&
1485 skb->nh.arph->ar_op == htons(ARPOP_REQUEST)) {
1486 adapter->sge->espibug_skb = skb;
1487 /* We want to re-use this skb later. We
1488 * simply bump the reference count and it
1489 * will not be freed...
1490 */
1491 skb = skb_get(skb);
1495 cpl = (struct cpl_tx_pkt *)__skb_push(skb, sizeof(*cpl));
1496 cpl->opcode = CPL_TX_PKT;
1497 cpl->ip_csum_dis = 1; /* SW calculates IP csum */
1498 cpl->l4_csum_dis = skb->ip_summed == CHECKSUM_HW ? 0 : 1;
1499 /* the length field isn't used so don't bother setting it */
1501 st->tx_cso += (skb->ip_summed == CHECKSUM_HW);
1502 sge->stats.tx_do_cksum += (skb->ip_summed == CHECKSUM_HW);
1503 sge->stats.tx_reg_pkts++;
1505 cpl->iff = dev->if_port;
1507 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
1508 if (adapter->vlan_grp && vlan_tx_tag_present(skb)) {
1509 cpl->vlan_valid = 1;
1510 cpl->vlan = htons(vlan_tx_tag_get(skb));
1511 st->vlan_insert++;
1512 } else
1513 #endif
1514 cpl->vlan_valid = 0;
1516 dev->trans_start = jiffies;
1517 return t1_sge_tx(skb, adapter, 0, dev);
1520 /*
1521 * Callback for the Tx buffer reclaim timer. Runs with softirqs disabled.
1522 */
1523 static void sge_tx_reclaim_cb(unsigned long data)
1525 int i;
1526 struct sge *sge = (struct sge *)data;
1528 for (i = 0; i < SGE_CMDQ_N; ++i) {
1529 struct cmdQ *q = &sge->cmdQ[i];
1531 if (!spin_trylock(&q->lock))
1532 continue;
1534 reclaim_completed_tx(sge, q);
1535 if (i == 0 && q->in_use) /* flush pending credits */
1536 writel(F_CMDQ0_ENABLE,
1537 sge->adapter->regs + A_SG_DOORBELL);
1539 spin_unlock(&q->lock);
1541 mod_timer(&sge->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
1544 /*
1545 * Propagate changes of the SGE coalescing parameters to the HW.
1546 */
1547 int t1_sge_set_coalesce_params(struct sge *sge, struct sge_params *p)
1549 sge->netdev->poll = t1_poll;
1550 sge->fixed_intrtimer = p->rx_coalesce_usecs *
1551 core_ticks_per_usec(sge->adapter);
1552 writel(sge->fixed_intrtimer, sge->adapter->regs + A_SG_INTRTIMER);
1553 return 0;
1556 /*
1557 * Allocates both RX and TX resources and configures the SGE. However,
1558 * the hardware is not enabled yet.
1559 */
1560 int t1_sge_configure(struct sge *sge, struct sge_params *p)
1562 if (alloc_rx_resources(sge, p))
1563 return -ENOMEM;
1564 if (alloc_tx_resources(sge, p)) {
1565 free_rx_resources(sge);
1566 return -ENOMEM;
1568 configure_sge(sge, p);
1570 /*
1571 * Now that we have sized the free lists calculate the payload
1572 * capacity of the large buffers. Other parts of the driver use
1573 * this to set the max offload coalescing size so that RX packets
1574 * do not overflow our large buffers.
1575 */
1576 p->large_buf_capacity = jumbo_payload_capacity(sge);
1577 return 0;
1580 /*
1581 * Disables the DMA engine.
1582 */
1583 void t1_sge_stop(struct sge *sge)
1585 writel(0, sge->adapter->regs + A_SG_CONTROL);
1586 (void) readl(sge->adapter->regs + A_SG_CONTROL); /* flush */
1587 if (is_T2(sge->adapter))
1588 del_timer_sync(&sge->espibug_timer);
1589 del_timer_sync(&sge->tx_reclaim_timer);
1592 /*
1593 * Enables the DMA engine.
1594 */
1595 void t1_sge_start(struct sge *sge)
1597 refill_free_list(sge, &sge->freelQ[0]);
1598 refill_free_list(sge, &sge->freelQ[1]);
1600 writel(sge->sge_control, sge->adapter->regs + A_SG_CONTROL);
1601 doorbell_pio(sge->adapter, F_FL0_ENABLE | F_FL1_ENABLE);
1602 (void) readl(sge->adapter->regs + A_SG_CONTROL); /* flush */
1604 mod_timer(&sge->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
1606 if (is_T2(sge->adapter))
1607 mod_timer(&sge->espibug_timer, jiffies + sge->espibug_timeout);
1610 /*
1611 * Callback for the T2 ESPI 'stuck packet feature' workaorund
1612 */
1613 static void espibug_workaround(void *data)
1615 struct adapter *adapter = (struct adapter *)data;
1616 struct sge *sge = adapter->sge;
1618 if (netif_running(adapter->port[0].dev)) {
1619 struct sk_buff *skb = sge->espibug_skb;
1621 u32 seop = t1_espi_get_mon(adapter, 0x930, 0);
1623 if ((seop & 0xfff0fff) == 0xfff && skb) {
1624 if (!skb->cb[0]) {
1625 u8 ch_mac_addr[ETH_ALEN] =
1626 {0x0, 0x7, 0x43, 0x0, 0x0, 0x0};
1627 memcpy(skb->data + sizeof(struct cpl_tx_pkt),
1628 ch_mac_addr, ETH_ALEN);
1629 memcpy(skb->data + skb->len - 10, ch_mac_addr,
1630 ETH_ALEN);
1631 skb->cb[0] = 0xff;
1634 /* bump the reference count to avoid freeing of the
1635 * skb once the DMA has completed.
1636 */
1637 skb = skb_get(skb);
1638 t1_sge_tx(skb, adapter, 0, adapter->port[0].dev);
1641 mod_timer(&sge->espibug_timer, jiffies + sge->espibug_timeout);
1644 /*
1645 * Creates a t1_sge structure and returns suggested resource parameters.
1646 */
1647 struct sge * __devinit t1_sge_create(struct adapter *adapter,
1648 struct sge_params *p)
1650 struct sge *sge = kmalloc(sizeof(*sge), GFP_KERNEL);
1652 if (!sge)
1653 return NULL;
1654 memset(sge, 0, sizeof(*sge));
1656 sge->adapter = adapter;
1657 sge->netdev = adapter->port[0].dev;
1658 sge->rx_pkt_pad = t1_is_T1B(adapter) ? 0 : 2;
1659 sge->jumbo_fl = t1_is_T1B(adapter) ? 1 : 0;
1661 init_timer(&sge->tx_reclaim_timer);
1662 sge->tx_reclaim_timer.data = (unsigned long)sge;
1663 sge->tx_reclaim_timer.function = sge_tx_reclaim_cb;
1665 if (is_T2(sge->adapter)) {
1666 init_timer(&sge->espibug_timer);
1667 sge->espibug_timer.function = (void *)&espibug_workaround;
1668 sge->espibug_timer.data = (unsigned long)sge->adapter;
1669 sge->espibug_timeout = 1;
1673 p->cmdQ_size[0] = SGE_CMDQ0_E_N;
1674 p->cmdQ_size[1] = SGE_CMDQ1_E_N;
1675 p->freelQ_size[!sge->jumbo_fl] = SGE_FREEL_SIZE;
1676 p->freelQ_size[sge->jumbo_fl] = SGE_JUMBO_FREEL_SIZE;
1677 p->rx_coalesce_usecs = 50;
1678 p->coalesce_enable = 0;
1679 p->sample_interval_usecs = 0;
1680 p->polling = 0;
1682 return sge;