]> xenbits.xensource.com Git - xen.git/commitdiff
x86/boot: Improve the boot watchdog determination of stuck cpus
authorAndrew Cooper <andrew.cooper3@citrix.com>
Tue, 2 Apr 2024 14:20:09 +0000 (16:20 +0200)
committerJan Beulich <jbeulich@suse.com>
Tue, 2 Apr 2024 14:20:09 +0000 (16:20 +0200)
Right now, check_nmi_watchdog() has two processing loops over all online CPUs
using prev_nmi_count as storage.

Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis()
make the determination of whether it is stuck, rather than having both
functions needing to agree on how many ticks mean stuck.

More importantly though, it means we can use the standard cpumask
infrastructure, including turning this:

  (XEN) Brought up 512 CPUs
  (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck

into the rather more manageable:

  (XEN) Brought up 512 CPUs
  (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
master commit: 9e18f339830c828798aef465556d4029d83476a0
master date: 2024-03-19 18:29:37 +0000

xen/arch/x86/nmi.c

index 7c9591b65e817436ee28ff1a85c20d9823a3827a..dd31034ac8d7d184e4913ef988227e2e34462498 100644 (file)
@@ -150,6 +150,8 @@ int nmi_active;
 
 static void __init cf_check wait_for_nmis(void *p)
 {
+    cpumask_t *stuck_cpus = p;
+    unsigned int cpu = smp_processor_id();
     unsigned int start_count = this_cpu(nmi_count);
     unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
     unsigned long s, e;
@@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p)
     do {
         cpu_relax();
         if ( this_cpu(nmi_count) >= start_count + 2 )
-            break;
+            return;
+
         e = rdtsc();
-    } while( e - s < ticks );
+    } while ( e - s < ticks );
+
+    /* Timeout.  Mark ourselves as stuck. */
+    cpumask_set_cpu(cpu, stuck_cpus);
 }
 
 void __init check_nmi_watchdog(void)
 {
-    static unsigned int __initdata prev_nmi_count[NR_CPUS];
-    int cpu;
-    bool ok = true;
+    static cpumask_t __initdata stuck_cpus;
 
     if ( nmi_watchdog == NMI_NONE )
         return;
 
     printk("Testing NMI watchdog on all CPUs:");
 
-    for_each_online_cpu ( cpu )
-        prev_nmi_count[cpu] = per_cpu(nmi_count, cpu);
-
     /*
      * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
      * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
      * uses only runs while the core's not halted
      */
-    on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
-
-    for_each_online_cpu ( cpu )
-    {
-        if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 )
-        {
-            printk(" %d", cpu);
-            ok = false;
-        }
-    }
+    on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1);
 
-    printk(" %s\n", ok ? "ok" : "stuck");
+    if ( cpumask_empty(&stuck_cpus) )
+        printk("ok\n");
+    else
+        printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus));
 
     /*
      * Now that we know it works we can reduce NMI frequency to