ia64/xen-unstable

view tools/xenmon/xenmon.py @ 8609:85d693e6f61a

Arch-specific per-vcpu info should be initialised to zero
when allocating a new vcpu structure, not copied from
CPU0's idle VCPU. Especially now that the idle VCPU itself
is dynamically allocated.

This should fix assertions people have been seeing in
getdomain_info_ctxt() relation to IOPL in eflags.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Jan 14 21:26:40 2006 +0100 (2006-01-14)
parents f5dafee503ba
children 0ad422291688
line source
1 #!/usr/bin/env python
3 #####################################################################
4 # xenmon is a front-end for xenbaked.
5 # There is a curses interface for live monitoring. XenMon also allows
6 # logging to a file. For options, run python xenmon.py -h
7 #
8 # Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins
9 # Authors: Lucy Cherkasova, lucy.cherkasova@hp.com
10 # Rob Gardner, rob.gardner@hp.com
11 # Diwaker Gupta, diwaker.gupta@hp.com
12 #####################################################################
13 # This program is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; under version 2 of the License.
16 #
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
21 #
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, write to the Free Software
24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 #####################################################################
27 import mmap
28 import struct
29 import os
30 import time
31 import optparse as _o
32 import curses as _c
33 import math
34 import sys
36 # constants
37 NSAMPLES = 100
38 NDOMAINS = 32
40 # the struct strings for qos_info
41 ST_DOM_INFO = "6Q4i32s"
42 ST_QDATA = "%dQ" % (6*NDOMAINS + 4)
44 # size of mmaped file
45 QOS_DATA_SIZE = struct.calcsize(ST_QDATA)*NSAMPLES + struct.calcsize(ST_DOM_INFO)*NDOMAINS + struct.calcsize("4i")
47 # location of mmaped file, hard coded right now
48 SHM_FILE = "/tmp/xenq-shm"
50 # format strings
51 TOTALS = 15*' ' + "%6.2f%%" + 35*' ' + "%6.2f%%"
53 ALLOCATED = "Allocated"
54 GOTTEN = "Gotten"
55 BLOCKED = "Blocked"
56 WAITED = "Waited"
57 IOCOUNT = "I/O Count"
58 EXCOUNT = "Exec Count"
60 # globals
61 dom_in_use = []
63 # our curses screen
64 stdscr = None
66 # parsed options
67 options, args = None, None
69 # the optparse module is quite smart
70 # to see help, just run xenmon -h
71 def setup_cmdline_parser():
72 parser = _o.OptionParser()
73 parser.add_option("-l", "--live", dest="live", action="store_true",
74 default=True, help = "show the ncurses live monitoring frontend (default)")
75 parser.add_option("-n", "--notlive", dest="live", action="store_false",
76 default="True", help = "write to file instead of live monitoring")
77 parser.add_option("-p", "--prefix", dest="prefix",
78 default = "log", help="prefix to use for output files")
79 parser.add_option("-t", "--time", dest="duration",
80 action="store", type="int", default=10,
81 help="stop logging to file after this much time has elapsed (in seconds). set to 0 to keep logging indefinitely")
82 parser.add_option("-i", "--interval", dest="interval",
83 action="store", type="int", default=1000,
84 help="interval for logging (in ms)")
85 parser.add_option("--ms_per_sample", dest="mspersample",
86 action="store", type="int", default=100,
87 help = "determines how many ms worth of data goes in a sample")
88 return parser
90 # encapsulate information about a domain
91 class DomainInfo:
92 def __init__(self):
93 self.allocated_sum = 0
94 self.gotten_sum = 0
95 self.blocked_sum = 0
96 self.waited_sum = 0
97 self.exec_count = 0;
98 self.iocount_sum = 0
99 self.ffp_samples = []
101 def gotten_stats(self, passed):
102 total = float(self.gotten_sum)
103 per = 100*total/passed
104 exs = self.exec_count
105 if exs > 0:
106 avg = total/exs
107 else:
108 avg = 0
109 return [total/(float(passed)/10**9), per, avg]
111 def waited_stats(self, passed):
112 total = float(self.waited_sum)
113 per = 100*total/passed
114 exs = self.exec_count
115 if exs > 0:
116 avg = total/exs
117 else:
118 avg = 0
119 return [total/(float(passed)/10**9), per, avg]
121 def blocked_stats(self, passed):
122 total = float(self.blocked_sum)
123 per = 100*total/passed
124 ios = self.iocount_sum
125 if ios > 0:
126 avg = total/float(ios)
127 else:
128 avg = 0
129 return [total/(float(passed)/10**9), per, avg]
131 def allocated_stats(self, passed):
132 total = self.allocated_sum
133 exs = self.exec_count
134 if exs > 0:
135 return float(total)/exs
136 else:
137 return 0
139 def ec_stats(self, passed):
140 total = float(self.exec_count/(float(passed)/10**9))
141 return total
143 def io_stats(self, passed):
144 total = float(self.iocount_sum)
145 exs = self.exec_count
146 if exs > 0:
147 avg = total/exs
148 else:
149 avg = 0
150 return [total/(float(passed)/10**9), avg]
152 def stats(self, passed):
153 return [self.gotten_stats(passed), self.allocated_stats(passed), self.blocked_stats(passed),
154 self.waited_stats(passed), self.ec_stats(passed), self.io_stats(passed)]
156 # report values over desired interval
157 def summarize(startat, endat, duration, samples):
158 dominfos = {}
159 for i in range(0, NDOMAINS):
160 dominfos[i] = DomainInfo()
162 passed = 1 # to prevent zero division
163 curid = startat
164 numbuckets = 0
165 lost_samples = []
166 ffp_samples = []
168 while passed < duration:
169 for i in range(0, NDOMAINS):
170 if dom_in_use[i]:
171 dominfos[i].gotten_sum += samples[curid][0*NDOMAINS + i]
172 dominfos[i].allocated_sum += samples[curid][1*NDOMAINS + i]
173 dominfos[i].waited_sum += samples[curid][2*NDOMAINS + i]
174 dominfos[i].blocked_sum += samples[curid][3*NDOMAINS + i]
175 dominfos[i].exec_count += samples[curid][4*NDOMAINS + i]
176 dominfos[i].iocount_sum += samples[curid][5*NDOMAINS + i]
178 passed += samples[curid][6*NDOMAINS]
179 lost_samples.append(samples[curid][6*NDOMAINS + 2])
180 ffp_samples.append(samples[curid][6*NDOMAINS + 3])
182 numbuckets += 1
184 if curid > 0:
185 curid -= 1
186 else:
187 curid = NSAMPLES - 1
188 if curid == endat:
189 break
191 lostinfo = [min(lost_samples), sum(lost_samples), max(lost_samples)]
192 ffpinfo = [min(ffp_samples), sum(ffp_samples), max(ffp_samples)]
194 ldoms = []
195 for x in range(0, NDOMAINS):
196 if dom_in_use[x]:
197 ldoms.append(dominfos[x].stats(passed))
198 else:
199 ldoms.append(0)
201 return [ldoms, lostinfo, ffpinfo]
203 # scale microseconds to milliseconds or seconds as necessary
204 def time_scale(ns):
205 if ns < 1000:
206 return "%4.2f ns" % float(ns)
207 elif ns < 1000*1000:
208 return "%4.2f us" % (float(ns)/10**3)
209 elif ns < 10**9:
210 return "%4.2f ms" % (float(ns)/10**6)
211 else:
212 return "%4.2f s" % (float(ns)/10**9)
214 # paint message on curses screen, but detect screen size errors
215 def display(scr, row, col, str, attr=0):
216 try:
217 scr.addstr(row, col, str, attr)
218 except:
219 scr.erase()
220 _c.nocbreak()
221 scr.keypad(0)
222 _c.echo()
223 _c.endwin()
224 print "Your terminal screen is not big enough; Please resize it."
225 print "row=%d, col=%d, str='%s'" % (row, col, str)
226 sys.exit(1)
229 # the live monitoring code
230 def show_livestats():
231 cpu = 0 # cpu of interest to display data for
232 ncpu = 1 # number of cpu's on this platform
233 slen = 0 # size of shared data structure, incuding padding
234 global dom_in_use
236 # mmap the (the first chunk of the) file
237 shmf = open(SHM_FILE, "r+")
238 shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE)
240 samples = []
241 doms = []
242 dom_in_use = []
244 # initialize curses
245 stdscr = _c.initscr()
246 _c.noecho()
247 _c.cbreak()
249 stdscr.keypad(1)
250 stdscr.timeout(1000)
251 [maxy, maxx] = stdscr.getmaxyx()
253 # display in a loop
254 while True:
256 for cpuidx in range(0, ncpu):
258 # calculate offset in mmap file to start from
259 idx = cpuidx * slen
262 samples = []
263 doms = []
265 # read in data
266 for i in range(0, NSAMPLES):
267 len = struct.calcsize(ST_QDATA)
268 sample = struct.unpack(ST_QDATA, shm[idx:idx+len])
269 samples.append(sample)
270 idx += len
272 for i in range(0, NDOMAINS):
273 len = struct.calcsize(ST_DOM_INFO)
274 dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len])
275 doms.append(dom)
276 # (last_update_time, start_time, runnable_start_time, blocked_start_time,
277 # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
278 # runnable, in_use, domid, name) = dom
279 # dom_in_use.append(in_use)
280 dom_in_use.append(dom[8])
281 idx += len
283 len = struct.calcsize("4i")
284 oldncpu = ncpu
285 (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len])
286 idx += len
288 # xenbaked tells us how many cpu's it's got, so re-do
289 # the mmap if necessary to get multiple cpu data
290 if oldncpu != ncpu:
291 shm = mmap.mmap(shmf.fileno(), ncpu*slen)
293 # if we've just calculated data for the cpu of interest, then
294 # stop examining mmap data and start displaying stuff
295 if cpuidx == cpu:
296 break
298 # calculate starting and ending datapoints; never look at "next" since
299 # it represents live data that may be in transition.
300 startat = next - 1
301 if next + 10 < NSAMPLES:
302 endat = next + 10
303 else:
304 endat = 10
306 # get summary over desired interval
307 [h1, l1, f1] = summarize(startat, endat, 10**9, samples)
308 [h2, l2, f2] = summarize(startat, endat, 10 * 10**9, samples)
311 # the actual display code
312 row = 0
313 display(stdscr, row, 1, "CPU = %d" % cpu, _c.A_STANDOUT)
315 display(stdscr, row, 10, "%sLast 10 seconds%sLast 1 second" % (6*' ', 30*' '), _c.A_BOLD)
316 row +=1
317 display(stdscr, row, 1, "%s" % ((maxx-2)*'='))
319 total_h1_cpu = 0
320 total_h2_cpu = 0
322 for dom in range(0, NDOMAINS):
323 if not dom_in_use[dom]:
324 continue
326 if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
327 # display gotten
328 row += 1
329 col = 2
330 display(stdscr, row, col, "%d" % dom)
331 col += 4
332 display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0]))
333 col += 12
334 display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1])
335 col += 12
336 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2]))
337 col += 18
338 display(stdscr, row, col, "%s" % time_scale(h1[dom][0][0]))
339 col += 12
340 display(stdscr, row, col, "%3.2f%%" % h1[dom][0][1])
341 col += 12
342 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][0][2]))
343 col += 18
344 display(stdscr, row, col, "Gotten")
346 # display allocated
347 row += 1
348 col = 2
349 display(stdscr, row, col, "%d" % dom)
350 col += 28
351 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1]))
352 col += 42
353 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][1]))
354 col += 18
355 display(stdscr, row, col, "Allocated")
357 # display blocked
358 row += 1
359 col = 2
360 display(stdscr, row, col, "%d" % dom)
361 col += 4
362 display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0]))
363 col += 12
364 display(stdscr, row, col, "%3.2f%%" % h2[dom][2][1])
365 col += 12
366 display(stdscr, row, col, "%s/io" % time_scale(h2[dom][2][2]))
367 col += 18
368 display(stdscr, row, col, "%s" % time_scale(h1[dom][2][0]))
369 col += 12
370 display(stdscr, row, col, "%3.2f%%" % h1[dom][2][1])
371 col += 12
372 display(stdscr, row, col, "%s/io" % time_scale(h1[dom][2][2]))
373 col += 18
374 display(stdscr, row, col, "Blocked")
376 # display waited
377 row += 1
378 col = 2
379 display(stdscr, row, col, "%d" % dom)
380 col += 4
381 display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0]))
382 col += 12
383 display(stdscr, row, col, "%3.2f%%" % h2[dom][3][1])
384 col += 12
385 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][3][2]))
386 col += 18
387 display(stdscr, row, col, "%s" % time_scale(h1[dom][3][0]))
388 col += 12
389 display(stdscr, row, col, "%3.2f%%" % h1[dom][3][1])
390 col += 12
391 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][3][2]))
392 col += 18
393 display(stdscr, row, col, "Waited")
395 # display ex count
396 row += 1
397 col = 2
398 display(stdscr, row, col, "%d" % dom)
400 col += 28
401 display(stdscr, row, col, "%d/s" % h2[dom][4])
402 col += 42
403 display(stdscr, row, col, "%d" % h1[dom][4])
404 col += 18
405 display(stdscr, row, col, "Execution count")
407 # display io count
408 row += 1
409 col = 2
410 display(stdscr, row, col, "%d" % dom)
411 col += 4
412 display(stdscr, row, col, "%d/s" % h2[dom][5][0])
413 col += 24
414 display(stdscr, row, col, "%d/ex" % h2[dom][5][1])
415 col += 18
416 display(stdscr, row, col, "%d" % h1[dom][5][0])
417 col += 24
418 display(stdscr, row, col, "%3.2f/ex" % h1[dom][5][1])
419 col += 18
420 display(stdscr, row, col, "I/O Count")
422 #row += 1
423 #stdscr.hline(row, 1, '-', maxx - 2)
424 total_h1_cpu += h1[dom][0][1]
425 total_h2_cpu += h2[dom][0][1]
428 row += 1
429 display(stdscr, row, 2, TOTALS % (total_h2_cpu, total_h1_cpu))
430 row += 1
431 # display(stdscr, row, 2,
432 # "\tFFP: %d (Min: %d, Max: %d)\t\t\tFFP: %d (Min: %d, Max %d)" %
433 # (math.ceil(f2[1]), f2[0], f2[2], math.ceil(f1[1]), f1[0], f1[2]), _c.A_BOLD)
435 if l1[1] > 1 :
436 row += 1
437 display(stdscr, row, 2,
438 "\tRecords lost: %d (Min: %d, Max: %d)\t\t\tRecords lost: %d (Min: %d, Max %d)" %
439 (math.ceil(l2[1]), l2[0], l2[2], math.ceil(l1[1]), l1[0], l1[2]), _c.A_BOLD)
441 # grab a char from tty input; exit if interrupt hit
442 try:
443 c = stdscr.getch()
444 except:
445 break
447 # q = quit
448 if c == ord('q'):
449 break
451 # c = cycle to a new cpu of interest
452 if c == ord('c'):
453 cpu = (cpu + 1) % ncpu
455 stdscr.erase()
457 _c.nocbreak()
458 stdscr.keypad(0)
459 _c.echo()
460 _c.endwin()
461 shm.close()
462 shmf.close()
465 # simple functions to allow initialization of log files without actually
466 # physically creating files that are never used; only on the first real
467 # write does the file get created
468 class Delayed(file):
469 def __init__(self, filename, mode):
470 self.filename = filename
471 self.saved_mode = mode
472 self.delay_data = ""
473 self.opened = 0
475 def delayed_write(self, str):
476 self.delay_data = str
478 def write(self, str):
479 if not self.opened:
480 self.file = open(self.filename, self.saved_mode)
481 self.opened = 1
482 self.file.write(self.delay_data)
483 self.file.write(str)
485 def flush(self):
486 if self.opened:
487 self.file.flush()
489 def close(self):
490 if self.opened:
491 self.file.close()
494 def writelog():
495 global options
496 global dom_in_use
498 ncpu = 1 # number of cpu's
499 slen = 0 # size of shared structure inc. padding
501 shmf = open(SHM_FILE, "r+")
502 shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE)
504 interval = 0
505 outfiles = {}
506 for dom in range(0, NDOMAINS):
507 outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w')
508 outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex ex/s io(tot) io/ex\n")
510 while options.duration == 0 or interval < (options.duration * 1000):
511 for cpuidx in range(0, ncpu):
513 idx = cpuidx * slen # offset needed in mmap file
516 samples = []
517 doms = []
518 dom_in_use = []
520 for i in range(0, NSAMPLES):
521 len = struct.calcsize(ST_QDATA)
522 sample = struct.unpack(ST_QDATA, shm[idx:idx+len])
523 samples.append(sample)
524 idx += len
526 for i in range(0, NDOMAINS):
527 len = struct.calcsize(ST_DOM_INFO)
528 dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len])
529 # doms.append(dom)
530 # (last_update_time, start_time, runnable_start_time, blocked_start_time,
531 # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
532 # runnable, in_use, domid, name) = dom
533 dom_in_use.append(dom[8])
534 idx += len
536 len = struct.calcsize("4i")
537 oldncpu = ncpu
538 (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len])
539 idx += len
541 if oldncpu != ncpu:
542 shm = mmap.mmap(shmf.fileno(), ncpu*slen)
544 startat = next - 1
545 if next + 10 < NSAMPLES:
546 endat = next + 10
547 else:
548 endat = 10
550 [h1,l1, f1] = summarize(startat, endat, options.interval * 10**6, samples)
551 for dom in range(0, NDOMAINS):
552 if not dom_in_use[dom]:
553 continue
554 if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
555 outfiles[dom].write("%.3f %d %d %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n" %
556 (interval, cpuidx, dom,
557 h1[dom][0][0], h1[dom][0][1], h1[dom][0][2],
558 h1[dom][1],
559 h1[dom][2][0], h1[dom][2][1], h1[dom][2][2],
560 h1[dom][3][0], h1[dom][3][1], h1[dom][3][2],
561 h1[dom][4],
562 h1[dom][5][0], h1[dom][5][1]))
563 outfiles[dom].flush()
565 interval += options.interval
566 time.sleep(1)
568 for dom in range(0, NDOMAINS):
569 outfiles[dom].close()
571 # start xenbaked
572 def start_xenbaked():
573 global options
574 global args
576 os.system("killall -9 xenbaked")
577 # assumes that xenbaked is in your path
578 os.system("xenbaked --ms_per_sample=%d &" %
579 options.mspersample)
580 time.sleep(1)
582 # stop xenbaked
583 def stop_xenbaked():
584 os.system("killall -s INT xenbaked")
586 def main():
587 global options
588 global args
589 global domains
591 parser = setup_cmdline_parser()
592 (options, args) = parser.parse_args()
594 start_xenbaked()
595 if options.live:
596 show_livestats()
597 else:
598 try:
599 writelog()
600 except:
601 print 'Quitting.'
602 stop_xenbaked()
604 if __name__ == "__main__":
605 main()