ia64/xen-unstable

view tools/xenmon/xenmon.py @ 10654:222b492cc063

[XENMON] This patch removes the magic number "31" for readability.
The number "31" means the idle domain ID.

In detail:
- display the idle domain ID with "Idle" instead of "31"
- write to the file "log-idle.log" instead of "log-dom31.log".

Signed-off-by: KUWAMURA Shin'ya <kuwa@jp.fujitsu.com>
author kfraser@localhost.localdomain
date Wed Jul 05 11:31:33 2006 +0100 (2006-07-05)
parents 7b9dacaf3340
children 3cdb93867f81
line source
1 #!/usr/bin/env python
3 #####################################################################
4 # xenmon is a front-end for xenbaked.
5 # There is a curses interface for live monitoring. XenMon also allows
6 # logging to a file. For options, run python xenmon.py -h
7 #
8 # Copyright (C) 2005,2006 by Hewlett Packard, Palo Alto and Fort Collins
9 # Authors: Lucy Cherkasova, lucy.cherkasova@hp.com
10 # Rob Gardner, rob.gardner@hp.com
11 # Diwaker Gupta, diwaker.gupta@hp.com
12 #####################################################################
13 # This program is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; under version 2 of the License.
16 #
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
21 #
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, write to the Free Software
24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 #####################################################################
27 import mmap
28 import struct
29 import os
30 import time
31 import optparse as _o
32 import curses as _c
33 import math
34 import sys
36 # constants
37 NSAMPLES = 100
38 NDOMAINS = 32
39 IDLE_DOMAIN = 31 # idle domain's ID
41 # the struct strings for qos_info
42 ST_DOM_INFO = "6Q4i32s"
43 ST_QDATA = "%dQ" % (6*NDOMAINS + 4)
45 # size of mmaped file
46 QOS_DATA_SIZE = struct.calcsize(ST_QDATA)*NSAMPLES + struct.calcsize(ST_DOM_INFO)*NDOMAINS + struct.calcsize("4i")
48 # location of mmaped file, hard coded right now
49 SHM_FILE = "/tmp/xenq-shm"
51 # format strings
52 TOTALS = 15*' ' + "%6.2f%%" + 35*' ' + "%6.2f%%"
54 ALLOCATED = "Allocated"
55 GOTTEN = "Gotten"
56 BLOCKED = "Blocked"
57 WAITED = "Waited"
58 IOCOUNT = "I/O Count"
59 EXCOUNT = "Exec Count"
61 # globals
62 dom_in_use = []
64 # our curses screen
65 stdscr = None
67 # parsed options
68 options, args = None, None
70 # the optparse module is quite smart
71 # to see help, just run xenmon -h
72 def setup_cmdline_parser():
73 parser = _o.OptionParser()
74 parser.add_option("-l", "--live", dest="live", action="store_true",
75 default=True, help = "show the ncurses live monitoring frontend (default)")
76 parser.add_option("-n", "--notlive", dest="live", action="store_false",
77 default="True", help = "write to file instead of live monitoring")
78 parser.add_option("-p", "--prefix", dest="prefix",
79 default = "log", help="prefix to use for output files")
80 parser.add_option("-t", "--time", dest="duration",
81 action="store", type="int", default=10,
82 help="stop logging to file after this much time has elapsed (in seconds). set to 0 to keep logging indefinitely")
83 parser.add_option("-i", "--interval", dest="interval",
84 action="store", type="int", default=1000,
85 help="interval for logging (in ms)")
86 parser.add_option("--ms_per_sample", dest="mspersample",
87 action="store", type="int", default=100,
88 help = "determines how many ms worth of data goes in a sample")
89 parser.add_option("--cpu", dest="cpu", action="store", type="int", default=0,
90 help = "specifies which cpu to display data for")
92 parser.add_option("--allocated", dest="allocated", action="store_true",
93 default=False, help="Display allocated time for each domain")
94 parser.add_option("--noallocated", dest="allocated", action="store_false",
95 default=False, help="Don't display allocated time for each domain")
97 parser.add_option("--blocked", dest="blocked", action="store_true",
98 default=True, help="Display blocked time for each domain")
99 parser.add_option("--noblocked", dest="blocked", action="store_false",
100 default=True, help="Don't display blocked time for each domain")
102 parser.add_option("--waited", dest="waited", action="store_true",
103 default=True, help="Display waiting time for each domain")
104 parser.add_option("--nowaited", dest="waited", action="store_false",
105 default=True, help="Don't display waiting time for each domain")
107 parser.add_option("--excount", dest="excount", action="store_true",
108 default=False, help="Display execution count for each domain")
109 parser.add_option("--noexcount", dest="excount", action="store_false",
110 default=False, help="Don't display execution count for each domain")
111 parser.add_option("--iocount", dest="iocount", action="store_true",
112 default=False, help="Display I/O count for each domain")
113 parser.add_option("--noiocount", dest="iocount", action="store_false",
114 default=False, help="Don't display I/O count for each domain")
116 return parser
118 # encapsulate information about a domain
119 class DomainInfo:
120 def __init__(self):
121 self.allocated_sum = 0
122 self.gotten_sum = 0
123 self.blocked_sum = 0
124 self.waited_sum = 0
125 self.exec_count = 0;
126 self.iocount_sum = 0
127 self.ffp_samples = []
129 def gotten_stats(self, passed):
130 total = float(self.gotten_sum)
131 per = 100*total/passed
132 exs = self.exec_count
133 if exs > 0:
134 avg = total/exs
135 else:
136 avg = 0
137 return [total/(float(passed)/10**9), per, avg]
139 def waited_stats(self, passed):
140 total = float(self.waited_sum)
141 per = 100*total/passed
142 exs = self.exec_count
143 if exs > 0:
144 avg = total/exs
145 else:
146 avg = 0
147 return [total/(float(passed)/10**9), per, avg]
149 def blocked_stats(self, passed):
150 total = float(self.blocked_sum)
151 per = 100*total/passed
152 ios = self.iocount_sum
153 if ios > 0:
154 avg = total/float(ios)
155 else:
156 avg = 0
157 return [total/(float(passed)/10**9), per, avg]
159 def allocated_stats(self, passed):
160 total = self.allocated_sum
161 exs = self.exec_count
162 if exs > 0:
163 return float(total)/exs
164 else:
165 return 0
167 def ec_stats(self, passed):
168 total = float(self.exec_count/(float(passed)/10**9))
169 return total
171 def io_stats(self, passed):
172 total = float(self.iocount_sum)
173 exs = self.exec_count
174 if exs > 0:
175 avg = total/exs
176 else:
177 avg = 0
178 return [total/(float(passed)/10**9), avg]
180 def stats(self, passed):
181 return [self.gotten_stats(passed), self.allocated_stats(passed), self.blocked_stats(passed),
182 self.waited_stats(passed), self.ec_stats(passed), self.io_stats(passed)]
184 # report values over desired interval
185 def summarize(startat, endat, duration, samples):
186 dominfos = {}
187 for i in range(0, NDOMAINS):
188 dominfos[i] = DomainInfo()
190 passed = 1 # to prevent zero division
191 curid = startat
192 numbuckets = 0
193 lost_samples = []
194 ffp_samples = []
196 while passed < duration:
197 for i in range(0, NDOMAINS):
198 if dom_in_use[i]:
199 dominfos[i].gotten_sum += samples[curid][0*NDOMAINS + i]
200 dominfos[i].allocated_sum += samples[curid][1*NDOMAINS + i]
201 dominfos[i].waited_sum += samples[curid][2*NDOMAINS + i]
202 dominfos[i].blocked_sum += samples[curid][3*NDOMAINS + i]
203 dominfos[i].exec_count += samples[curid][4*NDOMAINS + i]
204 dominfos[i].iocount_sum += samples[curid][5*NDOMAINS + i]
206 passed += samples[curid][6*NDOMAINS]
207 lost_samples.append(samples[curid][6*NDOMAINS + 2])
208 ffp_samples.append(samples[curid][6*NDOMAINS + 3])
210 numbuckets += 1
212 if curid > 0:
213 curid -= 1
214 else:
215 curid = NSAMPLES - 1
216 if curid == endat:
217 break
219 lostinfo = [min(lost_samples), sum(lost_samples), max(lost_samples)]
220 ffpinfo = [min(ffp_samples), sum(ffp_samples), max(ffp_samples)]
222 ldoms = []
223 for x in range(0, NDOMAINS):
224 if dom_in_use[x]:
225 ldoms.append(dominfos[x].stats(passed))
226 else:
227 ldoms.append(0)
229 return [ldoms, lostinfo, ffpinfo]
231 # scale microseconds to milliseconds or seconds as necessary
232 def time_scale(ns):
233 if ns < 1000:
234 return "%4.2f ns" % float(ns)
235 elif ns < 1000*1000:
236 return "%4.2f us" % (float(ns)/10**3)
237 elif ns < 10**9:
238 return "%4.2f ms" % (float(ns)/10**6)
239 else:
240 return "%4.2f s" % (float(ns)/10**9)
242 # paint message on curses screen, but detect screen size errors
243 def display(scr, row, col, str, attr=0):
244 try:
245 scr.addstr(row, col, str, attr)
246 except:
247 scr.erase()
248 _c.nocbreak()
249 scr.keypad(0)
250 _c.echo()
251 _c.endwin()
252 print "Your terminal screen is not big enough; Please resize it."
253 print "row=%d, col=%d, str='%s'" % (row, col, str)
254 sys.exit(1)
257 # diplay domain id
258 def display_domain_id(scr, row, col, dom):
259 if dom == IDLE_DOMAIN:
260 display(scr, row, col-1, "Idle")
261 else:
262 display(scr, row, col, "%d" % dom)
265 # the live monitoring code
266 def show_livestats(cpu):
267 ncpu = 1 # number of cpu's on this platform
268 slen = 0 # size of shared data structure, incuding padding
269 cpu_1sec_usage = 0.0
270 cpu_10sec_usage = 0.0
271 heartbeat = 1
272 global dom_in_use, options
274 # mmap the (the first chunk of the) file
275 shmf = open(SHM_FILE, "r+")
276 shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE)
278 # initialize curses
279 stdscr = _c.initscr()
280 _c.noecho()
281 _c.cbreak()
283 stdscr.keypad(1)
284 stdscr.timeout(1000)
285 [maxy, maxx] = stdscr.getmaxyx()
287 # display in a loop
288 while True:
290 cpuidx = 0
291 while cpuidx < ncpu:
293 # calculate offset in mmap file to start from
294 idx = cpuidx * slen
297 samples = []
298 doms = []
299 dom_in_use = []
301 # read in data
302 for i in range(0, NSAMPLES):
303 len = struct.calcsize(ST_QDATA)
304 sample = struct.unpack(ST_QDATA, shm[idx:idx+len])
305 samples.append(sample)
306 idx += len
308 for i in range(0, NDOMAINS):
309 len = struct.calcsize(ST_DOM_INFO)
310 dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len])
311 doms.append(dom)
312 # (last_update_time, start_time, runnable_start_time, blocked_start_time,
313 # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
314 # runnable, in_use, domid, name) = dom
315 # dom_in_use.append(in_use)
316 dom_in_use.append(dom[8])
317 idx += len
318 # print "dom_in_use(cpu=%d): " % cpuidx, dom_in_use
321 len = struct.calcsize("4i")
322 oldncpu = ncpu
323 (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len])
324 idx += len
326 # xenbaked tells us how many cpu's it's got, so re-do
327 # the mmap if necessary to get multiple cpu data
328 if oldncpu != ncpu:
329 shm = mmap.mmap(shmf.fileno(), ncpu*slen)
331 # if we've just calculated data for the cpu of interest, then
332 # stop examining mmap data and start displaying stuff
333 if cpuidx == cpu:
334 break
336 cpuidx = cpuidx + 1
338 # calculate starting and ending datapoints; never look at "next" since
339 # it represents live data that may be in transition.
340 startat = next - 1
341 if next + 10 < NSAMPLES:
342 endat = next + 10
343 else:
344 endat = 10
346 # get summary over desired interval
347 [h1, l1, f1] = summarize(startat, endat, 10**9, samples)
348 [h2, l2, f2] = summarize(startat, endat, 10 * 10**9, samples)
351 # the actual display code
352 row = 0
353 display(stdscr, row, 1, "CPU = %d" % cpu, _c.A_STANDOUT)
355 display(stdscr, row, 10, "%sLast 10 seconds (%3.2f%%)%sLast 1 second (%3.2f%%)" % (6*' ', cpu_10sec_usage, 30*' ', cpu_1sec_usage), _c.A_BOLD)
356 row +=1
357 display(stdscr, row, 1, "%s" % ((maxx-2)*'='))
359 total_h1_cpu = 0
360 total_h2_cpu = 0
362 cpu_1sec_usage = 0.0
363 cpu_10sec_usage = 0.0
365 for dom in range(0, NDOMAINS):
366 if not dom_in_use[dom]:
367 continue
369 if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
370 # display gotten
371 row += 1
372 col = 2
373 display_domain_id(stdscr, row, col, dom)
374 col += 4
375 display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0]))
376 col += 12
377 display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1])
378 if dom != NDOMAINS - 1:
379 cpu_10sec_usage += h2[dom][0][1]
380 col += 12
381 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2]))
382 col += 18
383 display(stdscr, row, col, "%s" % time_scale(h1[dom][0][0]))
384 col += 12
385 display(stdscr, row, col, "%3.2f%%" % h1[dom][0][1], _c.A_STANDOUT)
386 col += 12
387 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][0][2]))
388 col += 18
389 display(stdscr, row, col, "Gotten")
391 if dom != NDOMAINS - 1:
392 cpu_1sec_usage = cpu_1sec_usage + h1[dom][0][1]
394 # display allocated
395 if options.allocated:
396 row += 1
397 col = 2
398 display_domain_id(stdscr, row, col, dom)
399 col += 28
400 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1]))
401 col += 42
402 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][1]))
403 col += 18
404 display(stdscr, row, col, "Allocated")
406 # display blocked
407 if options.blocked:
408 row += 1
409 col = 2
410 display_domain_id(stdscr, row, col, dom)
411 col += 4
412 display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0]))
413 col += 12
414 display(stdscr, row, col, "%3.2f%%" % h2[dom][2][1])
415 col += 12
416 display(stdscr, row, col, "%s/io" % time_scale(h2[dom][2][2]))
417 col += 18
418 display(stdscr, row, col, "%s" % time_scale(h1[dom][2][0]))
419 col += 12
420 display(stdscr, row, col, "%3.2f%%" % h1[dom][2][1])
421 col += 12
422 display(stdscr, row, col, "%s/io" % time_scale(h1[dom][2][2]))
423 col += 18
424 display(stdscr, row, col, "Blocked")
426 # display waited
427 if options.waited:
428 row += 1
429 col = 2
430 display_domain_id(stdscr, row, col, dom)
431 col += 4
432 display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0]))
433 col += 12
434 display(stdscr, row, col, "%3.2f%%" % h2[dom][3][1])
435 col += 12
436 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][3][2]))
437 col += 18
438 display(stdscr, row, col, "%s" % time_scale(h1[dom][3][0]))
439 col += 12
440 display(stdscr, row, col, "%3.2f%%" % h1[dom][3][1])
441 col += 12
442 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][3][2]))
443 col += 18
444 display(stdscr, row, col, "Waited")
446 # display ex count
447 if options.excount:
448 row += 1
449 col = 2
450 display_domain_id(stdscr, row, col, dom)
452 col += 28
453 display(stdscr, row, col, "%d/s" % h2[dom][4])
454 col += 42
455 display(stdscr, row, col, "%d" % h1[dom][4])
456 col += 18
457 display(stdscr, row, col, "Execution count")
459 # display io count
460 if options.iocount:
461 row += 1
462 col = 2
463 display_domain_id(stdscr, row, col, dom)
464 col += 4
465 display(stdscr, row, col, "%d/s" % h2[dom][5][0])
466 col += 24
467 display(stdscr, row, col, "%d/ex" % h2[dom][5][1])
468 col += 18
469 display(stdscr, row, col, "%d" % h1[dom][5][0])
470 col += 24
471 display(stdscr, row, col, "%3.2f/ex" % h1[dom][5][1])
472 col += 18
473 display(stdscr, row, col, "I/O Count")
475 #row += 1
476 #stdscr.hline(row, 1, '-', maxx - 2)
477 total_h1_cpu += h1[dom][0][1]
478 total_h2_cpu += h2[dom][0][1]
481 row += 1
482 star = heartbeat * '*'
483 heartbeat = 1 - heartbeat
484 display(stdscr, row, 1, star)
485 display(stdscr, row, 2, TOTALS % (total_h2_cpu, total_h1_cpu))
486 row += 1
487 # display(stdscr, row, 2,
488 # "\tFFP: %d (Min: %d, Max: %d)\t\t\tFFP: %d (Min: %d, Max %d)" %
489 # (math.ceil(f2[1]), f2[0], f2[2], math.ceil(f1[1]), f1[0], f1[2]), _c.A_BOLD)
491 if l1[1] > 1 :
492 row += 1
493 display(stdscr, row, 2,
494 "\tRecords lost: %d (Min: %d, Max: %d)\t\t\tRecords lost: %d (Min: %d, Max %d)" %
495 (math.ceil(l2[1]), l2[0], l2[2], math.ceil(l1[1]), l1[0], l1[2]), _c.A_BOLD)
497 # grab a char from tty input; exit if interrupt hit
498 try:
499 c = stdscr.getch()
500 except:
501 break
503 # q = quit
504 if c == ord('q'):
505 break
507 # c = cycle to a new cpu of interest
508 if c == ord('c'):
509 cpu = (cpu + 1) % ncpu
511 # n/p = cycle to the next/previous CPU
512 if c == ord('n'):
513 cpu = (cpu + 1) % ncpu
514 if c == ord('p'):
515 cpu = (cpu - 1) % ncpu
517 stdscr.erase()
519 _c.nocbreak()
520 stdscr.keypad(0)
521 _c.echo()
522 _c.endwin()
523 shm.close()
524 shmf.close()
527 # simple functions to allow initialization of log files without actually
528 # physically creating files that are never used; only on the first real
529 # write does the file get created
530 class Delayed(file):
531 def __init__(self, filename, mode):
532 self.filename = filename
533 self.saved_mode = mode
534 self.delay_data = ""
535 self.opened = 0
537 def delayed_write(self, str):
538 self.delay_data = str
540 def write(self, str):
541 if not self.opened:
542 self.file = open(self.filename, self.saved_mode)
543 self.opened = 1
544 self.file.write(self.delay_data)
545 self.file.write(str)
547 def flush(self):
548 if self.opened:
549 self.file.flush()
551 def close(self):
552 if self.opened:
553 self.file.close()
556 def writelog():
557 global options
558 global dom_in_use
560 ncpu = 1 # number of cpu's
561 slen = 0 # size of shared structure inc. padding
563 shmf = open(SHM_FILE, "r+")
564 shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE)
566 interval = 0
567 curr = last = time.time()
568 outfiles = {}
569 for dom in range(0, NDOMAINS):
570 if dom == IDLE_DOMAIN:
571 outfiles[dom] = Delayed("%s-idle.log" % options.prefix, 'w')
572 else:
573 outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w')
574 outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex ex/s io(tot) io/ex\n")
576 while options.duration == 0 or interval < (options.duration * 1000):
577 cpuidx = 0
578 while cpuidx < ncpu:
580 idx = cpuidx * slen # offset needed in mmap file
582 samples = []
583 doms = []
584 dom_in_use = []
586 for i in range(0, NSAMPLES):
587 len = struct.calcsize(ST_QDATA)
588 sample = struct.unpack(ST_QDATA, shm[idx:idx+len])
589 samples.append(sample)
590 idx += len
592 for i in range(0, NDOMAINS):
593 len = struct.calcsize(ST_DOM_INFO)
594 dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len])
595 # doms.append(dom)
596 # (last_update_time, start_time, runnable_start_time, blocked_start_time,
597 # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
598 # runnable, in_use, domid, name) = dom
599 dom_in_use.append(dom[8])
600 idx += len
602 len = struct.calcsize("4i")
603 oldncpu = ncpu
604 (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len])
605 idx += len
607 if oldncpu != ncpu:
608 shm = mmap.mmap(shmf.fileno(), ncpu*slen)
610 startat = next - 1
611 if next + 10 < NSAMPLES:
612 endat = next + 10
613 else:
614 endat = 10
616 [h1,l1, f1] = summarize(startat, endat, options.interval * 10**6, samples)
617 for dom in range(0, NDOMAINS):
618 if not dom_in_use[dom]:
619 continue
620 if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
621 outfiles[dom].write("%.3f %d %d %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n" %
622 (interval, cpuidx, dom,
623 h1[dom][0][0], h1[dom][0][1], h1[dom][0][2],
624 h1[dom][1],
625 h1[dom][2][0], h1[dom][2][1], h1[dom][2][2],
626 h1[dom][3][0], h1[dom][3][1], h1[dom][3][2],
627 h1[dom][4],
628 h1[dom][5][0], h1[dom][5][1]))
629 outfiles[dom].flush()
630 curr = time.time()
631 interval += (curr - last) * 1000
632 last = curr
633 cpuidx = cpuidx + 1
634 time.sleep(options.interval / 1000.0)
636 for dom in range(0, NDOMAINS):
637 outfiles[dom].close()
639 # start xenbaked
640 def start_xenbaked():
641 global options
642 global args
644 os.system("killall -9 xenbaked")
645 # assumes that xenbaked is in your path
646 os.system("xenbaked --ms_per_sample=%d &" %
647 options.mspersample)
648 time.sleep(1)
650 # stop xenbaked
651 def stop_xenbaked():
652 os.system("killall -s INT xenbaked")
654 def main():
655 global options
656 global args
657 global domains
659 parser = setup_cmdline_parser()
660 (options, args) = parser.parse_args()
662 start_xenbaked()
663 if options.live:
664 show_livestats(options.cpu)
665 else:
666 try:
667 writelog()
668 except:
669 print 'Quitting.'
670 stop_xenbaked()
672 if __name__ == "__main__":
673 main()