ia64/xen-unstable

changeset 12625:da51aee40456

[IA64] import perfmon, oprofile related files from linux to linux-sparse

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

Updated to 2.6.16.33

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author awilliam@xenbuild.aw
date Tue Nov 28 11:19:40 2006 -0700 (2006-11-28)
parents 6cfe32a69ac6
children dc614bb5b0e8
files linux-2.6-xen-sparse/arch/ia64/kernel/perfmon.c linux-2.6-xen-sparse/arch/ia64/oprofile/Makefile linux-2.6-xen-sparse/arch/ia64/oprofile/init.c linux-2.6-xen-sparse/arch/ia64/oprofile/perfmon.c
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/perfmon.c	Tue Nov 28 11:19:40 2006 -0700
     1.3 @@ -0,0 +1,6852 @@
     1.4 +/*
     1.5 + * This file implements the perfmon-2 subsystem which is used
     1.6 + * to program the IA-64 Performance Monitoring Unit (PMU).
     1.7 + *
     1.8 + * The initial version of perfmon.c was written by
     1.9 + * Ganesh Venkitachalam, IBM Corp.
    1.10 + *
    1.11 + * Then it was modified for perfmon-1.x by Stephane Eranian and
    1.12 + * David Mosberger, Hewlett Packard Co.
    1.13 + *
    1.14 + * Version Perfmon-2.x is a rewrite of perfmon-1.x
    1.15 + * by Stephane Eranian, Hewlett Packard Co.
    1.16 + *
    1.17 + * Copyright (C) 1999-2005  Hewlett Packard Co
    1.18 + *               Stephane Eranian <eranian@hpl.hp.com>
    1.19 + *               David Mosberger-Tang <davidm@hpl.hp.com>
    1.20 + *
    1.21 + * More information about perfmon available at:
    1.22 + * 	http://www.hpl.hp.com/research/linux/perfmon
    1.23 + */
    1.24 +
    1.25 +#include <linux/config.h>
    1.26 +#include <linux/module.h>
    1.27 +#include <linux/kernel.h>
    1.28 +#include <linux/sched.h>
    1.29 +#include <linux/interrupt.h>
    1.30 +#include <linux/smp_lock.h>
    1.31 +#include <linux/proc_fs.h>
    1.32 +#include <linux/seq_file.h>
    1.33 +#include <linux/init.h>
    1.34 +#include <linux/vmalloc.h>
    1.35 +#include <linux/mm.h>
    1.36 +#include <linux/sysctl.h>
    1.37 +#include <linux/list.h>
    1.38 +#include <linux/file.h>
    1.39 +#include <linux/poll.h>
    1.40 +#include <linux/vfs.h>
    1.41 +#include <linux/pagemap.h>
    1.42 +#include <linux/mount.h>
    1.43 +#include <linux/bitops.h>
    1.44 +#include <linux/capability.h>
    1.45 +#include <linux/rcupdate.h>
    1.46 +#include <linux/completion.h>
    1.47 +
    1.48 +#include <asm/errno.h>
    1.49 +#include <asm/intrinsics.h>
    1.50 +#include <asm/page.h>
    1.51 +#include <asm/perfmon.h>
    1.52 +#include <asm/processor.h>
    1.53 +#include <asm/signal.h>
    1.54 +#include <asm/system.h>
    1.55 +#include <asm/uaccess.h>
    1.56 +#include <asm/delay.h>
    1.57 +
    1.58 +#ifdef CONFIG_PERFMON
    1.59 +/*
    1.60 + * perfmon context state
    1.61 + */
    1.62 +#define PFM_CTX_UNLOADED	1	/* context is not loaded onto any task */
    1.63 +#define PFM_CTX_LOADED		2	/* context is loaded onto a task */
    1.64 +#define PFM_CTX_MASKED		3	/* context is loaded but monitoring is masked due to overflow */
    1.65 +#define PFM_CTX_ZOMBIE		4	/* owner of the context is closing it */
    1.66 +
    1.67 +#define PFM_INVALID_ACTIVATION	(~0UL)
    1.68 +
    1.69 +/*
    1.70 + * depth of message queue
    1.71 + */
    1.72 +#define PFM_MAX_MSGS		32
    1.73 +#define PFM_CTXQ_EMPTY(g)	((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
    1.74 +
    1.75 +/*
    1.76 + * type of a PMU register (bitmask).
    1.77 + * bitmask structure:
    1.78 + * 	bit0   : register implemented
    1.79 + * 	bit1   : end marker
    1.80 + * 	bit2-3 : reserved
    1.81 + * 	bit4   : pmc has pmc.pm
    1.82 + * 	bit5   : pmc controls a counter (has pmc.oi), pmd is used as counter
    1.83 + * 	bit6-7 : register type
    1.84 + * 	bit8-31: reserved
    1.85 + */
    1.86 +#define PFM_REG_NOTIMPL		0x0 /* not implemented at all */
    1.87 +#define PFM_REG_IMPL		0x1 /* register implemented */
    1.88 +#define PFM_REG_END		0x2 /* end marker */
    1.89 +#define PFM_REG_MONITOR		(0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
    1.90 +#define PFM_REG_COUNTING	(0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
    1.91 +#define PFM_REG_CONTROL		(0x4<<4|PFM_REG_IMPL) /* PMU control register */
    1.92 +#define	PFM_REG_CONFIG		(0x8<<4|PFM_REG_IMPL) /* configuration register */
    1.93 +#define PFM_REG_BUFFER	 	(0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
    1.94 +
    1.95 +#define PMC_IS_LAST(i)	(pmu_conf->pmc_desc[i].type & PFM_REG_END)
    1.96 +#define PMD_IS_LAST(i)	(pmu_conf->pmd_desc[i].type & PFM_REG_END)
    1.97 +
    1.98 +#define PMC_OVFL_NOTIFY(ctx, i)	((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
    1.99 +
   1.100 +/* i assumed unsigned */
   1.101 +#define PMC_IS_IMPL(i)	  (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
   1.102 +#define PMD_IS_IMPL(i)	  (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
   1.103 +
   1.104 +/* XXX: these assume that register i is implemented */
   1.105 +#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
   1.106 +#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
   1.107 +#define PMC_IS_MONITOR(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
   1.108 +#define PMC_IS_CONTROL(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
   1.109 +
   1.110 +#define PMC_DFL_VAL(i)     pmu_conf->pmc_desc[i].default_value
   1.111 +#define PMC_RSVD_MASK(i)   pmu_conf->pmc_desc[i].reserved_mask
   1.112 +#define PMD_PMD_DEP(i)	   pmu_conf->pmd_desc[i].dep_pmd[0]
   1.113 +#define PMC_PMD_DEP(i)	   pmu_conf->pmc_desc[i].dep_pmd[0]
   1.114 +
   1.115 +#define PFM_NUM_IBRS	  IA64_NUM_DBG_REGS
   1.116 +#define PFM_NUM_DBRS	  IA64_NUM_DBG_REGS
   1.117 +
   1.118 +#define CTX_OVFL_NOBLOCK(c)	((c)->ctx_fl_block == 0)
   1.119 +#define CTX_HAS_SMPL(c)		((c)->ctx_fl_is_sampling)
   1.120 +#define PFM_CTX_TASK(h)		(h)->ctx_task
   1.121 +
   1.122 +#define PMU_PMC_OI		5 /* position of pmc.oi bit */
   1.123 +
   1.124 +/* XXX: does not support more than 64 PMDs */
   1.125 +#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
   1.126 +#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
   1.127 +
   1.128 +#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask)
   1.129 +
   1.130 +#define CTX_USED_IBR(ctx,n) 	(ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
   1.131 +#define CTX_USED_DBR(ctx,n) 	(ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
   1.132 +#define CTX_USES_DBREGS(ctx)	(((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
   1.133 +#define PFM_CODE_RR	0	/* requesting code range restriction */
   1.134 +#define PFM_DATA_RR	1	/* requestion data range restriction */
   1.135 +
   1.136 +#define PFM_CPUINFO_CLEAR(v)	pfm_get_cpu_var(pfm_syst_info) &= ~(v)
   1.137 +#define PFM_CPUINFO_SET(v)	pfm_get_cpu_var(pfm_syst_info) |= (v)
   1.138 +#define PFM_CPUINFO_GET()	pfm_get_cpu_var(pfm_syst_info)
   1.139 +
   1.140 +#define RDEP(x)	(1UL<<(x))
   1.141 +
   1.142 +/*
   1.143 + * context protection macros
   1.144 + * in SMP:
   1.145 + * 	- we need to protect against CPU concurrency (spin_lock)
   1.146 + * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
   1.147 + * in UP:
   1.148 + * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
   1.149 + *
   1.150 + * spin_lock_irqsave()/spin_lock_irqrestore():
   1.151 + * 	in SMP: local_irq_disable + spin_lock
   1.152 + * 	in UP : local_irq_disable
   1.153 + *
   1.154 + * spin_lock()/spin_lock():
   1.155 + * 	in UP : removed automatically
   1.156 + * 	in SMP: protect against context accesses from other CPU. interrupts
   1.157 + * 	        are not masked. This is useful for the PMU interrupt handler
   1.158 + * 	        because we know we will not get PMU concurrency in that code.
   1.159 + */
   1.160 +#define PROTECT_CTX(c, f) \
   1.161 +	do {  \
   1.162 +		DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
   1.163 +		spin_lock_irqsave(&(c)->ctx_lock, f); \
   1.164 +		DPRINT(("spinlocked ctx %p  by [%d]\n", c, current->pid)); \
   1.165 +	} while(0)
   1.166 +
   1.167 +#define UNPROTECT_CTX(c, f) \
   1.168 +	do { \
   1.169 +		DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
   1.170 +		spin_unlock_irqrestore(&(c)->ctx_lock, f); \
   1.171 +	} while(0)
   1.172 +
   1.173 +#define PROTECT_CTX_NOPRINT(c, f) \
   1.174 +	do {  \
   1.175 +		spin_lock_irqsave(&(c)->ctx_lock, f); \
   1.176 +	} while(0)
   1.177 +
   1.178 +
   1.179 +#define UNPROTECT_CTX_NOPRINT(c, f) \
   1.180 +	do { \
   1.181 +		spin_unlock_irqrestore(&(c)->ctx_lock, f); \
   1.182 +	} while(0)
   1.183 +
   1.184 +
   1.185 +#define PROTECT_CTX_NOIRQ(c) \
   1.186 +	do {  \
   1.187 +		spin_lock(&(c)->ctx_lock); \
   1.188 +	} while(0)
   1.189 +
   1.190 +#define UNPROTECT_CTX_NOIRQ(c) \
   1.191 +	do { \
   1.192 +		spin_unlock(&(c)->ctx_lock); \
   1.193 +	} while(0)
   1.194 +
   1.195 +
   1.196 +#ifdef CONFIG_SMP
   1.197 +
   1.198 +#define GET_ACTIVATION()	pfm_get_cpu_var(pmu_activation_number)
   1.199 +#define INC_ACTIVATION()	pfm_get_cpu_var(pmu_activation_number)++
   1.200 +#define SET_ACTIVATION(c)	(c)->ctx_last_activation = GET_ACTIVATION()
   1.201 +
   1.202 +#else /* !CONFIG_SMP */
   1.203 +#define SET_ACTIVATION(t) 	do {} while(0)
   1.204 +#define GET_ACTIVATION(t) 	do {} while(0)
   1.205 +#define INC_ACTIVATION(t) 	do {} while(0)
   1.206 +#endif /* CONFIG_SMP */
   1.207 +
   1.208 +#define SET_PMU_OWNER(t, c)	do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
   1.209 +#define GET_PMU_OWNER()		pfm_get_cpu_var(pmu_owner)
   1.210 +#define GET_PMU_CTX()		pfm_get_cpu_var(pmu_ctx)
   1.211 +
   1.212 +#define LOCK_PFS(g)	    	spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
   1.213 +#define UNLOCK_PFS(g)	    	spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
   1.214 +
   1.215 +#define PFM_REG_RETFLAG_SET(flags, val)	do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
   1.216 +
   1.217 +/*
   1.218 + * cmp0 must be the value of pmc0
   1.219 + */
   1.220 +#define PMC0_HAS_OVFL(cmp0)  (cmp0 & ~0x1UL)
   1.221 +
   1.222 +#define PFMFS_MAGIC 0xa0b4d889
   1.223 +
   1.224 +/*
   1.225 + * debugging
   1.226 + */
   1.227 +#define PFM_DEBUGGING 1
   1.228 +#ifdef PFM_DEBUGGING
   1.229 +#define DPRINT(a) \
   1.230 +	do { \
   1.231 +		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
   1.232 +	} while (0)
   1.233 +
   1.234 +#define DPRINT_ovfl(a) \
   1.235 +	do { \
   1.236 +		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
   1.237 +	} while (0)
   1.238 +#endif
   1.239 +
   1.240 +/*
   1.241 + * 64-bit software counter structure
   1.242 + *
   1.243 + * the next_reset_type is applied to the next call to pfm_reset_regs()
   1.244 + */
   1.245 +typedef struct {
   1.246 +	unsigned long	val;		/* virtual 64bit counter value */
   1.247 +	unsigned long	lval;		/* last reset value */
   1.248 +	unsigned long	long_reset;	/* reset value on sampling overflow */
   1.249 +	unsigned long	short_reset;    /* reset value on overflow */
   1.250 +	unsigned long	reset_pmds[4];  /* which other pmds to reset when this counter overflows */
   1.251 +	unsigned long	smpl_pmds[4];   /* which pmds are accessed when counter overflow */
   1.252 +	unsigned long	seed;		/* seed for random-number generator */
   1.253 +	unsigned long	mask;		/* mask for random-number generator */
   1.254 +	unsigned int 	flags;		/* notify/do not notify */
   1.255 +	unsigned long	eventid;	/* overflow event identifier */
   1.256 +} pfm_counter_t;
   1.257 +
   1.258 +/*
   1.259 + * context flags
   1.260 + */
   1.261 +typedef struct {
   1.262 +	unsigned int block:1;		/* when 1, task will blocked on user notifications */
   1.263 +	unsigned int system:1;		/* do system wide monitoring */
   1.264 +	unsigned int using_dbreg:1;	/* using range restrictions (debug registers) */
   1.265 +	unsigned int is_sampling:1;	/* true if using a custom format */
   1.266 +	unsigned int excl_idle:1;	/* exclude idle task in system wide session */
   1.267 +	unsigned int going_zombie:1;	/* context is zombie (MASKED+blocking) */
   1.268 +	unsigned int trap_reason:2;	/* reason for going into pfm_handle_work() */
   1.269 +	unsigned int no_msg:1;		/* no message sent on overflow */
   1.270 +	unsigned int can_restart:1;	/* allowed to issue a PFM_RESTART */
   1.271 +	unsigned int reserved:22;
   1.272 +} pfm_context_flags_t;
   1.273 +
   1.274 +#define PFM_TRAP_REASON_NONE		0x0	/* default value */
   1.275 +#define PFM_TRAP_REASON_BLOCK		0x1	/* we need to block on overflow */
   1.276 +#define PFM_TRAP_REASON_RESET		0x2	/* we need to reset PMDs */
   1.277 +
   1.278 +
   1.279 +/*
   1.280 + * perfmon context: encapsulates all the state of a monitoring session
   1.281 + */
   1.282 +
   1.283 +typedef struct pfm_context {
   1.284 +	spinlock_t		ctx_lock;		/* context protection */
   1.285 +
   1.286 +	pfm_context_flags_t	ctx_flags;		/* bitmask of flags  (block reason incl.) */
   1.287 +	unsigned int		ctx_state;		/* state: active/inactive (no bitfield) */
   1.288 +
   1.289 +	struct task_struct 	*ctx_task;		/* task to which context is attached */
   1.290 +
   1.291 +	unsigned long		ctx_ovfl_regs[4];	/* which registers overflowed (notification) */
   1.292 +
   1.293 +	struct completion	ctx_restart_done;  	/* use for blocking notification mode */
   1.294 +
   1.295 +	unsigned long		ctx_used_pmds[4];	/* bitmask of PMD used            */
   1.296 +	unsigned long		ctx_all_pmds[4];	/* bitmask of all accessible PMDs */
   1.297 +	unsigned long		ctx_reload_pmds[4];	/* bitmask of force reload PMD on ctxsw in */
   1.298 +
   1.299 +	unsigned long		ctx_all_pmcs[4];	/* bitmask of all accessible PMCs */
   1.300 +	unsigned long		ctx_reload_pmcs[4];	/* bitmask of force reload PMC on ctxsw in */
   1.301 +	unsigned long		ctx_used_monitors[4];	/* bitmask of monitor PMC being used */
   1.302 +
   1.303 +	unsigned long		ctx_pmcs[IA64_NUM_PMC_REGS];	/*  saved copies of PMC values */
   1.304 +
   1.305 +	unsigned int		ctx_used_ibrs[1];		/* bitmask of used IBR (speedup ctxsw in) */
   1.306 +	unsigned int		ctx_used_dbrs[1];		/* bitmask of used DBR (speedup ctxsw in) */
   1.307 +	unsigned long		ctx_dbrs[IA64_NUM_DBG_REGS];	/* DBR values (cache) when not loaded */
   1.308 +	unsigned long		ctx_ibrs[IA64_NUM_DBG_REGS];	/* IBR values (cache) when not loaded */
   1.309 +
   1.310 +	pfm_counter_t		ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
   1.311 +
   1.312 +	u64			ctx_saved_psr_up;	/* only contains psr.up value */
   1.313 +
   1.314 +	unsigned long		ctx_last_activation;	/* context last activation number for last_cpu */
   1.315 +	unsigned int		ctx_last_cpu;		/* CPU id of current or last CPU used (SMP only) */
   1.316 +	unsigned int		ctx_cpu;		/* cpu to which perfmon is applied (system wide) */
   1.317 +
   1.318 +	int			ctx_fd;			/* file descriptor used my this context */
   1.319 +	pfm_ovfl_arg_t		ctx_ovfl_arg;		/* argument to custom buffer format handler */
   1.320 +
   1.321 +	pfm_buffer_fmt_t	*ctx_buf_fmt;		/* buffer format callbacks */
   1.322 +	void			*ctx_smpl_hdr;		/* points to sampling buffer header kernel vaddr */
   1.323 +	unsigned long		ctx_smpl_size;		/* size of sampling buffer */
   1.324 +	void			*ctx_smpl_vaddr;	/* user level virtual address of smpl buffer */
   1.325 +
   1.326 +	wait_queue_head_t 	ctx_msgq_wait;
   1.327 +	pfm_msg_t		ctx_msgq[PFM_MAX_MSGS];
   1.328 +	int			ctx_msgq_head;
   1.329 +	int			ctx_msgq_tail;
   1.330 +	struct fasync_struct	*ctx_async_queue;
   1.331 +
   1.332 +	wait_queue_head_t 	ctx_zombieq;		/* termination cleanup wait queue */
   1.333 +} pfm_context_t;
   1.334 +
   1.335 +/*
   1.336 + * magic number used to verify that structure is really
   1.337 + * a perfmon context
   1.338 + */
   1.339 +#define PFM_IS_FILE(f)		((f)->f_op == &pfm_file_ops)
   1.340 +
   1.341 +#define PFM_GET_CTX(t)	 	((pfm_context_t *)(t)->thread.pfm_context)
   1.342 +
   1.343 +#ifdef CONFIG_SMP
   1.344 +#define SET_LAST_CPU(ctx, v)	(ctx)->ctx_last_cpu = (v)
   1.345 +#define GET_LAST_CPU(ctx)	(ctx)->ctx_last_cpu
   1.346 +#else
   1.347 +#define SET_LAST_CPU(ctx, v)	do {} while(0)
   1.348 +#define GET_LAST_CPU(ctx)	do {} while(0)
   1.349 +#endif
   1.350 +
   1.351 +
   1.352 +#define ctx_fl_block		ctx_flags.block
   1.353 +#define ctx_fl_system		ctx_flags.system
   1.354 +#define ctx_fl_using_dbreg	ctx_flags.using_dbreg
   1.355 +#define ctx_fl_is_sampling	ctx_flags.is_sampling
   1.356 +#define ctx_fl_excl_idle	ctx_flags.excl_idle
   1.357 +#define ctx_fl_going_zombie	ctx_flags.going_zombie
   1.358 +#define ctx_fl_trap_reason	ctx_flags.trap_reason
   1.359 +#define ctx_fl_no_msg		ctx_flags.no_msg
   1.360 +#define ctx_fl_can_restart	ctx_flags.can_restart
   1.361 +
   1.362 +#define PFM_SET_WORK_PENDING(t, v)	do { (t)->thread.pfm_needs_checking = v; } while(0);
   1.363 +#define PFM_GET_WORK_PENDING(t)		(t)->thread.pfm_needs_checking
   1.364 +
   1.365 +/*
   1.366 + * global information about all sessions
   1.367 + * mostly used to synchronize between system wide and per-process
   1.368 + */
   1.369 +typedef struct {
   1.370 +	spinlock_t		pfs_lock;		   /* lock the structure */
   1.371 +
   1.372 +	unsigned int		pfs_task_sessions;	   /* number of per task sessions */
   1.373 +	unsigned int		pfs_sys_sessions;	   /* number of per system wide sessions */
   1.374 +	unsigned int		pfs_sys_use_dbregs;	   /* incremented when a system wide session uses debug regs */
   1.375 +	unsigned int		pfs_ptrace_use_dbregs;	   /* incremented when a process uses debug regs */
   1.376 +	struct task_struct	*pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
   1.377 +} pfm_session_t;
   1.378 +
   1.379 +/*
   1.380 + * information about a PMC or PMD.
   1.381 + * dep_pmd[]: a bitmask of dependent PMD registers
   1.382 + * dep_pmc[]: a bitmask of dependent PMC registers
   1.383 + */
   1.384 +typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
   1.385 +typedef struct {
   1.386 +	unsigned int		type;
   1.387 +	int			pm_pos;
   1.388 +	unsigned long		default_value;	/* power-on default value */
   1.389 +	unsigned long		reserved_mask;	/* bitmask of reserved bits */
   1.390 +	pfm_reg_check_t		read_check;
   1.391 +	pfm_reg_check_t		write_check;
   1.392 +	unsigned long		dep_pmd[4];
   1.393 +	unsigned long		dep_pmc[4];
   1.394 +} pfm_reg_desc_t;
   1.395 +
   1.396 +/* assume cnum is a valid monitor */
   1.397 +#define PMC_PM(cnum, val)	(((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
   1.398 +
   1.399 +/*
   1.400 + * This structure is initialized at boot time and contains
   1.401 + * a description of the PMU main characteristics.
   1.402 + *
   1.403 + * If the probe function is defined, detection is based
   1.404 + * on its return value: 
   1.405 + * 	- 0 means recognized PMU
   1.406 + * 	- anything else means not supported
   1.407 + * When the probe function is not defined, then the pmu_family field
   1.408 + * is used and it must match the host CPU family such that:
   1.409 + * 	- cpu->family & config->pmu_family != 0
   1.410 + */
   1.411 +typedef struct {
   1.412 +	unsigned long  ovfl_val;	/* overflow value for counters */
   1.413 +
   1.414 +	pfm_reg_desc_t *pmc_desc;	/* detailed PMC register dependencies descriptions */
   1.415 +	pfm_reg_desc_t *pmd_desc;	/* detailed PMD register dependencies descriptions */
   1.416 +
   1.417 +	unsigned int   num_pmcs;	/* number of PMCS: computed at init time */
   1.418 +	unsigned int   num_pmds;	/* number of PMDS: computed at init time */
   1.419 +	unsigned long  impl_pmcs[4];	/* bitmask of implemented PMCS */
   1.420 +	unsigned long  impl_pmds[4];	/* bitmask of implemented PMDS */
   1.421 +
   1.422 +	char	      *pmu_name;	/* PMU family name */
   1.423 +	unsigned int  pmu_family;	/* cpuid family pattern used to identify pmu */
   1.424 +	unsigned int  flags;		/* pmu specific flags */
   1.425 +	unsigned int  num_ibrs;		/* number of IBRS: computed at init time */
   1.426 +	unsigned int  num_dbrs;		/* number of DBRS: computed at init time */
   1.427 +	unsigned int  num_counters;	/* PMC/PMD counting pairs : computed at init time */
   1.428 +	int           (*probe)(void);   /* customized probe routine */
   1.429 +	unsigned int  use_rr_dbregs:1;	/* set if debug registers used for range restriction */
   1.430 +} pmu_config_t;
   1.431 +/*
   1.432 + * PMU specific flags
   1.433 + */
   1.434 +#define PFM_PMU_IRQ_RESEND	1	/* PMU needs explicit IRQ resend */
   1.435 +
   1.436 +/*
   1.437 + * debug register related type definitions
   1.438 + */
   1.439 +typedef struct {
   1.440 +	unsigned long ibr_mask:56;
   1.441 +	unsigned long ibr_plm:4;
   1.442 +	unsigned long ibr_ig:3;
   1.443 +	unsigned long ibr_x:1;
   1.444 +} ibr_mask_reg_t;
   1.445 +
   1.446 +typedef struct {
   1.447 +	unsigned long dbr_mask:56;
   1.448 +	unsigned long dbr_plm:4;
   1.449 +	unsigned long dbr_ig:2;
   1.450 +	unsigned long dbr_w:1;
   1.451 +	unsigned long dbr_r:1;
   1.452 +} dbr_mask_reg_t;
   1.453 +
   1.454 +typedef union {
   1.455 +	unsigned long  val;
   1.456 +	ibr_mask_reg_t ibr;
   1.457 +	dbr_mask_reg_t dbr;
   1.458 +} dbreg_t;
   1.459 +
   1.460 +
   1.461 +/*
   1.462 + * perfmon command descriptions
   1.463 + */
   1.464 +typedef struct {
   1.465 +	int		(*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
   1.466 +	char		*cmd_name;
   1.467 +	int		cmd_flags;
   1.468 +	unsigned int	cmd_narg;
   1.469 +	size_t		cmd_argsize;
   1.470 +	int		(*cmd_getsize)(void *arg, size_t *sz);
   1.471 +} pfm_cmd_desc_t;
   1.472 +
   1.473 +#define PFM_CMD_FD		0x01	/* command requires a file descriptor */
   1.474 +#define PFM_CMD_ARG_READ	0x02	/* command must read argument(s) */
   1.475 +#define PFM_CMD_ARG_RW		0x04	/* command must read/write argument(s) */
   1.476 +#define PFM_CMD_STOP		0x08	/* command does not work on zombie context */
   1.477 +
   1.478 +
   1.479 +#define PFM_CMD_NAME(cmd)	pfm_cmd_tab[(cmd)].cmd_name
   1.480 +#define PFM_CMD_READ_ARG(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
   1.481 +#define PFM_CMD_RW_ARG(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
   1.482 +#define PFM_CMD_USE_FD(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
   1.483 +#define PFM_CMD_STOPPED(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
   1.484 +
   1.485 +#define PFM_CMD_ARG_MANY	-1 /* cannot be zero */
   1.486 +
   1.487 +typedef struct {
   1.488 +	unsigned long pfm_spurious_ovfl_intr_count;	/* keep track of spurious ovfl interrupts */
   1.489 +	unsigned long pfm_replay_ovfl_intr_count;	/* keep track of replayed ovfl interrupts */
   1.490 +	unsigned long pfm_ovfl_intr_count; 		/* keep track of ovfl interrupts */
   1.491 +	unsigned long pfm_ovfl_intr_cycles;		/* cycles spent processing ovfl interrupts */
   1.492 +	unsigned long pfm_ovfl_intr_cycles_min;		/* min cycles spent processing ovfl interrupts */
   1.493 +	unsigned long pfm_ovfl_intr_cycles_max;		/* max cycles spent processing ovfl interrupts */
   1.494 +	unsigned long pfm_smpl_handler_calls;
   1.495 +	unsigned long pfm_smpl_handler_cycles;
   1.496 +	char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
   1.497 +} pfm_stats_t;
   1.498 +
   1.499 +/*
   1.500 + * perfmon internal variables
   1.501 + */
   1.502 +static pfm_stats_t		pfm_stats[NR_CPUS];
   1.503 +static pfm_session_t		pfm_sessions;	/* global sessions information */
   1.504 +
   1.505 +static DEFINE_SPINLOCK(pfm_alt_install_check);
   1.506 +static pfm_intr_handler_desc_t  *pfm_alt_intr_handler;
   1.507 +
   1.508 +static struct proc_dir_entry 	*perfmon_dir;
   1.509 +static pfm_uuid_t		pfm_null_uuid = {0,};
   1.510 +
   1.511 +static spinlock_t		pfm_buffer_fmt_lock;
   1.512 +static LIST_HEAD(pfm_buffer_fmt_list);
   1.513 +
   1.514 +static pmu_config_t		*pmu_conf;
   1.515 +
   1.516 +/* sysctl() controls */
   1.517 +pfm_sysctl_t pfm_sysctl;
   1.518 +EXPORT_SYMBOL(pfm_sysctl);
   1.519 +
   1.520 +static ctl_table pfm_ctl_table[]={
   1.521 +	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
   1.522 +	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
   1.523 +	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
   1.524 +	{4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
   1.525 +	{ 0, },
   1.526 +};
   1.527 +static ctl_table pfm_sysctl_dir[] = {
   1.528 +	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
   1.529 + 	{0,},
   1.530 +};
   1.531 +static ctl_table pfm_sysctl_root[] = {
   1.532 +	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
   1.533 + 	{0,},
   1.534 +};
   1.535 +static struct ctl_table_header *pfm_sysctl_header;
   1.536 +
   1.537 +static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
   1.538 +static int pfm_flush(struct file *filp);
   1.539 +
   1.540 +#define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
   1.541 +#define pfm_get_cpu_data(a,b)		per_cpu(a, b)
   1.542 +
   1.543 +static inline void
   1.544 +pfm_put_task(struct task_struct *task)
   1.545 +{
   1.546 +	if (task != current) put_task_struct(task);
   1.547 +}
   1.548 +
   1.549 +static inline void
   1.550 +pfm_set_task_notify(struct task_struct *task)
   1.551 +{
   1.552 +	struct thread_info *info;
   1.553 +
   1.554 +	info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
   1.555 +	set_bit(TIF_NOTIFY_RESUME, &info->flags);
   1.556 +}
   1.557 +
   1.558 +static inline void
   1.559 +pfm_clear_task_notify(void)
   1.560 +{
   1.561 +	clear_thread_flag(TIF_NOTIFY_RESUME);
   1.562 +}
   1.563 +
   1.564 +static inline void
   1.565 +pfm_reserve_page(unsigned long a)
   1.566 +{
   1.567 +	SetPageReserved(vmalloc_to_page((void *)a));
   1.568 +}
   1.569 +static inline void
   1.570 +pfm_unreserve_page(unsigned long a)
   1.571 +{
   1.572 +	ClearPageReserved(vmalloc_to_page((void*)a));
   1.573 +}
   1.574 +
   1.575 +static inline unsigned long
   1.576 +pfm_protect_ctx_ctxsw(pfm_context_t *x)
   1.577 +{
   1.578 +	spin_lock(&(x)->ctx_lock);
   1.579 +	return 0UL;
   1.580 +}
   1.581 +
   1.582 +static inline void
   1.583 +pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
   1.584 +{
   1.585 +	spin_unlock(&(x)->ctx_lock);
   1.586 +}
   1.587 +
   1.588 +static inline unsigned int
   1.589 +pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
   1.590 +{
   1.591 +	return do_munmap(mm, addr, len);
   1.592 +}
   1.593 +
   1.594 +static inline unsigned long 
   1.595 +pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
   1.596 +{
   1.597 +	return get_unmapped_area(file, addr, len, pgoff, flags);
   1.598 +}
   1.599 +
   1.600 +
   1.601 +static struct super_block *
   1.602 +pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
   1.603 +{
   1.604 +	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
   1.605 +}
   1.606 +
   1.607 +static struct file_system_type pfm_fs_type = {
   1.608 +	.name     = "pfmfs",
   1.609 +	.get_sb   = pfmfs_get_sb,
   1.610 +	.kill_sb  = kill_anon_super,
   1.611 +};
   1.612 +
   1.613 +DEFINE_PER_CPU(unsigned long, pfm_syst_info);
   1.614 +DEFINE_PER_CPU(struct task_struct *, pmu_owner);
   1.615 +DEFINE_PER_CPU(pfm_context_t  *, pmu_ctx);
   1.616 +DEFINE_PER_CPU(unsigned long, pmu_activation_number);
   1.617 +EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
   1.618 +
   1.619 +
   1.620 +/* forward declaration */
   1.621 +static struct file_operations pfm_file_ops;
   1.622 +
   1.623 +/*
   1.624 + * forward declarations
   1.625 + */
   1.626 +#ifndef CONFIG_SMP
   1.627 +static void pfm_lazy_save_regs (struct task_struct *ta);
   1.628 +#endif
   1.629 +
   1.630 +void dump_pmu_state(const char *);
   1.631 +static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
   1.632 +
   1.633 +#include "perfmon_itanium.h"
   1.634 +#include "perfmon_mckinley.h"
   1.635 +#include "perfmon_montecito.h"
   1.636 +#include "perfmon_generic.h"
   1.637 +
   1.638 +static pmu_config_t *pmu_confs[]={
   1.639 +	&pmu_conf_mont,
   1.640 +	&pmu_conf_mck,
   1.641 +	&pmu_conf_ita,
   1.642 +	&pmu_conf_gen, /* must be last */
   1.643 +	NULL
   1.644 +};
   1.645 +
   1.646 +
   1.647 +static int pfm_end_notify_user(pfm_context_t *ctx);
   1.648 +
   1.649 +static inline void
   1.650 +pfm_clear_psr_pp(void)
   1.651 +{
   1.652 +	ia64_rsm(IA64_PSR_PP);
   1.653 +	ia64_srlz_i();
   1.654 +}
   1.655 +
   1.656 +static inline void
   1.657 +pfm_set_psr_pp(void)
   1.658 +{
   1.659 +	ia64_ssm(IA64_PSR_PP);
   1.660 +	ia64_srlz_i();
   1.661 +}
   1.662 +
   1.663 +static inline void
   1.664 +pfm_clear_psr_up(void)
   1.665 +{
   1.666 +	ia64_rsm(IA64_PSR_UP);
   1.667 +	ia64_srlz_i();
   1.668 +}
   1.669 +
   1.670 +static inline void
   1.671 +pfm_set_psr_up(void)
   1.672 +{
   1.673 +	ia64_ssm(IA64_PSR_UP);
   1.674 +	ia64_srlz_i();
   1.675 +}
   1.676 +
   1.677 +static inline unsigned long
   1.678 +pfm_get_psr(void)
   1.679 +{
   1.680 +	unsigned long tmp;
   1.681 +	tmp = ia64_getreg(_IA64_REG_PSR);
   1.682 +	ia64_srlz_i();
   1.683 +	return tmp;
   1.684 +}
   1.685 +
   1.686 +static inline void
   1.687 +pfm_set_psr_l(unsigned long val)
   1.688 +{
   1.689 +	ia64_setreg(_IA64_REG_PSR_L, val);
   1.690 +	ia64_srlz_i();
   1.691 +}
   1.692 +
   1.693 +static inline void
   1.694 +pfm_freeze_pmu(void)
   1.695 +{
   1.696 +	ia64_set_pmc(0,1UL);
   1.697 +	ia64_srlz_d();
   1.698 +}
   1.699 +
   1.700 +static inline void
   1.701 +pfm_unfreeze_pmu(void)
   1.702 +{
   1.703 +	ia64_set_pmc(0,0UL);
   1.704 +	ia64_srlz_d();
   1.705 +}
   1.706 +
   1.707 +static inline void
   1.708 +pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
   1.709 +{
   1.710 +	int i;
   1.711 +
   1.712 +	for (i=0; i < nibrs; i++) {
   1.713 +		ia64_set_ibr(i, ibrs[i]);
   1.714 +		ia64_dv_serialize_instruction();
   1.715 +	}
   1.716 +	ia64_srlz_i();
   1.717 +}
   1.718 +
   1.719 +static inline void
   1.720 +pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
   1.721 +{
   1.722 +	int i;
   1.723 +
   1.724 +	for (i=0; i < ndbrs; i++) {
   1.725 +		ia64_set_dbr(i, dbrs[i]);
   1.726 +		ia64_dv_serialize_data();
   1.727 +	}
   1.728 +	ia64_srlz_d();
   1.729 +}
   1.730 +
   1.731 +/*
   1.732 + * PMD[i] must be a counter. no check is made
   1.733 + */
   1.734 +static inline unsigned long
   1.735 +pfm_read_soft_counter(pfm_context_t *ctx, int i)
   1.736 +{
   1.737 +	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
   1.738 +}
   1.739 +
   1.740 +/*
   1.741 + * PMD[i] must be a counter. no check is made
   1.742 + */
   1.743 +static inline void
   1.744 +pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
   1.745 +{
   1.746 +	unsigned long ovfl_val = pmu_conf->ovfl_val;
   1.747 +
   1.748 +	ctx->ctx_pmds[i].val = val  & ~ovfl_val;
   1.749 +	/*
   1.750 +	 * writing to unimplemented part is ignore, so we do not need to
   1.751 +	 * mask off top part
   1.752 +	 */
   1.753 +	ia64_set_pmd(i, val & ovfl_val);
   1.754 +}
   1.755 +
   1.756 +static pfm_msg_t *
   1.757 +pfm_get_new_msg(pfm_context_t *ctx)
   1.758 +{
   1.759 +	int idx, next;
   1.760 +
   1.761 +	next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
   1.762 +
   1.763 +	DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
   1.764 +	if (next == ctx->ctx_msgq_head) return NULL;
   1.765 +
   1.766 + 	idx = 	ctx->ctx_msgq_tail;
   1.767 +	ctx->ctx_msgq_tail = next;
   1.768 +
   1.769 +	DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
   1.770 +
   1.771 +	return ctx->ctx_msgq+idx;
   1.772 +}
   1.773 +
   1.774 +static pfm_msg_t *
   1.775 +pfm_get_next_msg(pfm_context_t *ctx)
   1.776 +{
   1.777 +	pfm_msg_t *msg;
   1.778 +
   1.779 +	DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
   1.780 +
   1.781 +	if (PFM_CTXQ_EMPTY(ctx)) return NULL;
   1.782 +
   1.783 +	/*
   1.784 +	 * get oldest message
   1.785 +	 */
   1.786 +	msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
   1.787 +
   1.788 +	/*
   1.789 +	 * and move forward
   1.790 +	 */
   1.791 +	ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
   1.792 +
   1.793 +	DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
   1.794 +
   1.795 +	return msg;
   1.796 +}
   1.797 +
   1.798 +static void
   1.799 +pfm_reset_msgq(pfm_context_t *ctx)
   1.800 +{
   1.801 +	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
   1.802 +	DPRINT(("ctx=%p msgq reset\n", ctx));
   1.803 +}
   1.804 +
   1.805 +static void *
   1.806 +pfm_rvmalloc(unsigned long size)
   1.807 +{
   1.808 +	void *mem;
   1.809 +	unsigned long addr;
   1.810 +
   1.811 +	size = PAGE_ALIGN(size);
   1.812 +	mem  = vmalloc(size);
   1.813 +	if (mem) {
   1.814 +		//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
   1.815 +		memset(mem, 0, size);
   1.816 +		addr = (unsigned long)mem;
   1.817 +		while (size > 0) {
   1.818 +			pfm_reserve_page(addr);
   1.819 +			addr+=PAGE_SIZE;
   1.820 +			size-=PAGE_SIZE;
   1.821 +		}
   1.822 +	}
   1.823 +	return mem;
   1.824 +}
   1.825 +
   1.826 +static void
   1.827 +pfm_rvfree(void *mem, unsigned long size)
   1.828 +{
   1.829 +	unsigned long addr;
   1.830 +
   1.831 +	if (mem) {
   1.832 +		DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
   1.833 +		addr = (unsigned long) mem;
   1.834 +		while ((long) size > 0) {
   1.835 +			pfm_unreserve_page(addr);
   1.836 +			addr+=PAGE_SIZE;
   1.837 +			size-=PAGE_SIZE;
   1.838 +		}
   1.839 +		vfree(mem);
   1.840 +	}
   1.841 +	return;
   1.842 +}
   1.843 +
   1.844 +static pfm_context_t *
   1.845 +pfm_context_alloc(void)
   1.846 +{
   1.847 +	pfm_context_t *ctx;
   1.848 +
   1.849 +	/* 
   1.850 +	 * allocate context descriptor 
   1.851 +	 * must be able to free with interrupts disabled
   1.852 +	 */
   1.853 +	ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
   1.854 +	if (ctx) {
   1.855 +		memset(ctx, 0, sizeof(pfm_context_t));
   1.856 +		DPRINT(("alloc ctx @%p\n", ctx));
   1.857 +	}
   1.858 +	return ctx;
   1.859 +}
   1.860 +
   1.861 +static void
   1.862 +pfm_context_free(pfm_context_t *ctx)
   1.863 +{
   1.864 +	if (ctx) {
   1.865 +		DPRINT(("free ctx @%p\n", ctx));
   1.866 +		kfree(ctx);
   1.867 +	}
   1.868 +}
   1.869 +
   1.870 +static void
   1.871 +pfm_mask_monitoring(struct task_struct *task)
   1.872 +{
   1.873 +	pfm_context_t *ctx = PFM_GET_CTX(task);
   1.874 +	struct thread_struct *th = &task->thread;
   1.875 +	unsigned long mask, val, ovfl_mask;
   1.876 +	int i;
   1.877 +
   1.878 +	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
   1.879 +
   1.880 +	ovfl_mask = pmu_conf->ovfl_val;
   1.881 +	/*
   1.882 +	 * monitoring can only be masked as a result of a valid
   1.883 +	 * counter overflow. In UP, it means that the PMU still
   1.884 +	 * has an owner. Note that the owner can be different
   1.885 +	 * from the current task. However the PMU state belongs
   1.886 +	 * to the owner.
   1.887 +	 * In SMP, a valid overflow only happens when task is
   1.888 +	 * current. Therefore if we come here, we know that
   1.889 +	 * the PMU state belongs to the current task, therefore
   1.890 +	 * we can access the live registers.
   1.891 +	 *
   1.892 +	 * So in both cases, the live register contains the owner's
   1.893 +	 * state. We can ONLY touch the PMU registers and NOT the PSR.
   1.894 +	 *
   1.895 +	 * As a consequence to this call, the thread->pmds[] array
   1.896 +	 * contains stale information which must be ignored
   1.897 +	 * when context is reloaded AND monitoring is active (see
   1.898 +	 * pfm_restart).
   1.899 +	 */
   1.900 +	mask = ctx->ctx_used_pmds[0];
   1.901 +	for (i = 0; mask; i++, mask>>=1) {
   1.902 +		/* skip non used pmds */
   1.903 +		if ((mask & 0x1) == 0) continue;
   1.904 +		val = ia64_get_pmd(i);
   1.905 +
   1.906 +		if (PMD_IS_COUNTING(i)) {
   1.907 +			/*
   1.908 +		 	 * we rebuild the full 64 bit value of the counter
   1.909 +		 	 */
   1.910 +			ctx->ctx_pmds[i].val += (val & ovfl_mask);
   1.911 +		} else {
   1.912 +			ctx->ctx_pmds[i].val = val;
   1.913 +		}
   1.914 +		DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
   1.915 +			i,
   1.916 +			ctx->ctx_pmds[i].val,
   1.917 +			val & ovfl_mask));
   1.918 +	}
   1.919 +	/*
   1.920 +	 * mask monitoring by setting the privilege level to 0
   1.921 +	 * we cannot use psr.pp/psr.up for this, it is controlled by
   1.922 +	 * the user
   1.923 +	 *
   1.924 +	 * if task is current, modify actual registers, otherwise modify
   1.925 +	 * thread save state, i.e., what will be restored in pfm_load_regs()
   1.926 +	 */
   1.927 +	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
   1.928 +	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
   1.929 +		if ((mask & 0x1) == 0UL) continue;
   1.930 +		ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
   1.931 +		th->pmcs[i] &= ~0xfUL;
   1.932 +		DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
   1.933 +	}
   1.934 +	/*
   1.935 +	 * make all of this visible
   1.936 +	 */
   1.937 +	ia64_srlz_d();
   1.938 +}
   1.939 +
   1.940 +/*
   1.941 + * must always be done with task == current
   1.942 + *
   1.943 + * context must be in MASKED state when calling
   1.944 + */
   1.945 +static void
   1.946 +pfm_restore_monitoring(struct task_struct *task)
   1.947 +{
   1.948 +	pfm_context_t *ctx = PFM_GET_CTX(task);
   1.949 +	struct thread_struct *th = &task->thread;
   1.950 +	unsigned long mask, ovfl_mask;
   1.951 +	unsigned long psr, val;
   1.952 +	int i, is_system;
   1.953 +
   1.954 +	is_system = ctx->ctx_fl_system;
   1.955 +	ovfl_mask = pmu_conf->ovfl_val;
   1.956 +
   1.957 +	if (task != current) {
   1.958 +		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
   1.959 +		return;
   1.960 +	}
   1.961 +	if (ctx->ctx_state != PFM_CTX_MASKED) {
   1.962 +		printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
   1.963 +			task->pid, current->pid, ctx->ctx_state);
   1.964 +		return;
   1.965 +	}
   1.966 +	psr = pfm_get_psr();
   1.967 +	/*
   1.968 +	 * monitoring is masked via the PMC.
   1.969 +	 * As we restore their value, we do not want each counter to
   1.970 +	 * restart right away. We stop monitoring using the PSR,
   1.971 +	 * restore the PMC (and PMD) and then re-establish the psr
   1.972 +	 * as it was. Note that there can be no pending overflow at
   1.973 +	 * this point, because monitoring was MASKED.
   1.974 +	 *
   1.975 +	 * system-wide session are pinned and self-monitoring
   1.976 +	 */
   1.977 +	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
   1.978 +		/* disable dcr pp */
   1.979 +		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
   1.980 +		pfm_clear_psr_pp();
   1.981 +	} else {
   1.982 +		pfm_clear_psr_up();
   1.983 +	}
   1.984 +	/*
   1.985 +	 * first, we restore the PMD
   1.986 +	 */
   1.987 +	mask = ctx->ctx_used_pmds[0];
   1.988 +	for (i = 0; mask; i++, mask>>=1) {
   1.989 +		/* skip non used pmds */
   1.990 +		if ((mask & 0x1) == 0) continue;
   1.991 +
   1.992 +		if (PMD_IS_COUNTING(i)) {
   1.993 +			/*
   1.994 +			 * we split the 64bit value according to
   1.995 +			 * counter width
   1.996 +			 */
   1.997 +			val = ctx->ctx_pmds[i].val & ovfl_mask;
   1.998 +			ctx->ctx_pmds[i].val &= ~ovfl_mask;
   1.999 +		} else {
  1.1000 +			val = ctx->ctx_pmds[i].val;
  1.1001 +		}
  1.1002 +		ia64_set_pmd(i, val);
  1.1003 +
  1.1004 +		DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
  1.1005 +			i,
  1.1006 +			ctx->ctx_pmds[i].val,
  1.1007 +			val));
  1.1008 +	}
  1.1009 +	/*
  1.1010 +	 * restore the PMCs
  1.1011 +	 */
  1.1012 +	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
  1.1013 +	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
  1.1014 +		if ((mask & 0x1) == 0UL) continue;
  1.1015 +		th->pmcs[i] = ctx->ctx_pmcs[i];
  1.1016 +		ia64_set_pmc(i, th->pmcs[i]);
  1.1017 +		DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
  1.1018 +	}
  1.1019 +	ia64_srlz_d();
  1.1020 +
  1.1021 +	/*
  1.1022 +	 * must restore DBR/IBR because could be modified while masked
  1.1023 +	 * XXX: need to optimize 
  1.1024 +	 */
  1.1025 +	if (ctx->ctx_fl_using_dbreg) {
  1.1026 +		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
  1.1027 +		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
  1.1028 +	}
  1.1029 +
  1.1030 +	/*
  1.1031 +	 * now restore PSR
  1.1032 +	 */
  1.1033 +	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
  1.1034 +		/* enable dcr pp */
  1.1035 +		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
  1.1036 +		ia64_srlz_i();
  1.1037 +	}
  1.1038 +	pfm_set_psr_l(psr);
  1.1039 +}
  1.1040 +
  1.1041 +static inline void
  1.1042 +pfm_save_pmds(unsigned long *pmds, unsigned long mask)
  1.1043 +{
  1.1044 +	int i;
  1.1045 +
  1.1046 +	ia64_srlz_d();
  1.1047 +
  1.1048 +	for (i=0; mask; i++, mask>>=1) {
  1.1049 +		if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
  1.1050 +	}
  1.1051 +}
  1.1052 +
  1.1053 +/*
  1.1054 + * reload from thread state (used for ctxw only)
  1.1055 + */
  1.1056 +static inline void
  1.1057 +pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
  1.1058 +{
  1.1059 +	int i;
  1.1060 +	unsigned long val, ovfl_val = pmu_conf->ovfl_val;
  1.1061 +
  1.1062 +	for (i=0; mask; i++, mask>>=1) {
  1.1063 +		if ((mask & 0x1) == 0) continue;
  1.1064 +		val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
  1.1065 +		ia64_set_pmd(i, val);
  1.1066 +	}
  1.1067 +	ia64_srlz_d();
  1.1068 +}
  1.1069 +
  1.1070 +/*
  1.1071 + * propagate PMD from context to thread-state
  1.1072 + */
  1.1073 +static inline void
  1.1074 +pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
  1.1075 +{
  1.1076 +	struct thread_struct *thread = &task->thread;
  1.1077 +	unsigned long ovfl_val = pmu_conf->ovfl_val;
  1.1078 +	unsigned long mask = ctx->ctx_all_pmds[0];
  1.1079 +	unsigned long val;
  1.1080 +	int i;
  1.1081 +
  1.1082 +	DPRINT(("mask=0x%lx\n", mask));
  1.1083 +
  1.1084 +	for (i=0; mask; i++, mask>>=1) {
  1.1085 +
  1.1086 +		val = ctx->ctx_pmds[i].val;
  1.1087 +
  1.1088 +		/*
  1.1089 +		 * We break up the 64 bit value into 2 pieces
  1.1090 +		 * the lower bits go to the machine state in the
  1.1091 +		 * thread (will be reloaded on ctxsw in).
  1.1092 +		 * The upper part stays in the soft-counter.
  1.1093 +		 */
  1.1094 +		if (PMD_IS_COUNTING(i)) {
  1.1095 +			ctx->ctx_pmds[i].val = val & ~ovfl_val;
  1.1096 +			 val &= ovfl_val;
  1.1097 +		}
  1.1098 +		thread->pmds[i] = val;
  1.1099 +
  1.1100 +		DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
  1.1101 +			i,
  1.1102 +			thread->pmds[i],
  1.1103 +			ctx->ctx_pmds[i].val));
  1.1104 +	}
  1.1105 +}
  1.1106 +
  1.1107 +/*
  1.1108 + * propagate PMC from context to thread-state
  1.1109 + */
  1.1110 +static inline void
  1.1111 +pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
  1.1112 +{
  1.1113 +	struct thread_struct *thread = &task->thread;
  1.1114 +	unsigned long mask = ctx->ctx_all_pmcs[0];
  1.1115 +	int i;
  1.1116 +
  1.1117 +	DPRINT(("mask=0x%lx\n", mask));
  1.1118 +
  1.1119 +	for (i=0; mask; i++, mask>>=1) {
  1.1120 +		/* masking 0 with ovfl_val yields 0 */
  1.1121 +		thread->pmcs[i] = ctx->ctx_pmcs[i];
  1.1122 +		DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
  1.1123 +	}
  1.1124 +}
  1.1125 +
  1.1126 +
  1.1127 +
  1.1128 +static inline void
  1.1129 +pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
  1.1130 +{
  1.1131 +	int i;
  1.1132 +
  1.1133 +	for (i=0; mask; i++, mask>>=1) {
  1.1134 +		if ((mask & 0x1) == 0) continue;
  1.1135 +		ia64_set_pmc(i, pmcs[i]);
  1.1136 +	}
  1.1137 +	ia64_srlz_d();
  1.1138 +}
  1.1139 +
  1.1140 +static inline int
  1.1141 +pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
  1.1142 +{
  1.1143 +	return memcmp(a, b, sizeof(pfm_uuid_t));
  1.1144 +}
  1.1145 +
  1.1146 +static inline int
  1.1147 +pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs)
  1.1148 +{
  1.1149 +	int ret = 0;
  1.1150 +	if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
  1.1151 +	return ret;
  1.1152 +}
  1.1153 +
  1.1154 +static inline int
  1.1155 +pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size)
  1.1156 +{
  1.1157 +	int ret = 0;
  1.1158 +	if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
  1.1159 +	return ret;
  1.1160 +}
  1.1161 +
  1.1162 +
  1.1163 +static inline int
  1.1164 +pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags,
  1.1165 +		     int cpu, void *arg)
  1.1166 +{
  1.1167 +	int ret = 0;
  1.1168 +	if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
  1.1169 +	return ret;
  1.1170 +}
  1.1171 +
  1.1172 +static inline int
  1.1173 +pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags,
  1.1174 +		     int cpu, void *arg)
  1.1175 +{
  1.1176 +	int ret = 0;
  1.1177 +	if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
  1.1178 +	return ret;
  1.1179 +}
  1.1180 +
  1.1181 +static inline int
  1.1182 +pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
  1.1183 +{
  1.1184 +	int ret = 0;
  1.1185 +	if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
  1.1186 +	return ret;
  1.1187 +}
  1.1188 +
  1.1189 +static inline int
  1.1190 +pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
  1.1191 +{
  1.1192 +	int ret = 0;
  1.1193 +	if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
  1.1194 +	return ret;
  1.1195 +}
  1.1196 +
  1.1197 +static pfm_buffer_fmt_t *
  1.1198 +__pfm_find_buffer_fmt(pfm_uuid_t uuid)
  1.1199 +{
  1.1200 +	struct list_head * pos;
  1.1201 +	pfm_buffer_fmt_t * entry;
  1.1202 +
  1.1203 +	list_for_each(pos, &pfm_buffer_fmt_list) {
  1.1204 +		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
  1.1205 +		if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
  1.1206 +			return entry;
  1.1207 +	}
  1.1208 +	return NULL;
  1.1209 +}
  1.1210 + 
  1.1211 +/*
  1.1212 + * find a buffer format based on its uuid
  1.1213 + */
  1.1214 +static pfm_buffer_fmt_t *
  1.1215 +pfm_find_buffer_fmt(pfm_uuid_t uuid)
  1.1216 +{
  1.1217 +	pfm_buffer_fmt_t * fmt;
  1.1218 +	spin_lock(&pfm_buffer_fmt_lock);
  1.1219 +	fmt = __pfm_find_buffer_fmt(uuid);
  1.1220 +	spin_unlock(&pfm_buffer_fmt_lock);
  1.1221 +	return fmt;
  1.1222 +}
  1.1223 + 
  1.1224 +int
  1.1225 +pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
  1.1226 +{
  1.1227 +	int ret = 0;
  1.1228 +
  1.1229 +	/* some sanity checks */
  1.1230 +	if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL;
  1.1231 +
  1.1232 +	/* we need at least a handler */
  1.1233 +	if (fmt->fmt_handler == NULL) return -EINVAL;
  1.1234 +
  1.1235 +	/*
  1.1236 +	 * XXX: need check validity of fmt_arg_size
  1.1237 +	 */
  1.1238 +
  1.1239 +	spin_lock(&pfm_buffer_fmt_lock);
  1.1240 +
  1.1241 +	if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
  1.1242 +		printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
  1.1243 +		ret = -EBUSY;
  1.1244 +		goto out;
  1.1245 +	} 
  1.1246 +	list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
  1.1247 +	printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
  1.1248 +
  1.1249 +out:
  1.1250 +	spin_unlock(&pfm_buffer_fmt_lock);
  1.1251 + 	return ret;
  1.1252 +}
  1.1253 +EXPORT_SYMBOL(pfm_register_buffer_fmt);
  1.1254 +
  1.1255 +int
  1.1256 +pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
  1.1257 +{
  1.1258 +	pfm_buffer_fmt_t *fmt;
  1.1259 +	int ret = 0;
  1.1260 +
  1.1261 +	spin_lock(&pfm_buffer_fmt_lock);
  1.1262 +
  1.1263 +	fmt = __pfm_find_buffer_fmt(uuid);
  1.1264 +	if (!fmt) {
  1.1265 +		printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
  1.1266 +		ret = -EINVAL;
  1.1267 +		goto out;
  1.1268 +	}
  1.1269 +	list_del_init(&fmt->fmt_list);
  1.1270 +	printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
  1.1271 +
  1.1272 +out:
  1.1273 +	spin_unlock(&pfm_buffer_fmt_lock);
  1.1274 +	return ret;
  1.1275 +
  1.1276 +}
  1.1277 +EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
  1.1278 +
  1.1279 +extern void update_pal_halt_status(int);
  1.1280 +
  1.1281 +static int
  1.1282 +pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
  1.1283 +{
  1.1284 +	unsigned long flags;
  1.1285 +	/*
  1.1286 +	 * validy checks on cpu_mask have been done upstream
  1.1287 +	 */
  1.1288 +	LOCK_PFS(flags);
  1.1289 +
  1.1290 +	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
  1.1291 +		pfm_sessions.pfs_sys_sessions,
  1.1292 +		pfm_sessions.pfs_task_sessions,
  1.1293 +		pfm_sessions.pfs_sys_use_dbregs,
  1.1294 +		is_syswide,
  1.1295 +		cpu));
  1.1296 +
  1.1297 +	if (is_syswide) {
  1.1298 +		/*
  1.1299 +		 * cannot mix system wide and per-task sessions
  1.1300 +		 */
  1.1301 +		if (pfm_sessions.pfs_task_sessions > 0UL) {
  1.1302 +			DPRINT(("system wide not possible, %u conflicting task_sessions\n",
  1.1303 +			  	pfm_sessions.pfs_task_sessions));
  1.1304 +			goto abort;
  1.1305 +		}
  1.1306 +
  1.1307 +		if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
  1.1308 +
  1.1309 +		DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
  1.1310 +
  1.1311 +		pfm_sessions.pfs_sys_session[cpu] = task;
  1.1312 +
  1.1313 +		pfm_sessions.pfs_sys_sessions++ ;
  1.1314 +
  1.1315 +	} else {
  1.1316 +		if (pfm_sessions.pfs_sys_sessions) goto abort;
  1.1317 +		pfm_sessions.pfs_task_sessions++;
  1.1318 +	}
  1.1319 +
  1.1320 +	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
  1.1321 +		pfm_sessions.pfs_sys_sessions,
  1.1322 +		pfm_sessions.pfs_task_sessions,
  1.1323 +		pfm_sessions.pfs_sys_use_dbregs,
  1.1324 +		is_syswide,
  1.1325 +		cpu));
  1.1326 +
  1.1327 +	/*
  1.1328 +	 * disable default_idle() to go to PAL_HALT
  1.1329 +	 */
  1.1330 +	update_pal_halt_status(0);
  1.1331 +
  1.1332 +	UNLOCK_PFS(flags);
  1.1333 +
  1.1334 +	return 0;
  1.1335 +
  1.1336 +error_conflict:
  1.1337 +	DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
  1.1338 +  		pfm_sessions.pfs_sys_session[cpu]->pid,
  1.1339 +		cpu));
  1.1340 +abort:
  1.1341 +	UNLOCK_PFS(flags);
  1.1342 +
  1.1343 +	return -EBUSY;
  1.1344 +
  1.1345 +}
  1.1346 +
  1.1347 +static int
  1.1348 +pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
  1.1349 +{
  1.1350 +	unsigned long flags;
  1.1351 +	/*
  1.1352 +	 * validy checks on cpu_mask have been done upstream
  1.1353 +	 */
  1.1354 +	LOCK_PFS(flags);
  1.1355 +
  1.1356 +	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
  1.1357 +		pfm_sessions.pfs_sys_sessions,
  1.1358 +		pfm_sessions.pfs_task_sessions,
  1.1359 +		pfm_sessions.pfs_sys_use_dbregs,
  1.1360 +		is_syswide,
  1.1361 +		cpu));
  1.1362 +
  1.1363 +
  1.1364 +	if (is_syswide) {
  1.1365 +		pfm_sessions.pfs_sys_session[cpu] = NULL;
  1.1366 +		/*
  1.1367 +		 * would not work with perfmon+more than one bit in cpu_mask
  1.1368 +		 */
  1.1369 +		if (ctx && ctx->ctx_fl_using_dbreg) {
  1.1370 +			if (pfm_sessions.pfs_sys_use_dbregs == 0) {
  1.1371 +				printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
  1.1372 +			} else {
  1.1373 +				pfm_sessions.pfs_sys_use_dbregs--;
  1.1374 +			}
  1.1375 +		}
  1.1376 +		pfm_sessions.pfs_sys_sessions--;
  1.1377 +	} else {
  1.1378 +		pfm_sessions.pfs_task_sessions--;
  1.1379 +	}
  1.1380 +	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
  1.1381 +		pfm_sessions.pfs_sys_sessions,
  1.1382 +		pfm_sessions.pfs_task_sessions,
  1.1383 +		pfm_sessions.pfs_sys_use_dbregs,
  1.1384 +		is_syswide,
  1.1385 +		cpu));
  1.1386 +
  1.1387 +	/*
  1.1388 +	 * if possible, enable default_idle() to go into PAL_HALT
  1.1389 +	 */
  1.1390 +	if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
  1.1391 +		update_pal_halt_status(1);
  1.1392 +
  1.1393 +	UNLOCK_PFS(flags);
  1.1394 +
  1.1395 +	return 0;
  1.1396 +}
  1.1397 +
  1.1398 +/*
  1.1399 + * removes virtual mapping of the sampling buffer.
  1.1400 + * IMPORTANT: cannot be called with interrupts disable, e.g. inside
  1.1401 + * a PROTECT_CTX() section.
  1.1402 + */
  1.1403 +static int
  1.1404 +pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
  1.1405 +{
  1.1406 +	int r;
  1.1407 +
  1.1408 +	/* sanity checks */
  1.1409 +	if (task->mm == NULL || size == 0UL || vaddr == NULL) {
  1.1410 +		printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
  1.1411 +		return -EINVAL;
  1.1412 +	}
  1.1413 +
  1.1414 +	DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
  1.1415 +
  1.1416 +	/*
  1.1417 +	 * does the actual unmapping
  1.1418 +	 */
  1.1419 +	down_write(&task->mm->mmap_sem);
  1.1420 +
  1.1421 +	DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
  1.1422 +
  1.1423 +	r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
  1.1424 +
  1.1425 +	up_write(&task->mm->mmap_sem);
  1.1426 +	if (r !=0) {
  1.1427 +		printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
  1.1428 +	}
  1.1429 +
  1.1430 +	DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
  1.1431 +
  1.1432 +	return 0;
  1.1433 +}
  1.1434 +
  1.1435 +/*
  1.1436 + * free actual physical storage used by sampling buffer
  1.1437 + */
  1.1438 +#if 0
  1.1439 +static int
  1.1440 +pfm_free_smpl_buffer(pfm_context_t *ctx)
  1.1441 +{
  1.1442 +	pfm_buffer_fmt_t *fmt;
  1.1443 +
  1.1444 +	if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
  1.1445 +
  1.1446 +	/*
  1.1447 +	 * we won't use the buffer format anymore
  1.1448 +	 */
  1.1449 +	fmt = ctx->ctx_buf_fmt;
  1.1450 +
  1.1451 +	DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
  1.1452 +		ctx->ctx_smpl_hdr,
  1.1453 +		ctx->ctx_smpl_size,
  1.1454 +		ctx->ctx_smpl_vaddr));
  1.1455 +
  1.1456 +	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
  1.1457 +
  1.1458 +	/*
  1.1459 +	 * free the buffer
  1.1460 +	 */
  1.1461 +	pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
  1.1462 +
  1.1463 +	ctx->ctx_smpl_hdr  = NULL;
  1.1464 +	ctx->ctx_smpl_size = 0UL;
  1.1465 +
  1.1466 +	return 0;
  1.1467 +
  1.1468 +invalid_free:
  1.1469 +	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
  1.1470 +	return -EINVAL;
  1.1471 +}
  1.1472 +#endif
  1.1473 +
  1.1474 +static inline void
  1.1475 +pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
  1.1476 +{
  1.1477 +	if (fmt == NULL) return;
  1.1478 +
  1.1479 +	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
  1.1480 +
  1.1481 +}
  1.1482 +
  1.1483 +/*
  1.1484 + * pfmfs should _never_ be mounted by userland - too much of security hassle,
  1.1485 + * no real gain from having the whole whorehouse mounted. So we don't need
  1.1486 + * any operations on the root directory. However, we need a non-trivial
  1.1487 + * d_name - pfm: will go nicely and kill the special-casing in procfs.
  1.1488 + */
  1.1489 +static struct vfsmount *pfmfs_mnt;
  1.1490 +
  1.1491 +static int __init
  1.1492 +init_pfm_fs(void)
  1.1493 +{
  1.1494 +	int err = register_filesystem(&pfm_fs_type);
  1.1495 +	if (!err) {
  1.1496 +		pfmfs_mnt = kern_mount(&pfm_fs_type);
  1.1497 +		err = PTR_ERR(pfmfs_mnt);
  1.1498 +		if (IS_ERR(pfmfs_mnt))
  1.1499 +			unregister_filesystem(&pfm_fs_type);
  1.1500 +		else
  1.1501 +			err = 0;
  1.1502 +	}
  1.1503 +	return err;
  1.1504 +}
  1.1505 +
  1.1506 +static void __exit
  1.1507 +exit_pfm_fs(void)
  1.1508 +{
  1.1509 +	unregister_filesystem(&pfm_fs_type);
  1.1510 +	mntput(pfmfs_mnt);
  1.1511 +}
  1.1512 +
  1.1513 +static ssize_t
  1.1514 +pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
  1.1515 +{
  1.1516 +	pfm_context_t *ctx;
  1.1517 +	pfm_msg_t *msg;
  1.1518 +	ssize_t ret;
  1.1519 +	unsigned long flags;
  1.1520 +  	DECLARE_WAITQUEUE(wait, current);
  1.1521 +	if (PFM_IS_FILE(filp) == 0) {
  1.1522 +		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
  1.1523 +		return -EINVAL;
  1.1524 +	}
  1.1525 +
  1.1526 +	ctx = (pfm_context_t *)filp->private_data;
  1.1527 +	if (ctx == NULL) {
  1.1528 +		printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
  1.1529 +		return -EINVAL;
  1.1530 +	}
  1.1531 +
  1.1532 +	/*
  1.1533 +	 * check even when there is no message
  1.1534 +	 */
  1.1535 +	if (size < sizeof(pfm_msg_t)) {
  1.1536 +		DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
  1.1537 +		return -EINVAL;
  1.1538 +	}
  1.1539 +
  1.1540 +	PROTECT_CTX(ctx, flags);
  1.1541 +
  1.1542 +  	/*
  1.1543 +	 * put ourselves on the wait queue
  1.1544 +	 */
  1.1545 +  	add_wait_queue(&ctx->ctx_msgq_wait, &wait);
  1.1546 +
  1.1547 +
  1.1548 +  	for(;;) {
  1.1549 +		/*
  1.1550 +		 * check wait queue
  1.1551 +		 */
  1.1552 +
  1.1553 +  		set_current_state(TASK_INTERRUPTIBLE);
  1.1554 +
  1.1555 +		DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
  1.1556 +
  1.1557 +		ret = 0;
  1.1558 +		if(PFM_CTXQ_EMPTY(ctx) == 0) break;
  1.1559 +
  1.1560 +		UNPROTECT_CTX(ctx, flags);
  1.1561 +
  1.1562 +		/*
  1.1563 +		 * check non-blocking read
  1.1564 +		 */
  1.1565 +      		ret = -EAGAIN;
  1.1566 +		if(filp->f_flags & O_NONBLOCK) break;
  1.1567 +
  1.1568 +		/*
  1.1569 +		 * check pending signals
  1.1570 +		 */
  1.1571 +		if(signal_pending(current)) {
  1.1572 +			ret = -EINTR;
  1.1573 +			break;
  1.1574 +		}
  1.1575 +      		/*
  1.1576 +		 * no message, so wait
  1.1577 +		 */
  1.1578 +      		schedule();
  1.1579 +
  1.1580 +		PROTECT_CTX(ctx, flags);
  1.1581 +	}
  1.1582 +	DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
  1.1583 +  	set_current_state(TASK_RUNNING);
  1.1584 +	remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
  1.1585 +
  1.1586 +	if (ret < 0) goto abort;
  1.1587 +
  1.1588 +	ret = -EINVAL;
  1.1589 +	msg = pfm_get_next_msg(ctx);
  1.1590 +	if (msg == NULL) {
  1.1591 +		printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
  1.1592 +		goto abort_locked;
  1.1593 +	}
  1.1594 +
  1.1595 +	DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
  1.1596 +
  1.1597 +	ret = -EFAULT;
  1.1598 +  	if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
  1.1599 +
  1.1600 +abort_locked:
  1.1601 +	UNPROTECT_CTX(ctx, flags);
  1.1602 +abort:
  1.1603 +	return ret;
  1.1604 +}
  1.1605 +
  1.1606 +static ssize_t
  1.1607 +pfm_write(struct file *file, const char __user *ubuf,
  1.1608 +			  size_t size, loff_t *ppos)
  1.1609 +{
  1.1610 +	DPRINT(("pfm_write called\n"));
  1.1611 +	return -EINVAL;
  1.1612 +}
  1.1613 +
  1.1614 +static unsigned int
  1.1615 +pfm_poll(struct file *filp, poll_table * wait)
  1.1616 +{
  1.1617 +	pfm_context_t *ctx;
  1.1618 +	unsigned long flags;
  1.1619 +	unsigned int mask = 0;
  1.1620 +
  1.1621 +	if (PFM_IS_FILE(filp) == 0) {
  1.1622 +		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
  1.1623 +		return 0;
  1.1624 +	}
  1.1625 +
  1.1626 +	ctx = (pfm_context_t *)filp->private_data;
  1.1627 +	if (ctx == NULL) {
  1.1628 +		printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
  1.1629 +		return 0;
  1.1630 +	}
  1.1631 +
  1.1632 +
  1.1633 +	DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
  1.1634 +
  1.1635 +	poll_wait(filp, &ctx->ctx_msgq_wait, wait);
  1.1636 +
  1.1637 +	PROTECT_CTX(ctx, flags);
  1.1638 +
  1.1639 +	if (PFM_CTXQ_EMPTY(ctx) == 0)
  1.1640 +		mask =  POLLIN | POLLRDNORM;
  1.1641 +
  1.1642 +	UNPROTECT_CTX(ctx, flags);
  1.1643 +
  1.1644 +	DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
  1.1645 +
  1.1646 +	return mask;
  1.1647 +}
  1.1648 +
  1.1649 +static int
  1.1650 +pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
  1.1651 +{
  1.1652 +	DPRINT(("pfm_ioctl called\n"));
  1.1653 +	return -EINVAL;
  1.1654 +}
  1.1655 +
  1.1656 +/*
  1.1657 + * interrupt cannot be masked when coming here
  1.1658 + */
  1.1659 +static inline int
  1.1660 +pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
  1.1661 +{
  1.1662 +	int ret;
  1.1663 +
  1.1664 +	ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
  1.1665 +
  1.1666 +	DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
  1.1667 +		current->pid,
  1.1668 +		fd,
  1.1669 +		on,
  1.1670 +		ctx->ctx_async_queue, ret));
  1.1671 +
  1.1672 +	return ret;
  1.1673 +}
  1.1674 +
  1.1675 +static int
  1.1676 +pfm_fasync(int fd, struct file *filp, int on)
  1.1677 +{
  1.1678 +	pfm_context_t *ctx;
  1.1679 +	int ret;
  1.1680 +
  1.1681 +	if (PFM_IS_FILE(filp) == 0) {
  1.1682 +		printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
  1.1683 +		return -EBADF;
  1.1684 +	}
  1.1685 +
  1.1686 +	ctx = (pfm_context_t *)filp->private_data;
  1.1687 +	if (ctx == NULL) {
  1.1688 +		printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
  1.1689 +		return -EBADF;
  1.1690 +	}
  1.1691 +	/*
  1.1692 +	 * we cannot mask interrupts during this call because this may
  1.1693 +	 * may go to sleep if memory is not readily avalaible.
  1.1694 +	 *
  1.1695 +	 * We are protected from the conetxt disappearing by the get_fd()/put_fd()
  1.1696 +	 * done in caller. Serialization of this function is ensured by caller.
  1.1697 +	 */
  1.1698 +	ret = pfm_do_fasync(fd, filp, ctx, on);
  1.1699 +
  1.1700 +
  1.1701 +	DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
  1.1702 +		fd,
  1.1703 +		on,
  1.1704 +		ctx->ctx_async_queue, ret));
  1.1705 +
  1.1706 +	return ret;
  1.1707 +}
  1.1708 +
  1.1709 +#ifdef CONFIG_SMP
  1.1710 +/*
  1.1711 + * this function is exclusively called from pfm_close().
  1.1712 + * The context is not protected at that time, nor are interrupts
  1.1713 + * on the remote CPU. That's necessary to avoid deadlocks.
  1.1714 + */
  1.1715 +static void
  1.1716 +pfm_syswide_force_stop(void *info)
  1.1717 +{
  1.1718 +	pfm_context_t   *ctx = (pfm_context_t *)info;
  1.1719 +	struct pt_regs *regs = task_pt_regs(current);
  1.1720 +	struct task_struct *owner;
  1.1721 +	unsigned long flags;
  1.1722 +	int ret;
  1.1723 +
  1.1724 +	if (ctx->ctx_cpu != smp_processor_id()) {
  1.1725 +		printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d  but on CPU%d\n",
  1.1726 +			ctx->ctx_cpu,
  1.1727 +			smp_processor_id());
  1.1728 +		return;
  1.1729 +	}
  1.1730 +	owner = GET_PMU_OWNER();
  1.1731 +	if (owner != ctx->ctx_task) {
  1.1732 +		printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
  1.1733 +			smp_processor_id(),
  1.1734 +			owner->pid, ctx->ctx_task->pid);
  1.1735 +		return;
  1.1736 +	}
  1.1737 +	if (GET_PMU_CTX() != ctx) {
  1.1738 +		printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
  1.1739 +			smp_processor_id(),
  1.1740 +			GET_PMU_CTX(), ctx);
  1.1741 +		return;
  1.1742 +	}
  1.1743 +
  1.1744 +	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));	
  1.1745 +	/*
  1.1746 +	 * the context is already protected in pfm_close(), we simply
  1.1747 +	 * need to mask interrupts to avoid a PMU interrupt race on
  1.1748 +	 * this CPU
  1.1749 +	 */
  1.1750 +	local_irq_save(flags);
  1.1751 +
  1.1752 +	ret = pfm_context_unload(ctx, NULL, 0, regs);
  1.1753 +	if (ret) {
  1.1754 +		DPRINT(("context_unload returned %d\n", ret));
  1.1755 +	}
  1.1756 +
  1.1757 +	/*
  1.1758 +	 * unmask interrupts, PMU interrupts are now spurious here
  1.1759 +	 */
  1.1760 +	local_irq_restore(flags);
  1.1761 +}
  1.1762 +
  1.1763 +static void
  1.1764 +pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
  1.1765 +{
  1.1766 +	int ret;
  1.1767 +
  1.1768 +	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
  1.1769 +	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
  1.1770 +	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
  1.1771 +}
  1.1772 +#endif /* CONFIG_SMP */
  1.1773 +
  1.1774 +/*
  1.1775 + * called for each close(). Partially free resources.
  1.1776 + * When caller is self-monitoring, the context is unloaded.
  1.1777 + */
  1.1778 +static int
  1.1779 +pfm_flush(struct file *filp)
  1.1780 +{
  1.1781 +	pfm_context_t *ctx;
  1.1782 +	struct task_struct *task;
  1.1783 +	struct pt_regs *regs;
  1.1784 +	unsigned long flags;
  1.1785 +	unsigned long smpl_buf_size = 0UL;
  1.1786 +	void *smpl_buf_vaddr = NULL;
  1.1787 +	int state, is_system;
  1.1788 +
  1.1789 +	if (PFM_IS_FILE(filp) == 0) {
  1.1790 +		DPRINT(("bad magic for\n"));
  1.1791 +		return -EBADF;
  1.1792 +	}
  1.1793 +
  1.1794 +	ctx = (pfm_context_t *)filp->private_data;
  1.1795 +	if (ctx == NULL) {
  1.1796 +		printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
  1.1797 +		return -EBADF;
  1.1798 +	}
  1.1799 +
  1.1800 +	/*
  1.1801 +	 * remove our file from the async queue, if we use this mode.
  1.1802 +	 * This can be done without the context being protected. We come
  1.1803 +	 * here when the context has become unreacheable by other tasks.
  1.1804 +	 *
  1.1805 +	 * We may still have active monitoring at this point and we may
  1.1806 +	 * end up in pfm_overflow_handler(). However, fasync_helper()
  1.1807 +	 * operates with interrupts disabled and it cleans up the
  1.1808 +	 * queue. If the PMU handler is called prior to entering
  1.1809 +	 * fasync_helper() then it will send a signal. If it is
  1.1810 +	 * invoked after, it will find an empty queue and no
  1.1811 +	 * signal will be sent. In both case, we are safe
  1.1812 +	 */
  1.1813 +	if (filp->f_flags & FASYNC) {
  1.1814 +		DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
  1.1815 +		pfm_do_fasync (-1, filp, ctx, 0);
  1.1816 +	}
  1.1817 +
  1.1818 +	PROTECT_CTX(ctx, flags);
  1.1819 +
  1.1820 +	state     = ctx->ctx_state;
  1.1821 +	is_system = ctx->ctx_fl_system;
  1.1822 +
  1.1823 +	task = PFM_CTX_TASK(ctx);
  1.1824 +	regs = task_pt_regs(task);
  1.1825 +
  1.1826 +	DPRINT(("ctx_state=%d is_current=%d\n",
  1.1827 +		state,
  1.1828 +		task == current ? 1 : 0));
  1.1829 +
  1.1830 +	/*
  1.1831 +	 * if state == UNLOADED, then task is NULL
  1.1832 +	 */
  1.1833 +
  1.1834 +	/*
  1.1835 +	 * we must stop and unload because we are losing access to the context.
  1.1836 +	 */
  1.1837 +	if (task == current) {
  1.1838 +#ifdef CONFIG_SMP
  1.1839 +		/*
  1.1840 +		 * the task IS the owner but it migrated to another CPU: that's bad
  1.1841 +		 * but we must handle this cleanly. Unfortunately, the kernel does
  1.1842 +		 * not provide a mechanism to block migration (while the context is loaded).
  1.1843 +		 *
  1.1844 +		 * We need to release the resource on the ORIGINAL cpu.
  1.1845 +		 */
  1.1846 +		if (is_system && ctx->ctx_cpu != smp_processor_id()) {
  1.1847 +
  1.1848 +			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.1849 +			/*
  1.1850 +			 * keep context protected but unmask interrupt for IPI
  1.1851 +			 */
  1.1852 +			local_irq_restore(flags);
  1.1853 +
  1.1854 +			pfm_syswide_cleanup_other_cpu(ctx);
  1.1855 +
  1.1856 +			/*
  1.1857 +			 * restore interrupt masking
  1.1858 +			 */
  1.1859 +			local_irq_save(flags);
  1.1860 +
  1.1861 +			/*
  1.1862 +			 * context is unloaded at this point
  1.1863 +			 */
  1.1864 +		} else
  1.1865 +#endif /* CONFIG_SMP */
  1.1866 +		{
  1.1867 +
  1.1868 +			DPRINT(("forcing unload\n"));
  1.1869 +			/*
  1.1870 +		 	* stop and unload, returning with state UNLOADED
  1.1871 +		 	* and session unreserved.
  1.1872 +		 	*/
  1.1873 +			pfm_context_unload(ctx, NULL, 0, regs);
  1.1874 +
  1.1875 +			DPRINT(("ctx_state=%d\n", ctx->ctx_state));
  1.1876 +		}
  1.1877 +	}
  1.1878 +
  1.1879 +	/*
  1.1880 +	 * remove virtual mapping, if any, for the calling task.
  1.1881 +	 * cannot reset ctx field until last user is calling close().
  1.1882 +	 *
  1.1883 +	 * ctx_smpl_vaddr must never be cleared because it is needed
  1.1884 +	 * by every task with access to the context
  1.1885 +	 *
  1.1886 +	 * When called from do_exit(), the mm context is gone already, therefore
  1.1887 +	 * mm is NULL, i.e., the VMA is already gone  and we do not have to
  1.1888 +	 * do anything here
  1.1889 +	 */
  1.1890 +	if (ctx->ctx_smpl_vaddr && current->mm) {
  1.1891 +		smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
  1.1892 +		smpl_buf_size  = ctx->ctx_smpl_size;
  1.1893 +	}
  1.1894 +
  1.1895 +	UNPROTECT_CTX(ctx, flags);
  1.1896 +
  1.1897 +	/*
  1.1898 +	 * if there was a mapping, then we systematically remove it
  1.1899 +	 * at this point. Cannot be done inside critical section
  1.1900 +	 * because some VM function reenables interrupts.
  1.1901 +	 *
  1.1902 +	 */
  1.1903 +	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
  1.1904 +
  1.1905 +	return 0;
  1.1906 +}
  1.1907 +/*
  1.1908 + * called either on explicit close() or from exit_files(). 
  1.1909 + * Only the LAST user of the file gets to this point, i.e., it is
  1.1910 + * called only ONCE.
  1.1911 + *
  1.1912 + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero 
  1.1913 + * (fput()),i.e, last task to access the file. Nobody else can access the 
  1.1914 + * file at this point.
  1.1915 + *
  1.1916 + * When called from exit_files(), the VMA has been freed because exit_mm()
  1.1917 + * is executed before exit_files().
  1.1918 + *
  1.1919 + * When called from exit_files(), the current task is not yet ZOMBIE but we
  1.1920 + * flush the PMU state to the context. 
  1.1921 + */
  1.1922 +static int
  1.1923 +pfm_close(struct inode *inode, struct file *filp)
  1.1924 +{
  1.1925 +	pfm_context_t *ctx;
  1.1926 +	struct task_struct *task;
  1.1927 +	struct pt_regs *regs;
  1.1928 +  	DECLARE_WAITQUEUE(wait, current);
  1.1929 +	unsigned long flags;
  1.1930 +	unsigned long smpl_buf_size = 0UL;
  1.1931 +	void *smpl_buf_addr = NULL;
  1.1932 +	int free_possible = 1;
  1.1933 +	int state, is_system;
  1.1934 +
  1.1935 +	DPRINT(("pfm_close called private=%p\n", filp->private_data));
  1.1936 +
  1.1937 +	if (PFM_IS_FILE(filp) == 0) {
  1.1938 +		DPRINT(("bad magic\n"));
  1.1939 +		return -EBADF;
  1.1940 +	}
  1.1941 +	
  1.1942 +	ctx = (pfm_context_t *)filp->private_data;
  1.1943 +	if (ctx == NULL) {
  1.1944 +		printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
  1.1945 +		return -EBADF;
  1.1946 +	}
  1.1947 +
  1.1948 +	PROTECT_CTX(ctx, flags);
  1.1949 +
  1.1950 +	state     = ctx->ctx_state;
  1.1951 +	is_system = ctx->ctx_fl_system;
  1.1952 +
  1.1953 +	task = PFM_CTX_TASK(ctx);
  1.1954 +	regs = task_pt_regs(task);
  1.1955 +
  1.1956 +	DPRINT(("ctx_state=%d is_current=%d\n", 
  1.1957 +		state,
  1.1958 +		task == current ? 1 : 0));
  1.1959 +
  1.1960 +	/*
  1.1961 +	 * if task == current, then pfm_flush() unloaded the context
  1.1962 +	 */
  1.1963 +	if (state == PFM_CTX_UNLOADED) goto doit;
  1.1964 +
  1.1965 +	/*
  1.1966 +	 * context is loaded/masked and task != current, we need to
  1.1967 +	 * either force an unload or go zombie
  1.1968 +	 */
  1.1969 +
  1.1970 +	/*
  1.1971 +	 * The task is currently blocked or will block after an overflow.
  1.1972 +	 * we must force it to wakeup to get out of the
  1.1973 +	 * MASKED state and transition to the unloaded state by itself.
  1.1974 +	 *
  1.1975 +	 * This situation is only possible for per-task mode
  1.1976 +	 */
  1.1977 +	if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
  1.1978 +
  1.1979 +		/*
  1.1980 +		 * set a "partial" zombie state to be checked
  1.1981 +		 * upon return from down() in pfm_handle_work().
  1.1982 +		 *
  1.1983 +		 * We cannot use the ZOMBIE state, because it is checked
  1.1984 +		 * by pfm_load_regs() which is called upon wakeup from down().
  1.1985 +		 * In such case, it would free the context and then we would
  1.1986 +		 * return to pfm_handle_work() which would access the
  1.1987 +		 * stale context. Instead, we set a flag invisible to pfm_load_regs()
  1.1988 +		 * but visible to pfm_handle_work().
  1.1989 +		 *
  1.1990 +		 * For some window of time, we have a zombie context with
  1.1991 +		 * ctx_state = MASKED  and not ZOMBIE
  1.1992 +		 */
  1.1993 +		ctx->ctx_fl_going_zombie = 1;
  1.1994 +
  1.1995 +		/*
  1.1996 +		 * force task to wake up from MASKED state
  1.1997 +		 */
  1.1998 +		complete(&ctx->ctx_restart_done);
  1.1999 +
  1.2000 +		DPRINT(("waking up ctx_state=%d\n", state));
  1.2001 +
  1.2002 +		/*
  1.2003 +		 * put ourself to sleep waiting for the other
  1.2004 +		 * task to report completion
  1.2005 +		 *
  1.2006 +		 * the context is protected by mutex, therefore there
  1.2007 +		 * is no risk of being notified of completion before
  1.2008 +		 * begin actually on the waitq.
  1.2009 +		 */
  1.2010 +  		set_current_state(TASK_INTERRUPTIBLE);
  1.2011 +  		add_wait_queue(&ctx->ctx_zombieq, &wait);
  1.2012 +
  1.2013 +		UNPROTECT_CTX(ctx, flags);
  1.2014 +
  1.2015 +		/*
  1.2016 +		 * XXX: check for signals :
  1.2017 +		 * 	- ok for explicit close
  1.2018 +		 * 	- not ok when coming from exit_files()
  1.2019 +		 */
  1.2020 +      		schedule();
  1.2021 +
  1.2022 +
  1.2023 +		PROTECT_CTX(ctx, flags);
  1.2024 +
  1.2025 +
  1.2026 +		remove_wait_queue(&ctx->ctx_zombieq, &wait);
  1.2027 +  		set_current_state(TASK_RUNNING);
  1.2028 +
  1.2029 +		/*
  1.2030 +		 * context is unloaded at this point
  1.2031 +		 */
  1.2032 +		DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
  1.2033 +	}
  1.2034 +	else if (task != current) {
  1.2035 +#ifdef CONFIG_SMP
  1.2036 +		/*
  1.2037 +	 	 * switch context to zombie state
  1.2038 +	 	 */
  1.2039 +		ctx->ctx_state = PFM_CTX_ZOMBIE;
  1.2040 +
  1.2041 +		DPRINT(("zombie ctx for [%d]\n", task->pid));
  1.2042 +		/*
  1.2043 +		 * cannot free the context on the spot. deferred until
  1.2044 +		 * the task notices the ZOMBIE state
  1.2045 +		 */
  1.2046 +		free_possible = 0;
  1.2047 +#else
  1.2048 +		pfm_context_unload(ctx, NULL, 0, regs);
  1.2049 +#endif
  1.2050 +	}
  1.2051 +
  1.2052 +doit:
  1.2053 +	/* reload state, may have changed during  opening of critical section */
  1.2054 +	state = ctx->ctx_state;
  1.2055 +
  1.2056 +	/*
  1.2057 +	 * the context is still attached to a task (possibly current)
  1.2058 +	 * we cannot destroy it right now
  1.2059 +	 */
  1.2060 +
  1.2061 +	/*
  1.2062 +	 * we must free the sampling buffer right here because
  1.2063 +	 * we cannot rely on it being cleaned up later by the
  1.2064 +	 * monitored task. It is not possible to free vmalloc'ed
  1.2065 +	 * memory in pfm_load_regs(). Instead, we remove the buffer
  1.2066 +	 * now. should there be subsequent PMU overflow originally
  1.2067 +	 * meant for sampling, the will be converted to spurious
  1.2068 +	 * and that's fine because the monitoring tools is gone anyway.
  1.2069 +	 */
  1.2070 +	if (ctx->ctx_smpl_hdr) {
  1.2071 +		smpl_buf_addr = ctx->ctx_smpl_hdr;
  1.2072 +		smpl_buf_size = ctx->ctx_smpl_size;
  1.2073 +		/* no more sampling */
  1.2074 +		ctx->ctx_smpl_hdr = NULL;
  1.2075 +		ctx->ctx_fl_is_sampling = 0;
  1.2076 +	}
  1.2077 +
  1.2078 +	DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
  1.2079 +		state,
  1.2080 +		free_possible,
  1.2081 +		smpl_buf_addr,
  1.2082 +		smpl_buf_size));
  1.2083 +
  1.2084 +	if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
  1.2085 +
  1.2086 +	/*
  1.2087 +	 * UNLOADED that the session has already been unreserved.
  1.2088 +	 */
  1.2089 +	if (state == PFM_CTX_ZOMBIE) {
  1.2090 +		pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
  1.2091 +	}
  1.2092 +
  1.2093 +	/*
  1.2094 +	 * disconnect file descriptor from context must be done
  1.2095 +	 * before we unlock.
  1.2096 +	 */
  1.2097 +	filp->private_data = NULL;
  1.2098 +
  1.2099 +	/*
  1.2100 +	 * if we free on the spot, the context is now completely unreacheable
  1.2101 +	 * from the callers side. The monitored task side is also cut, so we
  1.2102 +	 * can freely cut.
  1.2103 +	 *
  1.2104 +	 * If we have a deferred free, only the caller side is disconnected.
  1.2105 +	 */
  1.2106 +	UNPROTECT_CTX(ctx, flags);
  1.2107 +
  1.2108 +	/*
  1.2109 +	 * All memory free operations (especially for vmalloc'ed memory)
  1.2110 +	 * MUST be done with interrupts ENABLED.
  1.2111 +	 */
  1.2112 +	if (smpl_buf_addr)  pfm_rvfree(smpl_buf_addr, smpl_buf_size);
  1.2113 +
  1.2114 +	/*
  1.2115 +	 * return the memory used by the context
  1.2116 +	 */
  1.2117 +	if (free_possible) pfm_context_free(ctx);
  1.2118 +
  1.2119 +	return 0;
  1.2120 +}
  1.2121 +
  1.2122 +static int
  1.2123 +pfm_no_open(struct inode *irrelevant, struct file *dontcare)
  1.2124 +{
  1.2125 +	DPRINT(("pfm_no_open called\n"));
  1.2126 +	return -ENXIO;
  1.2127 +}
  1.2128 +
  1.2129 +
  1.2130 +
  1.2131 +static struct file_operations pfm_file_ops = {
  1.2132 +	.llseek   = no_llseek,
  1.2133 +	.read     = pfm_read,
  1.2134 +	.write    = pfm_write,
  1.2135 +	.poll     = pfm_poll,
  1.2136 +	.ioctl    = pfm_ioctl,
  1.2137 +	.open     = pfm_no_open,	/* special open code to disallow open via /proc */
  1.2138 +	.fasync   = pfm_fasync,
  1.2139 +	.release  = pfm_close,
  1.2140 +	.flush	  = pfm_flush
  1.2141 +};
  1.2142 +
  1.2143 +static int
  1.2144 +pfmfs_delete_dentry(struct dentry *dentry)
  1.2145 +{
  1.2146 +	return 1;
  1.2147 +}
  1.2148 +
  1.2149 +static struct dentry_operations pfmfs_dentry_operations = {
  1.2150 +	.d_delete = pfmfs_delete_dentry,
  1.2151 +};
  1.2152 +
  1.2153 +
  1.2154 +static int
  1.2155 +pfm_alloc_fd(struct file **cfile)
  1.2156 +{
  1.2157 +	int fd, ret = 0;
  1.2158 +	struct file *file = NULL;
  1.2159 +	struct inode * inode;
  1.2160 +	char name[32];
  1.2161 +	struct qstr this;
  1.2162 +
  1.2163 +	fd = get_unused_fd();
  1.2164 +	if (fd < 0) return -ENFILE;
  1.2165 +
  1.2166 +	ret = -ENFILE;
  1.2167 +
  1.2168 +	file = get_empty_filp();
  1.2169 +	if (!file) goto out;
  1.2170 +
  1.2171 +	/*
  1.2172 +	 * allocate a new inode
  1.2173 +	 */
  1.2174 +	inode = new_inode(pfmfs_mnt->mnt_sb);
  1.2175 +	if (!inode) goto out;
  1.2176 +
  1.2177 +	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
  1.2178 +
  1.2179 +	inode->i_mode = S_IFCHR|S_IRUGO;
  1.2180 +	inode->i_uid  = current->fsuid;
  1.2181 +	inode->i_gid  = current->fsgid;
  1.2182 +
  1.2183 +	sprintf(name, "[%lu]", inode->i_ino);
  1.2184 +	this.name = name;
  1.2185 +	this.len  = strlen(name);
  1.2186 +	this.hash = inode->i_ino;
  1.2187 +
  1.2188 +	ret = -ENOMEM;
  1.2189 +
  1.2190 +	/*
  1.2191 +	 * allocate a new dcache entry
  1.2192 +	 */
  1.2193 +	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
  1.2194 +	if (!file->f_dentry) goto out;
  1.2195 +
  1.2196 +	file->f_dentry->d_op = &pfmfs_dentry_operations;
  1.2197 +
  1.2198 +	d_add(file->f_dentry, inode);
  1.2199 +	file->f_vfsmnt = mntget(pfmfs_mnt);
  1.2200 +	file->f_mapping = inode->i_mapping;
  1.2201 +
  1.2202 +	file->f_op    = &pfm_file_ops;
  1.2203 +	file->f_mode  = FMODE_READ;
  1.2204 +	file->f_flags = O_RDONLY;
  1.2205 +	file->f_pos   = 0;
  1.2206 +
  1.2207 +	/*
  1.2208 +	 * may have to delay until context is attached?
  1.2209 +	 */
  1.2210 +	fd_install(fd, file);
  1.2211 +
  1.2212 +	/*
  1.2213 +	 * the file structure we will use
  1.2214 +	 */
  1.2215 +	*cfile = file;
  1.2216 +
  1.2217 +	return fd;
  1.2218 +out:
  1.2219 +	if (file) put_filp(file);
  1.2220 +	put_unused_fd(fd);
  1.2221 +	return ret;
  1.2222 +}
  1.2223 +
  1.2224 +static void
  1.2225 +pfm_free_fd(int fd, struct file *file)
  1.2226 +{
  1.2227 +	struct files_struct *files = current->files;
  1.2228 +	struct fdtable *fdt;
  1.2229 +
  1.2230 +	/* 
  1.2231 +	 * there ie no fd_uninstall(), so we do it here
  1.2232 +	 */
  1.2233 +	spin_lock(&files->file_lock);
  1.2234 +	fdt = files_fdtable(files);
  1.2235 +	rcu_assign_pointer(fdt->fd[fd], NULL);
  1.2236 +	spin_unlock(&files->file_lock);
  1.2237 +
  1.2238 +	if (file)
  1.2239 +		put_filp(file);
  1.2240 +	put_unused_fd(fd);
  1.2241 +}
  1.2242 +
  1.2243 +static int
  1.2244 +pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
  1.2245 +{
  1.2246 +	DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
  1.2247 +
  1.2248 +	while (size > 0) {
  1.2249 +		unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
  1.2250 +
  1.2251 +
  1.2252 +		if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
  1.2253 +			return -ENOMEM;
  1.2254 +
  1.2255 +		addr  += PAGE_SIZE;
  1.2256 +		buf   += PAGE_SIZE;
  1.2257 +		size  -= PAGE_SIZE;
  1.2258 +	}
  1.2259 +	return 0;
  1.2260 +}
  1.2261 +
  1.2262 +/*
  1.2263 + * allocate a sampling buffer and remaps it into the user address space of the task
  1.2264 + */
  1.2265 +static int
  1.2266 +pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
  1.2267 +{
  1.2268 +	struct mm_struct *mm = task->mm;
  1.2269 +	struct vm_area_struct *vma = NULL;
  1.2270 +	unsigned long size;
  1.2271 +	void *smpl_buf;
  1.2272 +
  1.2273 +
  1.2274 +	/*
  1.2275 +	 * the fixed header + requested size and align to page boundary
  1.2276 +	 */
  1.2277 +	size = PAGE_ALIGN(rsize);
  1.2278 +
  1.2279 +	DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
  1.2280 +
  1.2281 +	/*
  1.2282 +	 * check requested size to avoid Denial-of-service attacks
  1.2283 +	 * XXX: may have to refine this test
  1.2284 +	 * Check against address space limit.
  1.2285 +	 *
  1.2286 +	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
  1.2287 +	 * 	return -ENOMEM;
  1.2288 +	 */
  1.2289 +	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
  1.2290 +		return -ENOMEM;
  1.2291 +
  1.2292 +	/*
  1.2293 +	 * We do the easy to undo allocations first.
  1.2294 + 	 *
  1.2295 +	 * pfm_rvmalloc(), clears the buffer, so there is no leak
  1.2296 +	 */
  1.2297 +	smpl_buf = pfm_rvmalloc(size);
  1.2298 +	if (smpl_buf == NULL) {
  1.2299 +		DPRINT(("Can't allocate sampling buffer\n"));
  1.2300 +		return -ENOMEM;
  1.2301 +	}
  1.2302 +
  1.2303 +	DPRINT(("smpl_buf @%p\n", smpl_buf));
  1.2304 +
  1.2305 +	/* allocate vma */
  1.2306 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  1.2307 +	if (!vma) {
  1.2308 +		DPRINT(("Cannot allocate vma\n"));
  1.2309 +		goto error_kmem;
  1.2310 +	}
  1.2311 +	memset(vma, 0, sizeof(*vma));
  1.2312 +
  1.2313 +	/*
  1.2314 +	 * partially initialize the vma for the sampling buffer
  1.2315 +	 */
  1.2316 +	vma->vm_mm	     = mm;
  1.2317 +	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
  1.2318 +	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
  1.2319 +
  1.2320 +	/*
  1.2321 +	 * Now we have everything we need and we can initialize
  1.2322 +	 * and connect all the data structures
  1.2323 +	 */
  1.2324 +
  1.2325 +	ctx->ctx_smpl_hdr   = smpl_buf;
  1.2326 +	ctx->ctx_smpl_size  = size; /* aligned size */
  1.2327 +
  1.2328 +	/*
  1.2329 +	 * Let's do the difficult operations next.
  1.2330 +	 *
  1.2331 +	 * now we atomically find some area in the address space and
  1.2332 +	 * remap the buffer in it.
  1.2333 +	 */
  1.2334 +	down_write(&task->mm->mmap_sem);
  1.2335 +
  1.2336 +	/* find some free area in address space, must have mmap sem held */
  1.2337 +	vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
  1.2338 +	if (vma->vm_start == 0UL) {
  1.2339 +		DPRINT(("Cannot find unmapped area for size %ld\n", size));
  1.2340 +		up_write(&task->mm->mmap_sem);
  1.2341 +		goto error;
  1.2342 +	}
  1.2343 +	vma->vm_end = vma->vm_start + size;
  1.2344 +	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  1.2345 +
  1.2346 +	DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
  1.2347 +
  1.2348 +	/* can only be applied to current task, need to have the mm semaphore held when called */
  1.2349 +	if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
  1.2350 +		DPRINT(("Can't remap buffer\n"));
  1.2351 +		up_write(&task->mm->mmap_sem);
  1.2352 +		goto error;
  1.2353 +	}
  1.2354 +
  1.2355 +	/*
  1.2356 +	 * now insert the vma in the vm list for the process, must be
  1.2357 +	 * done with mmap lock held
  1.2358 +	 */
  1.2359 +	insert_vm_struct(mm, vma);
  1.2360 +
  1.2361 +	mm->total_vm  += size >> PAGE_SHIFT;
  1.2362 +	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
  1.2363 +							vma_pages(vma));
  1.2364 +	up_write(&task->mm->mmap_sem);
  1.2365 +
  1.2366 +	/*
  1.2367 +	 * keep track of user level virtual address
  1.2368 +	 */
  1.2369 +	ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
  1.2370 +	*(unsigned long *)user_vaddr = vma->vm_start;
  1.2371 +
  1.2372 +	return 0;
  1.2373 +
  1.2374 +error:
  1.2375 +	kmem_cache_free(vm_area_cachep, vma);
  1.2376 +error_kmem:
  1.2377 +	pfm_rvfree(smpl_buf, size);
  1.2378 +
  1.2379 +	return -ENOMEM;
  1.2380 +}
  1.2381 +
  1.2382 +/*
  1.2383 + * XXX: do something better here
  1.2384 + */
  1.2385 +static int
  1.2386 +pfm_bad_permissions(struct task_struct *task)
  1.2387 +{
  1.2388 +	/* inspired by ptrace_attach() */
  1.2389 +	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
  1.2390 +		current->uid,
  1.2391 +		current->gid,
  1.2392 +		task->euid,
  1.2393 +		task->suid,
  1.2394 +		task->uid,
  1.2395 +		task->egid,
  1.2396 +		task->sgid));
  1.2397 +
  1.2398 +	return ((current->uid != task->euid)
  1.2399 +	    || (current->uid != task->suid)
  1.2400 +	    || (current->uid != task->uid)
  1.2401 +	    || (current->gid != task->egid)
  1.2402 +	    || (current->gid != task->sgid)
  1.2403 +	    || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
  1.2404 +}
  1.2405 +
  1.2406 +static int
  1.2407 +pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
  1.2408 +{
  1.2409 +	int ctx_flags;
  1.2410 +
  1.2411 +	/* valid signal */
  1.2412 +
  1.2413 +	ctx_flags = pfx->ctx_flags;
  1.2414 +
  1.2415 +	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
  1.2416 +
  1.2417 +		/*
  1.2418 +		 * cannot block in this mode
  1.2419 +		 */
  1.2420 +		if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
  1.2421 +			DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
  1.2422 +			return -EINVAL;
  1.2423 +		}
  1.2424 +	} else {
  1.2425 +	}
  1.2426 +	/* probably more to add here */
  1.2427 +
  1.2428 +	return 0;
  1.2429 +}
  1.2430 +
  1.2431 +static int
  1.2432 +pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags,
  1.2433 +		     unsigned int cpu, pfarg_context_t *arg)
  1.2434 +{
  1.2435 +	pfm_buffer_fmt_t *fmt = NULL;
  1.2436 +	unsigned long size = 0UL;
  1.2437 +	void *uaddr = NULL;
  1.2438 +	void *fmt_arg = NULL;
  1.2439 +	int ret = 0;
  1.2440 +#define PFM_CTXARG_BUF_ARG(a)	(pfm_buffer_fmt_t *)(a+1)
  1.2441 +
  1.2442 +	/* invoke and lock buffer format, if found */
  1.2443 +	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
  1.2444 +	if (fmt == NULL) {
  1.2445 +		DPRINT(("[%d] cannot find buffer format\n", task->pid));
  1.2446 +		return -EINVAL;
  1.2447 +	}
  1.2448 +
  1.2449 +	/*
  1.2450 +	 * buffer argument MUST be contiguous to pfarg_context_t
  1.2451 +	 */
  1.2452 +	if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
  1.2453 +
  1.2454 +	ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
  1.2455 +
  1.2456 +	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
  1.2457 +
  1.2458 +	if (ret) goto error;
  1.2459 +
  1.2460 +	/* link buffer format and context */
  1.2461 +	ctx->ctx_buf_fmt = fmt;
  1.2462 +
  1.2463 +	/*
  1.2464 +	 * check if buffer format wants to use perfmon buffer allocation/mapping service
  1.2465 +	 */
  1.2466 +	ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
  1.2467 +	if (ret) goto error;
  1.2468 +
  1.2469 +	if (size) {
  1.2470 +		/*
  1.2471 +		 * buffer is always remapped into the caller's address space
  1.2472 +		 */
  1.2473 +		ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
  1.2474 +		if (ret) goto error;
  1.2475 +
  1.2476 +		/* keep track of user address of buffer */
  1.2477 +		arg->ctx_smpl_vaddr = uaddr;
  1.2478 +	}
  1.2479 +	ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
  1.2480 +
  1.2481 +error:
  1.2482 +	return ret;
  1.2483 +}
  1.2484 +
  1.2485 +static void
  1.2486 +pfm_reset_pmu_state(pfm_context_t *ctx)
  1.2487 +{
  1.2488 +	int i;
  1.2489 +
  1.2490 +	/*
  1.2491 +	 * install reset values for PMC.
  1.2492 +	 */
  1.2493 +	for (i=1; PMC_IS_LAST(i) == 0; i++) {
  1.2494 +		if (PMC_IS_IMPL(i) == 0) continue;
  1.2495 +		ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
  1.2496 +		DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
  1.2497 +	}
  1.2498 +	/*
  1.2499 +	 * PMD registers are set to 0UL when the context in memset()
  1.2500 +	 */
  1.2501 +
  1.2502 +	/*
  1.2503 +	 * On context switched restore, we must restore ALL pmc and ALL pmd even
  1.2504 +	 * when they are not actively used by the task. In UP, the incoming process
  1.2505 +	 * may otherwise pick up left over PMC, PMD state from the previous process.
  1.2506 +	 * As opposed to PMD, stale PMC can cause harm to the incoming
  1.2507 +	 * process because they may change what is being measured.
  1.2508 +	 * Therefore, we must systematically reinstall the entire
  1.2509 +	 * PMC state. In SMP, the same thing is possible on the
  1.2510 +	 * same CPU but also on between 2 CPUs.
  1.2511 +	 *
  1.2512 +	 * The problem with PMD is information leaking especially
  1.2513 +	 * to user level when psr.sp=0
  1.2514 +	 *
  1.2515 +	 * There is unfortunately no easy way to avoid this problem
  1.2516 +	 * on either UP or SMP. This definitively slows down the
  1.2517 +	 * pfm_load_regs() function.
  1.2518 +	 */
  1.2519 +
  1.2520 +	 /*
  1.2521 +	  * bitmask of all PMCs accessible to this context
  1.2522 +	  *
  1.2523 +	  * PMC0 is treated differently.
  1.2524 +	  */
  1.2525 +	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
  1.2526 +
  1.2527 +	/*
  1.2528 +	 * bitmask of all PMDs that are accesible to this context
  1.2529 +	 */
  1.2530 +	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
  1.2531 +
  1.2532 +	DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
  1.2533 +
  1.2534 +	/*
  1.2535 +	 * useful in case of re-enable after disable
  1.2536 +	 */
  1.2537 +	ctx->ctx_used_ibrs[0] = 0UL;
  1.2538 +	ctx->ctx_used_dbrs[0] = 0UL;
  1.2539 +}
  1.2540 +
  1.2541 +static int
  1.2542 +pfm_ctx_getsize(void *arg, size_t *sz)
  1.2543 +{
  1.2544 +	pfarg_context_t *req = (pfarg_context_t *)arg;
  1.2545 +	pfm_buffer_fmt_t *fmt;
  1.2546 +
  1.2547 +	*sz = 0;
  1.2548 +
  1.2549 +	if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
  1.2550 +
  1.2551 +	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
  1.2552 +	if (fmt == NULL) {
  1.2553 +		DPRINT(("cannot find buffer format\n"));
  1.2554 +		return -EINVAL;
  1.2555 +	}
  1.2556 +	/* get just enough to copy in user parameters */
  1.2557 +	*sz = fmt->fmt_arg_size;
  1.2558 +	DPRINT(("arg_size=%lu\n", *sz));
  1.2559 +
  1.2560 +	return 0;
  1.2561 +}
  1.2562 +
  1.2563 +
  1.2564 +
  1.2565 +/*
  1.2566 + * cannot attach if :
  1.2567 + * 	- kernel task
  1.2568 + * 	- task not owned by caller
  1.2569 + * 	- task incompatible with context mode
  1.2570 + */
  1.2571 +static int
  1.2572 +pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
  1.2573 +{
  1.2574 +	/*
  1.2575 +	 * no kernel task or task not owner by caller
  1.2576 +	 */
  1.2577 +	if (task->mm == NULL) {
  1.2578 +		DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
  1.2579 +		return -EPERM;
  1.2580 +	}
  1.2581 +	if (pfm_bad_permissions(task)) {
  1.2582 +		DPRINT(("no permission to attach to  [%d]\n", task->pid));
  1.2583 +		return -EPERM;
  1.2584 +	}
  1.2585 +	/*
  1.2586 +	 * cannot block in self-monitoring mode
  1.2587 +	 */
  1.2588 +	if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
  1.2589 +		DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
  1.2590 +		return -EINVAL;
  1.2591 +	}
  1.2592 +
  1.2593 +	if (task->exit_state == EXIT_ZOMBIE) {
  1.2594 +		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
  1.2595 +		return -EBUSY;
  1.2596 +	}
  1.2597 +
  1.2598 +	/*
  1.2599 +	 * always ok for self
  1.2600 +	 */
  1.2601 +	if (task == current) return 0;
  1.2602 +
  1.2603 +	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
  1.2604 +		DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
  1.2605 +		return -EBUSY;
  1.2606 +	}
  1.2607 +	/*
  1.2608 +	 * make sure the task is off any CPU
  1.2609 +	 */
  1.2610 +	wait_task_inactive(task);
  1.2611 +
  1.2612 +	/* more to come... */
  1.2613 +
  1.2614 +	return 0;
  1.2615 +}
  1.2616 +
  1.2617 +static int
  1.2618 +pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
  1.2619 +{
  1.2620 +	struct task_struct *p = current;
  1.2621 +	int ret;
  1.2622 +
  1.2623 +	/* XXX: need to add more checks here */
  1.2624 +	if (pid < 2) return -EPERM;
  1.2625 +
  1.2626 +	if (pid != current->pid) {
  1.2627 +
  1.2628 +		read_lock(&tasklist_lock);
  1.2629 +
  1.2630 +		p = find_task_by_pid(pid);
  1.2631 +
  1.2632 +		/* make sure task cannot go away while we operate on it */
  1.2633 +		if (p) get_task_struct(p);
  1.2634 +
  1.2635 +		read_unlock(&tasklist_lock);
  1.2636 +
  1.2637 +		if (p == NULL) return -ESRCH;
  1.2638 +	}
  1.2639 +
  1.2640 +	ret = pfm_task_incompatible(ctx, p);
  1.2641 +	if (ret == 0) {
  1.2642 +		*task = p;
  1.2643 +	} else if (p != current) {
  1.2644 +		pfm_put_task(p);
  1.2645 +	}
  1.2646 +	return ret;
  1.2647 +}
  1.2648 +
  1.2649 +
  1.2650 +
  1.2651 +static int
  1.2652 +pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.2653 +{
  1.2654 +	pfarg_context_t *req = (pfarg_context_t *)arg;
  1.2655 +	struct file *filp;
  1.2656 +	int ctx_flags;
  1.2657 +	int ret;
  1.2658 +
  1.2659 +	/* let's check the arguments first */
  1.2660 +	ret = pfarg_is_sane(current, req);
  1.2661 +	if (ret < 0) return ret;
  1.2662 +
  1.2663 +	ctx_flags = req->ctx_flags;
  1.2664 +
  1.2665 +	ret = -ENOMEM;
  1.2666 +
  1.2667 +	ctx = pfm_context_alloc();
  1.2668 +	if (!ctx) goto error;
  1.2669 +
  1.2670 +	ret = pfm_alloc_fd(&filp);
  1.2671 +	if (ret < 0) goto error_file;
  1.2672 +
  1.2673 +	req->ctx_fd = ctx->ctx_fd = ret;
  1.2674 +
  1.2675 +	/*
  1.2676 +	 * attach context to file
  1.2677 +	 */
  1.2678 +	filp->private_data = ctx;
  1.2679 +
  1.2680 +	/*
  1.2681 +	 * does the user want to sample?
  1.2682 +	 */
  1.2683 +	if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
  1.2684 +		ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
  1.2685 +		if (ret) goto buffer_error;
  1.2686 +	}
  1.2687 +
  1.2688 +	/*
  1.2689 +	 * init context protection lock
  1.2690 +	 */
  1.2691 +	spin_lock_init(&ctx->ctx_lock);
  1.2692 +
  1.2693 +	/*
  1.2694 +	 * context is unloaded
  1.2695 +	 */
  1.2696 +	ctx->ctx_state = PFM_CTX_UNLOADED;
  1.2697 +
  1.2698 +	/*
  1.2699 +	 * initialization of context's flags
  1.2700 +	 */
  1.2701 +	ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
  1.2702 +	ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
  1.2703 +	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
  1.2704 +	ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
  1.2705 +	/*
  1.2706 +	 * will move to set properties
  1.2707 +	 * ctx->ctx_fl_excl_idle   = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
  1.2708 +	 */
  1.2709 +
  1.2710 +	/*
  1.2711 +	 * init restart semaphore to locked
  1.2712 +	 */
  1.2713 +	init_completion(&ctx->ctx_restart_done);
  1.2714 +
  1.2715 +	/*
  1.2716 +	 * activation is used in SMP only
  1.2717 +	 */
  1.2718 +	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
  1.2719 +	SET_LAST_CPU(ctx, -1);
  1.2720 +
  1.2721 +	/*
  1.2722 +	 * initialize notification message queue
  1.2723 +	 */
  1.2724 +	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
  1.2725 +	init_waitqueue_head(&ctx->ctx_msgq_wait);
  1.2726 +	init_waitqueue_head(&ctx->ctx_zombieq);
  1.2727 +
  1.2728 +	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
  1.2729 +		ctx,
  1.2730 +		ctx_flags,
  1.2731 +		ctx->ctx_fl_system,
  1.2732 +		ctx->ctx_fl_block,
  1.2733 +		ctx->ctx_fl_excl_idle,
  1.2734 +		ctx->ctx_fl_no_msg,
  1.2735 +		ctx->ctx_fd));
  1.2736 +
  1.2737 +	/*
  1.2738 +	 * initialize soft PMU state
  1.2739 +	 */
  1.2740 +	pfm_reset_pmu_state(ctx);
  1.2741 +
  1.2742 +	return 0;
  1.2743 +
  1.2744 +buffer_error:
  1.2745 +	pfm_free_fd(ctx->ctx_fd, filp);
  1.2746 +
  1.2747 +	if (ctx->ctx_buf_fmt) {
  1.2748 +		pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
  1.2749 +	}
  1.2750 +error_file:
  1.2751 +	pfm_context_free(ctx);
  1.2752 +
  1.2753 +error:
  1.2754 +	return ret;
  1.2755 +}
  1.2756 +
  1.2757 +static inline unsigned long
  1.2758 +pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
  1.2759 +{
  1.2760 +	unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
  1.2761 +	unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
  1.2762 +	extern unsigned long carta_random32 (unsigned long seed);
  1.2763 +
  1.2764 +	if (reg->flags & PFM_REGFL_RANDOM) {
  1.2765 +		new_seed = carta_random32(old_seed);
  1.2766 +		val -= (old_seed & mask);	/* counter values are negative numbers! */
  1.2767 +		if ((mask >> 32) != 0)
  1.2768 +			/* construct a full 64-bit random value: */
  1.2769 +			new_seed |= carta_random32(old_seed >> 32) << 32;
  1.2770 +		reg->seed = new_seed;
  1.2771 +	}
  1.2772 +	reg->lval = val;
  1.2773 +	return val;
  1.2774 +}
  1.2775 +
  1.2776 +static void
  1.2777 +pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
  1.2778 +{
  1.2779 +	unsigned long mask = ovfl_regs[0];
  1.2780 +	unsigned long reset_others = 0UL;
  1.2781 +	unsigned long val;
  1.2782 +	int i;
  1.2783 +
  1.2784 +	/*
  1.2785 +	 * now restore reset value on sampling overflowed counters
  1.2786 +	 */
  1.2787 +	mask >>= PMU_FIRST_COUNTER;
  1.2788 +	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
  1.2789 +
  1.2790 +		if ((mask & 0x1UL) == 0UL) continue;
  1.2791 +
  1.2792 +		ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
  1.2793 +		reset_others        |= ctx->ctx_pmds[i].reset_pmds[0];
  1.2794 +
  1.2795 +		DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
  1.2796 +	}
  1.2797 +
  1.2798 +	/*
  1.2799 +	 * Now take care of resetting the other registers
  1.2800 +	 */
  1.2801 +	for(i = 0; reset_others; i++, reset_others >>= 1) {
  1.2802 +
  1.2803 +		if ((reset_others & 0x1) == 0) continue;
  1.2804 +
  1.2805 +		ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
  1.2806 +
  1.2807 +		DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
  1.2808 +			  is_long_reset ? "long" : "short", i, val));
  1.2809 +	}
  1.2810 +}
  1.2811 +
  1.2812 +static void
  1.2813 +pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
  1.2814 +{
  1.2815 +	unsigned long mask = ovfl_regs[0];
  1.2816 +	unsigned long reset_others = 0UL;
  1.2817 +	unsigned long val;
  1.2818 +	int i;
  1.2819 +
  1.2820 +	DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
  1.2821 +
  1.2822 +	if (ctx->ctx_state == PFM_CTX_MASKED) {
  1.2823 +		pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
  1.2824 +		return;
  1.2825 +	}
  1.2826 +
  1.2827 +	/*
  1.2828 +	 * now restore reset value on sampling overflowed counters
  1.2829 +	 */
  1.2830 +	mask >>= PMU_FIRST_COUNTER;
  1.2831 +	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
  1.2832 +
  1.2833 +		if ((mask & 0x1UL) == 0UL) continue;
  1.2834 +
  1.2835 +		val           = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
  1.2836 +		reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
  1.2837 +
  1.2838 +		DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
  1.2839 +
  1.2840 +		pfm_write_soft_counter(ctx, i, val);
  1.2841 +	}
  1.2842 +
  1.2843 +	/*
  1.2844 +	 * Now take care of resetting the other registers
  1.2845 +	 */
  1.2846 +	for(i = 0; reset_others; i++, reset_others >>= 1) {
  1.2847 +
  1.2848 +		if ((reset_others & 0x1) == 0) continue;
  1.2849 +
  1.2850 +		val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
  1.2851 +
  1.2852 +		if (PMD_IS_COUNTING(i)) {
  1.2853 +			pfm_write_soft_counter(ctx, i, val);
  1.2854 +		} else {
  1.2855 +			ia64_set_pmd(i, val);
  1.2856 +		}
  1.2857 +		DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
  1.2858 +			  is_long_reset ? "long" : "short", i, val));
  1.2859 +	}
  1.2860 +	ia64_srlz_d();
  1.2861 +}
  1.2862 +
  1.2863 +static int
  1.2864 +pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.2865 +{
  1.2866 +	struct thread_struct *thread = NULL;
  1.2867 +	struct task_struct *task;
  1.2868 +	pfarg_reg_t *req = (pfarg_reg_t *)arg;
  1.2869 +	unsigned long value, pmc_pm;
  1.2870 +	unsigned long smpl_pmds, reset_pmds, impl_pmds;
  1.2871 +	unsigned int cnum, reg_flags, flags, pmc_type;
  1.2872 +	int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
  1.2873 +	int is_monitor, is_counting, state;
  1.2874 +	int ret = -EINVAL;
  1.2875 +	pfm_reg_check_t	wr_func;
  1.2876 +#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
  1.2877 +
  1.2878 +	state     = ctx->ctx_state;
  1.2879 +	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
  1.2880 +	is_system = ctx->ctx_fl_system;
  1.2881 +	task      = ctx->ctx_task;
  1.2882 +	impl_pmds = pmu_conf->impl_pmds[0];
  1.2883 +
  1.2884 +	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
  1.2885 +
  1.2886 +	if (is_loaded) {
  1.2887 +		thread = &task->thread;
  1.2888 +		/*
  1.2889 +		 * In system wide and when the context is loaded, access can only happen
  1.2890 +		 * when the caller is running on the CPU being monitored by the session.
  1.2891 +		 * It does not have to be the owner (ctx_task) of the context per se.
  1.2892 +		 */
  1.2893 +		if (is_system && ctx->ctx_cpu != smp_processor_id()) {
  1.2894 +			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.2895 +			return -EBUSY;
  1.2896 +		}
  1.2897 +		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
  1.2898 +	}
  1.2899 +	expert_mode = pfm_sysctl.expert_mode; 
  1.2900 +
  1.2901 +	for (i = 0; i < count; i++, req++) {
  1.2902 +
  1.2903 +		cnum       = req->reg_num;
  1.2904 +		reg_flags  = req->reg_flags;
  1.2905 +		value      = req->reg_value;
  1.2906 +		smpl_pmds  = req->reg_smpl_pmds[0];
  1.2907 +		reset_pmds = req->reg_reset_pmds[0];
  1.2908 +		flags      = 0;
  1.2909 +
  1.2910 +
  1.2911 +		if (cnum >= PMU_MAX_PMCS) {
  1.2912 +			DPRINT(("pmc%u is invalid\n", cnum));
  1.2913 +			goto error;
  1.2914 +		}
  1.2915 +
  1.2916 +		pmc_type   = pmu_conf->pmc_desc[cnum].type;
  1.2917 +		pmc_pm     = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
  1.2918 +		is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
  1.2919 +		is_monitor  = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
  1.2920 +
  1.2921 +		/*
  1.2922 +		 * we reject all non implemented PMC as well
  1.2923 +		 * as attempts to modify PMC[0-3] which are used
  1.2924 +		 * as status registers by the PMU
  1.2925 +		 */
  1.2926 +		if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
  1.2927 +			DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
  1.2928 +			goto error;
  1.2929 +		}
  1.2930 +		wr_func = pmu_conf->pmc_desc[cnum].write_check;
  1.2931 +		/*
  1.2932 +		 * If the PMC is a monitor, then if the value is not the default:
  1.2933 +		 * 	- system-wide session: PMCx.pm=1 (privileged monitor)
  1.2934 +		 * 	- per-task           : PMCx.pm=0 (user monitor)
  1.2935 +		 */
  1.2936 +		if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
  1.2937 +			DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
  1.2938 +				cnum,
  1.2939 +				pmc_pm,
  1.2940 +				is_system));
  1.2941 +			goto error;
  1.2942 +		}
  1.2943 +
  1.2944 +		if (is_counting) {
  1.2945 +			/*
  1.2946 +		 	 * enforce generation of overflow interrupt. Necessary on all
  1.2947 +		 	 * CPUs.
  1.2948 +		 	 */
  1.2949 +			value |= 1 << PMU_PMC_OI;
  1.2950 +
  1.2951 +			if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
  1.2952 +				flags |= PFM_REGFL_OVFL_NOTIFY;
  1.2953 +			}
  1.2954 +
  1.2955 +			if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
  1.2956 +
  1.2957 +			/* verify validity of smpl_pmds */
  1.2958 +			if ((smpl_pmds & impl_pmds) != smpl_pmds) {
  1.2959 +				DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
  1.2960 +				goto error;
  1.2961 +			}
  1.2962 +
  1.2963 +			/* verify validity of reset_pmds */
  1.2964 +			if ((reset_pmds & impl_pmds) != reset_pmds) {
  1.2965 +				DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
  1.2966 +				goto error;
  1.2967 +			}
  1.2968 +		} else {
  1.2969 +			if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
  1.2970 +				DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
  1.2971 +				goto error;
  1.2972 +			}
  1.2973 +			/* eventid on non-counting monitors are ignored */
  1.2974 +		}
  1.2975 +
  1.2976 +		/*
  1.2977 +		 * execute write checker, if any
  1.2978 +		 */
  1.2979 +		if (likely(expert_mode == 0 && wr_func)) {
  1.2980 +			ret = (*wr_func)(task, ctx, cnum, &value, regs);
  1.2981 +			if (ret) goto error;
  1.2982 +			ret = -EINVAL;
  1.2983 +		}
  1.2984 +
  1.2985 +		/*
  1.2986 +		 * no error on this register
  1.2987 +		 */
  1.2988 +		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
  1.2989 +
  1.2990 +		/*
  1.2991 +		 * Now we commit the changes to the software state
  1.2992 +		 */
  1.2993 +
  1.2994 +		/*
  1.2995 +		 * update overflow information
  1.2996 +		 */
  1.2997 +		if (is_counting) {
  1.2998 +			/*
  1.2999 +		 	 * full flag update each time a register is programmed
  1.3000 +		 	 */
  1.3001 +			ctx->ctx_pmds[cnum].flags = flags;
  1.3002 +
  1.3003 +			ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
  1.3004 +			ctx->ctx_pmds[cnum].smpl_pmds[0]  = smpl_pmds;
  1.3005 +			ctx->ctx_pmds[cnum].eventid       = req->reg_smpl_eventid;
  1.3006 +
  1.3007 +			/*
  1.3008 +			 * Mark all PMDS to be accessed as used.
  1.3009 +			 *
  1.3010 +			 * We do not keep track of PMC because we have to
  1.3011 +			 * systematically restore ALL of them.
  1.3012 +			 *
  1.3013 +			 * We do not update the used_monitors mask, because
  1.3014 +			 * if we have not programmed them, then will be in
  1.3015 +			 * a quiescent state, therefore we will not need to
  1.3016 +			 * mask/restore then when context is MASKED.
  1.3017 +			 */
  1.3018 +			CTX_USED_PMD(ctx, reset_pmds);
  1.3019 +			CTX_USED_PMD(ctx, smpl_pmds);
  1.3020 +			/*
  1.3021 +		 	 * make sure we do not try to reset on
  1.3022 +		 	 * restart because we have established new values
  1.3023 +		 	 */
  1.3024 +			if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
  1.3025 +		}
  1.3026 +		/*
  1.3027 +		 * Needed in case the user does not initialize the equivalent
  1.3028 +		 * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
  1.3029 +		 * possible leak here.
  1.3030 +		 */
  1.3031 +		CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
  1.3032 +
  1.3033 +		/*
  1.3034 +		 * keep track of the monitor PMC that we are using.
  1.3035 +		 * we save the value of the pmc in ctx_pmcs[] and if
  1.3036 +		 * the monitoring is not stopped for the context we also
  1.3037 +		 * place it in the saved state area so that it will be
  1.3038 +		 * picked up later by the context switch code.
  1.3039 +		 *
  1.3040 +		 * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
  1.3041 +		 *
  1.3042 +		 * The value in thread->pmcs[] may be modified on overflow, i.e.,  when
  1.3043 +		 * monitoring needs to be stopped.
  1.3044 +		 */
  1.3045 +		if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
  1.3046 +
  1.3047 +		/*
  1.3048 +		 * update context state
  1.3049 +		 */
  1.3050 +		ctx->ctx_pmcs[cnum] = value;
  1.3051 +
  1.3052 +		if (is_loaded) {
  1.3053 +			/*
  1.3054 +			 * write thread state
  1.3055 +			 */
  1.3056 +			if (is_system == 0) thread->pmcs[cnum] = value;
  1.3057 +
  1.3058 +			/*
  1.3059 +			 * write hardware register if we can
  1.3060 +			 */
  1.3061 +			if (can_access_pmu) {
  1.3062 +				ia64_set_pmc(cnum, value);
  1.3063 +			}
  1.3064 +#ifdef CONFIG_SMP
  1.3065 +			else {
  1.3066 +				/*
  1.3067 +				 * per-task SMP only here
  1.3068 +				 *
  1.3069 +			 	 * we are guaranteed that the task is not running on the other CPU,
  1.3070 +			 	 * we indicate that this PMD will need to be reloaded if the task
  1.3071 +			 	 * is rescheduled on the CPU it ran last on.
  1.3072 +			 	 */
  1.3073 +				ctx->ctx_reload_pmcs[0] |= 1UL << cnum;
  1.3074 +			}
  1.3075 +#endif
  1.3076 +		}
  1.3077 +
  1.3078 +		DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
  1.3079 +			  cnum,
  1.3080 +			  value,
  1.3081 +			  is_loaded,
  1.3082 +			  can_access_pmu,
  1.3083 +			  flags,
  1.3084 +			  ctx->ctx_all_pmcs[0],
  1.3085 +			  ctx->ctx_used_pmds[0],
  1.3086 +			  ctx->ctx_pmds[cnum].eventid,
  1.3087 +			  smpl_pmds,
  1.3088 +			  reset_pmds,
  1.3089 +			  ctx->ctx_reload_pmcs[0],
  1.3090 +			  ctx->ctx_used_monitors[0],
  1.3091 +			  ctx->ctx_ovfl_regs[0]));
  1.3092 +	}
  1.3093 +
  1.3094 +	/*
  1.3095 +	 * make sure the changes are visible
  1.3096 +	 */
  1.3097 +	if (can_access_pmu) ia64_srlz_d();
  1.3098 +
  1.3099 +	return 0;
  1.3100 +error:
  1.3101 +	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
  1.3102 +	return ret;
  1.3103 +}
  1.3104 +
  1.3105 +static int
  1.3106 +pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3107 +{
  1.3108 +	struct thread_struct *thread = NULL;
  1.3109 +	struct task_struct *task;
  1.3110 +	pfarg_reg_t *req = (pfarg_reg_t *)arg;
  1.3111 +	unsigned long value, hw_value, ovfl_mask;
  1.3112 +	unsigned int cnum;
  1.3113 +	int i, can_access_pmu = 0, state;
  1.3114 +	int is_counting, is_loaded, is_system, expert_mode;
  1.3115 +	int ret = -EINVAL;
  1.3116 +	pfm_reg_check_t wr_func;
  1.3117 +
  1.3118 +
  1.3119 +	state     = ctx->ctx_state;
  1.3120 +	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
  1.3121 +	is_system = ctx->ctx_fl_system;
  1.3122 +	ovfl_mask = pmu_conf->ovfl_val;
  1.3123 +	task      = ctx->ctx_task;
  1.3124 +
  1.3125 +	if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
  1.3126 +
  1.3127 +	/*
  1.3128 +	 * on both UP and SMP, we can only write to the PMC when the task is
  1.3129 +	 * the owner of the local PMU.
  1.3130 +	 */
  1.3131 +	if (likely(is_loaded)) {
  1.3132 +		thread = &task->thread;
  1.3133 +		/*
  1.3134 +		 * In system wide and when the context is loaded, access can only happen
  1.3135 +		 * when the caller is running on the CPU being monitored by the session.
  1.3136 +		 * It does not have to be the owner (ctx_task) of the context per se.
  1.3137 +		 */
  1.3138 +		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
  1.3139 +			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.3140 +			return -EBUSY;
  1.3141 +		}
  1.3142 +		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
  1.3143 +	}
  1.3144 +	expert_mode = pfm_sysctl.expert_mode; 
  1.3145 +
  1.3146 +	for (i = 0; i < count; i++, req++) {
  1.3147 +
  1.3148 +		cnum  = req->reg_num;
  1.3149 +		value = req->reg_value;
  1.3150 +
  1.3151 +		if (!PMD_IS_IMPL(cnum)) {
  1.3152 +			DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
  1.3153 +			goto abort_mission;
  1.3154 +		}
  1.3155 +		is_counting = PMD_IS_COUNTING(cnum);
  1.3156 +		wr_func     = pmu_conf->pmd_desc[cnum].write_check;
  1.3157 +
  1.3158 +		/*
  1.3159 +		 * execute write checker, if any
  1.3160 +		 */
  1.3161 +		if (unlikely(expert_mode == 0 && wr_func)) {
  1.3162 +			unsigned long v = value;
  1.3163 +
  1.3164 +			ret = (*wr_func)(task, ctx, cnum, &v, regs);
  1.3165 +			if (ret) goto abort_mission;
  1.3166 +
  1.3167 +			value = v;
  1.3168 +			ret   = -EINVAL;
  1.3169 +		}
  1.3170 +
  1.3171 +		/*
  1.3172 +		 * no error on this register
  1.3173 +		 */
  1.3174 +		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
  1.3175 +
  1.3176 +		/*
  1.3177 +		 * now commit changes to software state
  1.3178 +		 */
  1.3179 +		hw_value = value;
  1.3180 +
  1.3181 +		/*
  1.3182 +		 * update virtualized (64bits) counter
  1.3183 +		 */
  1.3184 +		if (is_counting) {
  1.3185 +			/*
  1.3186 +			 * write context state
  1.3187 +			 */
  1.3188 +			ctx->ctx_pmds[cnum].lval = value;
  1.3189 +
  1.3190 +			/*
  1.3191 +			 * when context is load we use the split value
  1.3192 +			 */
  1.3193 +			if (is_loaded) {
  1.3194 +				hw_value = value &  ovfl_mask;
  1.3195 +				value    = value & ~ovfl_mask;
  1.3196 +			}
  1.3197 +		}
  1.3198 +		/*
  1.3199 +		 * update reset values (not just for counters)
  1.3200 +		 */
  1.3201 +		ctx->ctx_pmds[cnum].long_reset  = req->reg_long_reset;
  1.3202 +		ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
  1.3203 +
  1.3204 +		/*
  1.3205 +		 * update randomization parameters (not just for counters)
  1.3206 +		 */
  1.3207 +		ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
  1.3208 +		ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
  1.3209 +
  1.3210 +		/*
  1.3211 +		 * update context value
  1.3212 +		 */
  1.3213 +		ctx->ctx_pmds[cnum].val  = value;
  1.3214 +
  1.3215 +		/*
  1.3216 +		 * Keep track of what we use
  1.3217 +		 *
  1.3218 +		 * We do not keep track of PMC because we have to
  1.3219 +		 * systematically restore ALL of them.
  1.3220 +		 */
  1.3221 +		CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
  1.3222 +
  1.3223 +		/*
  1.3224 +		 * mark this PMD register used as well
  1.3225 +		 */
  1.3226 +		CTX_USED_PMD(ctx, RDEP(cnum));
  1.3227 +
  1.3228 +		/*
  1.3229 +		 * make sure we do not try to reset on
  1.3230 +		 * restart because we have established new values
  1.3231 +		 */
  1.3232 +		if (is_counting && state == PFM_CTX_MASKED) {
  1.3233 +			ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
  1.3234 +		}
  1.3235 +
  1.3236 +		if (is_loaded) {
  1.3237 +			/*
  1.3238 +		 	 * write thread state
  1.3239 +		 	 */
  1.3240 +			if (is_system == 0) thread->pmds[cnum] = hw_value;
  1.3241 +
  1.3242 +			/*
  1.3243 +			 * write hardware register if we can
  1.3244 +			 */
  1.3245 +			if (can_access_pmu) {
  1.3246 +				ia64_set_pmd(cnum, hw_value);
  1.3247 +			} else {
  1.3248 +#ifdef CONFIG_SMP
  1.3249 +				/*
  1.3250 +			 	 * we are guaranteed that the task is not running on the other CPU,
  1.3251 +			 	 * we indicate that this PMD will need to be reloaded if the task
  1.3252 +			 	 * is rescheduled on the CPU it ran last on.
  1.3253 +			 	 */
  1.3254 +				ctx->ctx_reload_pmds[0] |= 1UL << cnum;
  1.3255 +#endif
  1.3256 +			}
  1.3257 +		}
  1.3258 +
  1.3259 +		DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
  1.3260 +			  "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
  1.3261 +			cnum,
  1.3262 +			value,
  1.3263 +			is_loaded,
  1.3264 +			can_access_pmu,
  1.3265 +			hw_value,
  1.3266 +			ctx->ctx_pmds[cnum].val,
  1.3267 +			ctx->ctx_pmds[cnum].short_reset,
  1.3268 +			ctx->ctx_pmds[cnum].long_reset,
  1.3269 +			PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
  1.3270 +			ctx->ctx_pmds[cnum].seed,
  1.3271 +			ctx->ctx_pmds[cnum].mask,
  1.3272 +			ctx->ctx_used_pmds[0],
  1.3273 +			ctx->ctx_pmds[cnum].reset_pmds[0],
  1.3274 +			ctx->ctx_reload_pmds[0],
  1.3275 +			ctx->ctx_all_pmds[0],
  1.3276 +			ctx->ctx_ovfl_regs[0]));
  1.3277 +	}
  1.3278 +
  1.3279 +	/*
  1.3280 +	 * make changes visible
  1.3281 +	 */
  1.3282 +	if (can_access_pmu) ia64_srlz_d();
  1.3283 +
  1.3284 +	return 0;
  1.3285 +
  1.3286 +abort_mission:
  1.3287 +	/*
  1.3288 +	 * for now, we have only one possibility for error
  1.3289 +	 */
  1.3290 +	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
  1.3291 +	return ret;
  1.3292 +}
  1.3293 +
  1.3294 +/*
  1.3295 + * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
  1.3296 + * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
  1.3297 + * interrupt is delivered during the call, it will be kept pending until we leave, making
  1.3298 + * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
  1.3299 + * guaranteed to return consistent data to the user, it may simply be old. It is not
  1.3300 + * trivial to treat the overflow while inside the call because you may end up in
  1.3301 + * some module sampling buffer code causing deadlocks.
  1.3302 + */
  1.3303 +static int
  1.3304 +pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3305 +{
  1.3306 +	struct thread_struct *thread = NULL;
  1.3307 +	struct task_struct *task;
  1.3308 +	unsigned long val = 0UL, lval, ovfl_mask, sval;
  1.3309 +	pfarg_reg_t *req = (pfarg_reg_t *)arg;
  1.3310 +	unsigned int cnum, reg_flags = 0;
  1.3311 +	int i, can_access_pmu = 0, state;
  1.3312 +	int is_loaded, is_system, is_counting, expert_mode;
  1.3313 +	int ret = -EINVAL;
  1.3314 +	pfm_reg_check_t rd_func;
  1.3315 +
  1.3316 +	/*
  1.3317 +	 * access is possible when loaded only for
  1.3318 +	 * self-monitoring tasks or in UP mode
  1.3319 +	 */
  1.3320 +
  1.3321 +	state     = ctx->ctx_state;
  1.3322 +	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
  1.3323 +	is_system = ctx->ctx_fl_system;
  1.3324 +	ovfl_mask = pmu_conf->ovfl_val;
  1.3325 +	task      = ctx->ctx_task;
  1.3326 +
  1.3327 +	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
  1.3328 +
  1.3329 +	if (likely(is_loaded)) {
  1.3330 +		thread = &task->thread;
  1.3331 +		/*
  1.3332 +		 * In system wide and when the context is loaded, access can only happen
  1.3333 +		 * when the caller is running on the CPU being monitored by the session.
  1.3334 +		 * It does not have to be the owner (ctx_task) of the context per se.
  1.3335 +		 */
  1.3336 +		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
  1.3337 +			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.3338 +			return -EBUSY;
  1.3339 +		}
  1.3340 +		/*
  1.3341 +		 * this can be true when not self-monitoring only in UP
  1.3342 +		 */
  1.3343 +		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
  1.3344 +
  1.3345 +		if (can_access_pmu) ia64_srlz_d();
  1.3346 +	}
  1.3347 +	expert_mode = pfm_sysctl.expert_mode; 
  1.3348 +
  1.3349 +	DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
  1.3350 +		is_loaded,
  1.3351 +		can_access_pmu,
  1.3352 +		state));
  1.3353 +
  1.3354 +	/*
  1.3355 +	 * on both UP and SMP, we can only read the PMD from the hardware register when
  1.3356 +	 * the task is the owner of the local PMU.
  1.3357 +	 */
  1.3358 +
  1.3359 +	for (i = 0; i < count; i++, req++) {
  1.3360 +
  1.3361 +		cnum        = req->reg_num;
  1.3362 +		reg_flags   = req->reg_flags;
  1.3363 +
  1.3364 +		if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
  1.3365 +		/*
  1.3366 +		 * we can only read the register that we use. That includes
  1.3367 +		 * the one we explicitely initialize AND the one we want included
  1.3368 +		 * in the sampling buffer (smpl_regs).
  1.3369 +		 *
  1.3370 +		 * Having this restriction allows optimization in the ctxsw routine
  1.3371 +		 * without compromising security (leaks)
  1.3372 +		 */
  1.3373 +		if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
  1.3374 +
  1.3375 +		sval        = ctx->ctx_pmds[cnum].val;
  1.3376 +		lval        = ctx->ctx_pmds[cnum].lval;
  1.3377 +		is_counting = PMD_IS_COUNTING(cnum);
  1.3378 +
  1.3379 +		/*
  1.3380 +		 * If the task is not the current one, then we check if the
  1.3381 +		 * PMU state is still in the local live register due to lazy ctxsw.
  1.3382 +		 * If true, then we read directly from the registers.
  1.3383 +		 */
  1.3384 +		if (can_access_pmu){
  1.3385 +			val = ia64_get_pmd(cnum);
  1.3386 +		} else {
  1.3387 +			/*
  1.3388 +			 * context has been saved
  1.3389 +			 * if context is zombie, then task does not exist anymore.
  1.3390 +			 * In this case, we use the full value saved in the context (pfm_flush_regs()).
  1.3391 +			 */
  1.3392 +			val = is_loaded ? thread->pmds[cnum] : 0UL;
  1.3393 +		}
  1.3394 +		rd_func = pmu_conf->pmd_desc[cnum].read_check;
  1.3395 +
  1.3396 +		if (is_counting) {
  1.3397 +			/*
  1.3398 +			 * XXX: need to check for overflow when loaded
  1.3399 +			 */
  1.3400 +			val &= ovfl_mask;
  1.3401 +			val += sval;
  1.3402 +		}
  1.3403 +
  1.3404 +		/*
  1.3405 +		 * execute read checker, if any
  1.3406 +		 */
  1.3407 +		if (unlikely(expert_mode == 0 && rd_func)) {
  1.3408 +			unsigned long v = val;
  1.3409 +			ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
  1.3410 +			if (ret) goto error;
  1.3411 +			val = v;
  1.3412 +			ret = -EINVAL;
  1.3413 +		}
  1.3414 +
  1.3415 +		PFM_REG_RETFLAG_SET(reg_flags, 0);
  1.3416 +
  1.3417 +		DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
  1.3418 +
  1.3419 +		/*
  1.3420 +		 * update register return value, abort all if problem during copy.
  1.3421 +		 * we only modify the reg_flags field. no check mode is fine because
  1.3422 +		 * access has been verified upfront in sys_perfmonctl().
  1.3423 +		 */
  1.3424 +		req->reg_value            = val;
  1.3425 +		req->reg_flags            = reg_flags;
  1.3426 +		req->reg_last_reset_val   = lval;
  1.3427 +	}
  1.3428 +
  1.3429 +	return 0;
  1.3430 +
  1.3431 +error:
  1.3432 +	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
  1.3433 +	return ret;
  1.3434 +}
  1.3435 +
  1.3436 +int
  1.3437 +pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
  1.3438 +{
  1.3439 +	pfm_context_t *ctx;
  1.3440 +
  1.3441 +	if (req == NULL) return -EINVAL;
  1.3442 +
  1.3443 + 	ctx = GET_PMU_CTX();
  1.3444 +
  1.3445 +	if (ctx == NULL) return -EINVAL;
  1.3446 +
  1.3447 +	/*
  1.3448 +	 * for now limit to current task, which is enough when calling
  1.3449 +	 * from overflow handler
  1.3450 +	 */
  1.3451 +	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
  1.3452 +
  1.3453 +	return pfm_write_pmcs(ctx, req, nreq, regs);
  1.3454 +}
  1.3455 +EXPORT_SYMBOL(pfm_mod_write_pmcs);
  1.3456 +
  1.3457 +int
  1.3458 +pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
  1.3459 +{
  1.3460 +	pfm_context_t *ctx;
  1.3461 +
  1.3462 +	if (req == NULL) return -EINVAL;
  1.3463 +
  1.3464 + 	ctx = GET_PMU_CTX();
  1.3465 +
  1.3466 +	if (ctx == NULL) return -EINVAL;
  1.3467 +
  1.3468 +	/*
  1.3469 +	 * for now limit to current task, which is enough when calling
  1.3470 +	 * from overflow handler
  1.3471 +	 */
  1.3472 +	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
  1.3473 +
  1.3474 +	return pfm_read_pmds(ctx, req, nreq, regs);
  1.3475 +}
  1.3476 +EXPORT_SYMBOL(pfm_mod_read_pmds);
  1.3477 +
  1.3478 +/*
  1.3479 + * Only call this function when a process it trying to
  1.3480 + * write the debug registers (reading is always allowed)
  1.3481 + */
  1.3482 +int
  1.3483 +pfm_use_debug_registers(struct task_struct *task)
  1.3484 +{
  1.3485 +	pfm_context_t *ctx = task->thread.pfm_context;
  1.3486 +	unsigned long flags;
  1.3487 +	int ret = 0;
  1.3488 +
  1.3489 +	if (pmu_conf->use_rr_dbregs == 0) return 0;
  1.3490 +
  1.3491 +	DPRINT(("called for [%d]\n", task->pid));
  1.3492 +
  1.3493 +	/*
  1.3494 +	 * do it only once
  1.3495 +	 */
  1.3496 +	if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
  1.3497 +
  1.3498 +	/*
  1.3499 +	 * Even on SMP, we do not need to use an atomic here because
  1.3500 +	 * the only way in is via ptrace() and this is possible only when the
  1.3501 +	 * process is stopped. Even in the case where the ctxsw out is not totally
  1.3502 +	 * completed by the time we come here, there is no way the 'stopped' process
  1.3503 +	 * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
  1.3504 +	 * So this is always safe.
  1.3505 +	 */
  1.3506 +	if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
  1.3507 +
  1.3508 +	LOCK_PFS(flags);
  1.3509 +
  1.3510 +	/*
  1.3511 +	 * We cannot allow setting breakpoints when system wide monitoring
  1.3512 +	 * sessions are using the debug registers.
  1.3513 +	 */
  1.3514 +	if (pfm_sessions.pfs_sys_use_dbregs> 0)
  1.3515 +		ret = -1;
  1.3516 +	else
  1.3517 +		pfm_sessions.pfs_ptrace_use_dbregs++;
  1.3518 +
  1.3519 +	DPRINT(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
  1.3520 +		  pfm_sessions.pfs_ptrace_use_dbregs,
  1.3521 +		  pfm_sessions.pfs_sys_use_dbregs,
  1.3522 +		  task->pid, ret));
  1.3523 +
  1.3524 +	UNLOCK_PFS(flags);
  1.3525 +
  1.3526 +	return ret;
  1.3527 +}
  1.3528 +
  1.3529 +/*
  1.3530 + * This function is called for every task that exits with the
  1.3531 + * IA64_THREAD_DBG_VALID set. This indicates a task which was
  1.3532 + * able to use the debug registers for debugging purposes via
  1.3533 + * ptrace(). Therefore we know it was not using them for
  1.3534 + * perfmormance monitoring, so we only decrement the number
  1.3535 + * of "ptraced" debug register users to keep the count up to date
  1.3536 + */
  1.3537 +int
  1.3538 +pfm_release_debug_registers(struct task_struct *task)
  1.3539 +{
  1.3540 +	unsigned long flags;
  1.3541 +	int ret;
  1.3542 +
  1.3543 +	if (pmu_conf->use_rr_dbregs == 0) return 0;
  1.3544 +
  1.3545 +	LOCK_PFS(flags);
  1.3546 +	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
  1.3547 +		printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
  1.3548 +		ret = -1;
  1.3549 +	}  else {
  1.3550 +		pfm_sessions.pfs_ptrace_use_dbregs--;
  1.3551 +		ret = 0;
  1.3552 +	}
  1.3553 +	UNLOCK_PFS(flags);
  1.3554 +
  1.3555 +	return ret;
  1.3556 +}
  1.3557 +
  1.3558 +static int
  1.3559 +pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3560 +{
  1.3561 +	struct task_struct *task;
  1.3562 +	pfm_buffer_fmt_t *fmt;
  1.3563 +	pfm_ovfl_ctrl_t rst_ctrl;
  1.3564 +	int state, is_system;
  1.3565 +	int ret = 0;
  1.3566 +
  1.3567 +	state     = ctx->ctx_state;
  1.3568 +	fmt       = ctx->ctx_buf_fmt;
  1.3569 +	is_system = ctx->ctx_fl_system;
  1.3570 +	task      = PFM_CTX_TASK(ctx);
  1.3571 +
  1.3572 +	switch(state) {
  1.3573 +		case PFM_CTX_MASKED:
  1.3574 +			break;
  1.3575 +		case PFM_CTX_LOADED: 
  1.3576 +			if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
  1.3577 +			/* fall through */
  1.3578 +		case PFM_CTX_UNLOADED:
  1.3579 +		case PFM_CTX_ZOMBIE:
  1.3580 +			DPRINT(("invalid state=%d\n", state));
  1.3581 +			return -EBUSY;
  1.3582 +		default:
  1.3583 +			DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
  1.3584 +			return -EINVAL;
  1.3585 +	}
  1.3586 +
  1.3587 +	/*
  1.3588 + 	 * In system wide and when the context is loaded, access can only happen
  1.3589 + 	 * when the caller is running on the CPU being monitored by the session.
  1.3590 + 	 * It does not have to be the owner (ctx_task) of the context per se.
  1.3591 + 	 */
  1.3592 +	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
  1.3593 +		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.3594 +		return -EBUSY;
  1.3595 +	}
  1.3596 +
  1.3597 +	/* sanity check */
  1.3598 +	if (unlikely(task == NULL)) {
  1.3599 +		printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
  1.3600 +		return -EINVAL;
  1.3601 +	}
  1.3602 +
  1.3603 +	if (task == current || is_system) {
  1.3604 +
  1.3605 +		fmt = ctx->ctx_buf_fmt;
  1.3606 +
  1.3607 +		DPRINT(("restarting self %d ovfl=0x%lx\n",
  1.3608 +			task->pid,
  1.3609 +			ctx->ctx_ovfl_regs[0]));
  1.3610 +
  1.3611 +		if (CTX_HAS_SMPL(ctx)) {
  1.3612 +
  1.3613 +			prefetch(ctx->ctx_smpl_hdr);
  1.3614 +
  1.3615 +			rst_ctrl.bits.mask_monitoring = 0;
  1.3616 +			rst_ctrl.bits.reset_ovfl_pmds = 0;
  1.3617 +
  1.3618 +			if (state == PFM_CTX_LOADED)
  1.3619 +				ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
  1.3620 +			else
  1.3621 +				ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
  1.3622 +		} else {
  1.3623 +			rst_ctrl.bits.mask_monitoring = 0;
  1.3624 +			rst_ctrl.bits.reset_ovfl_pmds = 1;
  1.3625 +		}
  1.3626 +
  1.3627 +		if (ret == 0) {
  1.3628 +			if (rst_ctrl.bits.reset_ovfl_pmds)
  1.3629 +				pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
  1.3630 +
  1.3631 +			if (rst_ctrl.bits.mask_monitoring == 0) {
  1.3632 +				DPRINT(("resuming monitoring for [%d]\n", task->pid));
  1.3633 +
  1.3634 +				if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
  1.3635 +			} else {
  1.3636 +				DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
  1.3637 +
  1.3638 +				// cannot use pfm_stop_monitoring(task, regs);
  1.3639 +			}
  1.3640 +		}
  1.3641 +		/*
  1.3642 +		 * clear overflowed PMD mask to remove any stale information
  1.3643 +		 */
  1.3644 +		ctx->ctx_ovfl_regs[0] = 0UL;
  1.3645 +
  1.3646 +		/*
  1.3647 +		 * back to LOADED state
  1.3648 +		 */
  1.3649 +		ctx->ctx_state = PFM_CTX_LOADED;
  1.3650 +
  1.3651 +		/*
  1.3652 +		 * XXX: not really useful for self monitoring
  1.3653 +		 */
  1.3654 +		ctx->ctx_fl_can_restart = 0;
  1.3655 +
  1.3656 +		return 0;
  1.3657 +	}
  1.3658 +
  1.3659 +	/* 
  1.3660 +	 * restart another task
  1.3661 +	 */
  1.3662 +
  1.3663 +	/*
  1.3664 +	 * When PFM_CTX_MASKED, we cannot issue a restart before the previous 
  1.3665 +	 * one is seen by the task.
  1.3666 +	 */
  1.3667 +	if (state == PFM_CTX_MASKED) {
  1.3668 +		if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
  1.3669 +		/*
  1.3670 +		 * will prevent subsequent restart before this one is
  1.3671 +		 * seen by other task
  1.3672 +		 */
  1.3673 +		ctx->ctx_fl_can_restart = 0;
  1.3674 +	}
  1.3675 +
  1.3676 +	/*
  1.3677 +	 * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
  1.3678 +	 * the task is blocked or on its way to block. That's the normal
  1.3679 +	 * restart path. If the monitoring is not masked, then the task
  1.3680 +	 * can be actively monitoring and we cannot directly intervene.
  1.3681 +	 * Therefore we use the trap mechanism to catch the task and
  1.3682 +	 * force it to reset the buffer/reset PMDs.
  1.3683 +	 *
  1.3684 +	 * if non-blocking, then we ensure that the task will go into
  1.3685 +	 * pfm_handle_work() before returning to user mode.
  1.3686 +	 *
  1.3687 +	 * We cannot explicitely reset another task, it MUST always
  1.3688 +	 * be done by the task itself. This works for system wide because
  1.3689 +	 * the tool that is controlling the session is logically doing 
  1.3690 +	 * "self-monitoring".
  1.3691 +	 */
  1.3692 +	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
  1.3693 +		DPRINT(("unblocking [%d] \n", task->pid));
  1.3694 +		complete(&ctx->ctx_restart_done);
  1.3695 +	} else {
  1.3696 +		DPRINT(("[%d] armed exit trap\n", task->pid));
  1.3697 +
  1.3698 +		ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
  1.3699 +
  1.3700 +		PFM_SET_WORK_PENDING(task, 1);
  1.3701 +
  1.3702 +		pfm_set_task_notify(task);
  1.3703 +
  1.3704 +		/*
  1.3705 +		 * XXX: send reschedule if task runs on another CPU
  1.3706 +		 */
  1.3707 +	}
  1.3708 +	return 0;
  1.3709 +}
  1.3710 +
  1.3711 +static int
  1.3712 +pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3713 +{
  1.3714 +	unsigned int m = *(unsigned int *)arg;
  1.3715 +
  1.3716 +	pfm_sysctl.debug = m == 0 ? 0 : 1;
  1.3717 +
  1.3718 +	printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
  1.3719 +
  1.3720 +	if (m == 0) {
  1.3721 +		memset(pfm_stats, 0, sizeof(pfm_stats));
  1.3722 +		for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
  1.3723 +	}
  1.3724 +	return 0;
  1.3725 +}
  1.3726 +
  1.3727 +/*
  1.3728 + * arg can be NULL and count can be zero for this function
  1.3729 + */
  1.3730 +static int
  1.3731 +pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3732 +{
  1.3733 +	struct thread_struct *thread = NULL;
  1.3734 +	struct task_struct *task;
  1.3735 +	pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg;
  1.3736 +	unsigned long flags;
  1.3737 +	dbreg_t dbreg;
  1.3738 +	unsigned int rnum;
  1.3739 +	int first_time;
  1.3740 +	int ret = 0, state;
  1.3741 +	int i, can_access_pmu = 0;
  1.3742 +	int is_system, is_loaded;
  1.3743 +
  1.3744 +	if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
  1.3745 +
  1.3746 +	state     = ctx->ctx_state;
  1.3747 +	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
  1.3748 +	is_system = ctx->ctx_fl_system;
  1.3749 +	task      = ctx->ctx_task;
  1.3750 +
  1.3751 +	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
  1.3752 +
  1.3753 +	/*
  1.3754 +	 * on both UP and SMP, we can only write to the PMC when the task is
  1.3755 +	 * the owner of the local PMU.
  1.3756 +	 */
  1.3757 +	if (is_loaded) {
  1.3758 +		thread = &task->thread;
  1.3759 +		/*
  1.3760 +		 * In system wide and when the context is loaded, access can only happen
  1.3761 +		 * when the caller is running on the CPU being monitored by the session.
  1.3762 +		 * It does not have to be the owner (ctx_task) of the context per se.
  1.3763 +		 */
  1.3764 +		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
  1.3765 +			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.3766 +			return -EBUSY;
  1.3767 +		}
  1.3768 +		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
  1.3769 +	}
  1.3770 +
  1.3771 +	/*
  1.3772 +	 * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
  1.3773 +	 * ensuring that no real breakpoint can be installed via this call.
  1.3774 +	 *
  1.3775 +	 * IMPORTANT: regs can be NULL in this function
  1.3776 +	 */
  1.3777 +
  1.3778 +	first_time = ctx->ctx_fl_using_dbreg == 0;
  1.3779 +
  1.3780 +	/*
  1.3781 +	 * don't bother if we are loaded and task is being debugged
  1.3782 +	 */
  1.3783 +	if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
  1.3784 +		DPRINT(("debug registers already in use for [%d]\n", task->pid));
  1.3785 +		return -EBUSY;
  1.3786 +	}
  1.3787 +
  1.3788 +	/*
  1.3789 +	 * check for debug registers in system wide mode
  1.3790 +	 *
  1.3791 +	 * If though a check is done in pfm_context_load(),
  1.3792 +	 * we must repeat it here, in case the registers are
  1.3793 +	 * written after the context is loaded
  1.3794 +	 */
  1.3795 +	if (is_loaded) {
  1.3796 +		LOCK_PFS(flags);
  1.3797 +
  1.3798 +		if (first_time && is_system) {
  1.3799 +			if (pfm_sessions.pfs_ptrace_use_dbregs)
  1.3800 +				ret = -EBUSY;
  1.3801 +			else
  1.3802 +				pfm_sessions.pfs_sys_use_dbregs++;
  1.3803 +		}
  1.3804 +		UNLOCK_PFS(flags);
  1.3805 +	}
  1.3806 +
  1.3807 +	if (ret != 0) return ret;
  1.3808 +
  1.3809 +	/*
  1.3810 +	 * mark ourself as user of the debug registers for
  1.3811 +	 * perfmon purposes.
  1.3812 +	 */
  1.3813 +	ctx->ctx_fl_using_dbreg = 1;
  1.3814 +
  1.3815 +	/*
  1.3816 + 	 * clear hardware registers to make sure we don't
  1.3817 + 	 * pick up stale state.
  1.3818 +	 *
  1.3819 +	 * for a system wide session, we do not use
  1.3820 +	 * thread.dbr, thread.ibr because this process
  1.3821 +	 * never leaves the current CPU and the state
  1.3822 +	 * is shared by all processes running on it
  1.3823 + 	 */
  1.3824 +	if (first_time && can_access_pmu) {
  1.3825 +		DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
  1.3826 +		for (i=0; i < pmu_conf->num_ibrs; i++) {
  1.3827 +			ia64_set_ibr(i, 0UL);
  1.3828 +			ia64_dv_serialize_instruction();
  1.3829 +		}
  1.3830 +		ia64_srlz_i();
  1.3831 +		for (i=0; i < pmu_conf->num_dbrs; i++) {
  1.3832 +			ia64_set_dbr(i, 0UL);
  1.3833 +			ia64_dv_serialize_data();
  1.3834 +		}
  1.3835 +		ia64_srlz_d();
  1.3836 +	}
  1.3837 +
  1.3838 +	/*
  1.3839 +	 * Now install the values into the registers
  1.3840 +	 */
  1.3841 +	for (i = 0; i < count; i++, req++) {
  1.3842 +
  1.3843 +		rnum      = req->dbreg_num;
  1.3844 +		dbreg.val = req->dbreg_value;
  1.3845 +
  1.3846 +		ret = -EINVAL;
  1.3847 +
  1.3848 +		if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
  1.3849 +			DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
  1.3850 +				  rnum, dbreg.val, mode, i, count));
  1.3851 +
  1.3852 +			goto abort_mission;
  1.3853 +		}
  1.3854 +
  1.3855 +		/*
  1.3856 +		 * make sure we do not install enabled breakpoint
  1.3857 +		 */
  1.3858 +		if (rnum & 0x1) {
  1.3859 +			if (mode == PFM_CODE_RR)
  1.3860 +				dbreg.ibr.ibr_x = 0;
  1.3861 +			else
  1.3862 +				dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
  1.3863 +		}
  1.3864 +
  1.3865 +		PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
  1.3866 +
  1.3867 +		/*
  1.3868 +		 * Debug registers, just like PMC, can only be modified
  1.3869 +		 * by a kernel call. Moreover, perfmon() access to those
  1.3870 +		 * registers are centralized in this routine. The hardware
  1.3871 +		 * does not modify the value of these registers, therefore,
  1.3872 +		 * if we save them as they are written, we can avoid having
  1.3873 +		 * to save them on context switch out. This is made possible
  1.3874 +		 * by the fact that when perfmon uses debug registers, ptrace()
  1.3875 +		 * won't be able to modify them concurrently.
  1.3876 +		 */
  1.3877 +		if (mode == PFM_CODE_RR) {
  1.3878 +			CTX_USED_IBR(ctx, rnum);
  1.3879 +
  1.3880 +			if (can_access_pmu) {
  1.3881 +				ia64_set_ibr(rnum, dbreg.val);
  1.3882 +				ia64_dv_serialize_instruction();
  1.3883 +			}
  1.3884 +
  1.3885 +			ctx->ctx_ibrs[rnum] = dbreg.val;
  1.3886 +
  1.3887 +			DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
  1.3888 +				rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
  1.3889 +		} else {
  1.3890 +			CTX_USED_DBR(ctx, rnum);
  1.3891 +
  1.3892 +			if (can_access_pmu) {
  1.3893 +				ia64_set_dbr(rnum, dbreg.val);
  1.3894 +				ia64_dv_serialize_data();
  1.3895 +			}
  1.3896 +			ctx->ctx_dbrs[rnum] = dbreg.val;
  1.3897 +
  1.3898 +			DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
  1.3899 +				rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
  1.3900 +		}
  1.3901 +	}
  1.3902 +
  1.3903 +	return 0;
  1.3904 +
  1.3905 +abort_mission:
  1.3906 +	/*
  1.3907 +	 * in case it was our first attempt, we undo the global modifications
  1.3908 +	 */
  1.3909 +	if (first_time) {
  1.3910 +		LOCK_PFS(flags);
  1.3911 +		if (ctx->ctx_fl_system) {
  1.3912 +			pfm_sessions.pfs_sys_use_dbregs--;
  1.3913 +		}
  1.3914 +		UNLOCK_PFS(flags);
  1.3915 +		ctx->ctx_fl_using_dbreg = 0;
  1.3916 +	}
  1.3917 +	/*
  1.3918 +	 * install error return flag
  1.3919 +	 */
  1.3920 +	PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
  1.3921 +
  1.3922 +	return ret;
  1.3923 +}
  1.3924 +
  1.3925 +static int
  1.3926 +pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3927 +{
  1.3928 +	return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
  1.3929 +}
  1.3930 +
  1.3931 +static int
  1.3932 +pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3933 +{
  1.3934 +	return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
  1.3935 +}
  1.3936 +
  1.3937 +int
  1.3938 +pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
  1.3939 +{
  1.3940 +	pfm_context_t *ctx;
  1.3941 +
  1.3942 +	if (req == NULL) return -EINVAL;
  1.3943 +
  1.3944 + 	ctx = GET_PMU_CTX();
  1.3945 +
  1.3946 +	if (ctx == NULL) return -EINVAL;
  1.3947 +
  1.3948 +	/*
  1.3949 +	 * for now limit to current task, which is enough when calling
  1.3950 +	 * from overflow handler
  1.3951 +	 */
  1.3952 +	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
  1.3953 +
  1.3954 +	return pfm_write_ibrs(ctx, req, nreq, regs);
  1.3955 +}
  1.3956 +EXPORT_SYMBOL(pfm_mod_write_ibrs);
  1.3957 +
  1.3958 +int
  1.3959 +pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
  1.3960 +{
  1.3961 +	pfm_context_t *ctx;
  1.3962 +
  1.3963 +	if (req == NULL) return -EINVAL;
  1.3964 +
  1.3965 + 	ctx = GET_PMU_CTX();
  1.3966 +
  1.3967 +	if (ctx == NULL) return -EINVAL;
  1.3968 +
  1.3969 +	/*
  1.3970 +	 * for now limit to current task, which is enough when calling
  1.3971 +	 * from overflow handler
  1.3972 +	 */
  1.3973 +	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
  1.3974 +
  1.3975 +	return pfm_write_dbrs(ctx, req, nreq, regs);
  1.3976 +}
  1.3977 +EXPORT_SYMBOL(pfm_mod_write_dbrs);
  1.3978 +
  1.3979 +
  1.3980 +static int
  1.3981 +pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3982 +{
  1.3983 +	pfarg_features_t *req = (pfarg_features_t *)arg;
  1.3984 +
  1.3985 +	req->ft_version = PFM_VERSION;
  1.3986 +	return 0;
  1.3987 +}
  1.3988 +
  1.3989 +static int
  1.3990 +pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.3991 +{
  1.3992 +	struct pt_regs *tregs;
  1.3993 +	struct task_struct *task = PFM_CTX_TASK(ctx);
  1.3994 +	int state, is_system;
  1.3995 +
  1.3996 +	state     = ctx->ctx_state;
  1.3997 +	is_system = ctx->ctx_fl_system;
  1.3998 +
  1.3999 +	/*
  1.4000 +	 * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
  1.4001 +	 */
  1.4002 +	if (state == PFM_CTX_UNLOADED) return -EINVAL;
  1.4003 +
  1.4004 +	/*
  1.4005 + 	 * In system wide and when the context is loaded, access can only happen
  1.4006 + 	 * when the caller is running on the CPU being monitored by the session.
  1.4007 + 	 * It does not have to be the owner (ctx_task) of the context per se.
  1.4008 + 	 */
  1.4009 +	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
  1.4010 +		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.4011 +		return -EBUSY;
  1.4012 +	}
  1.4013 +	DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
  1.4014 +		PFM_CTX_TASK(ctx)->pid,
  1.4015 +		state,
  1.4016 +		is_system));
  1.4017 +	/*
  1.4018 +	 * in system mode, we need to update the PMU directly
  1.4019 +	 * and the user level state of the caller, which may not
  1.4020 +	 * necessarily be the creator of the context.
  1.4021 +	 */
  1.4022 +	if (is_system) {
  1.4023 +		/*
  1.4024 +		 * Update local PMU first
  1.4025 +		 *
  1.4026 +		 * disable dcr pp
  1.4027 +		 */
  1.4028 +		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
  1.4029 +		ia64_srlz_i();
  1.4030 +
  1.4031 +		/*
  1.4032 +		 * update local cpuinfo
  1.4033 +		 */
  1.4034 +		PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
  1.4035 +
  1.4036 +		/*
  1.4037 +		 * stop monitoring, does srlz.i
  1.4038 +		 */
  1.4039 +		pfm_clear_psr_pp();
  1.4040 +
  1.4041 +		/*
  1.4042 +		 * stop monitoring in the caller
  1.4043 +		 */
  1.4044 +		ia64_psr(regs)->pp = 0;
  1.4045 +
  1.4046 +		return 0;
  1.4047 +	}
  1.4048 +	/*
  1.4049 +	 * per-task mode
  1.4050 +	 */
  1.4051 +
  1.4052 +	if (task == current) {
  1.4053 +		/* stop monitoring  at kernel level */
  1.4054 +		pfm_clear_psr_up();
  1.4055 +
  1.4056 +		/*
  1.4057 +	 	 * stop monitoring at the user level
  1.4058 +	 	 */
  1.4059 +		ia64_psr(regs)->up = 0;
  1.4060 +	} else {
  1.4061 +		tregs = task_pt_regs(task);
  1.4062 +
  1.4063 +		/*
  1.4064 +	 	 * stop monitoring at the user level
  1.4065 +	 	 */
  1.4066 +		ia64_psr(tregs)->up = 0;
  1.4067 +
  1.4068 +		/*
  1.4069 +		 * monitoring disabled in kernel at next reschedule
  1.4070 +		 */
  1.4071 +		ctx->ctx_saved_psr_up = 0;
  1.4072 +		DPRINT(("task=[%d]\n", task->pid));
  1.4073 +	}
  1.4074 +	return 0;
  1.4075 +}
  1.4076 +
  1.4077 +
  1.4078 +static int
  1.4079 +pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.4080 +{
  1.4081 +	struct pt_regs *tregs;
  1.4082 +	int state, is_system;
  1.4083 +
  1.4084 +	state     = ctx->ctx_state;
  1.4085 +	is_system = ctx->ctx_fl_system;
  1.4086 +
  1.4087 +	if (state != PFM_CTX_LOADED) return -EINVAL;
  1.4088 +
  1.4089 +	/*
  1.4090 + 	 * In system wide and when the context is loaded, access can only happen
  1.4091 + 	 * when the caller is running on the CPU being monitored by the session.
  1.4092 + 	 * It does not have to be the owner (ctx_task) of the context per se.
  1.4093 + 	 */
  1.4094 +	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
  1.4095 +		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
  1.4096 +		return -EBUSY;
  1.4097 +	}
  1.4098 +
  1.4099 +	/*
  1.4100 +	 * in system mode, we need to update the PMU directly
  1.4101 +	 * and the user level state of the caller, which may not
  1.4102 +	 * necessarily be the creator of the context.
  1.4103 +	 */
  1.4104 +	if (is_system) {
  1.4105 +
  1.4106 +		/*
  1.4107 +		 * set user level psr.pp for the caller
  1.4108 +		 */
  1.4109 +		ia64_psr(regs)->pp = 1;
  1.4110 +
  1.4111 +		/*
  1.4112 +		 * now update the local PMU and cpuinfo
  1.4113 +		 */
  1.4114 +		PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
  1.4115 +
  1.4116 +		/*
  1.4117 +		 * start monitoring at kernel level
  1.4118 +		 */
  1.4119 +		pfm_set_psr_pp();
  1.4120 +
  1.4121 +		/* enable dcr pp */
  1.4122 +		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
  1.4123 +		ia64_srlz_i();
  1.4124 +
  1.4125 +		return 0;
  1.4126 +	}
  1.4127 +
  1.4128 +	/*
  1.4129 +	 * per-process mode
  1.4130 +	 */
  1.4131 +
  1.4132 +	if (ctx->ctx_task == current) {
  1.4133 +
  1.4134 +		/* start monitoring at kernel level */
  1.4135 +		pfm_set_psr_up();
  1.4136 +
  1.4137 +		/*
  1.4138 +		 * activate monitoring at user level
  1.4139 +		 */
  1.4140 +		ia64_psr(regs)->up = 1;
  1.4141 +
  1.4142 +	} else {
  1.4143 +		tregs = task_pt_regs(ctx->ctx_task);
  1.4144 +
  1.4145 +		/*
  1.4146 +		 * start monitoring at the kernel level the next
  1.4147 +		 * time the task is scheduled
  1.4148 +		 */
  1.4149 +		ctx->ctx_saved_psr_up = IA64_PSR_UP;
  1.4150 +
  1.4151 +		/*
  1.4152 +		 * activate monitoring at user level
  1.4153 +		 */
  1.4154 +		ia64_psr(tregs)->up = 1;
  1.4155 +	}
  1.4156 +	return 0;
  1.4157 +}
  1.4158 +
  1.4159 +static int
  1.4160 +pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.4161 +{
  1.4162 +	pfarg_reg_t *req = (pfarg_reg_t *)arg;
  1.4163 +	unsigned int cnum;
  1.4164 +	int i;
  1.4165 +	int ret = -EINVAL;
  1.4166 +
  1.4167 +	for (i = 0; i < count; i++, req++) {
  1.4168 +
  1.4169 +		cnum = req->reg_num;
  1.4170 +
  1.4171 +		if (!PMC_IS_IMPL(cnum)) goto abort_mission;
  1.4172 +
  1.4173 +		req->reg_value = PMC_DFL_VAL(cnum);
  1.4174 +
  1.4175 +		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
  1.4176 +
  1.4177 +		DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
  1.4178 +	}
  1.4179 +	return 0;
  1.4180 +
  1.4181 +abort_mission:
  1.4182 +	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
  1.4183 +	return ret;
  1.4184 +}
  1.4185 +
  1.4186 +static int
  1.4187 +pfm_check_task_exist(pfm_context_t *ctx)
  1.4188 +{
  1.4189 +	struct task_struct *g, *t;
  1.4190 +	int ret = -ESRCH;
  1.4191 +
  1.4192 +	read_lock(&tasklist_lock);
  1.4193 +
  1.4194 +	do_each_thread (g, t) {
  1.4195 +		if (t->thread.pfm_context == ctx) {
  1.4196 +			ret = 0;
  1.4197 +			break;
  1.4198 +		}
  1.4199 +	} while_each_thread (g, t);
  1.4200 +
  1.4201 +	read_unlock(&tasklist_lock);
  1.4202 +
  1.4203 +	DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
  1.4204 +
  1.4205 +	return ret;
  1.4206 +}
  1.4207 +
  1.4208 +static int
  1.4209 +pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.4210 +{
  1.4211 +	struct task_struct *task;
  1.4212 +	struct thread_struct *thread;
  1.4213 +	struct pfm_context_t *old;
  1.4214 +	unsigned long flags;
  1.4215 +#ifndef CONFIG_SMP
  1.4216 +	struct task_struct *owner_task = NULL;
  1.4217 +#endif
  1.4218 +	pfarg_load_t *req = (pfarg_load_t *)arg;
  1.4219 +	unsigned long *pmcs_source, *pmds_source;
  1.4220 +	int the_cpu;
  1.4221 +	int ret = 0;
  1.4222 +	int state, is_system, set_dbregs = 0;
  1.4223 +
  1.4224 +	state     = ctx->ctx_state;
  1.4225 +	is_system = ctx->ctx_fl_system;
  1.4226 +	/*
  1.4227 +	 * can only load from unloaded or terminated state
  1.4228 +	 */
  1.4229 +	if (state != PFM_CTX_UNLOADED) {
  1.4230 +		DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
  1.4231 +			req->load_pid,
  1.4232 +			ctx->ctx_state));
  1.4233 +		return -EBUSY;
  1.4234 +	}
  1.4235 +
  1.4236 +	DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
  1.4237 +
  1.4238 +	if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
  1.4239 +		DPRINT(("cannot use blocking mode on self\n"));
  1.4240 +		return -EINVAL;
  1.4241 +	}
  1.4242 +
  1.4243 +	ret = pfm_get_task(ctx, req->load_pid, &task);
  1.4244 +	if (ret) {
  1.4245 +		DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
  1.4246 +		return ret;
  1.4247 +	}
  1.4248 +
  1.4249 +	ret = -EINVAL;
  1.4250 +
  1.4251 +	/*
  1.4252 +	 * system wide is self monitoring only
  1.4253 +	 */
  1.4254 +	if (is_system && task != current) {
  1.4255 +		DPRINT(("system wide is self monitoring only load_pid=%d\n",
  1.4256 +			req->load_pid));
  1.4257 +		goto error;
  1.4258 +	}
  1.4259 +
  1.4260 +	thread = &task->thread;
  1.4261 +
  1.4262 +	ret = 0;
  1.4263 +	/*
  1.4264 +	 * cannot load a context which is using range restrictions,
  1.4265 +	 * into a task that is being debugged.
  1.4266 +	 */
  1.4267 +	if (ctx->ctx_fl_using_dbreg) {
  1.4268 +		if (thread->flags & IA64_THREAD_DBG_VALID) {
  1.4269 +			ret = -EBUSY;
  1.4270 +			DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
  1.4271 +			goto error;
  1.4272 +		}
  1.4273 +		LOCK_PFS(flags);
  1.4274 +
  1.4275 +		if (is_system) {
  1.4276 +			if (pfm_sessions.pfs_ptrace_use_dbregs) {
  1.4277 +				DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
  1.4278 +				ret = -EBUSY;
  1.4279 +			} else {
  1.4280 +				pfm_sessions.pfs_sys_use_dbregs++;
  1.4281 +				DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
  1.4282 +				set_dbregs = 1;
  1.4283 +			}
  1.4284 +		}
  1.4285 +
  1.4286 +		UNLOCK_PFS(flags);
  1.4287 +
  1.4288 +		if (ret) goto error;
  1.4289 +	}
  1.4290 +
  1.4291 +	/*
  1.4292 +	 * SMP system-wide monitoring implies self-monitoring.
  1.4293 +	 *
  1.4294 +	 * The programming model expects the task to
  1.4295 +	 * be pinned on a CPU throughout the session.
  1.4296 +	 * Here we take note of the current CPU at the
  1.4297 +	 * time the context is loaded. No call from
  1.4298 +	 * another CPU will be allowed.
  1.4299 +	 *
  1.4300 +	 * The pinning via shed_setaffinity()
  1.4301 +	 * must be done by the calling task prior
  1.4302 +	 * to this call.
  1.4303 +	 *
  1.4304 +	 * systemwide: keep track of CPU this session is supposed to run on
  1.4305 +	 */
  1.4306 +	the_cpu = ctx->ctx_cpu = smp_processor_id();
  1.4307 +
  1.4308 +	ret = -EBUSY;
  1.4309 +	/*
  1.4310 +	 * now reserve the session
  1.4311 +	 */
  1.4312 +	ret = pfm_reserve_session(current, is_system, the_cpu);
  1.4313 +	if (ret) goto error;
  1.4314 +
  1.4315 +	/*
  1.4316 +	 * task is necessarily stopped at this point.
  1.4317 +	 *
  1.4318 +	 * If the previous context was zombie, then it got removed in
  1.4319 +	 * pfm_save_regs(). Therefore we should not see it here.
  1.4320 +	 * If we see a context, then this is an active context
  1.4321 +	 *
  1.4322 +	 * XXX: needs to be atomic
  1.4323 +	 */
  1.4324 +	DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
  1.4325 +		thread->pfm_context, ctx));
  1.4326 +
  1.4327 +	ret = -EBUSY;
  1.4328 +	old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
  1.4329 +	if (old != NULL) {
  1.4330 +		DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
  1.4331 +		goto error_unres;
  1.4332 +	}
  1.4333 +
  1.4334 +	pfm_reset_msgq(ctx);
  1.4335 +
  1.4336 +	ctx->ctx_state = PFM_CTX_LOADED;
  1.4337 +
  1.4338 +	/*
  1.4339 +	 * link context to task
  1.4340 +	 */
  1.4341 +	ctx->ctx_task = task;
  1.4342 +
  1.4343 +	if (is_system) {
  1.4344 +		/*
  1.4345 +		 * we load as stopped
  1.4346 +		 */
  1.4347 +		PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
  1.4348 +		PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
  1.4349 +
  1.4350 +		if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
  1.4351 +	} else {
  1.4352 +		thread->flags |= IA64_THREAD_PM_VALID;
  1.4353 +	}
  1.4354 +
  1.4355 +	/*
  1.4356 +	 * propagate into thread-state
  1.4357 +	 */
  1.4358 +	pfm_copy_pmds(task, ctx);
  1.4359 +	pfm_copy_pmcs(task, ctx);
  1.4360 +
  1.4361 +	pmcs_source = thread->pmcs;
  1.4362 +	pmds_source = thread->pmds;
  1.4363 +
  1.4364 +	/*
  1.4365 +	 * always the case for system-wide
  1.4366 +	 */
  1.4367 +	if (task == current) {
  1.4368 +
  1.4369 +		if (is_system == 0) {
  1.4370 +
  1.4371 +			/* allow user level control */
  1.4372 +			ia64_psr(regs)->sp = 0;
  1.4373 +			DPRINT(("clearing psr.sp for [%d]\n", task->pid));
  1.4374 +
  1.4375 +			SET_LAST_CPU(ctx, smp_processor_id());
  1.4376 +			INC_ACTIVATION();
  1.4377 +			SET_ACTIVATION(ctx);
  1.4378 +#ifndef CONFIG_SMP
  1.4379 +			/*
  1.4380 +			 * push the other task out, if any
  1.4381 +			 */
  1.4382 +			owner_task = GET_PMU_OWNER();
  1.4383 +			if (owner_task) pfm_lazy_save_regs(owner_task);
  1.4384 +#endif
  1.4385 +		}
  1.4386 +		/*
  1.4387 +		 * load all PMD from ctx to PMU (as opposed to thread state)
  1.4388 +		 * restore all PMC from ctx to PMU
  1.4389 +		 */
  1.4390 +		pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
  1.4391 +		pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
  1.4392 +
  1.4393 +		ctx->ctx_reload_pmcs[0] = 0UL;
  1.4394 +		ctx->ctx_reload_pmds[0] = 0UL;
  1.4395 +
  1.4396 +		/*
  1.4397 +		 * guaranteed safe by earlier check against DBG_VALID
  1.4398 +		 */
  1.4399 +		if (ctx->ctx_fl_using_dbreg) {
  1.4400 +			pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
  1.4401 +			pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
  1.4402 +		}
  1.4403 +		/*
  1.4404 +		 * set new ownership
  1.4405 +		 */
  1.4406 +		SET_PMU_OWNER(task, ctx);
  1.4407 +
  1.4408 +		DPRINT(("context loaded on PMU for [%d]\n", task->pid));
  1.4409 +	} else {
  1.4410 +		/*
  1.4411 +		 * when not current, task MUST be stopped, so this is safe
  1.4412 +		 */
  1.4413 +		regs = task_pt_regs(task);
  1.4414 +
  1.4415 +		/* force a full reload */
  1.4416 +		ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
  1.4417 +		SET_LAST_CPU(ctx, -1);
  1.4418 +
  1.4419 +		/* initial saved psr (stopped) */
  1.4420 +		ctx->ctx_saved_psr_up = 0UL;
  1.4421 +		ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
  1.4422 +	}
  1.4423 +
  1.4424 +	ret = 0;
  1.4425 +
  1.4426 +error_unres:
  1.4427 +	if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
  1.4428 +error:
  1.4429 +	/*
  1.4430 +	 * we must undo the dbregs setting (for system-wide)
  1.4431 +	 */
  1.4432 +	if (ret && set_dbregs) {
  1.4433 +		LOCK_PFS(flags);
  1.4434 +		pfm_sessions.pfs_sys_use_dbregs--;
  1.4435 +		UNLOCK_PFS(flags);
  1.4436 +	}
  1.4437 +	/*
  1.4438 +	 * release task, there is now a link with the context
  1.4439 +	 */
  1.4440 +	if (is_system == 0 && task != current) {
  1.4441 +		pfm_put_task(task);
  1.4442 +
  1.4443 +		if (ret == 0) {
  1.4444 +			ret = pfm_check_task_exist(ctx);
  1.4445 +			if (ret) {
  1.4446 +				ctx->ctx_state = PFM_CTX_UNLOADED;
  1.4447 +				ctx->ctx_task  = NULL;
  1.4448 +			}
  1.4449 +		}
  1.4450 +	}
  1.4451 +	return ret;
  1.4452 +}
  1.4453 +
  1.4454 +/*
  1.4455 + * in this function, we do not need to increase the use count
  1.4456 + * for the task via get_task_struct(), because we hold the
  1.4457 + * context lock. If the task were to disappear while having
  1.4458 + * a context attached, it would go through pfm_exit_thread()
  1.4459 + * which also grabs the context lock  and would therefore be blocked
  1.4460 + * until we are here.
  1.4461 + */
  1.4462 +static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx);
  1.4463 +
  1.4464 +static int
  1.4465 +pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
  1.4466 +{
  1.4467 +	struct task_struct *task = PFM_CTX_TASK(ctx);
  1.4468 +	struct pt_regs *tregs;
  1.4469 +	int prev_state, is_system;
  1.4470 +	int ret;
  1.4471 +
  1.4472 +	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
  1.4473 +
  1.4474 +	prev_state = ctx->ctx_state;
  1.4475 +	is_system  = ctx->ctx_fl_system;
  1.4476 +
  1.4477 +	/*
  1.4478 +	 * unload only when necessary
  1.4479 +	 */
  1.4480 +	if (prev_state == PFM_CTX_UNLOADED) {
  1.4481 +		DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
  1.4482 +		return 0;
  1.4483 +	}
  1.4484 +
  1.4485 +	/*
  1.4486 +	 * clear psr and dcr bits
  1.4487 +	 */
  1.4488 +	ret = pfm_stop(ctx, NULL, 0, regs);
  1.4489 +	if (ret) return ret;
  1.4490 +
  1.4491 +	ctx->ctx_state = PFM_CTX_UNLOADED;
  1.4492 +
  1.4493 +	/*
  1.4494 +	 * in system mode, we need to update the PMU directly
  1.4495 +	 * and the user level state of the caller, which may not
  1.4496 +	 * necessarily be the creator of the context.
  1.4497 +	 */
  1.4498 +	if (is_system) {
  1.4499 +
  1.4500 +		/*
  1.4501 +		 * Update cpuinfo
  1.4502 +		 *
  1.4503 +		 * local PMU is taken care of in pfm_stop()
  1.4504 +		 */
  1.4505 +		PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
  1.4506 +		PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
  1.4507 +
  1.4508 +		/*
  1.4509 +		 * save PMDs in context
  1.4510 +		 * release ownership
  1.4511 +		 */
  1.4512 +		pfm_flush_pmds(current, ctx);
  1.4513 +
  1.4514 +		/*
  1.4515 +		 * at this point we are done with the PMU
  1.4516 +		 * so we can unreserve the resource.
  1.4517 +		 */
  1.4518 +		if (prev_state != PFM_CTX_ZOMBIE) 
  1.4519 +			pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
  1.4520 +
  1.4521 +		/*
  1.4522 +		 * disconnect context from task
  1.4523 +		 */
  1.4524 +		task->thread.pfm_context = NULL;
  1.4525 +		/*
  1.4526 +		 * disconnect task from context
  1.4527 +		 */
  1.4528 +		ctx->ctx_task = NULL;
  1.4529 +
  1.4530 +		/*
  1.4531 +		 * There is nothing more to cleanup here.
  1.4532 +		 */
  1.4533 +		return 0;
  1.4534 +	}
  1.4535 +
  1.4536 +	/*
  1.4537 +	 * per-task mode
  1.4538 +	 */
  1.4539 +	tregs = task == current ? regs : task_pt_regs(task);
  1.4540 +
  1.4541 +	if (task == current) {
  1.4542 +		/*
  1.4543 +		 * cancel user level control
  1.4544 +		 */
  1.4545 +		ia64_psr(regs)->sp = 1;
  1.4546 +
  1.4547 +		DPRINT(("setting psr.sp for [%d]\n", task->pid));
  1.4548 +	}
  1.4549 +	/*
  1.4550 +	 * save PMDs to context
  1.4551 +	 * release ownership
  1.4552 +	 */
  1.4553 +	pfm_flush_pmds(task, ctx);
  1.4554 +
  1.4555 +	/*
  1.4556 +	 * at this point we are done with the PMU
  1.4557 +	 * so we can unreserve the resource.
  1.4558 +	 *
  1.4559 +	 * when state was ZOMBIE, we have already unreserved.
  1.4560 +	 */
  1.4561 +	if (prev_state != PFM_CTX_ZOMBIE) 
  1.4562 +		pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
  1.4563 +
  1.4564 +	/*
  1.4565 +	 * reset activation counter and psr
  1.4566 +	 */
  1.4567 +	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
  1.4568 +	SET_LAST_CPU(ctx, -1);
  1.4569 +
  1.4570 +	/*
  1.4571 +	 * PMU state will not be restored
  1.4572 +	 */
  1.4573 +	task->thread.flags &= ~IA64_THREAD_PM_VALID;
  1.4574 +
  1.4575 +	/*
  1.4576 +	 * break links between context and task
  1.4577 +	 */
  1.4578 +	task->thread.pfm_context  = NULL;
  1.4579 +	ctx->ctx_task             = NULL;
  1.4580 +
  1.4581 +	PFM_SET_WORK_PENDING(task, 0);
  1.4582 +
  1.4583 +	ctx->ctx_fl_trap_reason  = PFM_TRAP_REASON_NONE;
  1.4584 +	ctx->ctx_fl_can_restart  = 0;
  1.4585 +	ctx->ctx_fl_going_zombie = 0;
  1.4586 +
  1.4587 +	DPRINT(("disconnected [%d] from context\n", task->pid));
  1.4588 +
  1.4589 +	return 0;
  1.4590 +}
  1.4591 +
  1.4592 +
  1.4593 +/*
  1.4594 + * called only from exit_thread(): task == current
  1.4595 + * we come here only if current has a context attached (loaded or masked)
  1.4596 + */
  1.4597 +void
  1.4598 +pfm_exit_thread(struct task_struct *task)
  1.4599 +{
  1.4600 +	pfm_context_t *ctx;
  1.4601 +	unsigned long flags;
  1.4602 +	struct pt_regs *regs = task_pt_regs(task);
  1.4603 +	int ret, state;
  1.4604 +	int free_ok = 0;
  1.4605 +
  1.4606 +	ctx = PFM_GET_CTX(task);
  1.4607 +
  1.4608 +	PROTECT_CTX(ctx, flags);
  1.4609 +
  1.4610 +	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
  1.4611 +
  1.4612 +	state = ctx->ctx_state;
  1.4613 +	switch(state) {
  1.4614 +		case PFM_CTX_UNLOADED:
  1.4615 +			/*
  1.4616 +	 		 * only comes to thios function if pfm_context is not NULL, i.e., cannot
  1.4617 +			 * be in unloaded state
  1.4618 +	 		 */
  1.4619 +			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
  1.4620 +			break;
  1.4621 +		case PFM_CTX_LOADED:
  1.4622 +		case PFM_CTX_MASKED:
  1.4623 +			ret = pfm_context_unload(ctx, NULL, 0, regs);
  1.4624 +			if (ret) {
  1.4625 +				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
  1.4626 +			}
  1.4627 +			DPRINT(("ctx unloaded for current state was %d\n", state));
  1.4628 +
  1.4629 +			pfm_end_notify_user(ctx);
  1.4630 +			break;
  1.4631 +		case PFM_CTX_ZOMBIE:
  1.4632 +			ret = pfm_context_unload(ctx, NULL, 0, regs);
  1.4633 +			if (ret) {
  1.4634 +				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
  1.4635 +			}
  1.4636 +			free_ok = 1;
  1.4637 +			break;
  1.4638 +		default:
  1.4639 +			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
  1.4640 +			break;
  1.4641 +	}
  1.4642 +	UNPROTECT_CTX(ctx, flags);
  1.4643 +
  1.4644 +	{ u64 psr = pfm_get_psr();
  1.4645 +	  BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
  1.4646 +	  BUG_ON(GET_PMU_OWNER());
  1.4647 +	  BUG_ON(ia64_psr(regs)->up);
  1.4648 +	  BUG_ON(ia64_psr(regs)->pp);
  1.4649 +	}
  1.4650 +
  1.4651 +	/*
  1.4652 +	 * All memory free operations (especially for vmalloc'ed memory)
  1.4653 +	 * MUST be done with interrupts ENABLED.
  1.4654 +	 */
  1.4655 +	if (free_ok) pfm_context_free(ctx);
  1.4656 +}
  1.4657 +
  1.4658 +/*
  1.4659 + * functions MUST be listed in the increasing order of their index (see permfon.h)
  1.4660 + */
  1.4661 +#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
  1.4662 +#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
  1.4663 +#define PFM_CMD_PCLRWS	(PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP)
  1.4664 +#define PFM_CMD_PCLRW	(PFM_CMD_FD|PFM_CMD_ARG_RW)
  1.4665 +#define PFM_CMD_NONE	{ NULL, "no-cmd", 0, 0, 0, NULL}
  1.4666 +
  1.4667 +static pfm_cmd_desc_t pfm_cmd_tab[]={
  1.4668 +/* 0  */PFM_CMD_NONE,
  1.4669 +/* 1  */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
  1.4670 +/* 2  */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
  1.4671 +/* 3  */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
  1.4672 +/* 4  */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
  1.4673 +/* 5  */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
  1.4674 +/* 6  */PFM_CMD_NONE,
  1.4675 +/* 7  */PFM_CMD_NONE,
  1.4676 +/* 8  */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
  1.4677 +/* 9  */PFM_CMD_NONE,
  1.4678 +/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
  1.4679 +/* 11 */PFM_CMD_NONE,
  1.4680 +/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
  1.4681 +/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
  1.4682 +/* 14 */PFM_CMD_NONE,
  1.4683 +/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
  1.4684 +/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
  1.4685 +/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
  1.4686 +/* 18 */PFM_CMD_NONE,
  1.4687 +/* 19 */PFM_CMD_NONE,
  1.4688 +/* 20 */PFM_CMD_NONE,
  1.4689 +/* 21 */PFM_CMD_NONE,
  1.4690 +/* 22 */PFM_CMD_NONE,
  1.4691 +/* 23 */PFM_CMD_NONE,
  1.4692 +/* 24 */PFM_CMD_NONE,
  1.4693 +/* 25 */PFM_CMD_NONE,
  1.4694 +/* 26 */PFM_CMD_NONE,
  1.4695 +/* 27 */PFM_CMD_NONE,
  1.4696 +/* 28 */PFM_CMD_NONE,
  1.4697 +/* 29 */PFM_CMD_NONE,
  1.4698 +/* 30 */PFM_CMD_NONE,
  1.4699 +/* 31 */PFM_CMD_NONE,
  1.4700 +/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
  1.4701 +/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
  1.4702 +};
  1.4703 +#define PFM_CMD_COUNT	(sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
  1.4704 +
  1.4705 +static int
  1.4706 +pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
  1.4707 +{
  1.4708 +	struct task_struct *task;
  1.4709 +	int state, old_state;
  1.4710 +
  1.4711 +recheck:
  1.4712 +	state = ctx->ctx_state;
  1.4713 +	task  = ctx->ctx_task;
  1.4714 +
  1.4715 +	if (task == NULL) {
  1.4716 +		DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
  1.4717 +		return 0;
  1.4718 +	}
  1.4719 +
  1.4720 +	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
  1.4721 +		ctx->ctx_fd,
  1.4722 +		state,
  1.4723 +		task->pid,
  1.4724 +		task->state, PFM_CMD_STOPPED(cmd)));
  1.4725 +
  1.4726 +	/*
  1.4727 +	 * self-monitoring always ok.
  1.4728 +	 *
  1.4729 +	 * for system-wide the caller can either be the creator of the
  1.4730 +	 * context (to one to which the context is attached to) OR
  1.4731 +	 * a task running on the same CPU as the session.
  1.4732 +	 */
  1.4733 +	if (task == current || ctx->ctx_fl_system) return 0;
  1.4734 +
  1.4735 +	/*
  1.4736 +	 * we are monitoring another thread
  1.4737 +	 */
  1.4738 +	switch(state) {
  1.4739 +		case PFM_CTX_UNLOADED:
  1.4740 +			/*
  1.4741 +			 * if context is UNLOADED we are safe to go
  1.4742 +			 */
  1.4743 +			return 0;
  1.4744 +		case PFM_CTX_ZOMBIE:
  1.4745 +			/*
  1.4746 +			 * no command can operate on a zombie context
  1.4747 +			 */
  1.4748 +			DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
  1.4749 +			return -EINVAL;
  1.4750 +		case PFM_CTX_MASKED:
  1.4751 +			/*
  1.4752 +			 * PMU state has been saved to software even though
  1.4753 +			 * the thread may still be running.
  1.4754 +			 */
  1.4755 +			if (cmd != PFM_UNLOAD_CONTEXT) return 0;
  1.4756 +	}
  1.4757 +
  1.4758 +	/*
  1.4759 +	 * context is LOADED or MASKED. Some commands may need to have 
  1.4760 +	 * the task stopped.
  1.4761 +	 *
  1.4762 +	 * We could lift this restriction for UP but it would mean that
  1.4763 +	 * the user has no guarantee the task would not run between
  1.4764 +	 * two successive calls to perfmonctl(). That's probably OK.
  1.4765 +	 * If this user wants to ensure the task does not run, then
  1.4766 +	 * the task must be stopped.
  1.4767 +	 */
  1.4768 +	if (PFM_CMD_STOPPED(cmd)) {
  1.4769 +		if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
  1.4770 +			DPRINT(("[%d] task not in stopped state\n", task->pid));
  1.4771 +			return -EBUSY;
  1.4772 +		}
  1.4773 +		/*
  1.4774 +		 * task is now stopped, wait for ctxsw out
  1.4775 +		 *
  1.4776 +		 * This is an interesting point in the code.
  1.4777 +		 * We need to unprotect the context because
  1.4778 +		 * the pfm_save_regs() routines needs to grab
  1.4779 +		 * the same lock. There are danger in doing
  1.4780 +		 * this because it leaves a window open for
  1.4781 +		 * another task to get access to the context
  1.4782 +		 * and possibly change its state. The one thing
  1.4783 +		 * that is not possible is for the context to disappear
  1.4784 +		 * because we are protected by the VFS layer, i.e.,
  1.4785 +		 * get_fd()/put_fd().
  1.4786 +		 */
  1.4787 +		old_state = state;
  1.4788 +
  1.4789 +		UNPROTECT_CTX(ctx, flags);
  1.4790 +
  1.4791 +		wait_task_inactive(task);
  1.4792 +
  1.4793 +		PROTECT_CTX(ctx, flags);
  1.4794 +
  1.4795 +		/*
  1.4796 +		 * we must recheck to verify if state has changed
  1.4797 +		 */
  1.4798 +		if (ctx->ctx_state != old_state) {
  1.4799 +			DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
  1.4800 +			goto recheck;
  1.4801 +		}
  1.4802 +	}
  1.4803 +	return 0;
  1.4804 +}
  1.4805 +
  1.4806 +/*
  1.4807 + * system-call entry point (must return long)
  1.4808 + */
  1.4809 +asmlinkage long
  1.4810 +sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
  1.4811 +{
  1.4812 +	struct file *file = NULL;
  1.4813 +	pfm_context_t *ctx = NULL;
  1.4814 +	unsigned long flags = 0UL;
  1.4815 +	void *args_k = NULL;
  1.4816 +	long ret; /* will expand int return types */
  1.4817 +	size_t base_sz, sz, xtra_sz = 0;
  1.4818 +	int narg, completed_args = 0, call_made = 0, cmd_flags;
  1.4819 +	int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
  1.4820 +	int (*getsize)(void *arg, size_t *sz);
  1.4821 +#define PFM_MAX_ARGSIZE	4096
  1.4822 +
  1.4823 +	/*
  1.4824 +	 * reject any call if perfmon was disabled at initialization
  1.4825 +	 */
  1.4826 +	if (unlikely(pmu_conf == NULL)) return -ENOSYS;
  1.4827 +
  1.4828 +	if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
  1.4829 +		DPRINT(("invalid cmd=%d\n", cmd));
  1.4830 +		return -EINVAL;
  1.4831 +	}
  1.4832 +
  1.4833 +	func      = pfm_cmd_tab[cmd].cmd_func;
  1.4834 +	narg      = pfm_cmd_tab[cmd].cmd_narg;
  1.4835 +	base_sz   = pfm_cmd_tab[cmd].cmd_argsize;
  1.4836 +	getsize   = pfm_cmd_tab[cmd].cmd_getsize;
  1.4837 +	cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
  1.4838 +
  1.4839 +	if (unlikely(func == NULL)) {
  1.4840 +		DPRINT(("invalid cmd=%d\n", cmd));
  1.4841 +		return -EINVAL;
  1.4842 +	}
  1.4843 +
  1.4844 +	DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
  1.4845 +		PFM_CMD_NAME(cmd),
  1.4846 +		cmd,
  1.4847 +		narg,
  1.4848 +		base_sz,
  1.4849 +		count));
  1.4850 +
  1.4851 +	/*
  1.4852 +	 * check if number of arguments matches what the command expects
  1.4853 +	 */
  1.4854 +	if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count)))
  1.4855 +		return -EINVAL;
  1.4856 +
  1.4857 +restart_args:
  1.4858 +	sz = xtra_sz + base_sz*count;
  1.4859 +	/*
  1.4860 +	 * limit abuse to min page size
  1.4861 +	 */
  1.4862 +	if (unlikely(sz > PFM_MAX_ARGSIZE)) {
  1.4863 +		printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
  1.4864 +		return -E2BIG;
  1.4865 +	}
  1.4866 +
  1.4867 +	/*
  1.4868 +	 * allocate default-sized argument buffer
  1.4869 +	 */
  1.4870 +	if (likely(count && args_k == NULL)) {
  1.4871 +		args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
  1.4872 +		if (args_k == NULL) return -ENOMEM;
  1.4873 +	}
  1.4874 +
  1.4875 +	ret = -EFAULT;
  1.4876 +
  1.4877 +	/*
  1.4878 +	 * copy arguments
  1.4879 +	 *
  1.4880 +	 * assume sz = 0 for command without parameters
  1.4881 +	 */
  1.4882 +	if (sz && copy_from_user(args_k, arg, sz)) {
  1.4883 +		DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
  1.4884 +		goto error_args;
  1.4885 +	}
  1.4886 +
  1.4887 +	/*
  1.4888 +	 * check if command supports extra parameters
  1.4889 +	 */
  1.4890 +	if (completed_args == 0 && getsize) {
  1.4891 +		/*
  1.4892 +		 * get extra parameters size (based on main argument)
  1.4893 +		 */
  1.4894 +		ret = (*getsize)(args_k, &xtra_sz);
  1.4895 +		if (ret) goto error_args;
  1.4896 +
  1.4897 +		completed_args = 1;
  1.4898 +
  1.4899 +		DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
  1.4900 +
  1.4901 +		/* retry if necessary */
  1.4902 +		if (likely(xtra_sz)) goto restart_args;
  1.4903 +	}
  1.4904 +
  1.4905 +	if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
  1.4906 +
  1.4907 +	ret = -EBADF;
  1.4908 +
  1.4909 +	file = fget(fd);
  1.4910 +	if (unlikely(file == NULL)) {
  1.4911 +		DPRINT(("invalid fd %d\n", fd));
  1.4912 +		goto error_args;
  1.4913 +	}
  1.4914 +	if (unlikely(PFM_IS_FILE(file) == 0)) {
  1.4915 +		DPRINT(("fd %d not related to perfmon\n", fd));
  1.4916 +		goto error_args;
  1.4917 +	}
  1.4918 +
  1.4919 +	ctx = (pfm_context_t *)file->private_data;
  1.4920 +	if (unlikely(ctx == NULL)) {
  1.4921 +		DPRINT(("no context for fd %d\n", fd));
  1.4922 +		goto error_args;
  1.4923 +	}
  1.4924 +	prefetch(&ctx->ctx_state);
  1.4925 +
  1.4926 +	PROTECT_CTX(ctx, flags);
  1.4927 +
  1.4928 +	/*
  1.4929 +	 * check task is stopped
  1.4930 +	 */
  1.4931 +	ret = pfm_check_task_state(ctx, cmd, flags);
  1.4932 +	if (unlikely(ret)) goto abort_locked;
  1.4933 +
  1.4934 +skip_fd:
  1.4935 +	ret = (*func)(ctx, args_k, count, task_pt_regs(current));
  1.4936 +
  1.4937 +	call_made = 1;
  1.4938 +
  1.4939 +abort_locked:
  1.4940 +	if (likely(ctx)) {
  1.4941 +		DPRINT(("context unlocked\n"));
  1.4942 +		UNPROTECT_CTX(ctx, flags);
  1.4943 +	}
  1.4944 +
  1.4945 +	/* copy argument back to user, if needed */
  1.4946 +	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
  1.4947 +
  1.4948 +error_args:
  1.4949 +	if (file)
  1.4950 +		fput(file);
  1.4951 +
  1.4952 +	kfree(args_k);
  1.4953 +
  1.4954 +	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
  1.4955 +
  1.4956 +	return ret;
  1.4957 +}
  1.4958 +
  1.4959 +static void
  1.4960 +pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs)
  1.4961 +{
  1.4962 +	pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
  1.4963 +	pfm_ovfl_ctrl_t rst_ctrl;
  1.4964 +	int state;
  1.4965 +	int ret = 0;
  1.4966 +
  1.4967 +	state = ctx->ctx_state;
  1.4968 +	/*
  1.4969 +	 * Unlock sampling buffer and reset index atomically
  1.4970 +	 * XXX: not really needed when blocking
  1.4971 +	 */
  1.4972 +	if (CTX_HAS_SMPL(ctx)) {
  1.4973 +
  1.4974 +		rst_ctrl.bits.mask_monitoring = 0;
  1.4975 +		rst_ctrl.bits.reset_ovfl_pmds = 0;
  1.4976 +
  1.4977 +		if (state == PFM_CTX_LOADED)
  1.4978 +			ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
  1.4979 +		else
  1.4980 +			ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
  1.4981 +	} else {
  1.4982 +		rst_ctrl.bits.mask_monitoring = 0;
  1.4983 +		rst_ctrl.bits.reset_ovfl_pmds = 1;
  1.4984 +	}
  1.4985 +
  1.4986 +	if (ret == 0) {
  1.4987 +		if (rst_ctrl.bits.reset_ovfl_pmds) {
  1.4988 +			pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
  1.4989 +		}
  1.4990 +		if (rst_ctrl.bits.mask_monitoring == 0) {
  1.4991 +			DPRINT(("resuming monitoring\n"));
  1.4992 +			if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
  1.4993 +		} else {
  1.4994 +			DPRINT(("stopping monitoring\n"));
  1.4995 +			//pfm_stop_monitoring(current, regs);
  1.4996 +		}
  1.4997 +		ctx->ctx_state = PFM_CTX_LOADED;
  1.4998 +	}
  1.4999 +}
  1.5000 +
  1.5001 +/*
  1.5002 + * context MUST BE LOCKED when calling
  1.5003 + * can only be called for current
  1.5004 + */
  1.5005 +static void
  1.5006 +pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
  1.5007 +{
  1.5008 +	int ret;
  1.5009 +
  1.5010 +	DPRINT(("entering for [%d]\n", current->pid));
  1.5011 +
  1.5012 +	ret = pfm_context_unload(ctx, NULL, 0, regs);
  1.5013 +	if (ret) {
  1.5014 +		printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
  1.5015 +	}
  1.5016 +
  1.5017 +	/*
  1.5018 +	 * and wakeup controlling task, indicating we are now disconnected
  1.5019 +	 */
  1.5020 +	wake_up_interruptible(&ctx->ctx_zombieq);
  1.5021 +
  1.5022 +	/*
  1.5023 +	 * given that context is still locked, the controlling
  1.5024 +	 * task will only get access when we return from
  1.5025 +	 * pfm_handle_work().
  1.5026 +	 */
  1.5027 +}
  1.5028 +
  1.5029 +static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
  1.5030 + /*
  1.5031 +  * pfm_handle_work() can be called with interrupts enabled
  1.5032 +  * (TIF_NEED_RESCHED) or disabled. The down_interruptible
  1.5033 +  * call may sleep, therefore we must re-enable interrupts
  1.5034 +  * to avoid deadlocks. It is safe to do so because this function
  1.5035 +  * is called ONLY when returning to user level (PUStk=1), in which case
  1.5036 +  * there is no risk of kernel stack overflow due to deep
  1.5037 +  * interrupt nesting.
  1.5038 +  */
  1.5039 +void
  1.5040 +pfm_handle_work(void)
  1.5041 +{
  1.5042 +	pfm_context_t *ctx;
  1.5043 +	struct pt_regs *regs;
  1.5044 +	unsigned long flags, dummy_flags;
  1.5045 +	unsigned long ovfl_regs;
  1.5046 +	unsigned int reason;
  1.5047 +	int ret;
  1.5048 +
  1.5049 +	ctx = PFM_GET_CTX(current);
  1.5050 +	if (ctx == NULL) {
  1.5051 +		printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
  1.5052 +		return;
  1.5053 +	}
  1.5054 +
  1.5055 +	PROTECT_CTX(ctx, flags);
  1.5056 +
  1.5057 +	PFM_SET_WORK_PENDING(current, 0);
  1.5058 +
  1.5059 +	pfm_clear_task_notify();
  1.5060 +
  1.5061 +	regs = task_pt_regs(current);
  1.5062 +
  1.5063 +	/*
  1.5064 +	 * extract reason for being here and clear
  1.5065 +	 */
  1.5066 +	reason = ctx->ctx_fl_trap_reason;
  1.5067 +	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
  1.5068 +	ovfl_regs = ctx->ctx_ovfl_regs[0];
  1.5069 +
  1.5070 +	DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
  1.5071 +
  1.5072 +	/*
  1.5073 +	 * must be done before we check for simple-reset mode
  1.5074 +	 */
  1.5075 +	if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
  1.5076 +
  1.5077 +
  1.5078 +	//if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
  1.5079 +	if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
  1.5080 +
  1.5081 +	/*
  1.5082 +	 * restore interrupt mask to what it was on entry.
  1.5083 +	 * Could be enabled/diasbled.
  1.5084 +	 */
  1.5085 +	UNPROTECT_CTX(ctx, flags);
  1.5086 +
  1.5087 +	/*
  1.5088 +	 * force interrupt enable because of down_interruptible()
  1.5089 +	 */
  1.5090 +	local_irq_enable();
  1.5091 +
  1.5092 +	DPRINT(("before block sleeping\n"));
  1.5093 +
  1.5094 +	/*
  1.5095 +	 * may go through without blocking on SMP systems
  1.5096 +	 * if restart has been received already by the time we call down()
  1.5097 +	 */
  1.5098 +	ret = wait_for_completion_interruptible(&ctx->ctx_restart_done);
  1.5099 +
  1.5100 +	DPRINT(("after block sleeping ret=%d\n", ret));
  1.5101 +
  1.5102 +	/*
  1.5103 +	 * lock context and mask interrupts again
  1.5104 +	 * We save flags into a dummy because we may have
  1.5105 +	 * altered interrupts mask compared to entry in this
  1.5106 +	 * function.
  1.5107 +	 */
  1.5108 +	PROTECT_CTX(ctx, dummy_flags);
  1.5109 +
  1.5110 +	/*
  1.5111 +	 * we need to read the ovfl_regs only after wake-up
  1.5112 +	 * because we may have had pfm_write_pmds() in between
  1.5113 +	 * and that can changed PMD values and therefore 
  1.5114 +	 * ovfl_regs is reset for these new PMD values.
  1.5115 +	 */
  1.5116 +	ovfl_regs = ctx->ctx_ovfl_regs[0];
  1.5117 +
  1.5118 +	if (ctx->ctx_fl_going_zombie) {
  1.5119 +do_zombie:
  1.5120 +		DPRINT(("context is zombie, bailing out\n"));
  1.5121 +		pfm_context_force_terminate(ctx, regs);
  1.5122 +		goto nothing_to_do;
  1.5123 +	}
  1.5124 +	/*
  1.5125 +	 * in case of interruption of down() we don't restart anything
  1.5126 +	 */
  1.5127 +	if (ret < 0) goto nothing_to_do;
  1.5128 +
  1.5129 +skip_blocking:
  1.5130 +	pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
  1.5131 +	ctx->ctx_ovfl_regs[0] = 0UL;
  1.5132 +
  1.5133 +nothing_to_do:
  1.5134 +	/*
  1.5135 +	 * restore flags as they were upon entry
  1.5136 +	 */
  1.5137 +	UNPROTECT_CTX(ctx, flags);
  1.5138 +}
  1.5139 +
  1.5140 +static int
  1.5141 +pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg)
  1.5142 +{
  1.5143 +	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
  1.5144 +		DPRINT(("ignoring overflow notification, owner is zombie\n"));
  1.5145 +		return 0;
  1.5146 +	}
  1.5147 +
  1.5148 +	DPRINT(("waking up somebody\n"));
  1.5149 +
  1.5150 +	if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
  1.5151 +
  1.5152 +	/*
  1.5153 +	 * safe, we are not in intr handler, nor in ctxsw when
  1.5154 +	 * we come here
  1.5155 +	 */
  1.5156 +	kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
  1.5157 +
  1.5158 +	return 0;
  1.5159 +}
  1.5160 +
  1.5161 +static int
  1.5162 +pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
  1.5163 +{
  1.5164 +	pfm_msg_t *msg = NULL;
  1.5165 +
  1.5166 +	if (ctx->ctx_fl_no_msg == 0) {
  1.5167 +		msg = pfm_get_new_msg(ctx);
  1.5168 +		if (msg == NULL) {
  1.5169 +			printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
  1.5170 +			return -1;
  1.5171 +		}
  1.5172 +
  1.5173 +		msg->pfm_ovfl_msg.msg_type         = PFM_MSG_OVFL;
  1.5174 +		msg->pfm_ovfl_msg.msg_ctx_fd       = ctx->ctx_fd;
  1.5175 +		msg->pfm_ovfl_msg.msg_active_set   = 0;
  1.5176 +		msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
  1.5177 +		msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
  1.5178 +		msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
  1.5179 +		msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
  1.5180 +		msg->pfm_ovfl_msg.msg_tstamp       = 0UL;
  1.5181 +	}
  1.5182 +
  1.5183 +	DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
  1.5184 +		msg,
  1.5185 +		ctx->ctx_fl_no_msg,
  1.5186 +		ctx->ctx_fd,
  1.5187 +		ovfl_pmds));
  1.5188 +
  1.5189 +	return pfm_notify_user(ctx, msg);
  1.5190 +}
  1.5191 +
  1.5192 +static int
  1.5193 +pfm_end_notify_user(pfm_context_t *ctx)
  1.5194 +{
  1.5195 +	pfm_msg_t *msg;
  1.5196 +
  1.5197 +	msg = pfm_get_new_msg(ctx);
  1.5198 +	if (msg == NULL) {
  1.5199 +		printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
  1.5200 +		return -1;
  1.5201 +	}
  1.5202 +	/* no leak */
  1.5203 +	memset(msg, 0, sizeof(*msg));
  1.5204 +
  1.5205 +	msg->pfm_end_msg.msg_type    = PFM_MSG_END;
  1.5206 +	msg->pfm_end_msg.msg_ctx_fd  = ctx->ctx_fd;
  1.5207 +	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
  1.5208 +
  1.5209 +	DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
  1.5210 +		msg,
  1.5211 +		ctx->ctx_fl_no_msg,
  1.5212 +		ctx->ctx_fd));
  1.5213 +
  1.5214 +	return pfm_notify_user(ctx, msg);
  1.5215 +}
  1.5216 +
  1.5217 +/*
  1.5218 + * main overflow processing routine.
  1.5219 + * it can be called from the interrupt path or explicitely during the context switch code
  1.5220 + */
  1.5221 +static void
  1.5222 +pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
  1.5223 +{
  1.5224 +	pfm_ovfl_arg_t *ovfl_arg;
  1.5225 +	unsigned long mask;
  1.5226 +	unsigned long old_val, ovfl_val, new_val;
  1.5227 +	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
  1.5228 +	unsigned long tstamp;
  1.5229 +	pfm_ovfl_ctrl_t	ovfl_ctrl;
  1.5230 +	unsigned int i, has_smpl;
  1.5231 +	int must_notify = 0;
  1.5232 +
  1.5233 +	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
  1.5234 +
  1.5235 +	/*
  1.5236 +	 * sanity test. Should never happen
  1.5237 +	 */
  1.5238 +	if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
  1.5239 +
  1.5240 +	tstamp   = ia64_get_itc();
  1.5241 +	mask     = pmc0 >> PMU_FIRST_COUNTER;
  1.5242 +	ovfl_val = pmu_conf->ovfl_val;
  1.5243 +	has_smpl = CTX_HAS_SMPL(ctx);
  1.5244 +
  1.5245 +	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
  1.5246 +		     "used_pmds=0x%lx\n",
  1.5247 +			pmc0,
  1.5248 +			task ? task->pid: -1,
  1.5249 +			(regs ? regs->cr_iip : 0),
  1.5250 +			CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
  1.5251 +			ctx->ctx_used_pmds[0]));
  1.5252 +
  1.5253 +
  1.5254 +	/*
  1.5255 +	 * first we update the virtual counters
  1.5256 +	 * assume there was a prior ia64_srlz_d() issued
  1.5257 +	 */
  1.5258 +	for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
  1.5259 +
  1.5260 +		/* skip pmd which did not overflow */
  1.5261 +		if ((mask & 0x1) == 0) continue;
  1.5262 +
  1.5263 +		/*
  1.5264 +		 * Note that the pmd is not necessarily 0 at this point as qualified events
  1.5265 +		 * may have happened before the PMU was frozen. The residual count is not
  1.5266 +		 * taken into consideration here but will be with any read of the pmd via
  1.5267 +		 * pfm_read_pmds().
  1.5268 +		 */
  1.5269 +		old_val              = new_val = ctx->ctx_pmds[i].val;
  1.5270 +		new_val             += 1 + ovfl_val;
  1.5271 +		ctx->ctx_pmds[i].val = new_val;
  1.5272 +
  1.5273 +		/*
  1.5274 +		 * check for overflow condition
  1.5275 +		 */
  1.5276 +		if (likely(old_val > new_val)) {
  1.5277 +			ovfl_pmds |= 1UL << i;
  1.5278 +			if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i;
  1.5279 +		}
  1.5280 +
  1.5281 +		DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
  1.5282 +			i,
  1.5283 +			new_val,
  1.5284 +			old_val,
  1.5285 +			ia64_get_pmd(i) & ovfl_val,
  1.5286 +			ovfl_pmds,
  1.5287 +			ovfl_notify));
  1.5288 +	}
  1.5289 +
  1.5290 +	/*
  1.5291 +	 * there was no 64-bit overflow, nothing else to do
  1.5292 +	 */
  1.5293 +	if (ovfl_pmds == 0UL) return;
  1.5294 +
  1.5295 +	/* 
  1.5296 +	 * reset all control bits
  1.5297 +	 */
  1.5298 +	ovfl_ctrl.val = 0;
  1.5299 +	reset_pmds    = 0UL;
  1.5300 +
  1.5301 +	/*
  1.5302 +	 * if a sampling format module exists, then we "cache" the overflow by 
  1.5303 +	 * calling the module's handler() routine.
  1.5304 +	 */
  1.5305 +	if (has_smpl) {
  1.5306 +		unsigned long start_cycles, end_cycles;
  1.5307 +		unsigned long pmd_mask;
  1.5308 +		int j, k, ret = 0;
  1.5309 +		int this_cpu = smp_processor_id();
  1.5310 +
  1.5311 +		pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
  1.5312 +		ovfl_arg = &ctx->ctx_ovfl_arg;
  1.5313 +
  1.5314 +		prefetch(ctx->ctx_smpl_hdr);
  1.5315 +
  1.5316 +		for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
  1.5317 +
  1.5318 +			mask = 1UL << i;
  1.5319 +
  1.5320 +			if ((pmd_mask & 0x1) == 0) continue;
  1.5321 +
  1.5322 +			ovfl_arg->ovfl_pmd      = (unsigned char )i;
  1.5323 +			ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
  1.5324 +			ovfl_arg->active_set    = 0;
  1.5325 +			ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
  1.5326 +			ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
  1.5327 +
  1.5328 +			ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
  1.5329 +			ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
  1.5330 +			ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
  1.5331 +
  1.5332 +			/*
  1.5333 +		 	 * copy values of pmds of interest. Sampling format may copy them
  1.5334 +		 	 * into sampling buffer.
  1.5335 +		 	 */
  1.5336 +			if (smpl_pmds) {
  1.5337 +				for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
  1.5338 +					if ((smpl_pmds & 0x1) == 0) continue;
  1.5339 +					ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
  1.5340 +					DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
  1.5341 +				}
  1.5342 +			}
  1.5343 +
  1.5344 +			pfm_stats[this_cpu].pfm_smpl_handler_calls++;
  1.5345 +
  1.5346 +			start_cycles = ia64_get_itc();
  1.5347 +
  1.5348 +			/*
  1.5349 +		 	 * call custom buffer format record (handler) routine
  1.5350 +		 	 */
  1.5351 +			ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
  1.5352 +
  1.5353 +			end_cycles = ia64_get_itc();
  1.5354 +
  1.5355 +			/*
  1.5356 +			 * For those controls, we take the union because they have
  1.5357 +			 * an all or nothing behavior.
  1.5358 +			 */
  1.5359 +			ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
  1.5360 +			ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
  1.5361 +			ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
  1.5362 +			/*
  1.5363 +			 * build the bitmask of pmds to reset now
  1.5364 +			 */
  1.5365 +			if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
  1.5366 +
  1.5367 +			pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
  1.5368 +		}
  1.5369 +		/*
  1.5370 +		 * when the module cannot handle the rest of the overflows, we abort right here
  1.5371 +		 */
  1.5372 +		if (ret && pmd_mask) {
  1.5373 +			DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
  1.5374 +				pmd_mask<<PMU_FIRST_COUNTER));
  1.5375 +		}
  1.5376 +		/*
  1.5377 +		 * remove the pmds we reset now from the set of pmds to reset in pfm_restart()
  1.5378 +		 */
  1.5379 +		ovfl_pmds &= ~reset_pmds;
  1.5380 +	} else {
  1.5381 +		/*
  1.5382 +		 * when no sampling module is used, then the default
  1.5383 +		 * is to notify on overflow if requested by user
  1.5384 +		 */
  1.5385 +		ovfl_ctrl.bits.notify_user     = ovfl_notify ? 1 : 0;
  1.5386 +		ovfl_ctrl.bits.block_task      = ovfl_notify ? 1 : 0;
  1.5387 +		ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
  1.5388 +		ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
  1.5389 +		/*
  1.5390 +		 * if needed, we reset all overflowed pmds
  1.5391 +		 */
  1.5392 +		if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
  1.5393 +	}
  1.5394 +
  1.5395 +	DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
  1.5396 +
  1.5397 +	/*
  1.5398 +	 * reset the requested PMD registers using the short reset values
  1.5399 +	 */
  1.5400 +	if (reset_pmds) {
  1.5401 +		unsigned long bm = reset_pmds;
  1.5402 +		pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
  1.5403 +	}
  1.5404 +
  1.5405 +	if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
  1.5406 +		/*
  1.5407 +		 * keep track of what to reset when unblocking
  1.5408 +		 */
  1.5409 +		ctx->ctx_ovfl_regs[0] = ovfl_pmds;
  1.5410 +
  1.5411 +		/*
  1.5412 +		 * check for blocking context 
  1.5413 +		 */
  1.5414 +		if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
  1.5415 +
  1.5416 +			ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
  1.5417 +
  1.5418 +			/*
  1.5419 +			 * set the perfmon specific checking pending work for the task
  1.5420 +			 */
  1.5421 +			PFM_SET_WORK_PENDING(task, 1);
  1.5422 +
  1.5423 +			/*
  1.5424 +			 * when coming from ctxsw, current still points to the
  1.5425 +			 * previous task, therefore we must work with task and not current.
  1.5426 +			 */
  1.5427 +			pfm_set_task_notify(task);
  1.5428 +		}
  1.5429 +		/*
  1.5430 +		 * defer until state is changed (shorten spin window). the context is locked
  1.5431 +		 * anyway, so the signal receiver would come spin for nothing.
  1.5432 +		 */
  1.5433 +		must_notify = 1;
  1.5434 +	}
  1.5435 +
  1.5436 +	DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
  1.5437 +			GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
  1.5438 +			PFM_GET_WORK_PENDING(task),
  1.5439 +			ctx->ctx_fl_trap_reason,
  1.5440 +			ovfl_pmds,
  1.5441 +			ovfl_notify,
  1.5442 +			ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
  1.5443 +	/*
  1.5444 +	 * in case monitoring must be stopped, we toggle the psr bits
  1.5445 +	 */
  1.5446 +	if (ovfl_ctrl.bits.mask_monitoring) {
  1.5447 +		pfm_mask_monitoring(task);
  1.5448 +		ctx->ctx_state = PFM_CTX_MASKED;
  1.5449 +		ctx->ctx_fl_can_restart = 1;
  1.5450 +	}
  1.5451 +
  1.5452 +	/*
  1.5453 +	 * send notification now
  1.5454 +	 */
  1.5455 +	if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
  1.5456 +
  1.5457 +	return;
  1.5458 +
  1.5459 +sanity_check:
  1.5460 +	printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
  1.5461 +			smp_processor_id(),
  1.5462 +			task ? task->pid : -1,
  1.5463 +			pmc0);
  1.5464 +	return;
  1.5465 +
  1.5466 +stop_monitoring:
  1.5467 +	/*
  1.5468 +	 * in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
  1.5469 +	 * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
  1.5470 +	 * come here as zombie only if the task is the current task. In which case, we
  1.5471 +	 * can access the PMU  hardware directly.
  1.5472 +	 *
  1.5473 +	 * Note that zombies do have PM_VALID set. So here we do the minimal.
  1.5474 +	 *
  1.5475 +	 * In case the context was zombified it could not be reclaimed at the time
  1.5476 +	 * the monitoring program exited. At this point, the PMU reservation has been
  1.5477 +	 * returned, the sampiing buffer has been freed. We must convert this call
  1.5478 +	 * into a spurious interrupt. However, we must also avoid infinite overflows
  1.5479 +	 * by stopping monitoring for this task. We can only come here for a per-task
  1.5480 +	 * context. All we need to do is to stop monitoring using the psr bits which
  1.5481 +	 * are always task private. By re-enabling secure montioring, we ensure that
  1.5482 +	 * the monitored task will not be able to re-activate monitoring.
  1.5483 +	 * The task will eventually be context switched out, at which point the context
  1.5484 +	 * will be reclaimed (that includes releasing ownership of the PMU).
  1.5485 +	 *
  1.5486 +	 * So there might be a window of time where the number of per-task session is zero
  1.5487 +	 * yet one PMU might have a owner and get at most one overflow interrupt for a zombie
  1.5488 +	 * context. This is safe because if a per-task session comes in, it will push this one
  1.5489 +	 * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
  1.5490 +	 * session is force on that CPU, given that we use task pinning, pfm_save_regs() will
  1.5491 +	 * also push our zombie context out.
  1.5492 +	 *
  1.5493 +	 * Overall pretty hairy stuff....
  1.5494 +	 */
  1.5495 +	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
  1.5496 +	pfm_clear_psr_up();
  1.5497 +	ia64_psr(regs)->up = 0;
  1.5498 +	ia64_psr(regs)->sp = 1;
  1.5499 +	return;
  1.5500 +}
  1.5501 +
  1.5502 +static int
  1.5503 +pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
  1.5504 +{
  1.5505 +	struct task_struct *task;
  1.5506 +	pfm_context_t *ctx;
  1.5507 +	unsigned long flags;
  1.5508 +	u64 pmc0;
  1.5509 +	int this_cpu = smp_processor_id();
  1.5510 +	int retval = 0;
  1.5511 +
  1.5512 +	pfm_stats[this_cpu].pfm_ovfl_intr_count++;
  1.5513 +
  1.5514 +	/*
  1.5515 +	 * srlz.d done before arriving here
  1.5516 +	 */
  1.5517 +	pmc0 = ia64_get_pmc(0);
  1.5518 +
  1.5519 +	task = GET_PMU_OWNER();
  1.5520 +	ctx  = GET_PMU_CTX();
  1.5521 +
  1.5522 +	/*
  1.5523 +	 * if we have some pending bits set
  1.5524 +	 * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
  1.5525 +	 */
  1.5526 +	if (PMC0_HAS_OVFL(pmc0) && task) {
  1.5527 +		/*
  1.5528 +		 * we assume that pmc0.fr is always set here
  1.5529 +		 */
  1.5530 +
  1.5531 +		/* sanity check */
  1.5532 +		if (!ctx) goto report_spurious1;
  1.5533 +
  1.5534 +		if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) 
  1.5535 +			goto report_spurious2;
  1.5536 +
  1.5537 +		PROTECT_CTX_NOPRINT(ctx, flags);
  1.5538 +
  1.5539 +		pfm_overflow_handler(task, ctx, pmc0, regs);
  1.5540 +
  1.5541 +		UNPROTECT_CTX_NOPRINT(ctx, flags);
  1.5542 +
  1.5543 +	} else {
  1.5544 +		pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
  1.5545 +		retval = -1;
  1.5546 +	}
  1.5547 +	/*
  1.5548 +	 * keep it unfrozen at all times
  1.5549 +	 */
  1.5550 +	pfm_unfreeze_pmu();
  1.5551 +
  1.5552 +	return retval;
  1.5553 +
  1.5554 +report_spurious1:
  1.5555 +	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
  1.5556 +		this_cpu, task->pid);
  1.5557 +	pfm_unfreeze_pmu();
  1.5558 +	return -1;
  1.5559 +report_spurious2:
  1.5560 +	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", 
  1.5561 +		this_cpu, 
  1.5562 +		task->pid);
  1.5563 +	pfm_unfreeze_pmu();
  1.5564 +	return -1;
  1.5565 +}
  1.5566 +
  1.5567 +static irqreturn_t
  1.5568 +pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
  1.5569 +{
  1.5570 +	unsigned long start_cycles, total_cycles;
  1.5571 +	unsigned long min, max;
  1.5572 +	int this_cpu;
  1.5573 +	int ret;
  1.5574 +
  1.5575 +	this_cpu = get_cpu();
  1.5576 +	if (likely(!pfm_alt_intr_handler)) {
  1.5577 +		min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
  1.5578 +		max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
  1.5579 +
  1.5580 +		start_cycles = ia64_get_itc();
  1.5581 +
  1.5582 +		ret = pfm_do_interrupt_handler(irq, arg, regs);
  1.5583 +
  1.5584 +		total_cycles = ia64_get_itc();
  1.5585 +
  1.5586 +		/*
  1.5587 +		 * don't measure spurious interrupts
  1.5588 +		 */
  1.5589 +		if (likely(ret == 0)) {
  1.5590 +			total_cycles -= start_cycles;
  1.5591 +
  1.5592 +			if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
  1.5593 +			if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
  1.5594 +
  1.5595 +			pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
  1.5596 +		}
  1.5597 +	}
  1.5598 +	else {
  1.5599 +		(*pfm_alt_intr_handler->handler)(irq, arg, regs);
  1.5600 +	}
  1.5601 +
  1.5602 +	put_cpu_no_resched();
  1.5603 +	return IRQ_HANDLED;
  1.5604 +}
  1.5605 +
  1.5606 +/*
  1.5607 + * /proc/perfmon interface, for debug only
  1.5608 + */
  1.5609 +
  1.5610 +#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
  1.5611 +
  1.5612 +static void *
  1.5613 +pfm_proc_start(struct seq_file *m, loff_t *pos)
  1.5614 +{
  1.5615 +	if (*pos == 0) {
  1.5616 +		return PFM_PROC_SHOW_HEADER;
  1.5617 +	}
  1.5618 +
  1.5619 +	while (*pos <= NR_CPUS) {
  1.5620 +		if (cpu_online(*pos - 1)) {
  1.5621 +			return (void *)*pos;
  1.5622 +		}
  1.5623 +		++*pos;
  1.5624 +	}
  1.5625 +	return NULL;
  1.5626 +}
  1.5627 +
  1.5628 +static void *
  1.5629 +pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
  1.5630 +{
  1.5631 +	++*pos;
  1.5632 +	return pfm_proc_start(m, pos);
  1.5633 +}
  1.5634 +
  1.5635 +static void
  1.5636 +pfm_proc_stop(struct seq_file *m, void *v)
  1.5637 +{
  1.5638 +}
  1.5639 +
  1.5640 +static void
  1.5641 +pfm_proc_show_header(struct seq_file *m)
  1.5642 +{
  1.5643 +	struct list_head * pos;
  1.5644 +	pfm_buffer_fmt_t * entry;
  1.5645 +	unsigned long flags;
  1.5646 +
  1.5647 + 	seq_printf(m,
  1.5648 +		"perfmon version           : %u.%u\n"
  1.5649 +		"model                     : %s\n"
  1.5650 +		"fastctxsw                 : %s\n"
  1.5651 +		"expert mode               : %s\n"
  1.5652 +		"ovfl_mask                 : 0x%lx\n"
  1.5653 +		"PMU flags                 : 0x%x\n",
  1.5654 +		PFM_VERSION_MAJ, PFM_VERSION_MIN,
  1.5655 +		pmu_conf->pmu_name,
  1.5656 +		pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
  1.5657 +		pfm_sysctl.expert_mode > 0 ? "Yes": "No",
  1.5658 +		pmu_conf->ovfl_val,
  1.5659 +		pmu_conf->flags);
  1.5660 +
  1.5661 +  	LOCK_PFS(flags);
  1.5662 +
  1.5663 + 	seq_printf(m,
  1.5664 + 		"proc_sessions             : %u\n"
  1.5665 + 		"sys_sessions              : %u\n"
  1.5666 + 		"sys_use_dbregs            : %u\n"
  1.5667 + 		"ptrace_use_dbregs         : %u\n",
  1.5668 + 		pfm_sessions.pfs_task_sessions,
  1.5669 + 		pfm_sessions.pfs_sys_sessions,
  1.5670 + 		pfm_sessions.pfs_sys_use_dbregs,
  1.5671 + 		pfm_sessions.pfs_ptrace_use_dbregs);
  1.5672 +
  1.5673 +  	UNLOCK_PFS(flags);
  1.5674 +
  1.5675 +	spin_lock(&pfm_buffer_fmt_lock);
  1.5676 +
  1.5677 +	list_for_each(pos, &pfm_buffer_fmt_list) {
  1.5678 +		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
  1.5679 +		seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
  1.5680 +			entry->fmt_uuid[0],
  1.5681 +			entry->fmt_uuid[1],
  1.5682 +			entry->fmt_uuid[2],
  1.5683 +			entry->fmt_uuid[3],
  1.5684 +			entry->fmt_uuid[4],
  1.5685 +			entry->fmt_uuid[5],
  1.5686 +			entry->fmt_uuid[6],
  1.5687 +			entry->fmt_uuid[7],
  1.5688 +			entry->fmt_uuid[8],
  1.5689 +			entry->fmt_uuid[9],
  1.5690 +			entry->fmt_uuid[10],
  1.5691 +			entry->fmt_uuid[11],
  1.5692 +			entry->fmt_uuid[12],
  1.5693 +			entry->fmt_uuid[13],
  1.5694 +			entry->fmt_uuid[14],
  1.5695 +			entry->fmt_uuid[15],
  1.5696 +			entry->fmt_name);
  1.5697 +	}
  1.5698 +	spin_unlock(&pfm_buffer_fmt_lock);
  1.5699 +
  1.5700 +}
  1.5701 +
  1.5702 +static int
  1.5703 +pfm_proc_show(struct seq_file *m, void *v)
  1.5704 +{
  1.5705 +	unsigned long psr;
  1.5706 +	unsigned int i;
  1.5707 +	int cpu;
  1.5708 +
  1.5709 +	if (v == PFM_PROC_SHOW_HEADER) {
  1.5710 +		pfm_proc_show_header(m);
  1.5711 +		return 0;
  1.5712 +	}
  1.5713 +
  1.5714 +	/* show info for CPU (v - 1) */
  1.5715 +
  1.5716 +	cpu = (long)v - 1;
  1.5717 +	seq_printf(m,
  1.5718 +		"CPU%-2d overflow intrs      : %lu\n"
  1.5719 +		"CPU%-2d overflow cycles     : %lu\n"
  1.5720 +		"CPU%-2d overflow min        : %lu\n"
  1.5721 +		"CPU%-2d overflow max        : %lu\n"
  1.5722 +		"CPU%-2d smpl handler calls  : %lu\n"
  1.5723 +		"CPU%-2d smpl handler cycles : %lu\n"
  1.5724 +		"CPU%-2d spurious intrs      : %lu\n"
  1.5725 +		"CPU%-2d replay   intrs      : %lu\n"
  1.5726 +		"CPU%-2d syst_wide           : %d\n"
  1.5727 +		"CPU%-2d dcr_pp              : %d\n"
  1.5728 +		"CPU%-2d exclude idle        : %d\n"
  1.5729 +		"CPU%-2d owner               : %d\n"
  1.5730 +		"CPU%-2d context             : %p\n"
  1.5731 +		"CPU%-2d activations         : %lu\n",
  1.5732 +		cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
  1.5733 +		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
  1.5734 +		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
  1.5735 +		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
  1.5736 +		cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
  1.5737 +		cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
  1.5738 +		cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
  1.5739 +		cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
  1.5740 +		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
  1.5741 +		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
  1.5742 +		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
  1.5743 +		cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
  1.5744 +		cpu, pfm_get_cpu_data(pmu_ctx, cpu),
  1.5745 +		cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
  1.5746 +
  1.5747 +	if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
  1.5748 +
  1.5749 +		psr = pfm_get_psr();
  1.5750 +
  1.5751 +		ia64_srlz_d();
  1.5752 +
  1.5753 +		seq_printf(m, 
  1.5754 +			"CPU%-2d psr                 : 0x%lx\n"
  1.5755 +			"CPU%-2d pmc0                : 0x%lx\n", 
  1.5756 +			cpu, psr,
  1.5757 +			cpu, ia64_get_pmc(0));
  1.5758 +
  1.5759 +		for (i=0; PMC_IS_LAST(i) == 0;  i++) {
  1.5760 +			if (PMC_IS_COUNTING(i) == 0) continue;
  1.5761 +   			seq_printf(m, 
  1.5762 +				"CPU%-2d pmc%u                : 0x%lx\n"
  1.5763 +   				"CPU%-2d pmd%u                : 0x%lx\n", 
  1.5764 +				cpu, i, ia64_get_pmc(i),
  1.5765 +				cpu, i, ia64_get_pmd(i));
  1.5766 +  		}
  1.5767 +	}
  1.5768 +	return 0;
  1.5769 +}
  1.5770 +
  1.5771 +struct seq_operations pfm_seq_ops = {
  1.5772 +	.start =	pfm_proc_start,
  1.5773 + 	.next =		pfm_proc_next,
  1.5774 + 	.stop =		pfm_proc_stop,
  1.5775 + 	.show =		pfm_proc_show
  1.5776 +};
  1.5777 +
  1.5778 +static int
  1.5779 +pfm_proc_open(struct inode *inode, struct file *file)
  1.5780 +{
  1.5781 +	return seq_open(file, &pfm_seq_ops);
  1.5782 +}
  1.5783 +
  1.5784 +
  1.5785 +/*
  1.5786 + * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
  1.5787 + * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
  1.5788 + * is active or inactive based on mode. We must rely on the value in
  1.5789 + * local_cpu_data->pfm_syst_info
  1.5790 + */
  1.5791 +void
  1.5792 +pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
  1.5793 +{
  1.5794 +	struct pt_regs *regs;
  1.5795 +	unsigned long dcr;
  1.5796 +	unsigned long dcr_pp;
  1.5797 +
  1.5798 +	dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
  1.5799 +
  1.5800 +	/*
  1.5801 +	 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
  1.5802 +	 * on every CPU, so we can rely on the pid to identify the idle task.
  1.5803 +	 */
  1.5804 +	if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
  1.5805 +		regs = task_pt_regs(task);
  1.5806 +		ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
  1.5807 +		return;
  1.5808 +	}
  1.5809 +	/*
  1.5810 +	 * if monitoring has started
  1.5811 +	 */
  1.5812 +	if (dcr_pp) {
  1.5813 +		dcr = ia64_getreg(_IA64_REG_CR_DCR);
  1.5814 +		/*
  1.5815 +		 * context switching in?
  1.5816 +		 */
  1.5817 +		if (is_ctxswin) {
  1.5818 +			/* mask monitoring for the idle task */
  1.5819 +			ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
  1.5820 +			pfm_clear_psr_pp();
  1.5821 +			ia64_srlz_i();
  1.5822 +			return;
  1.5823 +		}
  1.5824 +		/*
  1.5825 +		 * context switching out
  1.5826 +		 * restore monitoring for next task
  1.5827 +		 *
  1.5828 +		 * Due to inlining this odd if-then-else construction generates
  1.5829 +		 * better code.
  1.5830 +		 */
  1.5831 +		ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP);
  1.5832 +		pfm_set_psr_pp();
  1.5833 +		ia64_srlz_i();
  1.5834 +	}
  1.5835 +}
  1.5836 +
  1.5837 +#ifdef CONFIG_SMP
  1.5838 +
  1.5839 +static void
  1.5840 +pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
  1.5841 +{
  1.5842 +	struct task_struct *task = ctx->ctx_task;
  1.5843 +
  1.5844 +	ia64_psr(regs)->up = 0;
  1.5845 +	ia64_psr(regs)->sp = 1;
  1.5846 +
  1.5847 +	if (GET_PMU_OWNER() == task) {
  1.5848 +		DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
  1.5849 +		SET_PMU_OWNER(NULL, NULL);
  1.5850 +	}
  1.5851 +
  1.5852 +	/*
  1.5853 +	 * disconnect the task from the context and vice-versa
  1.5854 +	 */
  1.5855 +	PFM_SET_WORK_PENDING(task, 0);
  1.5856 +
  1.5857 +	task->thread.pfm_context  = NULL;
  1.5858 +	task->thread.flags       &= ~IA64_THREAD_PM_VALID;
  1.5859 +
  1.5860 +	DPRINT(("force cleanup for [%d]\n",  task->pid));
  1.5861 +}
  1.5862 +
  1.5863 +
  1.5864 +/*
  1.5865 + * in 2.6, interrupts are masked when we come here and the runqueue lock is held
  1.5866 + */
  1.5867 +void
  1.5868 +pfm_save_regs(struct task_struct *task)
  1.5869 +{
  1.5870 +	pfm_context_t *ctx;
  1.5871 +	struct thread_struct *t;
  1.5872 +	unsigned long flags;
  1.5873 +	u64 psr;
  1.5874 +
  1.5875 +
  1.5876 +	ctx = PFM_GET_CTX(task);
  1.5877 +	if (ctx == NULL) return;
  1.5878 +	t = &task->thread;
  1.5879 +
  1.5880 +	/*
  1.5881 + 	 * we always come here with interrupts ALREADY disabled by
  1.5882 + 	 * the scheduler. So we simply need to protect against concurrent
  1.5883 +	 * access, not CPU concurrency.
  1.5884 +	 */
  1.5885 +	flags = pfm_protect_ctx_ctxsw(ctx);
  1.5886 +
  1.5887 +	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
  1.5888 +		struct pt_regs *regs = task_pt_regs(task);
  1.5889 +
  1.5890 +		pfm_clear_psr_up();
  1.5891 +
  1.5892 +		pfm_force_cleanup(ctx, regs);
  1.5893 +
  1.5894 +		BUG_ON(ctx->ctx_smpl_hdr);
  1.5895 +
  1.5896 +		pfm_unprotect_ctx_ctxsw(ctx, flags);
  1.5897 +
  1.5898 +		pfm_context_free(ctx);
  1.5899 +		return;
  1.5900 +	}
  1.5901 +
  1.5902 +	/*
  1.5903 +	 * save current PSR: needed because we modify it
  1.5904 +	 */
  1.5905 +	ia64_srlz_d();
  1.5906 +	psr = pfm_get_psr();
  1.5907 +
  1.5908 +	BUG_ON(psr & (IA64_PSR_I));
  1.5909 +
  1.5910 +	/*
  1.5911 +	 * stop monitoring:
  1.5912 +	 * This is the last instruction which may generate an overflow
  1.5913 +	 *
  1.5914 +	 * We do not need to set psr.sp because, it is irrelevant in kernel.
  1.5915 +	 * It will be restored from ipsr when going back to user level
  1.5916 +	 */
  1.5917 +	pfm_clear_psr_up();
  1.5918 +
  1.5919 +	/*
  1.5920 +	 * keep a copy of psr.up (for reload)
  1.5921 +	 */
  1.5922 +	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
  1.5923 +
  1.5924 +	/*
  1.5925 +	 * release ownership of this PMU.
  1.5926 +	 * PM interrupts are masked, so nothing
  1.5927 +	 * can happen.
  1.5928 +	 */
  1.5929 +	SET_PMU_OWNER(NULL, NULL);
  1.5930 +
  1.5931 +	/*
  1.5932 +	 * we systematically save the PMD as we have no
  1.5933 +	 * guarantee we will be schedule at that same
  1.5934 +	 * CPU again.
  1.5935 +	 */
  1.5936 +	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
  1.5937 +
  1.5938 +	/*
  1.5939 +	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
  1.5940 +	 * we will need it on the restore path to check
  1.5941 +	 * for pending overflow.
  1.5942 +	 */
  1.5943 +	t->pmcs[0] = ia64_get_pmc(0);
  1.5944 +
  1.5945 +	/*
  1.5946 +	 * unfreeze PMU if had pending overflows
  1.5947 +	 */
  1.5948 +	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
  1.5949 +
  1.5950 +	/*
  1.5951 +	 * finally, allow context access.
  1.5952 +	 * interrupts will still be masked after this call.
  1.5953 +	 */
  1.5954 +	pfm_unprotect_ctx_ctxsw(ctx, flags);
  1.5955 +}
  1.5956 +
  1.5957 +#else /* !CONFIG_SMP */
  1.5958 +void
  1.5959 +pfm_save_regs(struct task_struct *task)
  1.5960 +{
  1.5961 +	pfm_context_t *ctx;
  1.5962 +	u64 psr;
  1.5963 +
  1.5964 +	ctx = PFM_GET_CTX(task);
  1.5965 +	if (ctx == NULL) return;
  1.5966 +
  1.5967 +	/*
  1.5968 +	 * save current PSR: needed because we modify it
  1.5969 +	 */
  1.5970 +	psr = pfm_get_psr();
  1.5971 +
  1.5972 +	BUG_ON(psr & (IA64_PSR_I));
  1.5973 +
  1.5974 +	/*
  1.5975 +	 * stop monitoring:
  1.5976 +	 * This is the last instruction which may generate an overflow
  1.5977 +	 *
  1.5978 +	 * We do not need to set psr.sp because, it is irrelevant in kernel.
  1.5979 +	 * It will be restored from ipsr when going back to user level
  1.5980 +	 */
  1.5981 +	pfm_clear_psr_up();
  1.5982 +
  1.5983 +	/*
  1.5984 +	 * keep a copy of psr.up (for reload)
  1.5985 +	 */
  1.5986 +	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
  1.5987 +}
  1.5988 +
  1.5989 +static void
  1.5990 +pfm_lazy_save_regs (struct task_struct *task)
  1.5991 +{
  1.5992 +	pfm_context_t *ctx;
  1.5993 +	struct thread_struct *t;
  1.5994 +	unsigned long flags;
  1.5995 +
  1.5996 +	{ u64 psr  = pfm_get_psr();
  1.5997 +	  BUG_ON(psr & IA64_PSR_UP);
  1.5998 +	}
  1.5999 +
  1.6000 +	ctx = PFM_GET_CTX(task);
  1.6001 +	t   = &task->thread;
  1.6002 +
  1.6003 +	/*
  1.6004 +	 * we need to mask PMU overflow here to
  1.6005 +	 * make sure that we maintain pmc0 until
  1.6006 +	 * we save it. overflow interrupts are
  1.6007 +	 * treated as spurious if there is no
  1.6008 +	 * owner.
  1.6009 +	 *
  1.6010 +	 * XXX: I don't think this is necessary
  1.6011 +	 */
  1.6012 +	PROTECT_CTX(ctx,flags);
  1.6013 +
  1.6014 +	/*
  1.6015 +	 * release ownership of this PMU.
  1.6016 +	 * must be done before we save the registers.
  1.6017 +	 *
  1.6018 +	 * after this call any PMU interrupt is treated
  1.6019 +	 * as spurious.
  1.6020 +	 */
  1.6021 +	SET_PMU_OWNER(NULL, NULL);
  1.6022 +
  1.6023 +	/*
  1.6024 +	 * save all the pmds we use
  1.6025 +	 */
  1.6026 +	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
  1.6027 +
  1.6028 +	/*
  1.6029 +	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
  1.6030 +	 * it is needed to check for pended overflow
  1.6031 +	 * on the restore path
  1.6032 +	 */
  1.6033 +	t->pmcs[0] = ia64_get_pmc(0);
  1.6034 +
  1.6035 +	/*
  1.6036 +	 * unfreeze PMU if had pending overflows
  1.6037 +	 */
  1.6038 +	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
  1.6039 +
  1.6040 +	/*
  1.6041 +	 * now get can unmask PMU interrupts, they will
  1.6042 +	 * be treated as purely spurious and we will not
  1.6043 +	 * lose any information
  1.6044 +	 */
  1.6045 +	UNPROTECT_CTX(ctx,flags);
  1.6046 +}
  1.6047 +#endif /* CONFIG_SMP */
  1.6048 +
  1.6049 +#ifdef CONFIG_SMP
  1.6050 +/*
  1.6051 + * in 2.6, interrupts are masked when we come here and the runqueue lock is held
  1.6052 + */
  1.6053 +void
  1.6054 +pfm_load_regs (struct task_struct *task)
  1.6055 +{
  1.6056 +	pfm_context_t *ctx;
  1.6057 +	struct thread_struct *t;
  1.6058 +	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
  1.6059 +	unsigned long flags;
  1.6060 +	u64 psr, psr_up;
  1.6061 +	int need_irq_resend;
  1.6062 +
  1.6063 +	ctx = PFM_GET_CTX(task);
  1.6064 +	if (unlikely(ctx == NULL)) return;
  1.6065 +
  1.6066 +	BUG_ON(GET_PMU_OWNER());
  1.6067 +
  1.6068 +	t     = &task->thread;
  1.6069 +	/*
  1.6070 +	 * possible on unload
  1.6071 +	 */
  1.6072 +	if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
  1.6073 +
  1.6074 +	/*
  1.6075 + 	 * we always come here with interrupts ALREADY disabled by
  1.6076 + 	 * the scheduler. So we simply need to protect against concurrent
  1.6077 +	 * access, not CPU concurrency.
  1.6078 +	 */
  1.6079 +	flags = pfm_protect_ctx_ctxsw(ctx);
  1.6080 +	psr   = pfm_get_psr();
  1.6081 +
  1.6082 +	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
  1.6083 +
  1.6084 +	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
  1.6085 +	BUG_ON(psr & IA64_PSR_I);
  1.6086 +
  1.6087 +	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
  1.6088 +		struct pt_regs *regs = task_pt_regs(task);
  1.6089 +
  1.6090 +		BUG_ON(ctx->ctx_smpl_hdr);
  1.6091 +
  1.6092 +		pfm_force_cleanup(ctx, regs);
  1.6093 +
  1.6094 +		pfm_unprotect_ctx_ctxsw(ctx, flags);
  1.6095 +
  1.6096 +		/*
  1.6097 +		 * this one (kmalloc'ed) is fine with interrupts disabled
  1.6098 +		 */
  1.6099 +		pfm_context_free(ctx);
  1.6100 +
  1.6101 +		return;
  1.6102 +	}
  1.6103 +
  1.6104 +	/*
  1.6105 +	 * we restore ALL the debug registers to avoid picking up
  1.6106 +	 * stale state.
  1.6107 +	 */
  1.6108 +	if (ctx->ctx_fl_using_dbreg) {
  1.6109 +		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
  1.6110 +		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
  1.6111 +	}
  1.6112 +	/*
  1.6113 +	 * retrieve saved psr.up
  1.6114 +	 */
  1.6115 +	psr_up = ctx->ctx_saved_psr_up;
  1.6116 +
  1.6117 +	/*
  1.6118 +	 * if we were the last user of the PMU on that CPU,
  1.6119 +	 * then nothing to do except restore psr
  1.6120 +	 */
  1.6121 +	if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
  1.6122 +
  1.6123 +		/*
  1.6124 +		 * retrieve partial reload masks (due to user modifications)
  1.6125 +		 */
  1.6126 +		pmc_mask = ctx->ctx_reload_pmcs[0];
  1.6127 +		pmd_mask = ctx->ctx_reload_pmds[0];
  1.6128 +
  1.6129 +	} else {
  1.6130 +		/*
  1.6131 +	 	 * To avoid leaking information to the user level when psr.sp=0,
  1.6132 +	 	 * we must reload ALL implemented pmds (even the ones we don't use).
  1.6133 +	 	 * In the kernel we only allow PFM_READ_PMDS on registers which
  1.6134 +	 	 * we initialized or requested (sampling) so there is no risk there.
  1.6135 +	 	 */
  1.6136 +		pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
  1.6137 +
  1.6138 +		/*
  1.6139 +	 	 * ALL accessible PMCs are systematically reloaded, unused registers
  1.6140 +	 	 * get their default (from pfm_reset_pmu_state()) values to avoid picking
  1.6141 +	 	 * up stale configuration.
  1.6142 +	 	 *
  1.6143 +	 	 * PMC0 is never in the mask. It is always restored separately.
  1.6144 +	 	 */
  1.6145 +		pmc_mask = ctx->ctx_all_pmcs[0];
  1.6146 +	}
  1.6147 +	/*
  1.6148 +	 * when context is MASKED, we will restore PMC with plm=0
  1.6149 +	 * and PMD with stale information, but that's ok, nothing
  1.6150 +	 * will be captured.
  1.6151 +	 *
  1.6152 +	 * XXX: optimize here
  1.6153 +	 */
  1.6154 +	if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
  1.6155 +	if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
  1.6156 +
  1.6157 +	/*
  1.6158 +	 * check for pending overflow at the time the state
  1.6159 +	 * was saved.
  1.6160 +	 */
  1.6161 +	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
  1.6162 +		/*
  1.6163 +		 * reload pmc0 with the overflow information
  1.6164 +		 * On McKinley PMU, this will trigger a PMU interrupt
  1.6165 +		 */
  1.6166 +		ia64_set_pmc(0, t->pmcs[0]);
  1.6167 +		ia64_srlz_d();
  1.6168 +		t->pmcs[0] = 0UL;
  1.6169 +
  1.6170 +		/*
  1.6171 +		 * will replay the PMU interrupt
  1.6172 +		 */
  1.6173 +		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
  1.6174 +
  1.6175 +		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
  1.6176 +	}
  1.6177 +
  1.6178 +	/*
  1.6179 +	 * we just did a reload, so we reset the partial reload fields
  1.6180 +	 */
  1.6181 +	ctx->ctx_reload_pmcs[0] = 0UL;
  1.6182 +	ctx->ctx_reload_pmds[0] = 0UL;
  1.6183 +
  1.6184 +	SET_LAST_CPU(ctx, smp_processor_id());
  1.6185 +
  1.6186 +	/*
  1.6187 +	 * dump activation value for this PMU
  1.6188 +	 */
  1.6189 +	INC_ACTIVATION();
  1.6190 +	/*
  1.6191 +	 * record current activation for this context
  1.6192 +	 */
  1.6193 +	SET_ACTIVATION(ctx);
  1.6194 +
  1.6195 +	/*
  1.6196 +	 * establish new ownership. 
  1.6197 +	 */
  1.6198 +	SET_PMU_OWNER(task, ctx);
  1.6199 +
  1.6200 +	/*
  1.6201 +	 * restore the psr.up bit. measurement
  1.6202 +	 * is active again.
  1.6203 +	 * no PMU interrupt can happen at this point
  1.6204 +	 * because we still have interrupts disabled.
  1.6205 +	 */
  1.6206 +	if (likely(psr_up)) pfm_set_psr_up();
  1.6207 +
  1.6208 +	/*
  1.6209 +	 * allow concurrent access to context
  1.6210 +	 */
  1.6211 +	pfm_unprotect_ctx_ctxsw(ctx, flags);
  1.6212 +}
  1.6213 +#else /*  !CONFIG_SMP */
  1.6214 +/*
  1.6215 + * reload PMU state for UP kernels
  1.6216 + * in 2.5 we come here with interrupts disabled
  1.6217 + */
  1.6218 +void
  1.6219 +pfm_load_regs (struct task_struct *task)
  1.6220 +{
  1.6221 +	struct thread_struct *t;
  1.6222 +	pfm_context_t *ctx;
  1.6223 +	struct task_struct *owner;
  1.6224 +	unsigned long pmd_mask, pmc_mask;
  1.6225 +	u64 psr, psr_up;
  1.6226 +	int need_irq_resend;
  1.6227 +
  1.6228 +	owner = GET_PMU_OWNER();
  1.6229 +	ctx   = PFM_GET_CTX(task);
  1.6230 +	t     = &task->thread;
  1.6231 +	psr   = pfm_get_psr();
  1.6232 +
  1.6233 +	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
  1.6234 +	BUG_ON(psr & IA64_PSR_I);
  1.6235 +
  1.6236 +	/*
  1.6237 +	 * we restore ALL the debug registers to avoid picking up
  1.6238 +	 * stale state.
  1.6239 +	 *
  1.6240 +	 * This must be done even when the task is still the owner
  1.6241 +	 * as the registers may have been modified via ptrace()
  1.6242 +	 * (not perfmon) by the previous task.
  1.6243 +	 */
  1.6244 +	if (ctx->ctx_fl_using_dbreg) {
  1.6245 +		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
  1.6246 +		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
  1.6247 +	}
  1.6248 +
  1.6249 +	/*
  1.6250 +	 * retrieved saved psr.up
  1.6251 +	 */
  1.6252 +	psr_up = ctx->ctx_saved_psr_up;
  1.6253 +	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
  1.6254 +
  1.6255 +	/*
  1.6256 +	 * short path, our state is still there, just
  1.6257 +	 * need to restore psr and we go
  1.6258 +	 *
  1.6259 +	 * we do not touch either PMC nor PMD. the psr is not touched
  1.6260 +	 * by the overflow_handler. So we are safe w.r.t. to interrupt
  1.6261 +	 * concurrency even without interrupt masking.
  1.6262 +	 */
  1.6263 +	if (likely(owner == task)) {
  1.6264 +		if (likely(psr_up)) pfm_set_psr_up();
  1.6265 +		return;
  1.6266 +	}
  1.6267 +
  1.6268 +	/*
  1.6269 +	 * someone else is still using the PMU, first push it out and
  1.6270 +	 * then we'll be able to install our stuff !
  1.6271 +	 *
  1.6272 +	 * Upon return, there will be no owner for the current PMU
  1.6273 +	 */
  1.6274 +	if (owner) pfm_lazy_save_regs(owner);
  1.6275 +
  1.6276 +	/*
  1.6277 +	 * To avoid leaking information to the user level when psr.sp=0,
  1.6278 +	 * we must reload ALL implemented pmds (even the ones we don't use).
  1.6279 +	 * In the kernel we only allow PFM_READ_PMDS on registers which
  1.6280 +	 * we initialized or requested (sampling) so there is no risk there.
  1.6281 +	 */
  1.6282 +	pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
  1.6283 +
  1.6284 +	/*
  1.6285 +	 * ALL accessible PMCs are systematically reloaded, unused registers
  1.6286 +	 * get their default (from pfm_reset_pmu_state()) values to avoid picking
  1.6287 +	 * up stale configuration.
  1.6288 +	 *
  1.6289 +	 * PMC0 is never in the mask. It is always restored separately
  1.6290 +	 */
  1.6291 +	pmc_mask = ctx->ctx_all_pmcs[0];
  1.6292 +
  1.6293 +	pfm_restore_pmds(t->pmds, pmd_mask);
  1.6294 +	pfm_restore_pmcs(t->pmcs, pmc_mask);
  1.6295 +
  1.6296 +	/*
  1.6297 +	 * check for pending overflow at the time the state
  1.6298 +	 * was saved.
  1.6299 +	 */
  1.6300 +	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
  1.6301 +		/*
  1.6302 +		 * reload pmc0 with the overflow information
  1.6303 +		 * On McKinley PMU, this will trigger a PMU interrupt
  1.6304 +		 */
  1.6305 +		ia64_set_pmc(0, t->pmcs[0]);
  1.6306 +		ia64_srlz_d();
  1.6307 +
  1.6308 +		t->pmcs[0] = 0UL;
  1.6309 +
  1.6310 +		/*
  1.6311 +		 * will replay the PMU interrupt
  1.6312 +		 */
  1.6313 +		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
  1.6314 +
  1.6315 +		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
  1.6316 +	}
  1.6317 +
  1.6318 +	/*
  1.6319 +	 * establish new ownership. 
  1.6320 +	 */
  1.6321 +	SET_PMU_OWNER(task, ctx);
  1.6322 +
  1.6323 +	/*
  1.6324 +	 * restore the psr.up bit. measurement
  1.6325 +	 * is active again.
  1.6326 +	 * no PMU interrupt can happen at this point
  1.6327 +	 * because we still have interrupts disabled.
  1.6328 +	 */
  1.6329 +	if (likely(psr_up)) pfm_set_psr_up();
  1.6330 +}
  1.6331 +#endif /* CONFIG_SMP */
  1.6332 +
  1.6333 +/*
  1.6334 + * this function assumes monitoring is stopped
  1.6335 + */
  1.6336 +static void
  1.6337 +pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
  1.6338 +{
  1.6339 +	u64 pmc0;
  1.6340 +	unsigned long mask2, val, pmd_val, ovfl_val;
  1.6341 +	int i, can_access_pmu = 0;
  1.6342 +	int is_self;
  1.6343 +
  1.6344 +	/*
  1.6345 +	 * is the caller the task being monitored (or which initiated the
  1.6346 +	 * session for system wide measurements)
  1.6347 +	 */
  1.6348 +	is_self = ctx->ctx_task == task ? 1 : 0;
  1.6349 +
  1.6350 +	/*
  1.6351 +	 * can access PMU is task is the owner of the PMU state on the current CPU
  1.6352 +	 * or if we are running on the CPU bound to the context in system-wide mode
  1.6353 +	 * (that is not necessarily the task the context is attached to in this mode).
  1.6354 +	 * In system-wide we always have can_access_pmu true because a task running on an
  1.6355 +	 * invalid processor is flagged earlier in the call stack (see pfm_stop).
  1.6356 +	 */
  1.6357 +	can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
  1.6358 +	if (can_access_pmu) {
  1.6359 +		/*
  1.6360 +		 * Mark the PMU as not owned
  1.6361 +		 * This will cause the interrupt handler to do nothing in case an overflow
  1.6362 +		 * interrupt was in-flight
  1.6363 +		 * This also guarantees that pmc0 will contain the final state
  1.6364 +		 * It virtually gives us full control on overflow processing from that point
  1.6365 +		 * on.
  1.6366 +		 */
  1.6367 +		SET_PMU_OWNER(NULL, NULL);
  1.6368 +		DPRINT(("releasing ownership\n"));
  1.6369 +
  1.6370 +		/*
  1.6371 +		 * read current overflow status:
  1.6372 +		 *
  1.6373 +		 * we are guaranteed to read the final stable state
  1.6374 +		 */
  1.6375 +		ia64_srlz_d();
  1.6376 +		pmc0 = ia64_get_pmc(0); /* slow */
  1.6377 +
  1.6378 +		/*
  1.6379 +		 * reset freeze bit, overflow status information destroyed
  1.6380 +		 */
  1.6381 +		pfm_unfreeze_pmu();
  1.6382 +	} else {
  1.6383 +		pmc0 = task->thread.pmcs[0];
  1.6384 +		/*
  1.6385 +		 * clear whatever overflow status bits there were
  1.6386 +		 */
  1.6387 +		task->thread.pmcs[0] = 0;
  1.6388 +	}
  1.6389 +	ovfl_val = pmu_conf->ovfl_val;
  1.6390 +	/*
  1.6391 +	 * we save all the used pmds
  1.6392 +	 * we take care of overflows for counting PMDs
  1.6393 +	 *
  1.6394 +	 * XXX: sampling situation is not taken into account here
  1.6395 +	 */
  1.6396 +	mask2 = ctx->ctx_used_pmds[0];
  1.6397 +
  1.6398 +	DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
  1.6399 +
  1.6400 +	for (i = 0; mask2; i++, mask2>>=1) {
  1.6401 +
  1.6402 +		/* skip non used pmds */
  1.6403 +		if ((mask2 & 0x1) == 0) continue;
  1.6404 +
  1.6405 +		/*
  1.6406 +		 * can access PMU always true in system wide mode
  1.6407 +		 */
  1.6408 +		val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
  1.6409 +
  1.6410 +		if (PMD_IS_COUNTING(i)) {
  1.6411 +			DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
  1.6412 +				task->pid,
  1.6413 +				i,
  1.6414 +				ctx->ctx_pmds[i].val,
  1.6415 +				val & ovfl_val));
  1.6416 +
  1.6417 +			/*
  1.6418 +			 * we rebuild the full 64 bit value of the counter
  1.6419 +			 */
  1.6420 +			val = ctx->ctx_pmds[i].val + (val & ovfl_val);
  1.6421 +
  1.6422 +			/*
  1.6423 +			 * now everything is in ctx_pmds[] and we need
  1.6424 +			 * to clear the saved context from save_regs() such that
  1.6425 +			 * pfm_read_pmds() gets the correct value
  1.6426 +			 */
  1.6427 +			pmd_val = 0UL;
  1.6428 +
  1.6429 +			/*
  1.6430 +			 * take care of overflow inline
  1.6431 +			 */
  1.6432 +			if (pmc0 & (1UL << i)) {
  1.6433 +				val += 1 + ovfl_val;
  1.6434 +				DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
  1.6435 +			}
  1.6436 +		}
  1.6437 +
  1.6438 +		DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
  1.6439 +
  1.6440 +		if (is_self) task->thread.pmds[i] = pmd_val;
  1.6441 +
  1.6442 +		ctx->ctx_pmds[i].val = val;
  1.6443 +	}
  1.6444 +}
  1.6445 +
  1.6446 +static struct irqaction perfmon_irqaction = {
  1.6447 +	.handler = pfm_interrupt_handler,
  1.6448 +	.flags   = SA_INTERRUPT,
  1.6449 +	.name    = "perfmon"
  1.6450 +};
  1.6451 +
  1.6452 +static void
  1.6453 +pfm_alt_save_pmu_state(void *data)
  1.6454 +{
  1.6455 +	struct pt_regs *regs;
  1.6456 +
  1.6457 +	regs = task_pt_regs(current);
  1.6458 +
  1.6459 +	DPRINT(("called\n"));
  1.6460 +
  1.6461 +	/*
  1.6462 +	 * should not be necessary but
  1.6463 +	 * let's take not risk
  1.6464 +	 */
  1.6465 +	pfm_clear_psr_up();
  1.6466 +	pfm_clear_psr_pp();
  1.6467 +	ia64_psr(regs)->pp = 0;
  1.6468 +
  1.6469 +	/*
  1.6470 +	 * This call is required
  1.6471 +	 * May cause a spurious interrupt on some processors
  1.6472 +	 */
  1.6473 +	pfm_freeze_pmu();
  1.6474 +
  1.6475 +	ia64_srlz_d();
  1.6476 +}
  1.6477 +
  1.6478 +void
  1.6479 +pfm_alt_restore_pmu_state(void *data)
  1.6480 +{
  1.6481 +	struct pt_regs *regs;
  1.6482 +
  1.6483 +	regs = task_pt_regs(current);
  1.6484 +
  1.6485 +	DPRINT(("called\n"));
  1.6486 +
  1.6487 +	/*
  1.6488 +	 * put PMU back in state expected
  1.6489 +	 * by perfmon
  1.6490 +	 */
  1.6491 +	pfm_clear_psr_up();
  1.6492 +	pfm_clear_psr_pp();
  1.6493 +	ia64_psr(regs)->pp = 0;
  1.6494 +
  1.6495 +	/*
  1.6496 +	 * perfmon runs with PMU unfrozen at all times
  1.6497 +	 */
  1.6498 +	pfm_unfreeze_pmu();
  1.6499 +
  1.6500 +	ia64_srlz_d();
  1.6501 +}
  1.6502 +
  1.6503 +int
  1.6504 +pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
  1.6505 +{
  1.6506 +	int ret, i;
  1.6507 +	int reserve_cpu;
  1.6508 +
  1.6509 +	/* some sanity checks */
  1.6510 +	if (hdl == NULL || hdl->handler == NULL) return -EINVAL;
  1.6511 +
  1.6512 +	/* do the easy test first */
  1.6513 +	if (pfm_alt_intr_handler) return -EBUSY;
  1.6514 +
  1.6515 +	/* one at a time in the install or remove, just fail the others */
  1.6516 +	if (!spin_trylock(&pfm_alt_install_check)) {
  1.6517 +		return -EBUSY;
  1.6518 +	}
  1.6519 +
  1.6520 +	/* reserve our session */
  1.6521 +	for_each_online_cpu(reserve_cpu) {
  1.6522 +		ret = pfm_reserve_session(NULL, 1, reserve_cpu);
  1.6523 +		if (ret) goto cleanup_reserve;
  1.6524 +	}
  1.6525 +
  1.6526 +	/* save the current system wide pmu states */
  1.6527 +	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
  1.6528 +	if (ret) {
  1.6529 +		DPRINT(("on_each_cpu() failed: %d\n", ret));
  1.6530 +		goto cleanup_reserve;
  1.6531 +	}
  1.6532 +
  1.6533 +	/* officially change to the alternate interrupt handler */
  1.6534 +	pfm_alt_intr_handler = hdl;
  1.6535 +
  1.6536 +	spin_unlock(&pfm_alt_install_check);
  1.6537 +
  1.6538 +	return 0;
  1.6539 +
  1.6540 +cleanup_reserve:
  1.6541 +	for_each_online_cpu(i) {
  1.6542 +		/* don't unreserve more than we reserved */
  1.6543 +		if (i >= reserve_cpu) break;
  1.6544 +
  1.6545 +		pfm_unreserve_session(NULL, 1, i);
  1.6546 +	}
  1.6547 +
  1.6548 +	spin_unlock(&pfm_alt_install_check);
  1.6549 +
  1.6550 +	return ret;
  1.6551 +}
  1.6552 +EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt);
  1.6553 +
  1.6554 +int
  1.6555 +pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
  1.6556 +{
  1.6557 +	int i;
  1.6558 +	int ret;
  1.6559 +
  1.6560 +	if (hdl == NULL) return -EINVAL;
  1.6561 +
  1.6562 +	/* cannot remove someone else's handler! */
  1.6563 +	if (pfm_alt_intr_handler != hdl) return -EINVAL;
  1.6564 +
  1.6565 +	/* one at a time in the install or remove, just fail the others */
  1.6566 +	if (!spin_trylock(&pfm_alt_install_check)) {
  1.6567 +		return -EBUSY;
  1.6568 +	}
  1.6569 +
  1.6570 +	pfm_alt_intr_handler = NULL;
  1.6571 +
  1.6572 +	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
  1.6573 +	if (ret) {
  1.6574 +		DPRINT(("on_each_cpu() failed: %d\n", ret));
  1.6575 +	}
  1.6576 +
  1.6577 +	for_each_online_cpu(i) {
  1.6578 +		pfm_unreserve_session(NULL, 1, i);
  1.6579 +	}
  1.6580 +
  1.6581 +	spin_unlock(&pfm_alt_install_check);
  1.6582 +
  1.6583 +	return 0;
  1.6584 +}
  1.6585 +EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt);
  1.6586 +
  1.6587 +/*
  1.6588 + * perfmon initialization routine, called from the initcall() table
  1.6589 + */
  1.6590 +static int init_pfm_fs(void);
  1.6591 +
  1.6592 +static int __init
  1.6593 +pfm_probe_pmu(void)
  1.6594 +{
  1.6595 +	pmu_config_t **p;
  1.6596 +	int family;
  1.6597 +
  1.6598 +	family = local_cpu_data->family;
  1.6599 +	p      = pmu_confs;
  1.6600 +
  1.6601 +	while(*p) {
  1.6602 +		if ((*p)->probe) {
  1.6603 +			if ((*p)->probe() == 0) goto found;
  1.6604 +		} else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
  1.6605 +			goto found;
  1.6606 +		}
  1.6607 +		p++;
  1.6608 +	}
  1.6609 +	return -1;
  1.6610 +found:
  1.6611 +	pmu_conf = *p;
  1.6612 +	return 0;
  1.6613 +}
  1.6614 +
  1.6615 +static struct file_operations pfm_proc_fops = {
  1.6616 +	.open		= pfm_proc_open,
  1.6617 +	.read		= seq_read,
  1.6618 +	.llseek		= seq_lseek,
  1.6619 +	.release	= seq_release,
  1.6620 +};
  1.6621 +
  1.6622 +int __init
  1.6623 +pfm_init(void)
  1.6624 +{
  1.6625 +	unsigned int n, n_counters, i;
  1.6626 +
  1.6627 +	printk("perfmon: version %u.%u IRQ %u\n",
  1.6628 +		PFM_VERSION_MAJ,
  1.6629 +		PFM_VERSION_MIN,
  1.6630 +		IA64_PERFMON_VECTOR);
  1.6631 +
  1.6632 +	if (pfm_probe_pmu()) {
  1.6633 +		printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", 
  1.6634 +				local_cpu_data->family);
  1.6635 +		return -ENODEV;
  1.6636 +	}
  1.6637 +
  1.6638 +	/*
  1.6639 +	 * compute the number of implemented PMD/PMC from the
  1.6640 +	 * description tables
  1.6641 +	 */
  1.6642 +	n = 0;
  1.6643 +	for (i=0; PMC_IS_LAST(i) == 0;  i++) {
  1.6644 +		if (PMC_IS_IMPL(i) == 0) continue;
  1.6645 +		pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
  1.6646 +		n++;
  1.6647 +	}
  1.6648 +	pmu_conf->num_pmcs = n;
  1.6649 +
  1.6650 +	n = 0; n_counters = 0;
  1.6651 +	for (i=0; PMD_IS_LAST(i) == 0;  i++) {
  1.6652 +		if (PMD_IS_IMPL(i) == 0) continue;
  1.6653 +		pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
  1.6654 +		n++;
  1.6655 +		if (PMD_IS_COUNTING(i)) n_counters++;
  1.6656 +	}
  1.6657 +	pmu_conf->num_pmds      = n;
  1.6658 +	pmu_conf->num_counters  = n_counters;
  1.6659 +
  1.6660 +	/*
  1.6661 +	 * sanity checks on the number of debug registers
  1.6662 +	 */
  1.6663 +	if (pmu_conf->use_rr_dbregs) {
  1.6664 +		if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
  1.6665 +			printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
  1.6666 +			pmu_conf = NULL;
  1.6667 +			return -1;
  1.6668 +		}
  1.6669 +		if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
  1.6670 +			printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
  1.6671 +			pmu_conf = NULL;
  1.6672 +			return -1;
  1.6673 +		}
  1.6674 +	}
  1.6675 +
  1.6676 +	printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
  1.6677 +	       pmu_conf->pmu_name,
  1.6678 +	       pmu_conf->num_pmcs,
  1.6679 +	       pmu_conf->num_pmds,
  1.6680 +	       pmu_conf->num_counters,
  1.6681 +	       ffz(pmu_conf->ovfl_val));
  1.6682 +
  1.6683 +	/* sanity check */
  1.6684 +	if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
  1.6685 +		printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
  1.6686 +		pmu_conf = NULL;
  1.6687 +		return -1;
  1.6688 +	}
  1.6689 +
  1.6690 +	/*
  1.6691 +	 * create /proc/perfmon (mostly for debugging purposes)
  1.6692 +	 */
  1.6693 + 	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
  1.6694 +	if (perfmon_dir == NULL) {
  1.6695 +		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
  1.6696 +		pmu_conf = NULL;
  1.6697 +		return -1;
  1.6698 +	}
  1.6699 +  	/*
  1.6700 + 	 * install customized file operations for /proc/perfmon entry
  1.6701 + 	 */
  1.6702 + 	perfmon_dir->proc_fops = &pfm_proc_fops;
  1.6703 +
  1.6704 +	/*
  1.6705 +	 * create /proc/sys/kernel/perfmon (for debugging purposes)
  1.6706 +	 */
  1.6707 +	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
  1.6708 +
  1.6709 +	/*
  1.6710 +	 * initialize all our spinlocks
  1.6711 +	 */
  1.6712 +	spin_lock_init(&pfm_sessions.pfs_lock);
  1.6713 +	spin_lock_init(&pfm_buffer_fmt_lock);
  1.6714 +
  1.6715 +	init_pfm_fs();
  1.6716 +
  1.6717 +	for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
  1.6718 +
  1.6719 +	return 0;
  1.6720 +}
  1.6721 +
  1.6722 +__initcall(pfm_init);
  1.6723 +
  1.6724 +/*
  1.6725 + * this function is called before pfm_init()
  1.6726 + */
  1.6727 +void
  1.6728 +pfm_init_percpu (void)
  1.6729 +{
  1.6730 +	/*
  1.6731 +	 * make sure no measurement is active
  1.6732 +	 * (may inherit programmed PMCs from EFI).
  1.6733 +	 */
  1.6734 +	pfm_clear_psr_pp();
  1.6735 +	pfm_clear_psr_up();
  1.6736 +
  1.6737 +	/*
  1.6738 +	 * we run with the PMU not frozen at all times
  1.6739 +	 */
  1.6740 +	pfm_unfreeze_pmu();
  1.6741 +
  1.6742 +	if (smp_processor_id() == 0)
  1.6743 +		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
  1.6744 +
  1.6745 +	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
  1.6746 +	ia64_srlz_d();
  1.6747 +}
  1.6748 +
  1.6749 +/*
  1.6750 + * used for debug purposes only
  1.6751 + */
  1.6752 +void
  1.6753 +dump_pmu_state(const char *from)
  1.6754 +{
  1.6755 +	struct task_struct *task;
  1.6756 +	struct thread_struct *t;
  1.6757 +	struct pt_regs *regs;
  1.6758 +	pfm_context_t *ctx;
  1.6759 +	unsigned long psr, dcr, info, flags;
  1.6760 +	int i, this_cpu;
  1.6761 +
  1.6762 +	local_irq_save(flags);
  1.6763 +
  1.6764 +	this_cpu = smp_processor_id();
  1.6765 +	regs     = task_pt_regs(current);
  1.6766 +	info     = PFM_CPUINFO_GET();
  1.6767 +	dcr      = ia64_getreg(_IA64_REG_CR_DCR);
  1.6768 +
  1.6769 +	if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
  1.6770 +		local_irq_restore(flags);
  1.6771 +		return;
  1.6772 +	}
  1.6773 +
  1.6774 +	printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", 
  1.6775 +		this_cpu, 
  1.6776 +		from, 
  1.6777 +		current->pid, 
  1.6778 +		regs->cr_iip,
  1.6779 +		current->comm);
  1.6780 +
  1.6781 +	task = GET_PMU_OWNER();
  1.6782 +	ctx  = GET_PMU_CTX();
  1.6783 +
  1.6784 +	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
  1.6785 +
  1.6786 +	psr = pfm_get_psr();
  1.6787 +
  1.6788 +	printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", 
  1.6789 +		this_cpu,
  1.6790 +		ia64_get_pmc(0),
  1.6791 +		psr & IA64_PSR_PP ? 1 : 0,
  1.6792 +		psr & IA64_PSR_UP ? 1 : 0,
  1.6793 +		dcr & IA64_DCR_PP ? 1 : 0,
  1.6794 +		info,
  1.6795 +		ia64_psr(regs)->up,
  1.6796 +		ia64_psr(regs)->pp);
  1.6797 +
  1.6798 +	ia64_psr(regs)->up = 0;
  1.6799 +	ia64_psr(regs)->pp = 0;
  1.6800 +
  1.6801 +	t = &current->thread;
  1.6802 +
  1.6803 +	for (i=1; PMC_IS_LAST(i) == 0; i++) {
  1.6804 +		if (PMC_IS_IMPL(i) == 0) continue;
  1.6805 +		printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
  1.6806 +	}
  1.6807 +
  1.6808 +	for (i=1; PMD_IS_LAST(i) == 0; i++) {
  1.6809 +		if (PMD_IS_IMPL(i) == 0) continue;
  1.6810 +		printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
  1.6811 +	}
  1.6812 +
  1.6813 +	if (ctx) {
  1.6814 +		printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
  1.6815 +				this_cpu,
  1.6816 +				ctx->ctx_state,
  1.6817 +				ctx->ctx_smpl_vaddr,
  1.6818 +				ctx->ctx_smpl_hdr,
  1.6819 +				ctx->ctx_msgq_head,
  1.6820 +				ctx->ctx_msgq_tail,
  1.6821 +				ctx->ctx_saved_psr_up);
  1.6822 +	}
  1.6823 +	local_irq_restore(flags);
  1.6824 +}
  1.6825 +
  1.6826 +/*
  1.6827 + * called from process.c:copy_thread(). task is new child.
  1.6828 + */
  1.6829 +void
  1.6830 +pfm_inherit(struct task_struct *task, struct pt_regs *regs)
  1.6831 +{
  1.6832 +	struct thread_struct *thread;
  1.6833 +
  1.6834 +	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
  1.6835 +
  1.6836 +	thread = &task->thread;
  1.6837 +
  1.6838 +	/*
  1.6839 +	 * cut links inherited from parent (current)
  1.6840 +	 */
  1.6841 +	thread->pfm_context = NULL;
  1.6842 +
  1.6843 +	PFM_SET_WORK_PENDING(task, 0);
  1.6844 +
  1.6845 +	/*
  1.6846 +	 * the psr bits are already set properly in copy_threads()
  1.6847 +	 */
  1.6848 +}
  1.6849 +#else  /* !CONFIG_PERFMON */
  1.6850 +asmlinkage long
  1.6851 +sys_perfmonctl (int fd, int cmd, void *arg, int count)
  1.6852 +{
  1.6853 +	return -ENOSYS;
  1.6854 +}
  1.6855 +#endif /* CONFIG_PERFMON */
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/linux-2.6-xen-sparse/arch/ia64/oprofile/Makefile	Tue Nov 28 11:19:40 2006 -0700
     2.3 @@ -0,0 +1,10 @@
     2.4 +obj-$(CONFIG_OPROFILE) += oprofile.o
     2.5 +
     2.6 +DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
     2.7 +		oprof.o cpu_buffer.o buffer_sync.o \
     2.8 +		event_buffer.o oprofile_files.o \
     2.9 +		oprofilefs.o oprofile_stats.o \
    2.10 +		timer_int.o )
    2.11 +
    2.12 +oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
    2.13 +oprofile-$(CONFIG_PERFMON) += perfmon.o
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/linux-2.6-xen-sparse/arch/ia64/oprofile/init.c	Tue Nov 28 11:19:40 2006 -0700
     3.3 @@ -0,0 +1,38 @@
     3.4 +/**
     3.5 + * @file init.c
     3.6 + *
     3.7 + * @remark Copyright 2002 OProfile authors
     3.8 + * @remark Read the file COPYING
     3.9 + *
    3.10 + * @author John Levon <levon@movementarian.org>
    3.11 + */
    3.12 +
    3.13 +#include <linux/kernel.h>
    3.14 +#include <linux/oprofile.h>
    3.15 +#include <linux/init.h>
    3.16 +#include <linux/errno.h>
    3.17 + 
    3.18 +extern int perfmon_init(struct oprofile_operations * ops);
    3.19 +extern void perfmon_exit(void);
    3.20 +extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth);
    3.21 +
    3.22 +int __init oprofile_arch_init(struct oprofile_operations * ops)
    3.23 +{
    3.24 +	int ret = -ENODEV;
    3.25 +
    3.26 +#ifdef CONFIG_PERFMON
    3.27 +	/* perfmon_init() can fail, but we have no way to report it */
    3.28 +	ret = perfmon_init(ops);
    3.29 +#endif
    3.30 +	ops->backtrace = ia64_backtrace;
    3.31 +
    3.32 +	return ret;
    3.33 +}
    3.34 +
    3.35 +
    3.36 +void oprofile_arch_exit(void)
    3.37 +{
    3.38 +#ifdef CONFIG_PERFMON
    3.39 +	perfmon_exit();
    3.40 +#endif
    3.41 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/linux-2.6-xen-sparse/arch/ia64/oprofile/perfmon.c	Tue Nov 28 11:19:40 2006 -0700
     4.3 @@ -0,0 +1,100 @@
     4.4 +/**
     4.5 + * @file perfmon.c
     4.6 + *
     4.7 + * @remark Copyright 2003 OProfile authors
     4.8 + * @remark Read the file COPYING
     4.9 + *
    4.10 + * @author John Levon <levon@movementarian.org>
    4.11 + */
    4.12 +
    4.13 +#include <linux/kernel.h>
    4.14 +#include <linux/config.h>
    4.15 +#include <linux/oprofile.h>
    4.16 +#include <linux/sched.h>
    4.17 +#include <asm/perfmon.h>
    4.18 +#include <asm/ptrace.h>
    4.19 +#include <asm/errno.h>
    4.20 +
    4.21 +static int allow_ints;
    4.22 +
    4.23 +static int
    4.24 +perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg,
    4.25 +                struct pt_regs *regs, unsigned long stamp)
    4.26 +{
    4.27 +	int event = arg->pmd_eventid;
    4.28 + 
    4.29 +	arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1;
    4.30 +
    4.31 +	/* the owner of the oprofile event buffer may have exited
    4.32 +	 * without perfmon being shutdown (e.g. SIGSEGV)
    4.33 +	 */
    4.34 +	if (allow_ints)
    4.35 +		oprofile_add_sample(regs, event);
    4.36 +	return 0;
    4.37 +}
    4.38 +
    4.39 +
    4.40 +static int perfmon_start(void)
    4.41 +{
    4.42 +	allow_ints = 1;
    4.43 +	return 0;
    4.44 +}
    4.45 +
    4.46 +
    4.47 +static void perfmon_stop(void)
    4.48 +{
    4.49 +	allow_ints = 0;
    4.50 +}
    4.51 +
    4.52 +
    4.53 +#define OPROFILE_FMT_UUID { \
    4.54 +	0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c }
    4.55 +
    4.56 +static pfm_buffer_fmt_t oprofile_fmt = {
    4.57 + 	.fmt_name 	    = "oprofile_format",
    4.58 + 	.fmt_uuid	    = OPROFILE_FMT_UUID,
    4.59 + 	.fmt_handler	    = perfmon_handler,
    4.60 +};
    4.61 +
    4.62 +
    4.63 +static char * get_cpu_type(void)
    4.64 +{
    4.65 +	__u8 family = local_cpu_data->family;
    4.66 +
    4.67 +	switch (family) {
    4.68 +		case 0x07:
    4.69 +			return "ia64/itanium";
    4.70 +		case 0x1f:
    4.71 +			return "ia64/itanium2";
    4.72 +		default:
    4.73 +			return "ia64/ia64";
    4.74 +	}
    4.75 +}
    4.76 +
    4.77 +
    4.78 +/* all the ops are handled via userspace for IA64 perfmon */
    4.79 +
    4.80 +static int using_perfmon;
    4.81 +
    4.82 +int perfmon_init(struct oprofile_operations * ops)
    4.83 +{
    4.84 +	int ret = pfm_register_buffer_fmt(&oprofile_fmt);
    4.85 +	if (ret)
    4.86 +		return -ENODEV;
    4.87 +
    4.88 +	ops->cpu_type = get_cpu_type();
    4.89 +	ops->start = perfmon_start;
    4.90 +	ops->stop = perfmon_stop;
    4.91 +	using_perfmon = 1;
    4.92 +	printk(KERN_INFO "oprofile: using perfmon.\n");
    4.93 +	return 0;
    4.94 +}
    4.95 +
    4.96 +
    4.97 +void perfmon_exit(void)
    4.98 +{
    4.99 +	if (!using_perfmon)
   4.100 +		return;
   4.101 +
   4.102 +	pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
   4.103 +}