ia64/xen-unstable

view xen/arch/x86/numa.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 3ccd0f0dba5b
children
line source
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5 */
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/time.h>
15 #include <xen/smp.h>
16 #include <asm/acpi.h>
17 #include <xen/sched.h>
19 static int numa_setup(char *s);
20 custom_param("numa", numa_setup);
22 #ifndef Dprintk
23 #define Dprintk(x...)
24 #endif
26 /* from proto.h */
27 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
29 struct node_data node_data[MAX_NUMNODES];
31 int memnode_shift;
32 u8 memnodemap[NODEMAPSIZE];
34 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
35 [0 ... NR_CPUS-1] = NUMA_NO_NODE
36 };
37 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
38 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
39 };
40 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
42 nodemask_t node_online_map = { { [0] = 1UL } };
44 /* Default NUMA to off for now. acpi=on required to enable it. */
45 int numa_off __initdata = 1;
47 int acpi_numa __initdata;
49 /*
50 * Given a shift value, try to populate memnodemap[]
51 * Returns :
52 * 1 if OK
53 * 0 if memnodmap[] too small (of shift too small)
54 * -1 if node overlap or lost ram (shift too big)
55 */
56 static int __init
57 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
58 {
59 int i;
60 int res = -1;
61 paddr_t addr, end;
63 if (shift >= 64)
64 return -1;
65 memset(memnodemap, 0xff, sizeof(memnodemap));
66 for (i = 0; i < numnodes; i++) {
67 addr = nodes[i].start;
68 end = nodes[i].end;
69 if (addr >= end)
70 continue;
71 if ((end >> shift) >= NODEMAPSIZE)
72 return 0;
73 do {
74 if (memnodemap[addr >> shift] != 0xff)
75 return -1;
76 memnodemap[addr >> shift] = i;
77 addr += (1ULL << shift);
78 } while (addr < end);
79 res = 1;
80 }
81 return res;
82 }
84 int __init compute_hash_shift(struct node *nodes, int numnodes)
85 {
86 int shift = 20;
88 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
89 shift++;
91 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
92 shift);
94 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
95 printk(KERN_INFO
96 "Your memory is not aligned you need to rebuild your kernel "
97 "with a bigger NODEMAPSIZE shift=%d\n",
98 shift);
99 return -1;
100 }
101 return shift;
102 }
104 /* initialize NODE_DATA given nodeid and start/end */
105 void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
106 {
107 unsigned long start_pfn, end_pfn;
109 start_pfn = start >> PAGE_SHIFT;
110 end_pfn = end >> PAGE_SHIFT;
112 NODE_DATA(nodeid)->node_id = nodeid;
113 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
114 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
116 node_set_online(nodeid);
117 }
119 void __init numa_init_array(void)
120 {
121 int rr, i;
122 /* There are unfortunately some poorly designed mainboards around
123 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
124 mapping. To avoid this fill in the mapping for all possible
125 CPUs, as the number of CPUs is not known yet.
126 We round robin the existing nodes. */
127 rr = first_node(node_online_map);
128 for (i = 0; i < NR_CPUS; i++) {
129 if (cpu_to_node[i] != NUMA_NO_NODE)
130 continue;
131 numa_set_node(i, rr);
132 rr = next_node(rr, node_online_map);
133 if (rr == MAX_NUMNODES)
134 rr = first_node(node_online_map);
135 }
137 }
139 #ifdef CONFIG_NUMA_EMU
140 static int numa_fake __initdata = 0;
142 /* Numa emulation */
143 static int numa_emulation(u64 start_pfn, u64 end_pfn)
144 {
145 int i;
146 struct node nodes[MAX_NUMNODES];
147 u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
149 /* Kludge needed for the hash function */
150 if (hweight64(sz) > 1) {
151 u64 x = 1;
152 while ((x << 1) < sz)
153 x <<= 1;
154 if (x < sz/2)
155 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
156 sz = x;
157 }
159 memset(&nodes,0,sizeof(nodes));
160 for (i = 0; i < numa_fake; i++) {
161 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
162 if (i == numa_fake-1)
163 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
164 nodes[i].end = nodes[i].start + sz;
165 printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
166 i,
167 nodes[i].start, nodes[i].end,
168 (nodes[i].end - nodes[i].start) >> 20);
169 node_set_online(i);
170 }
171 memnode_shift = compute_hash_shift(nodes, numa_fake);
172 if (memnode_shift < 0) {
173 memnode_shift = 0;
174 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
175 return -1;
176 }
177 for_each_online_node(i)
178 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
179 numa_init_array();
180 return 0;
181 }
182 #endif
184 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
185 {
186 int i;
188 #ifdef CONFIG_NUMA_EMU
189 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
190 return;
191 #endif
193 #ifdef CONFIG_ACPI_NUMA
194 if (!numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
195 (u64)end_pfn << PAGE_SHIFT))
196 return;
197 #endif
199 printk(KERN_INFO "%s\n",
200 numa_off ? "NUMA turned off" : "No NUMA configuration found");
202 printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
203 (u64)start_pfn << PAGE_SHIFT,
204 (u64)end_pfn << PAGE_SHIFT);
205 /* setup dummy node covering all memory */
206 memnode_shift = 63;
207 memnodemap[0] = 0;
208 nodes_clear(node_online_map);
209 node_set_online(0);
210 for (i = 0; i < NR_CPUS; i++)
211 numa_set_node(i, 0);
212 node_to_cpumask[0] = cpumask_of_cpu(0);
213 setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT);
214 }
216 __cpuinit void numa_add_cpu(int cpu)
217 {
218 cpu_set(cpu, node_to_cpumask[cpu_to_node(cpu)]);
219 }
221 void __cpuinit numa_set_node(int cpu, int node)
222 {
223 cpu_to_node[cpu] = node;
224 }
226 /* [numa=off] */
227 static __init int numa_setup(char *opt)
228 {
229 if (!strncmp(opt,"off",3))
230 numa_off = 1;
231 if (!strncmp(opt,"on",2))
232 numa_off = 0;
233 #ifdef CONFIG_NUMA_EMU
234 if(!strncmp(opt, "fake=", 5)) {
235 numa_off = 0;
236 numa_fake = simple_strtoul(opt+5,NULL,0); ;
237 if (numa_fake >= MAX_NUMNODES)
238 numa_fake = MAX_NUMNODES;
239 }
240 #endif
241 #ifdef CONFIG_ACPI_NUMA
242 if (!strncmp(opt,"noacpi",6)) {
243 numa_off = 0;
244 acpi_numa = -1;
245 }
246 #endif
247 return 1;
248 }
250 /*
251 * Setup early cpu_to_node.
252 *
253 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
254 * and apicid_to_node[] tables have valid entries for a CPU.
255 * This means we skip cpu_to_node[] initialisation for NUMA
256 * emulation and faking node case (when running a kernel compiled
257 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
258 * is already initialized in a round robin manner at numa_init_array,
259 * prior to this call, and this initialization is good enough
260 * for the fake NUMA cases.
261 */
262 void __init init_cpu_to_node(void)
263 {
264 int i;
265 for (i = 0; i < NR_CPUS; i++) {
266 u32 apicid = x86_cpu_to_apicid[i];
267 if (apicid == BAD_APICID)
268 continue;
269 if (apicid_to_node[apicid] == NUMA_NO_NODE)
270 continue;
271 numa_set_node(i,apicid_to_node[apicid]);
272 }
273 }
275 EXPORT_SYMBOL(cpu_to_node);
276 EXPORT_SYMBOL(node_to_cpumask);
277 EXPORT_SYMBOL(memnode_shift);
278 EXPORT_SYMBOL(memnodemap);
279 EXPORT_SYMBOL(node_data);
281 static void dump_numa(unsigned char key)
282 {
283 s_time_t now = NOW();
284 int i;
285 struct domain *d;
286 struct page_info *page;
287 unsigned int page_num_node[MAX_NUMNODES];
289 printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
290 (u32)(now>>32), (u32)now);
292 for_each_online_node(i) {
293 paddr_t pa = (paddr_t)(NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
294 printk("idx%d -> NODE%d start->%lu size->%lu\n",
295 i, NODE_DATA(i)->node_id,
296 NODE_DATA(i)->node_start_pfn,
297 NODE_DATA(i)->node_spanned_pages);
298 /* sanity check phys_to_nid() */
299 printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, phys_to_nid(pa),
300 NODE_DATA(i)->node_id);
301 }
302 for_each_online_cpu(i)
303 printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
305 rcu_read_lock(&domlist_read_lock);
307 printk("Memory location of each domain:\n");
308 for_each_domain(d)
309 {
310 printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
312 for_each_online_node(i)
313 page_num_node[i] = 0;
315 page_list_for_each(page, &d->page_list)
316 {
317 i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
318 page_num_node[i]++;
319 }
321 for_each_online_node(i)
322 printk(" Node %u: %u\n", i, page_num_node[i]);
323 }
325 rcu_read_unlock(&domlist_read_lock);
326 }
328 static __init int register_numa_trigger(void)
329 {
330 register_keyhandler('u', dump_numa, "dump numa info");
331 return 0;
332 }
333 __initcall(register_numa_trigger);