ia64/xen-unstable

view xen/arch/x86/numa.c @ 12992:65c3287306db

[XEN] Use cpumask macros to update numa node masks.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Dec 13 10:24:20 2006 +0000 (2006-12-13)
parents 0aea81b1e757
children b3f681d71265
line source
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5 */
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/time.h>
15 #include <xen/smp.h>
16 #include <asm/acpi.h>
18 static int numa_setup(char *s);
19 custom_param("numa", numa_setup);
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
25 /* from proto.h */
26 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
28 struct node_data node_data[MAX_NUMNODES];
30 int memnode_shift;
31 u8 memnodemap[NODEMAPSIZE];
33 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
34 [0 ... NR_CPUS-1] = NUMA_NO_NODE
35 };
36 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
37 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
38 };
39 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
41 nodemask_t node_online_map = { { [0] = 1UL } };
43 /* Default NUMA to off for now. acpi=on required to enable it. */
44 int numa_off __initdata = 1;
46 int acpi_numa __initdata;
48 /*
49 * Given a shift value, try to populate memnodemap[]
50 * Returns :
51 * 1 if OK
52 * 0 if memnodmap[] too small (of shift too small)
53 * -1 if node overlap or lost ram (shift too big)
54 */
55 static int __init
56 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
57 {
58 int i;
59 int res = -1;
60 unsigned long addr, end;
62 if (shift >= 64)
63 return -1;
64 memset(memnodemap, 0xff, sizeof(memnodemap));
65 for (i = 0; i < numnodes; i++) {
66 addr = nodes[i].start;
67 end = nodes[i].end;
68 if (addr >= end)
69 continue;
70 if ((end >> shift) >= NODEMAPSIZE)
71 return 0;
72 do {
73 if (memnodemap[addr >> shift] != 0xff)
74 return -1;
75 memnodemap[addr >> shift] = i;
76 addr += (1UL << shift);
77 } while (addr < end);
78 res = 1;
79 }
80 return res;
81 }
83 int __init compute_hash_shift(struct node *nodes, int numnodes)
84 {
85 int shift = 20;
87 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
88 shift++;
90 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
91 shift);
93 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
94 printk(KERN_INFO
95 "Your memory is not aligned you need to rebuild your kernel "
96 "with a bigger NODEMAPSIZE shift=%d\n",
97 shift);
98 return -1;
99 }
100 return shift;
101 }
103 /* initialize NODE_DATA given nodeid and start/end */
104 void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
105 {
106 unsigned long start_pfn, end_pfn;
108 start_pfn = start >> PAGE_SHIFT;
109 end_pfn = end >> PAGE_SHIFT;
111 NODE_DATA(nodeid)->node_id = nodeid;
112 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
113 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
115 node_set_online(nodeid);
116 }
118 void __init numa_init_array(void)
119 {
120 int rr, i;
121 /* There are unfortunately some poorly designed mainboards around
122 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
123 mapping. To avoid this fill in the mapping for all possible
124 CPUs, as the number of CPUs is not known yet.
125 We round robin the existing nodes. */
126 rr = first_node(node_online_map);
127 for (i = 0; i < NR_CPUS; i++) {
128 if (cpu_to_node[i] != NUMA_NO_NODE)
129 continue;
130 numa_set_node(i, rr);
131 rr = next_node(rr, node_online_map);
132 if (rr == MAX_NUMNODES)
133 rr = first_node(node_online_map);
134 }
136 }
138 #ifdef CONFIG_NUMA_EMU
139 static int numa_fake __initdata = 0;
141 /* Numa emulation */
142 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
143 {
144 int i;
145 struct node nodes[MAX_NUMNODES];
146 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
148 /* Kludge needed for the hash function */
149 if (hweight64(sz) > 1) {
150 unsigned long x = 1;
151 while ((x << 1) < sz)
152 x <<= 1;
153 if (x < sz/2)
154 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
155 sz = x;
156 }
158 memset(&nodes,0,sizeof(nodes));
159 for (i = 0; i < numa_fake; i++) {
160 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
161 if (i == numa_fake-1)
162 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
163 nodes[i].end = nodes[i].start + sz;
164 printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
165 i,
166 nodes[i].start, nodes[i].end,
167 (nodes[i].end - nodes[i].start) >> 20);
168 node_set_online(i);
169 }
170 memnode_shift = compute_hash_shift(nodes, numa_fake);
171 if (memnode_shift < 0) {
172 memnode_shift = 0;
173 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
174 return -1;
175 }
176 for_each_online_node(i)
177 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
178 numa_init_array();
179 return 0;
180 }
181 #endif
183 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
184 {
185 int i;
187 #ifdef CONFIG_NUMA_EMU
188 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
189 return;
190 #endif
192 #ifdef CONFIG_ACPI_NUMA
193 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
194 end_pfn << PAGE_SHIFT))
195 return;
196 #endif
198 printk(KERN_INFO "%s\n",
199 numa_off ? "NUMA turned off" : "No NUMA configuration found");
201 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
202 start_pfn << PAGE_SHIFT,
203 end_pfn << PAGE_SHIFT);
204 /* setup dummy node covering all memory */
205 memnode_shift = 63;
206 memnodemap[0] = 0;
207 nodes_clear(node_online_map);
208 node_set_online(0);
209 for (i = 0; i < NR_CPUS; i++)
210 numa_set_node(i, 0);
211 node_to_cpumask[0] = cpumask_of_cpu(0);
212 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
213 }
215 __cpuinit void numa_add_cpu(int cpu)
216 {
217 cpu_set(cpu, node_to_cpumask[cpu_to_node(cpu)]);
218 }
220 void __cpuinit numa_set_node(int cpu, int node)
221 {
222 cpu_to_node[cpu] = node;
223 }
225 /* [numa=off] */
226 static __init int numa_setup(char *opt)
227 {
228 if (!strncmp(opt,"off",3))
229 numa_off = 1;
230 if (!strncmp(opt,"on",2))
231 numa_off = 0;
232 #ifdef CONFIG_NUMA_EMU
233 if(!strncmp(opt, "fake=", 5)) {
234 numa_off = 0;
235 numa_fake = simple_strtoul(opt+5,NULL,0); ;
236 if (numa_fake >= MAX_NUMNODES)
237 numa_fake = MAX_NUMNODES;
238 }
239 #endif
240 #ifdef CONFIG_ACPI_NUMA
241 if (!strncmp(opt,"noacpi",6)) {
242 numa_off = 0;
243 acpi_numa = -1;
244 }
245 #endif
246 return 1;
247 }
249 /*
250 * Setup early cpu_to_node.
251 *
252 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
253 * and apicid_to_node[] tables have valid entries for a CPU.
254 * This means we skip cpu_to_node[] initialisation for NUMA
255 * emulation and faking node case (when running a kernel compiled
256 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
257 * is already initialized in a round robin manner at numa_init_array,
258 * prior to this call, and this initialization is good enough
259 * for the fake NUMA cases.
260 */
261 void __init init_cpu_to_node(void)
262 {
263 int i;
264 for (i = 0; i < NR_CPUS; i++) {
265 u8 apicid = x86_cpu_to_apicid[i];
266 if (apicid == BAD_APICID)
267 continue;
268 if (apicid_to_node[apicid] == NUMA_NO_NODE)
269 continue;
270 numa_set_node(i,apicid_to_node[apicid]);
271 }
272 }
274 EXPORT_SYMBOL(cpu_to_node);
275 EXPORT_SYMBOL(node_to_cpumask);
276 EXPORT_SYMBOL(memnode_shift);
277 EXPORT_SYMBOL(memnodemap);
278 EXPORT_SYMBOL(node_data);
280 static void dump_numa(unsigned char key)
281 {
282 s_time_t now = NOW();
283 int i;
285 printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
286 (u32)(now>>32), (u32)now);
288 for_each_online_node(i) {
289 unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
290 printk("idx%d -> NODE%d start->%lu size->%lu\n",
291 i, NODE_DATA(i)->node_id,
292 NODE_DATA(i)->node_start_pfn,
293 NODE_DATA(i)->node_spanned_pages);
294 /* sanity check phys_to_nid() */
295 printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa),
296 NODE_DATA(i)->node_id);
297 }
298 for_each_online_cpu(i)
299 printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
300 }
302 static __init int register_numa_trigger(void)
303 {
304 register_keyhandler('u', dump_numa, "dump numa info");
305 return 0;
306 }
307 __initcall(register_numa_trigger);