Blame - arch/i386/kernel/smp.c - SHIFTPHONES/mainline/linux

blob: 68be7d0c7238c1feaf4eff4856fac04f111ae0dd [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Intel SMP support routines.
				3	*
				4	* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
				5	* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
				6	*
				7	* This code is released under the GNU General Public License version 2 or
				8	* later.
				9	*/
				10
				11	#include <linux/init.h>
				12
				13	#include <linux/mm.h>
				14	#include <linux/irq.h>
				15	#include <linux/delay.h>
				16	#include <linux/spinlock.h>
				17	#include <linux/smp_lock.h>
				18	#include <linux/kernel_stat.h>
				19	#include <linux/mc146818rtc.h>
				20	#include <linux/cache.h>
				21	#include <linux/interrupt.h>
Alexey Dobriyan	129f694	2005-06-23 00:08:33 -0700	[diff] [blame]	22	#include <linux/module.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	23
				24	#include <asm/mtrr.h>
				25	#include <asm/tlbflush.h>
				26	#include <mach_apic.h>
				27
				28	/*
				29	* Some notes on x86 processor bugs affecting SMP operation:
				30	*
				31	* Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
				32	* The Linux implications for SMP are handled as follows:
				33	*
				34	* Pentium III / [Xeon]
				35	* None of the E1AP-E3AP errata are visible to the user.
				36	*
				37	* E1AP. see PII A1AP
				38	* E2AP. see PII A2AP
				39	* E3AP. see PII A3AP
				40	*
				41	* Pentium II / [Xeon]
				42	* None of the A1AP-A3AP errata are visible to the user.
				43	*
				44	* A1AP. see PPro 1AP
				45	* A2AP. see PPro 2AP
				46	* A3AP. see PPro 7AP
				47	*
				48	* Pentium Pro
				49	* None of 1AP-9AP errata are visible to the normal user,
				50	* except occasional delivery of 'spurious interrupt' as trap #15.
				51	* This is very rare and a non-problem.
				52	*
				53	* 1AP. Linux maps APIC as non-cacheable
				54	* 2AP. worked around in hardware
				55	* 3AP. fixed in C0 and above steppings microcode update.
				56	* Linux does not use excessive STARTUP_IPIs.
				57	* 4AP. worked around in hardware
				58	* 5AP. symmetric IO mode (normal Linux operation) not affected.
				59	* 'noapic' mode has vector 0xf filled out properly.
				60	* 6AP. 'noapic' mode might be affected - fixed in later steppings
				61	* 7AP. We do not assume writes to the LVT deassering IRQs
				62	* 8AP. We do not enable low power mode (deep sleep) during MP bootup
				63	* 9AP. We do not use mixed mode
				64	*
				65	* Pentium
				66	* There is a marginal case where REP MOVS on 100MHz SMP
				67	* machines with B stepping processors can fail. XXX should provide
				68	* an L1cache=Writethrough or L1cache=off option.
				69	*
				70	* B stepping CPUs may hang. There are hardware work arounds
				71	* for this. We warn about it in case your board doesn't have the work
				72	* arounds. Basically thats so I can tell anyone with a B stepping
				73	* CPU and SMP problems "tough".
				74	*
				75	* Specific items [From Pentium Processor Specification Update]
				76	*
				77	* 1AP. Linux doesn't use remote read
				78	* 2AP. Linux doesn't trust APIC errors
				79	* 3AP. We work around this
				80	* 4AP. Linux never generated 3 interrupts of the same priority
				81	* to cause a lost local interrupt.
				82	* 5AP. Remote read is never used
				83	* 6AP. not affected - worked around in hardware
				84	* 7AP. not affected - worked around in hardware
				85	* 8AP. worked around in hardware - we get explicit CS errors if not
				86	* 9AP. only 'noapic' mode affected. Might generate spurious
				87	* interrupts, we log only the first one and count the
				88	* rest silently.
				89	* 10AP. not affected - worked around in hardware
				90	* 11AP. Linux reads the APIC between writes to avoid this, as per
				91	* the documentation. Make sure you preserve this as it affects
				92	* the C stepping chips too.
				93	* 12AP. not affected - worked around in hardware
				94	* 13AP. not affected - worked around in hardware
				95	* 14AP. we always deassert INIT during bootup
				96	* 15AP. not affected - worked around in hardware
				97	* 16AP. not affected - worked around in hardware
				98	* 17AP. not affected - worked around in hardware
				99	* 18AP. not affected - worked around in hardware
				100	* 19AP. not affected - worked around in BIOS
				101	*
				102	* If this sounds worrying believe me these bugs are either ___RARE___,
				103	* or are signal timing bugs worked around in hardware and there's
				104	* about nothing of note with C stepping upwards.
				105	*/
				106
				107	DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
				108
				109	/*
				110	* the following functions deal with sending IPIs between CPUs.
				111	*
				112	* We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
				113	*/
				114
				115	static inline int __prepare_ICR (unsigned int shortcut, int vector)
				116	{
				117	return APIC_DM_FIXED \| shortcut \| vector \| APIC_DEST_LOGICAL;
				118	}
				119
				120	static inline int __prepare_ICR2 (unsigned int mask)
				121	{
				122	return SET_APIC_DEST_FIELD(mask);
				123	}
				124
				125	void __send_IPI_shortcut(unsigned int shortcut, int vector)
				126	{
				127	/*
				128	* Subtle. In the case of the 'never do double writes' workaround
				129	* we have to lock out interrupts to be safe. As we don't care
				130	* of the value read we use an atomic rmw access to avoid costly
				131	* cli/sti. Otherwise we use an even cheaper single atomic write
				132	* to the APIC.
				133	*/
				134	unsigned int cfg;
				135
				136	/*
				137	* Wait for idle.
				138	*/
				139	apic_wait_icr_idle();
				140
				141	/*
				142	* No need to touch the target chip field
				143	*/
				144	cfg = __prepare_ICR(shortcut, vector);
				145
				146	/*
				147	* Send the IPI. The write to APIC_ICR fires this off.
				148	*/
				149	apic_write_around(APIC_ICR, cfg);
				150	}
				151
				152	void fastcall send_IPI_self(int vector)
				153	{
				154	__send_IPI_shortcut(APIC_DEST_SELF, vector);
				155	}
				156
				157	/*
				158	* This is only used on smaller machines.
				159	*/
				160	void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
				161	{
				162	unsigned long mask = cpus_addr(cpumask)[0];
				163	unsigned long cfg;
				164	unsigned long flags;
				165
				166	local_irq_save(flags);
				167
				168	/*
				169	* Wait for idle.
				170	*/
				171	apic_wait_icr_idle();
				172
				173	/*
				174	* prepare target chip field
				175	*/
				176	cfg = __prepare_ICR2(mask);
				177	apic_write_around(APIC_ICR2, cfg);
				178
				179	/*
				180	* program the ICR
				181	*/
				182	cfg = __prepare_ICR(0, vector);
				183
				184	/*
				185	* Send the IPI. The write to APIC_ICR fires this off.
				186	*/
				187	apic_write_around(APIC_ICR, cfg);
				188
				189	local_irq_restore(flags);
				190	}
				191
				192	void send_IPI_mask_sequence(cpumask_t mask, int vector)
				193	{
				194	unsigned long cfg, flags;
				195	unsigned int query_cpu;
				196
				197	/*
				198	* Hack. The clustered APIC addressing mode doesn't allow us to send
				199	* to an arbitrary mask, so I do a unicasts to each CPU instead. This
				200	* should be modified to do 1 message per cluster ID - mbligh
				201	*/
				202
				203	local_irq_save(flags);
				204
				205	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
				206	if (cpu_isset(query_cpu, mask)) {
				207
				208	/*
				209	* Wait for idle.
				210	*/
				211	apic_wait_icr_idle();
				212
				213	/*
				214	* prepare target chip field
				215	*/
				216	cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
				217	apic_write_around(APIC_ICR2, cfg);
				218
				219	/*
				220	* program the ICR
				221	*/
				222	cfg = __prepare_ICR(0, vector);
				223
				224	/*
				225	* Send the IPI. The write to APIC_ICR fires this off.
				226	*/
				227	apic_write_around(APIC_ICR, cfg);
				228	}
				229	}
				230	local_irq_restore(flags);
				231	}
				232
				233	#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
				234
				235	/*
				236	* Smarter SMP flushing macros.
				237	* c/o Linus Torvalds.
				238	*
				239	* These mean you can really definitely utterly forget about
				240	* writing to user space from interrupts. (Its not allowed anyway).
				241	*
				242	* Optimizations Manfred Spraul <manfred@colorfullife.com>
				243	*/
				244
				245	static cpumask_t flush_cpumask;
				246	static struct mm_struct * flush_mm;
				247	static unsigned long flush_va;
				248	static DEFINE_SPINLOCK(tlbstate_lock);
				249	#define FLUSH_ALL 0xffffffff
				250
				251	/*
				252	* We cannot call mmdrop() because we are in interrupt context,
				253	* instead update mm->cpu_vm_mask.
				254	*
				255	* We need to reload %cr3 since the page tables may be going
				256	* away from under us..
				257	*/
				258	static inline void leave_mm (unsigned long cpu)
				259	{
				260	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
				261	BUG();
				262	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
				263	load_cr3(swapper_pg_dir);
				264	}
				265
				266	/*
				267	*
				268	* The flush IPI assumes that a thread switch happens in this order:
				269	* [cpu0: the cpu that switches]
				270	* 1) switch_mm() either 1a) or 1b)
				271	* 1a) thread switch to a different mm
				272	* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
				273	* Stop ipi delivery for the old mm. This is not synchronized with
				274	* the other cpus, but smp_invalidate_interrupt ignore flush ipis
				275	* for the wrong mm, and in the worst case we perform a superflous
				276	* tlb flush.
				277	* 1a2) set cpu_tlbstate to TLBSTATE_OK
				278	* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
				279	* was in lazy tlb mode.
				280	* 1a3) update cpu_tlbstate[].active_mm
				281	* Now cpu0 accepts tlb flushes for the new mm.
				282	* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
				283	* Now the other cpus will send tlb flush ipis.
				284	* 1a4) change cr3.
				285	* 1b) thread switch without mm change
				286	* cpu_tlbstate[].active_mm is correct, cpu0 already handles
				287	* flush ipis.
				288	* 1b1) set cpu_tlbstate to TLBSTATE_OK
				289	* 1b2) test_and_set the cpu bit in cpu_vm_mask.
				290	* Atomically set the bit [other cpus will start sending flush ipis],
				291	* and test the bit.
				292	* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
				293	* 2) switch %%esp, ie current
				294	*
				295	* The interrupt must handle 2 special cases:
				296	* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
				297	* - the cpu performs speculative tlb reads, i.e. even if the cpu only
				298	* runs in kernel space, the cpu could load tlb entries for user space
				299	* pages.
				300	*
				301	* The good news is that cpu_tlbstate is local to each cpu, no
				302	* write/read ordering problems.
				303	*/
				304
				305	/*
				306	* TLB flush IPI:
				307	*
				308	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
				309	* 2) Leave the mm if we are in the lazy tlb mode.
				310	*/
				311
				312	fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
				313	{
				314	unsigned long cpu;
				315
				316	cpu = get_cpu();
				317
				318	if (!cpu_isset(cpu, flush_cpumask))
				319	goto out;
				320	/*
				321	* This was a BUG() but until someone can quote me the
				322	* line from the intel manual that guarantees an IPI to
				323	* multiple CPUs is retried _only_ on the erroring CPUs
				324	* its staying as a return
				325	*
				326	* BUG();
				327	*/
				328
				329	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
				330	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
				331	if (flush_va == FLUSH_ALL)
				332	local_flush_tlb();
				333	else
				334	__flush_tlb_one(flush_va);
				335	} else
				336	leave_mm(cpu);
				337	}
				338	ack_APIC_irq();
				339	smp_mb__before_clear_bit();
				340	cpu_clear(cpu, flush_cpumask);
				341	smp_mb__after_clear_bit();
				342	out:
				343	put_cpu_no_resched();
				344	}
				345
				346	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				347	unsigned long va)
				348	{
				349	cpumask_t tmp;
				350	/*
				351	* A couple of (to be removed) sanity checks:
				352	*
				353	* - we do not send IPIs to not-yet booted CPUs.
				354	* - current CPU must not be in mask
				355	* - mask must exist :)
				356	*/
				357	BUG_ON(cpus_empty(cpumask));
				358
				359	cpus_and(tmp, cpumask, cpu_online_map);
				360	BUG_ON(!cpus_equal(cpumask, tmp));
				361	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
				362	BUG_ON(!mm);
				363
				364	/*
				365	* i'm not happy about this global shared spinlock in the
				366	* MM hot path, but we'll see how contended it is.
				367	* Temporarily this turns IRQs off, so that lockups are
				368	* detected by the NMI watchdog.
				369	*/
				370	spin_lock(&tlbstate_lock);
				371
				372	flush_mm = mm;
				373	flush_va = va;
				374	#if NR_CPUS <= BITS_PER_LONG
				375	atomic_set_mask(cpumask, &flush_cpumask);
				376	#else
				377	{
				378	int k;
				379	unsigned long flush_mask = (unsigned long )&flush_cpumask;
				380	unsigned long cpu_mask = (unsigned long )&cpumask;
				381	for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
				382	atomic_set_mask(cpu_mask[k], &flush_mask[k]);
				383	}
				384	#endif
				385	/*
				386	* We have to send the IPI only to
				387	* CPUs affected.
				388	*/
				389	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
				390
				391	while (!cpus_empty(flush_cpumask))
				392	/* nothing. lockup detection does not belong here */
				393	mb();
				394
				395	flush_mm = NULL;
				396	flush_va = 0;
				397	spin_unlock(&tlbstate_lock);
				398	}
				399
				400	void flush_tlb_current_task(void)
				401	{
				402	struct mm_struct *mm = current->mm;
				403	cpumask_t cpu_mask;
				404
				405	preempt_disable();
				406	cpu_mask = mm->cpu_vm_mask;
				407	cpu_clear(smp_processor_id(), cpu_mask);
				408
				409	local_flush_tlb();
				410	if (!cpus_empty(cpu_mask))
				411	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
				412	preempt_enable();
				413	}
				414
				415	void flush_tlb_mm (struct mm_struct * mm)
				416	{
				417	cpumask_t cpu_mask;
				418
				419	preempt_disable();
				420	cpu_mask = mm->cpu_vm_mask;
				421	cpu_clear(smp_processor_id(), cpu_mask);
				422
				423	if (current->active_mm == mm) {
				424	if (current->mm)
				425	local_flush_tlb();
				426	else
				427	leave_mm(smp_processor_id());
				428	}
				429	if (!cpus_empty(cpu_mask))
				430	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
				431
				432	preempt_enable();
				433	}
				434
				435	void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
				436	{
				437	struct mm_struct *mm = vma->vm_mm;
				438	cpumask_t cpu_mask;
				439
				440	preempt_disable();
				441	cpu_mask = mm->cpu_vm_mask;
				442	cpu_clear(smp_processor_id(), cpu_mask);
				443
				444	if (current->active_mm == mm) {
				445	if(current->mm)
				446	__flush_tlb_one(va);
				447	else
				448	leave_mm(smp_processor_id());
				449	}
				450
				451	if (!cpus_empty(cpu_mask))
				452	flush_tlb_others(cpu_mask, mm, va);
				453
				454	preempt_enable();
				455	}
Alexey Dobriyan	129f694	2005-06-23 00:08:33 -0700	[diff] [blame]	456	EXPORT_SYMBOL(flush_tlb_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	457
				458	static void do_flush_tlb_all(void* info)
				459	{
				460	unsigned long cpu = smp_processor_id();
				461
				462	__flush_tlb_all();
				463	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
				464	leave_mm(cpu);
				465	}
				466
				467	void flush_tlb_all(void)
				468	{
				469	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
				470	}
				471
				472	/*
				473	* this function sends a 'reschedule' IPI to another CPU.
				474	* it goes straight through and wastes no time serializing
				475	* anything. Worst case is that we lose a reschedule ...
				476	*/
				477	void smp_send_reschedule(int cpu)
				478	{
				479	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
				480	}
				481
				482	/*
				483	* Structure and data for smp_call_function(). This is designed to minimise
				484	* static memory requirements. It also looks cleaner.
				485	*/
				486	static DEFINE_SPINLOCK(call_lock);
				487
				488	struct call_data_struct {
				489	void (func) (void info);
				490	void *info;
				491	atomic_t started;
				492	atomic_t finished;
				493	int wait;
				494	};
				495
				496	static struct call_data_struct * call_data;
				497
				498	/*
				499	* this function sends a 'generic call function' IPI to all other CPUs
				500	* in the system.
				501	*/
				502
				503	int smp_call_function (void (func) (void info), void *info, int nonatomic,
				504	int wait)
				505	/*
				506	* [SUMMARY] Run a function on all other CPUs.
				507	* <func> The function to run. This must be fast and non-blocking.
				508	* <info> An arbitrary pointer to pass to the function.
				509	* <nonatomic> currently unused.
				510	* <wait> If true, wait (atomically) until function has completed on other CPUs.
				511	* [RETURNS] 0 on success, else a negative status code. Does not return until
				512	* remote CPUs are nearly ready to execute <<func>> or are or have executed.
				513	*
				514	* You must not call this function with disabled interrupts or from a
				515	* hardware interrupt handler or from a bottom half handler.
				516	*/
				517	{
				518	struct call_data_struct data;
				519	int cpus = num_online_cpus()-1;
				520
				521	if (!cpus)
				522	return 0;
				523
				524	/* Can deadlock when called with interrupts disabled */
				525	WARN_ON(irqs_disabled());
				526
				527	data.func = func;
				528	data.info = info;
				529	atomic_set(&data.started, 0);
				530	data.wait = wait;
				531	if (wait)
				532	atomic_set(&data.finished, 0);
				533
				534	spin_lock(&call_lock);
				535	call_data = &data;
				536	mb();
				537
				538	/* Send a message to all other CPUs and wait for them to respond */
				539	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
				540
				541	/* Wait for response */
				542	while (atomic_read(&data.started) != cpus)
				543	cpu_relax();
				544
				545	if (wait)
				546	while (atomic_read(&data.finished) != cpus)
				547	cpu_relax();
				548	spin_unlock(&call_lock);
				549
				550	return 0;
				551	}
Alexey Dobriyan	129f694	2005-06-23 00:08:33 -0700	[diff] [blame]	552	EXPORT_SYMBOL(smp_call_function);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	553
				554	static void stop_this_cpu (void * dummy)
				555	{
				556	/*
				557	* Remove this CPU:
				558	*/
				559	cpu_clear(smp_processor_id(), cpu_online_map);
				560	local_irq_disable();
				561	disable_local_APIC();
				562	if (cpu_data[smp_processor_id()].hlt_works_ok)
				563	for(;;) __asm__("hlt");
				564	for (;;);
				565	}
				566
				567	/*
				568	* this function calls the 'stop' function on all other CPUs in the system.
				569	*/
				570
				571	void smp_send_stop(void)
				572	{
				573	smp_call_function(stop_this_cpu, NULL, 1, 0);
				574
				575	local_irq_disable();
				576	disable_local_APIC();
				577	local_irq_enable();
				578	}
				579
				580	/*
				581	* Reschedule call back. Nothing to do,
				582	* all the work is done automatically when
				583	* we return from the interrupt.
				584	*/
				585	fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
				586	{
				587	ack_APIC_irq();
				588	}
				589
				590	fastcall void smp_call_function_interrupt(struct pt_regs *regs)
				591	{
				592	void (func) (void info) = call_data->func;
				593	void *info = call_data->info;
				594	int wait = call_data->wait;
				595
				596	ack_APIC_irq();
				597	/*
				598	* Notify initiating CPU that I've grabbed the data and am
				599	* about to execute the function
				600	*/
				601	mb();
				602	atomic_inc(&call_data->started);
				603	/*
				604	* At this point the info structure may be out of scope unless wait==1
				605	*/
				606	irq_enter();
				607	(*func)(info);
				608	irq_exit();
				609
				610	if (wait) {
				611	mb();
				612	atomic_inc(&call_data->finished);
				613	}
				614	}
				615