Index: oldkernel/linux/Documentation/Configure.help
diff -u linux/Documentation/Configure.help:1.3 linux/Documentation/Configure.help:1.4
--- linux/Documentation/Configure.help:1.3	Thu Jun  1 14:57:34 2000
+++ linux/Documentation/Configure.help	Thu Jun  1 15:05:19 2000
@@ -1659,10 +1659,10 @@
   all x86 CPU types (albeit not optimally fast), you can specify
   "386" here.
 
-  If you specify one of "486" or "586" or "Pentium" or "PPro", then
-  the kernel will not necessarily run on earlier architectures (e.g. a
-  Pentium optimized kernel will run on a PPro, but not necessarily on
-  a i486).
+  If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII",
+  then the kernel will not necessarily run on earlier architectures 
+  (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily
+  on a i486).
 
   Here are the settings recommended for greatest speed:
    - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
@@ -1676,8 +1676,30 @@
      K6-3D.
    - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and
      Intel Pentium II/Pentium Pro.
+   - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs
+     with the Deschutes or Mendocino core. You have to chose this for
+     MMX2 support.
 
   If you don't know what to do, choose "386".
+
+Disable PII/PIII Serial Number at bootup
+CONFIG_X86_PN_OFF
+  This makes the kernel disable the CPUID serial number that is embedded on
+  the new PIII CPUs at bootup.
+
+Enable PII/PIII Extended Fast FPU save and restore support
+CONFIG_X86_FX
+  This enables use of the new PII/PIII FXSAVE/FXRSTOR support.  This item
+  is required to make use of the new PIII 128bit XMM registers.  It is safe
+  to leave this enabled all the time.
+
+Enable CPU Specific (MMX/MMX2) Optimizations
+CONFIG_X86_CPU_OPTIMIZATIONS
+  This enables use of the MMX registers and 128bit MMX2 registers on CPUs
+  that can support the new instructions (Pentium/AMD K6 or newer).  In
+  order to support the Pentium III 128 bit XMM registers you must enable
+  both this and PII/PIII Extended Fast FPU save support.  It is safe to
+  leave this enabled all the time.
 
 VGA text console
 CONFIG_VGA_CONSOLE
Index: oldkernel/linux/arch/i386/Makefile
diff -u linux/arch/i386/Makefile:1.1.1.1 linux/arch/i386/Makefile:1.2
--- linux/arch/i386/Makefile:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/Makefile	Thu Jun  1 15:05:19 2000
@@ -43,6 +43,10 @@
 CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686
 endif
 
+ifdef CONFIG_M686FX
+CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0 -DCPU=686
+endif
+
 HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
 
 SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib
Index: oldkernel/linux/arch/i386/config.in
diff -u linux/arch/i386/config.in:1.1.1.1 linux/arch/i386/config.in:1.2
--- linux/arch/i386/config.in:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/config.in	Thu Jun  1 15:05:19 2000
@@ -16,7 +16,8 @@
 	 486/Cx486		CONFIG_M486	\
 	 586/K5/5x86/6x86	CONFIG_M586	\
 	 Pentium/K6/TSC		CONFIG_M586TSC	\
-	 PPro/6x86MX		CONFIG_M686" PPro
+	 PPro/6x86MX/PII	CONFIG_M686 \
+	 PIII/Xeon/Deschutes	CONFIG_M686FX" PIII
 #
 # Define implied options from the CPU selection here
 #
@@ -26,20 +27,24 @@
   define_bool CONFIG_X86_BSWAP y
   define_bool CONFIG_X86_POPAD_OK y
 fi
-if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" \
+				-o "$CONFIG_M586TSC" = "y" ]; then
   define_bool CONFIG_X86_TSC y
 fi
-if [ "$CONFIG_M686" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then
   define_bool CONFIG_X86_GOOD_APIC y
 fi
+bool 'Disable the PII/PIII Serial Number at bootup' CONFIG_X86_PN_OFF
+bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX
+bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions' CONFIG_X86_CPU_OPTIMIZATIONS
+bool 'Math emulation' CONFIG_MATH_EMULATION
+bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
+bool 'Symmetric multi-processing support' CONFIG_SMP
 
 choice 'Maximum Physical Memory' \
 	"1GB		CONFIG_1GB \
 	 2GB		CONFIG_2GB" 1GB
 
-bool 'Math emulation' CONFIG_MATH_EMULATION
-bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
-bool 'Symmetric multi-processing support' CONFIG_SMP
 endmenu
 
 mainmenu_option next_comment
Index: oldkernel/linux/arch/i386/defconfig
diff -u linux/arch/i386/defconfig:1.2 linux/arch/i386/defconfig:1.3
--- linux/arch/i386/defconfig:1.2	Thu Jun  1 14:51:28 2000
+++ linux/arch/i386/defconfig	Thu Jun  1 15:05:19 2000
@@ -21,11 +21,14 @@
 CONFIG_X86_POPAD_OK=y
 CONFIG_X86_TSC=y
 CONFIG_X86_GOOD_APIC=y
-CONFIG_1GB=y
-# CONFIG_2GB is not set
+CONFIG_X86_PN_OFF=y
+CONFIG_X86_FX=y
+CONFIG_X86_CPU_OPTIMIZATIONS=y
 # CONFIG_MATH_EMULATION is not set
 # CONFIG_MTRR is not set
 CONFIG_SMP=y
+CONFIG_1GB=y
+# CONFIG_2GB is not set
 
 #
 # Loadable module support
Index: oldkernel/linux/arch/i386/kernel/head.S
diff -u linux/arch/i386/kernel/head.S:1.1.1.1 linux/arch/i386/kernel/head.S:1.2
--- linux/arch/i386/kernel/head.S:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/head.S	Thu Jun  1 15:05:19 2000
@@ -14,7 +14,6 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
-
 #define CL_MAGIC_ADDR	0x90020
 #define CL_MAGIC	0xA33F
 #define CL_BASE_ADDR	0x90000
@@ -32,7 +31,8 @@
 #define X86_HARD_MATH	CPU_PARAMS+6
 #define X86_CPUID	CPU_PARAMS+8
 #define X86_CAPABILITY	CPU_PARAMS+12
-#define X86_VENDOR_ID	CPU_PARAMS+16
+#define X86_MMU_CR4	CPU_PARAMS+16
+#define X86_VENDOR_ID	CPU_PARAMS+20
 
 /*
  * swapper_pg_dir is the main page directory, address 0x00101000
@@ -59,9 +59,8 @@
  *	NOTE! We have to correct for the fact that we're
  *	not yet offset PAGE_OFFSET..
  */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
 	movl %cr4,%eax		# Turn on 4Mb pages
-	orl cr4_bits,%eax
+	orl X86_MMU_CR4-__PAGE_OFFSET,%eax
 	movl %eax,%cr4
 #endif
 /*
Index: oldkernel/linux/arch/i386/kernel/i386_ksyms.c
diff -u linux/arch/i386/kernel/i386_ksyms.c:1.1.1.1 linux/arch/i386/kernel/i386_ksyms.c:1.2
--- linux/arch/i386/kernel/i386_ksyms.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/i386_ksyms.c	Thu Jun  1 15:05:19 2000
@@ -119,3 +119,13 @@
 #ifdef CONFIG_VT
 EXPORT_SYMBOL(screen_info);
 #endif
+
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+EXPORT_SYMBOL(best_memcpy);
+EXPORT_SYMBOL(best_memset);
+EXPORT_SYMBOL(best_copy_to_user);
+EXPORT_SYMBOL(best_copy_from_user);
+EXPORT_SYMBOL(__best_copy_to_user);
+EXPORT_SYMBOL(__best_copy_from_user);
+#endif
+
Index: oldkernel/linux/arch/i386/kernel/process.c
diff -u linux/arch/i386/kernel/process.c:1.1.1.1 linux/arch/i386/kernel/process.c:1.2
--- linux/arch/i386/kernel/process.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/process.c	Thu Jun  1 15:05:19 2000
@@ -42,6 +42,7 @@
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -582,6 +583,106 @@
 }
 
 /*
+ * FPU state handling functions
+ */
+
+int i387_hard_to_user ( struct user_i387_struct * user,
+				union i387_hard_union * hard)
+{
+#ifdef CONFIG_X86_FX
+	int i, err = 0;
+	short *tmp, *tmp2;
+	union i387_hard_union hard2;
+#else
+	int err = 0;
+#endif
+
+	if (!access_ok(VERIFY_WRITE, user, sizeof(*user)))
+		return -EFAULT;
+#ifdef CONFIG_X86_FX
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+		hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd;
+		hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd;
+		hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+		hard2.fsave.fip = hard->fxsave.fxfip;
+		hard2.fsave.fcs = hard->fxsave.fxfcs;
+		hard2.fsave.foo = hard->fxsave.fxfoo;
+		hard2.fsave.fos = hard->fxsave.fxfos;
+
+		tmp = (short *)&hard2.fsave.st_space[0];
+		tmp2 = (short *)&hard->fxsave.st_space[0];
+
+		/*
+		 * Transform the two layouts:
+		 * (we do not mix 32-bit access with 16-bit access because
+		 * thats suboptimal on PPros)
+		 */
+
+		for (i = 0; i < 8; i++) {
+			*tmp = *tmp2; tmp++; tmp2++;
+			*tmp = *tmp2; tmp++; tmp2++;
+			*tmp = *tmp2; tmp++; tmp2++;
+			*tmp = *tmp2; tmp++; tmp2++;
+			*tmp = *tmp2; tmp++; tmp2 += 4;
+		}
+		err = copy_to_user((void *)(user),(&(hard2)),
+				   sizeof(struct i387_hard_fsave));
+	} else
+#endif
+		err = copy_to_user((void *)(user),
+				   (&(hard->fsave.cwd)),
+				   sizeof(struct i387_hard_fsave));
+	return err;
+}
+
+int i387_user_to_hard (union i387_hard_union * hard,
+			struct user_i387_struct * user)
+{
+#ifdef CONFIG_X86_FX
+	int i, err = 0;
+	short *tmp, *tmp2;
+	union i387_hard_union hard2;
+#else
+	int err = 0;
+#endif
+
+	if (!access_ok(VERIFY_READ, user, sizeof(*user)))
+		return -EFAULT;
+#ifdef CONFIG_X86_FX
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+		err = copy_from_user((&(hard2)),(void *)(user),
+				     sizeof(struct i387_hard_fsave));
+		hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff;
+		hard->fxsave.fxswd = hard2.fsave.swd & 0xffff;
+		hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd);
+		hard->fxsave.fxfip = hard2.fsave.fip;
+		hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff;
+		hard->fxsave.fxfoo = hard2.fsave.foo;
+		hard->fxsave.fxfos = hard2.fsave.fos & 0xffff;
+
+		tmp2 = (short *)&hard->fxsave.st_space[0];
+		tmp = (short *)&hard2.fsave.st_space[0];
+
+		for (i = 0; i < 8; i++) {
+			*tmp2 = *tmp; tmp++; tmp2++;
+			*tmp2 = *tmp; tmp++; tmp2++;
+			*tmp2 = *tmp; tmp++; tmp2++;
+			*tmp2 = *tmp; tmp++; tmp2++;
+			*tmp2 = *tmp; tmp++; tmp2++;
+			*tmp2 = 0; tmp2++;
+			*tmp2 = 0; tmp2++;
+			*tmp2 = 0; tmp2++;
+		}
+	} else
+#endif
+		err = copy_from_user((&(hard->fsave.cwd)),
+				     (void *)(user),
+				     sizeof(struct i387_hard_fsave));
+	return err;
+}
+
+
+/*
  * Save a segment.
  */
 #define savesegment(seg,value) \
@@ -626,13 +727,43 @@
  */
 int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu)
 {
+#ifdef CONFIG_X86_FX
+	int fpvalid, i;
+	short *tmp, *tmp2;
+	struct task_struct *tsk = current;
+	union i387_hard_union *hard;
+#else
 	int fpvalid;
 	struct task_struct *tsk = current;
-
+#endif
 	fpvalid = tsk->used_math;
 	if (fpvalid) {
 		unlazy_fpu(tsk);
-		memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu));
+#ifdef CONFIG_X86_FX
+		if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+			hard = &tsk->tss.i387.hard;
+
+			fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd;
+			fpu->swd = 0xffff0000 | hard->fxsave.fxswd;
+			fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+			fpu->fip = hard->fxsave.fxfip;
+			fpu->fcs = hard->fxsave.fxfcs;
+			fpu->foo = hard->fxsave.fxfoo;
+			fpu->fos = hard->fxsave.fxfos;
+
+			tmp = (short *)&fpu->st_space[0];
+			tmp2 = (short *)&hard->fxsave.st_space[0];
+
+			for (i = 0; i < 8; i++) {
+				*tmp = *tmp2; tmp++; tmp2++;
+				*tmp = *tmp2; tmp++; tmp2++;
+				*tmp = *tmp2; tmp++; tmp2++;
+				*tmp = *tmp2; tmp++; tmp2++;
+				*tmp = *tmp2; tmp++; tmp2+=4;
+			}
+		} else
+#endif
+			memcpy(fpu,&tsk->tss.i387.hard.fsave,sizeof(*fpu));
 	}
 
 	return fpvalid;
@@ -692,8 +823,8 @@
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
- * We fsave/fwait so that an exception goes off at the right time
- * (as a call from the fsave or fwait in effect) rather than to
+ * We fpu_save so that an exception goes off at the right time
+ * (as a call from the f*save or fwait in effect) rather than to
  * the wrong process. Lazy FP saving no longer makes any sense
  * with modern CPU's, and this simplifies a lot of things (SMP
  * and UP become the same).
Index: oldkernel/linux/arch/i386/kernel/ptrace.c
diff -u linux/arch/i386/kernel/ptrace.c:1.1.1.1 linux/arch/i386/kernel/ptrace.c:1.2
--- linux/arch/i386/kernel/ptrace.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/ptrace.c	Thu Jun  1 15:05:19 2000
@@ -17,6 +17,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
+#include <asm/i387.h>
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -646,6 +647,9 @@
 		  };
 
 		case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+			/*
+			 * user-space expects an 'old-style' FPU dump.
+			 */
 			if (!access_ok(VERIFY_WRITE, (unsigned *)data,
 				       sizeof(struct user_i387_struct)))
 			  {
@@ -655,15 +659,17 @@
 			ret = 0;
 			if ( !child->used_math ) {
 			  /* Simulate an empty FPU. */
-			  child->tss.i387.hard.cwd = 0xffff037f;
-			  child->tss.i387.hard.swd = 0xffff0000;
-			  child->tss.i387.hard.twd = 0xffffffff;
+			  i387_set_cwd(child->tss.i387.hard, 0x037f);
+			  i387_set_swd(child->tss.i387.hard, 0x0000);
+			  i387_set_twd(child->tss.i387.hard, 0xffff);
 			}
 #ifdef CONFIG_MATH_EMULATION
 			if ( boot_cpu_data.hard_math ) {
 #endif
-				__copy_to_user((void *)data, &child->tss.i387.hard,
-						sizeof(struct user_i387_struct));
+				i387_hard_to_user(
+					(struct user_i387_struct *)data,
+					&child->tss.i387.hard
+				);
 #ifdef CONFIG_MATH_EMULATION
 			} else {
 			  save_i387_soft(&child->tss.i387.soft,
@@ -684,8 +690,10 @@
 #ifdef CONFIG_MATH_EMULATION
 			if ( boot_cpu_data.hard_math ) {
 #endif
-			  __copy_from_user(&child->tss.i387.hard, (void *)data,
-					   sizeof(struct user_i387_struct));
+				i387_user_to_hard(
+					&child->tss.i387.hard,
+					(struct user_i387_struct *)data
+				);
 #ifdef CONFIG_MATH_EMULATION
 			} else {
 			  restore_i387_soft(&child->tss.i387.soft,
Index: oldkernel/linux/arch/i386/kernel/setup.c
diff -u linux/arch/i386/kernel/setup.c:1.1.1.1 linux/arch/i386/kernel/setup.c:1.2
--- linux/arch/i386/kernel/setup.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/setup.c	Thu Jun  1 15:05:19 2000
@@ -104,6 +104,17 @@
 extern int _etext, _edata, _end;
 extern unsigned long cpu_hz;
 
+#ifdef CONFIG_X86_PN_OFF
+int disable_x86_serial_nr = 1;
+#else
+int disable_x86_serial_nr = 0;
+#endif
+
+/*
+ * For the various FPU using kernel accelerator routines
+ */
+spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED;
+
 /*
  * This is set up by the setup-routine at boot-time
  */
@@ -809,20 +820,6 @@
 
 	if (c->x86_vendor == X86_VENDOR_AMD && amd_model(c))
 		return;
-		
-	if (c->cpuid_level > 0 && c->x86_vendor == X86_VENDOR_INTEL)
-	{
-		if(c->x86_capability&(1<<18))
-		{
-			/* Disable processor serial number on Intel Pentium III 
-			   from code by Phil Karn */
-			unsigned long lo,hi;
-			rdmsr(0x119,lo,hi);
-			lo |= 0x200000;
-			wrmsr(0x119,lo,hi);
-			printk(KERN_INFO "Pentium-III serial number disabled.\n");
-		}
-	}
 
 	if (c->cpuid_level > 1) {
 		/* supports eax=2  call */
@@ -909,7 +906,15 @@
 	}
 	cyrix_model(&boot_cpu_data);
 }
-	
+
+/*
+ * Setup function for serial number stuff
+ */
+
+__initfunc(void x86_serial_nr_setup(char *str, int *ints))
+{
+	disable_x86_serial_nr = !disable_x86_serial_nr;
+}
 	
 
 static char *cpu_vendor_names[] __initdata = {
Index: oldkernel/linux/arch/i386/kernel/signal.c
diff -u linux/arch/i386/kernel/signal.c:1.1.1.1 linux/arch/i386/kernel/signal.c:1.2
--- linux/arch/i386/kernel/signal.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/signal.c	Thu Jun  1 15:05:19 2000
@@ -21,6 +21,7 @@
 #include <linux/stddef.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
+#include <asm/i387.h>
 
 #define DEBUG_SIG 0
 
@@ -153,9 +154,14 @@
 
 static inline int restore_i387_hard(struct _fpstate *buf)
 {
+	int err = 0;
 	struct task_struct *tsk = current;
 	clear_fpu(tsk);
-	return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf));
+
+	err = i387_user_to_hard(&tsk->tss.i387.hard,
+				(struct user_i387_struct *)buf);
+	err |= get_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+	return err;
 }
 
 static inline int restore_i387(struct _fpstate *buf)
@@ -305,11 +311,14 @@
 
 static inline int save_i387_hard(struct _fpstate * buf)
 {
+	int err = 0;
 	struct task_struct *tsk = current;
 
 	unlazy_fpu(tsk);
-	tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd;
-	if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf)))
+	err = i387_hard_to_user((struct user_i387_struct *)buf,
+			&tsk->tss.i387.hard);
+	err |= put_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+	if (err)
 		return -1;
 	return 1;
 }
Index: oldkernel/linux/arch/i386/kernel/smp.c
diff -u linux/arch/i386/kernel/smp.c:1.1.1.1 linux/arch/i386/kernel/smp.c:1.2
--- linux/arch/i386/kernel/smp.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/smp.c	Thu Jun  1 15:05:19 2000
@@ -891,6 +891,8 @@
  */
 int __init start_secondary(void *unused)
 {
+	disable_serial_nr();
+	load_default_mxcsr();
 	/*
 	 * Dont put anything before smp_callin(), SMP
 	 * booting is too fragile that we want to limit the
Index: oldkernel/linux/arch/i386/kernel/traps.c
diff -u linux/arch/i386/kernel/traps.c:1.1.1.1 linux/arch/i386/kernel/traps.c:1.2
--- linux/arch/i386/kernel/traps.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/kernel/traps.c	Thu Jun  1 15:05:19 2000
@@ -33,6 +33,7 @@
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 
 #include <asm/smp.h>
 
@@ -421,7 +422,9 @@
 	 * (this will also clear the error)
 	 */
 	task = current;
-	save_fpu(task);
+	i387_save_hard(task->tss.i387);
+	task->flags &= ~PF_USEDFPU;
+	stts();
 	task->tss.trap_no = 16;
 	task->tss.error_code = 0;
 	force_sig(SIGFPE, task);
@@ -452,17 +455,44 @@
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
 	__asm__ __volatile__("clts");		/* Allow maths ops (or we recurse) */
-	if(current->used_math)
-		__asm__("frstor %0": :"m" (current->tss.i387));
-	else
-	{
+	/*
+	 * If we have either of the kernel FPU use states set in the
+	 * fpustate variable, then this will be a kernel math trap.
+	 * Otherwise, this is userspace trying to use the FPU.
+	 */
+	if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) {
+		load_default_mxcsr(); /* we don't ever mess with this in
+					 kernel space, so just make sure
+					 we have a reasonable one so we
+					 don't start taking unmasked
+					 exceptions by accident */
+		if(current->tss.mmx_reg_space != NULL)
+			__asm__("movq 0x00(%0), %%mm0\n\t"
+				"movq 0x08(%0), %%mm1\n\t"
+				"movq 0x10(%0), %%mm2\n\t"
+				"movq 0x18(%0), %%mm3\n\t"
+				:: "r" (current->tss.mmx_reg_space));
+		if(current->tss.kni_reg_space != NULL)
+			__asm__("movups 0x00(%0), %%xmm0\n\t"
+				"movups 0x10(%0), %%xmm1\n\t"
+				"movups 0x20(%0), %%xmm2\n\t"
+				"movups 0x30(%0), %%xmm3\n\t"
+				:: "r" (current->tss.kni_reg_space));
+	} else if(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) {
+		i387_restore_hard(current->tss.i387);
+		current->tss.x86_fpustate = 0;
+	} else if(current->used_math) {
+		i387_restore_hard(current->tss.i387);
+		current->flags|=PF_USEDFPU;	/* make switch_to() work */
+	} else {
 		/*
 		 *	Our first FPU usage, clean the chip.
 		 */
 		__asm__("fninit");
+		load_default_mxcsr();
 		current->used_math = 1;
+		current->flags|=PF_USEDFPU;	/* make switch_to() work */
 	}
-	current->flags|=PF_USEDFPU;		/* So we fnsave on switch_to() */
 }
 
 #ifndef CONFIG_MATH_EMULATION
Index: oldkernel/linux/arch/i386/lib/Makefile
diff -u linux/arch/i386/lib/Makefile:1.1.1.1 linux/arch/i386/lib/Makefile:1.2
--- linux/arch/i386/lib/Makefile:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/lib/Makefile	Thu Jun  1 15:05:19 2000
@@ -9,4 +9,8 @@
 L_OBJS  = checksum.o old-checksum.o semaphore.o delay.o \
 	usercopy.o getuser.o putuser.o
 
+ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y)
+	L_OBJS += best_function.o simd.o
+endif
+
 include $(TOPDIR)/Rules.make
Index: oldkernel/linux/arch/i386/lib/best_function.c
diff -u /dev/null linux/arch/i386/lib/best_function.c:1.1
--- /dev/null	Mon Jul 31 21:12:24 2000
+++ linux/arch/i386/lib/best_function.c	Thu Jun  1 15:05:19 2000
@@ -0,0 +1,196 @@
+/*
+ * SIMD functions.  These replace the functions in asm-i386/string.h
+ * whenever it makes sense.  These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <dledford@redhat.com>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+/*
+ * We declare our accelerator functions here since this is the only place
+ * that needs the declarations which makes a header file a pain to deal
+ * with
+ */
+extern void * kni_memcpy(void *, const void *, size_t);
+extern void * kni_memset(void *, char, size_t);
+extern unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+extern unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long);
+
+static void * best_memcpy_final(void *, const void *, size_t);
+static void * best_memset_final(void *, char, size_t);
+static unsigned long best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long best_copy_from_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long);
+
+void * best_memcpy(void * to, const void * from, size_t n)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)kni_memcpy - BAR;
+			return(kni_memcpy(to, from, n));
+		} else {
+			*caller = (int)best_memcpy_final - BAR;
+			return(__memcpy(to, from, n));
+		}
+	} else {
+		return(__memcpy(to, from, n));
+	}
+}
+
+static void * best_memcpy_final(void * to, const void * from, size_t n)
+{
+	return(__memcpy(to, from, n));
+}
+
+void * best_memset(void * s, char c, size_t count)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)kni_memset - BAR;
+			return(kni_memset(s, c, count));
+		} else {
+			*caller = (int)best_memset_final - BAR;
+			return(__memset_generic(s, c, count));
+		}
+	} else {
+		return(__memset_generic(s, c, count));
+	}
+}
+
+static void * best_memset_final(void * s, char c, size_t count)
+{
+	return(__memset_generic(s, c, count));
+}
+
+unsigned long
+best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)kni_copy_to_user - BAR;
+			return(kni_copy_to_user(to, from, n));
+		} else {
+			*caller = (int)best_copy_to_user_final - BAR;
+			return(best_copy_to_user_final(to, from, n));
+		}
+	} else {
+		if (access_ok(VERIFY_WRITE, to, n)) {
+			__copy_user(to,from,n);
+		}
+		return n;
+	}
+}
+
+static unsigned long
+best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		__copy_user(to,from,n);
+	}
+	return n;
+}
+
+unsigned long
+best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)kni_copy_from_user - BAR;
+			return(kni_copy_from_user(to, from, n));
+		} else {
+			*caller = (int)best_copy_from_user_final - BAR;
+			return(best_copy_from_user_final(to, from, n));
+		}
+	} else {
+		if (access_ok(VERIFY_READ, from, n)) {
+			__copy_user_zeroing(to,from,n);
+		}
+		return n;
+	}
+}
+
+static unsigned long
+best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+	if (access_ok(VERIFY_READ, from, n)) {
+		__copy_user_zeroing(to,from,n);
+	}
+	return n;
+}
+
+unsigned long
+__best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)__kni_copy_to_user_nocheck - BAR;
+			return(__kni_copy_to_user_nocheck(to, from, n));
+		} else {
+			*caller = (int)__best_copy_to_user_final - BAR;
+			return(__best_copy_to_user_final(to, from, n));
+		}
+	} else {
+		__copy_user(to,from,n);
+		return n;
+	}
+}
+
+static unsigned long
+__best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+	__copy_user(to,from,n);
+	return n;
+}
+
+unsigned long
+__best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+	int BAR = (int)__builtin_return_address(0);
+	int *caller = (int *)BAR - 1;
+	if(boot_cpu_data.enable_fixups) {
+	    if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+		 (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+			*caller = (int)__kni_copy_from_user_nocheck - BAR;
+			return(__kni_copy_from_user_nocheck(to, from, n));
+		} else {
+			*caller = (int)__best_copy_from_user_final - BAR;
+			return(__best_copy_from_user_final(to, from, n));
+		}
+	} else {
+		__copy_user_zeroing(to,from,n);
+		return n;
+	}
+}
+
+static unsigned long
+__best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+	__copy_user_zeroing(to,from,n);
+	return n;
+}
+
Index: oldkernel/linux/arch/i386/lib/simd.c
diff -u /dev/null linux/arch/i386/lib/simd.c:1.1
--- /dev/null	Mon Jul 31 21:12:24 2000
+++ linux/arch/i386/lib/simd.c	Thu Jun  1 15:05:19 2000
@@ -0,0 +1,435 @@
+/*
+ * SIMD functions.  These replace the functions in asm-i386/string.h
+ * whenever it makes sense.  These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <dledford@redhat.com>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+
+extern void * kni_memcpy(void * to, const void * from, size_t n)
+{
+	unsigned long flags;
+	void *ret=to;
+	size_t size;
+	int recursive = 0;
+	char xmm_space[64];
+
+	/*
+	 * If the transfer is too small, then use the generic routine.
+	 */
+	if (n < 128) {
+		return(__memcpy(to, from, n));
+	}
+	kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+
+	/*
+	 * Align the destination on a 16byte boundary.
+	 * The source doesn't have to be aligned.
+	 */
+	if ( (unsigned long)to & 0xf ) {
+		size = 0x10 - ((unsigned long)to & 0xf);
+		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
+				     "movups %%xmm0,(%1)\n\t"
+				     :
+				     : "r" (from),
+				       "r" (to));
+		n -= size;
+		from += size;
+		to += size;
+	}
+	/*
+	 * If the copy would have tailings, take care of them
+	 * now instead of later
+	 */
+	if(n & 0xf) {
+		size = n - 0x10;
+		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
+				     "movups %%xmm0,(%1)\n\t"
+				     :
+				     : "r" (from + size),
+				       "r" (to + size));
+		n &= ~0xf;
+	}
+	/*
+	 * Prefetch the first two cachelines now.
+	 */
+	__asm__ __volatile__("prefetchnta 0x00(%0)\n\t"
+			     "prefetchnta 0x20(%0)\n\t"
+			     :
+			     : "r" (from));
+	/*
+	 * Copy 32 bytes at a time.  The single unroll is good
+	 * for a 30% performance boost in the copy.  Additional
+	 * unrolls are not productive.  We are guaranteed to
+	 * have at least 32 bytes of data to copy since the
+	 * macro in string.h doesn't call into this function
+	 * with less than 64 bytes of copy and we lost < 32
+	 * bytes to alignment earlier.
+	 */
+	while (n >= 0x20) {
+		__asm__ __volatile__(
+				     "movups 0x00(%0),%%xmm0\n\t"
+				     "movups 0x10(%0),%%xmm1\n\t"
+				     "movntps %%xmm0,0x00(%1)\n\t"
+				     "movntps %%xmm1,0x10(%1)\n\t"
+				     : 
+				     : "r" (from), "r" (to)
+				     : "memory");
+		from += 0x20;
+		/*
+		 * Note: Intermixing the prefetch at *exactly* this point
+		 * in time has been shown to be the fastest possible.
+		 * Timing these prefetch instructions is a complete black
+		 * art with nothing but trial and error showing the way.
+		 * To that extent, this optimum version was found by using
+		 * a userland version of this routine that we clocked for
+		 * lots of runs.  We then fiddled with ordering until we
+		 * settled on our highest speen routines.  So, the long
+		 * and short of this is, don't mess with instruction ordering
+		 * here or suffer permance penalties you will.
+		 */
+		__asm__ __volatile__(
+				     "prefetchnta 0x20(%0)\n\t"
+				     : 
+				     : "r" (from));
+		to += 0x20;
+		n -= 0x20;
+	}
+	if (n) {
+		__asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t"
+				     "movntps %%xmm0,0x00(%1)\n\t"
+				     : 
+				     : "r" (from), "r" (to)
+				     : "memory");
+	}
+	SFENCE();
+	kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+	return(ret);
+}
+
+extern void * kni_memset(void * s, char c, size_t count)
+{
+	unsigned long flags;
+	size_t size;
+	void *ret=s;
+	int recursive = 0;
+	char xmm_space[64];
+
+	/*
+	 * If the transfer is too small, then use the generic routine.
+	 */
+	if (count < 128) {
+		return(__memset_generic(s, c, count));
+	}
+	kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+	/*
+	 * Load up our XMM register with the stuff to set mem with
+	 */
+	if(c == '\0') {
+		__asm__ __volatile__("xorps %%xmm0,%%xmm0\n\t"
+				     "movups %%xmm0,(%0)\n\t"
+				     :
+				     : "r" (s));
+	} else {
+		__memset_generic(s, c, 0x10);
+		__asm__ __volatile__("movups (%0),%%xmm0"
+				     :
+				     : "r" (s));
+	}
+	/*
+	 * align the destination on a 16 byte boundary, we can simply
+	 * do the math to align things since we already populated the
+	 * first 16 bytes.
+	 */
+	size = (0x10 - ((unsigned long)s & 0xf));
+	count -= size;
+	s += size;
+	/*
+	 * On the off chance we have tailings due to alignment issues,
+	 * do them now to make later more efficient
+	 */
+	if(count & 0xf) {
+		__asm__ __volatile__("movups %%xmm0,(%0)"
+				     :
+				     : "r" (s + (count - 0x10))
+				     : "memory");
+		count &= ~0xf;
+	}
+	/*
+	 * Do the copy by plopping out the register to memory.
+	 * Note: Unrolling this was *totally* unproductive.  My benchmark
+	 * showed that one or two plops per iteration produced the same
+	 * speed to within .06 MByte/s of speed.  Considering that the
+	 * routine benchmarked at over 3000 MByte/s, .06 is not statistically
+	 * significant and only doing one drop per loop simplifies 
+	 * overhead of book keeping.
+	 */
+	while(count) {
+		__asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t"
+				     :
+				     : "r" (s));
+		s += 0x10;
+		count -= 0x10;
+	}
+	SFENCE();
+	kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+	return(ret);
+}
+
+#define __kni_copy_to_user(to,from,size)				\
+do {									\
+	int __d0, __d1, tmp, tmp2;					\
+	__asm__ __volatile__(						\
+		"	movl %1,%4\n"					\
+		"	andl $0xf,%4\n"					\
+		"	movups (%2),%%xmm0\n"				\
+		"1:	movups %%xmm0,(%1)\n"				\
+		"	movl $0x10,%3\n"				\
+		"	subl %4,%3\n"					\
+		"	addl %3,%2\n"					\
+		"	addl %3,%1\n"					\
+		"	subl %3,%0\n"					\
+		"	prefetchnta 0x00(%2)\n"				\
+		"	prefetchnta 0x20(%2)\n"				\
+		"	jmp 200f\n"					\
+		"100:	movups 0x00(%2),%%xmm0\n"			\
+		"	movups 0x10(%2),%%xmm1\n"			\
+		"2:	movntps %%xmm0,0x00(%1)\n"			\
+		"3:	movntps %%xmm1,0x10(%1)\n"			\
+		"	addl $0x20,%2\n"				\
+		"	prefetchnta 0x20(%2)\n"				\
+		"	addl $0x20,%1\n"				\
+		"	subl $0x20,%0\n"				\
+		"200:	cmpl $0x1f,%0\n"				\
+		"	ja 100b\n"					\
+		"	cmpl $0xf,%0\n"					\
+		"	jbe 300f\n"					\
+		"	movups 0x00(%2),%%xmm0\n"			\
+		"4:	movntps %%xmm0,0x00(%1)\n"			\
+		"	addl $0x10,%2\n"				\
+		"	addl $0x10,%1\n"				\
+		"	subl $0x10,%0\n"				\
+		"300:	testl %0,%0\n"					\
+		"	je 400f\n"					\
+		"	movl $0x10,%3\n"				\
+		"	subl %0,%3\n"					\
+		"	subl %3,%1\n"					\
+		"	subl %3,%2\n"					\
+		"	movups 0x00(%2),%%xmm0\n"			\
+		"5:	movups %%xmm0,0x00(%1)\n"			\
+		"	addl $0x10,%2\n"				\
+		"	addl $0x10,%1\n"				\
+		"	xorl %0,%0\n"					\
+		"400:\n"						\
+		".section .fixup,\"ax\"\n"				\
+		"6:	jmp 400b\n"					\
+		"7:	addl $0x10,%1\n"				\
+		"	addl $0x10,%2\n"				\
+		"	subl $0x10,%0\n"				\
+		"	jmp 400b\n"					\
+		"8:	addl %3,%1\n"					\
+		"	addl %3,%2\n"					\
+		"	jmp 400b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 1b,6b\n"					\
+		"	.long 2b,6b\n"					\
+		"	.long 3b,7b\n"					\
+		"	.long 4b,6b\n"					\
+		"	.long 5b,8b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp),	\
+		  "=r"(tmp2)						\
+		: "0"(size), "1"(to), "2"(from)				\
+		: "memory");						\
+} while (0)
+
+#define __kni_copy_from_user(to,from,size)				\
+do {									\
+	int __d0, __d1, tmp, tmp2;					\
+	__asm__ __volatile__(						\
+		"	movl %1,%4\n"					\
+		"	andl $0xf,%4\n"					\
+		"1:	movups (%2),%%xmm0\n"				\
+		"	movups %%xmm0,(%1)\n"				\
+		"	movl $0x10,%3\n"				\
+		"	subl %4,%3\n"					\
+		"	addl %3,%2\n"					\
+		"	addl %3,%1\n"					\
+		"	subl %3,%0\n"					\
+		"	prefetchnta 0x00(%2)\n"				\
+		"	prefetchnta 0x20(%2)\n"				\
+		"	jmp 100f\n"					\
+		"2:	movups 0x00(%2),%%xmm0\n"			\
+		"3:	movups 0x10(%2),%%xmm1\n"			\
+		"	movntps %%xmm0,0x00(%1)\n"			\
+		"	movntps %%xmm1,0x10(%1)\n"			\
+		"	addl $0x20,%2\n"				\
+		"	prefetchnta 0x20(%2)\n"				\
+		"	addl $0x20,%1\n"				\
+		"	subl $0x20,%0\n"				\
+		"100:	cmpl $0x1f,%0\n"				\
+		"	ja 2b\n"					\
+		"	cmpl $0xf,%0\n"					\
+		"	jbe 200f\n"					\
+		"4:	movups 0x00(%2),%%xmm0\n"			\
+		"	movntps %%xmm0,0x00(%1)\n"			\
+		"	addl $0x10,%2\n"				\
+		"	addl $0x10,%1\n"				\
+		"	subl $0x10,%0\n"				\
+		"200:	testl %0,%0\n"					\
+		"	je 300f\n"					\
+		"	movl $0x10,%3\n"				\
+		"	subl %0,%3\n"					\
+		"	subl %3,%1\n"					\
+		"	subl %3,%2\n"					\
+		"5:	movups 0x00(%2),%%xmm0\n"			\
+		"	movups %%xmm0,0x00(%1)\n"			\
+		"	addl $0x10,%2\n"				\
+		"	addl $0x10,%1\n"				\
+		"	xorl %0,%0\n"					\
+		"300:\n"						\
+		".section .fixup,\"ax\"\n"				\
+		"6:	xorps %%xmm0,%%xmm0\n"				\
+		"	movups %%xmm0,(%1)\n"				\
+		"	movl $0x10,%3\n"				\
+		"	subl %4,%3\n"					\
+		"	addl %3,%1\n"					\
+		"	movl %3,%4\n"					\
+		"	movl %0,%3\n"					\
+		"	subl %4,%3\n"					\
+		"	jmp 600f\n"					\
+		"7:	subl $0x10,%0\n"				\
+		"	addl $0x10,%1\n"				\
+		"400:	movl %0,%3\n"					\
+		"	xorps %%xmm0,%%xmm0\n"				\
+		"	jmp 600f\n"					\
+		"500:	movntps %%xmm0,0x00(%1)\n"			\
+		"	movntps %%xmm0,0x10(%1)\n"			\
+		"	addl $0x20,%1\n"				\
+		"	subl $0x20,%3\n"				\
+		"600:	cmpl $0x1f,%3\n"				\
+		"	ja 500b\n"					\
+		"	cmpl $0xf,%3\n"					\
+		"	jbe 700f\n"					\
+		"	movntps %%xmm0,0x00(%1)\n"			\
+		"	addl $0x10,%1\n"				\
+		"	subl $0x10,%3\n"				\
+		"700:	testl %3,%3\n"					\
+		"	je 300b\n"					\
+		"	xorl %4,%4\n"					\
+		"	movb %4,(%1)\n"					\
+		"	inc %1\n"					\
+		"	dec %3\n"					\
+		"	jmp 700b\n"					\
+		"8:	addl %3,%1\n"					\
+		"	movl %0,%3\n"					\
+		"	jmp 700b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 1b,6b\n"					\
+		"	.long 2b,400b\n"				\
+		"	.long 3b,7b\n"					\
+		"	.long 4b,400b\n"				\
+		"	.long 5b,8b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp),	\
+		  "=q"(tmp2)						\
+		: "0"(size), "1"(to), "2"(from)				\
+		: "memory");						\
+} while (0)
+
+
+unsigned long
+__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
+{
+	unsigned long flags;
+	int recursive = 0;
+	char xmm_space[64];
+	char xmm_reg_space[64]; /* in case we switch context */
+
+	if (n >= 128) {
+		kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+		__kni_copy_to_user(to,from,n);
+		SFENCE();
+		kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+	} else {
+		__copy_user(to,from,n);
+	}
+	return n;
+}
+
+unsigned long
+__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
+{
+	unsigned long flags;
+	int recursive = 0;
+	char xmm_space[64];
+	char xmm_reg_space[64]; /* in case we switch context */
+
+	if (n >= 128) {
+		kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+		__kni_copy_from_user(to,from,n);
+		SFENCE();
+		kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+	} else {
+		__copy_user_zeroing(to,from,n);
+	}
+	return n;
+}
+
+
+
+unsigned long
+kni_copy_to_user(void *to, const void *from, unsigned long n)
+{
+	unsigned long flags;
+	int recursive = 0;
+	char xmm_space[64];
+	char xmm_reg_space[64]; /* in case we switch context */
+
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		if (n >= 128) {
+			kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+			__kni_copy_to_user(to,from,n);
+			SFENCE();
+			kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+		} else {
+			__copy_user(to,from,n);
+		}
+	}
+	return n;
+}
+
+unsigned long
+kni_copy_from_user(void *to, const void *from, unsigned long n)
+{
+	unsigned long flags;
+	int recursive = 0;
+	char xmm_space[64];
+	char xmm_reg_space[64]; /* in case we switch context */
+
+	if (access_ok(VERIFY_READ, from, n)) {
+		if (n >= 128) {
+			kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+			__kni_copy_from_user(to,from,n);
+			SFENCE();
+			kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+		} else {
+			__copy_user_zeroing(to,from,n);
+		}
+	}
+	return n;
+}
+
+
Index: oldkernel/linux/arch/i386/mm/init.c
diff -u linux/arch/i386/mm/init.c:1.1.1.1 linux/arch/i386/mm/init.c:1.2
--- linux/arch/i386/mm/init.c:1.1.1.1	Wed May 31 12:33:53 2000
+++ linux/arch/i386/mm/init.c	Thu Jun  1 15:05:19 2000
@@ -184,34 +184,6 @@
 extern char _text, _etext, _edata, __bss_start, _end;
 extern char __init_begin, __init_end;
 
-#define X86_CR4_VME		0x0001		/* enable vm86 extensions */
-#define X86_CR4_PVI		0x0002		/* virtual interrupts flag enable */
-#define X86_CR4_TSD		0x0004		/* disable time stamp at ipl 3 */
-#define X86_CR4_DE		0x0008		/* enable debugging extensions */
-#define X86_CR4_PSE		0x0010		/* enable page size extensions */
-#define X86_CR4_PAE		0x0020		/* enable physical address extensions */
-#define X86_CR4_MCE		0x0040		/* Machine check enable */
-#define X86_CR4_PGE		0x0080		/* enable global pages */
-#define X86_CR4_PCE		0x0100		/* enable performance counters at ipl 3 */
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-unsigned long mmu_cr4_features __initdata = 0;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-	mmu_cr4_features |= mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
-}
-
 /*
  * allocate page table(s) for compile-time fixed mappings
  */
Index: oldkernel/linux/include/asm-i386/bugs.h
diff -u linux/include/asm-i386/bugs.h:1.1.1.1 linux/include/asm-i386/bugs.h:1.2
--- linux/include/asm-i386/bugs.h:1.1.1.1	Wed May 31 12:33:49 2000
+++ linux/include/asm-i386/bugs.h	Thu Jun  1 15:05:19 2000
@@ -18,6 +18,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/stddef.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 
@@ -69,6 +70,45 @@
 #endif
 		return;
 	}
+#ifdef CONFIG_X86_FX
+	/*
+	 * If we got so far we can safely turn on FXSAVE/FXRESTORE,
+	 * but make sure we are 16-byte aligned first.
+	 */
+	if (offsetof(struct task_struct, tss.i387.hard.fxsave.fxcwd) & 15) {
+		/*
+		 * This triggers a link-time error if we manage to
+		 * break alignment somehow.
+		 */
+		extern void __buggy_fxsr_alignment(void);
+
+		__buggy_fxsr_alignment();
+	}
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+		printk("Enabling extended fast FPU save and restore...");
+		set_in_cr4(X86_CR4_OSFXSR);
+		printk("done.\n");
+	}
+	/*
+	 * Note, Katmai instructions are enabled as soon as you start
+	 * using the FXSAVE/RESTORE stuff.  This setting only
+	 * indicates support for the masked/unmasked exceptions on
+	 * the new PIII cpus. We don't have an Exception 16 handler
+	 * for this yet, but we set this bit anyway.  It'll kill us
+	 * the first time we take an umasked KNI exception, but since
+	 * no userland apps currently use KNI, it isn't an issue yet.
+	 * We should have the handler added by then.
+	 */
+	if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) {
+		printk("Not enabling KNI unmasked exception support\n");
+		printk("Exception 19 error handler not integrated yet\n");
+#if 0
+		set_in_cr4(X86_CR4_OSXMMEXCPT);
+		printk("done.\n");
+#endif
+	}
+#endif
+	disable_serial_nr();
 	if (mca_pentium_flag) {
 		/* The IBM Model 95 machines with pentiums lock up on
 		 * fpu test, so we avoid it. All pentiums have inbuilt
@@ -117,23 +157,23 @@
 		return;
 	if (!ignore_irq13) {
 		printk("OK, FPU using old IRQ 13 error reporting\n");
-		return;
+	} else {
+		__asm__("fninit\n\t"
+			"fldl %1\n\t"
+			"fdivl %2\n\t"
+			"fmull %2\n\t"
+			"fldl %1\n\t"
+			"fsubp %%st,%%st(1)\n\t"
+			"fistpl %0\n\t"
+			"fwait\n\t"
+			"fninit"
+			: "=m" (*&boot_cpu_data.fdiv_bug)
+			: "m" (*&x), "m" (*&y));
+		if (!boot_cpu_data.fdiv_bug)
+			printk("OK, FPU using exception 16 error reporting.\n");
+		else
+			printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n");
 	}
-	__asm__("fninit\n\t"
-		"fldl %1\n\t"
-		"fdivl %2\n\t"
-		"fmull %2\n\t"
-		"fldl %1\n\t"
-		"fsubp %%st,%%st(1)\n\t"
-		"fistpl %0\n\t"
-		"fwait\n\t"
-		"fninit"
-		: "=m" (*&boot_cpu_data.fdiv_bug)
-		: "m" (*&x), "m" (*&y));
-	if (!boot_cpu_data.fdiv_bug)
-		printk("OK, FPU using exception 16 error reporting.\n");
-	else
-		printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n");
 }
 
 __initfunc(static void check_hlt(void))
@@ -419,5 +459,7 @@
 	check_amd_k6();
 	check_pentium_f00f();
 	check_cyrix_coma();
+	boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */
+					 /* kernel functions now */
 	system_utsname.machine[1] = '0' + boot_cpu_data.x86;
 }
Index: oldkernel/linux/include/asm-i386/i387.h
diff -u /dev/null linux/include/asm-i386/i387.h:1.1
--- /dev/null	Mon Jul 31 21:12:25 2000
+++ linux/include/asm-i386/i387.h	Thu Jun  1 15:05:19 2000
@@ -0,0 +1,313 @@
+/*
+ * include/asm-i386/i387.h
+ *
+ * Copyright (c) 1999 Doug Ledford <dledford@redhat.com>
+ *
+ * Made from various code bits pulled from other files
+ * in order to put things together in a way that made
+ * sense.
+ *
+ * FX/FPU support:
+ * Copyright (c) 1999 Ingo Molnar <mingo@redhat.com>,
+ *                   Gabriel Paubert <paubert@iram.es>
+ */
+
+#ifndef __ASM_I386_I387_H
+#define __ASM_I386_I387_H
+
+extern int i387_hard_to_user ( struct user_i387_struct * user,
+	union i387_hard_union * hard);
+extern int i387_user_to_hard ( union i387_hard_union * hard,
+	struct user_i387_struct * user);
+
+/*
+ * Fill out the reserved bits, treat it as an fsave struct since the
+ * union makes this work for both fsave and fxsave structs.
+ */
+#ifdef CONFIG_X86_FX
+
+#define i387_save_hard(x) \
+do { \
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+		__asm__ __volatile__("fxsave %0" \
+				     : "=m" ((x).hard.fxsave.fxcwd)); \
+	} else { \
+		__asm__ __volatile__("fnsave %0; fwait;" \
+				     : "=m" ((x).hard.fsave.cwd)); \
+	} \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+                __asm__ __volatile__("fxrstor %0" \
+				     : \
+				     : "m" ((x).hard.fxsave.fxcwd)); \
+	} else { \
+		__asm__ __volatile__("frstor %0" \
+				     : \
+				     :"m" ((x).hard.fsave.cwd)); \
+	} \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { \
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+		(x).fxsave.fxcwd = (short)(v); \
+	} else { \
+		(x).fsave.cwd = ((long)(v) | 0xffff0000); \
+	} \
+} while(0)
+
+#define i387_set_swd(x,v) \
+do { \
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+		(x).fxsave.fxswd = (short)(v); \
+	} else { \
+		(x).fsave.swd = ((long)(v) | 0xffff0000); \
+	} \
+} while(0)
+
+#define i387_set_twd(x,v) \
+do { \
+	if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+		(x).fxsave.fxtwd = (short)(v); \
+	} else { \
+		(x).fsave.twd = ((long)(v) | 0xffff0000); \
+	} \
+} while(0)
+
+static inline unsigned short fputag_KNI_to_387(unsigned char tb) {
+	unsigned short tw = tb;
+	tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */
+	tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */
+	tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */
+	tw = ~(tw * 3);
+	return tw;
+}
+
+static inline unsigned char fputag_387_to_KNI(unsigned short tw) {
+	tw = ~tw & 0x5555;		/* z7z6z5z4z3z2z1z0 */
+	tw = (tw | (tw >> 1)) & 0x3333;	/* zz76zz54zz32zz10 */
+	tw = (tw | (tw >> 2)) & 0x0f0f;	/* zzzz7654zzzz3210 */
+	tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */
+	return tw;
+}
+
+#else /* CONFIG_X86_FX */
+
+#define i387_save_hard(x) \
+do { \
+	__asm__ __volatile__("fnsave %0; fwait;" \
+			     : "=m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+	__asm__ __volatile__("frstor %0" \
+			     : \
+			     :"m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_swd(x,v) \
+do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_twd(x,v) \
+do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0)
+
+#endif /* CONFIG_X86_FX */
+
+/*
+ * FPU lazy state save handling..
+ */
+#define save_kern_fpu(tsk) do { \
+	if(tsk->tss.mmx_reg_space != NULL) \
+		__asm__("movq %%mm0, 0x00(%0)\n\t" \
+			"movq %%mm1, 0x08(%0)\n\t" \
+			"movq %%mm2, 0x10(%0)\n\t" \
+			"movq %%mm3, 0x18(%0)\n\t" \
+			:: "r" (tsk->tss.mmx_reg_space):"memory"); \
+	if(tsk->tss.kni_reg_space != NULL) \
+		__asm__("movups %%xmm0, 0x00(%0)\n\t" \
+			"movups %%xmm1, 0x10(%0)\n\t" \
+			"movups %%xmm2, 0x20(%0)\n\t" \
+			"movups %%xmm3, 0x30(%0)\n\t" \
+			:: "r" (tsk->tss.kni_reg_space):"memory"); \
+} while (0)
+
+#define unlazy_fpu(tsk) do { \
+	if (tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \
+		save_kern_fpu(tsk); \
+		if (!(tsk->flags & PF_USEDFPU)) { \
+			stts(); \
+		} \
+	} \
+	if (tsk->flags & PF_USEDFPU) { \
+		if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED)) { \
+			i387_save_hard(tsk->tss.i387); \
+		} \
+		tsk->flags &= ~PF_USEDFPU; \
+		stts(); \
+	} \
+} while (0)
+
+#define clear_fpu(tsk) do { \
+	if ( (tsk->flags & PF_USEDFPU) || \
+	     (tsk->tss.x86_fpustate) ) { \
+		tsk->flags &= ~PF_USEDFPU; \
+		tsk->tss.x86_fpustate = 0; \
+		stts(); \
+	} \
+} while (0)
+
+/*
+ * For when we want to use the FPU in kernel code
+ * 
+ * These functions allow the use of up to 4 KNI based xmm registers on the
+ * Pentium III processors or up to 4 MMX registers on Pentium MMX and above
+ * or compatible processors.  Pick the routines that you need based on the
+ * regs you are going to use.  Keep in mind that these are intended to be
+ * used only after you've verified that the processor supports these
+ * operations.  Use them before you've done that and watch your machine go
+ * boom.  Take a look in arch/i386/lib/best_function.c for an example of
+ * how to fixup the kernel with kni/mmx using functions once the CPU
+ * capabilities have been determined.
+ *
+ * In all of these functions:
+ *
+ *   recursive - int, used to determine what the state is at restore time
+ *   regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni
+ *          which is then used to save off the contents of the current
+ *          regs to be recursively safe
+ *   task_switch_regs - char * to another array of the same size as the one
+ *          above, but this array is optional.  If your function might get 
+ *          pre-empted by another task then this pointer should be non-NULL
+ *          so that at unlazy_fpu() time in the switch_to() function we
+ *          can save your register state (copy_*_user functions are an example
+ *          of functions that need this, since they can take a page fault and
+ *          while that fault is being serviced the scheduler is free to run
+ *          another task entirely).
+ *   irqflags - unsigned long used to store IRQ state
+ */
+
+#define SAVE_MMX_REGS(regs) \
+	__asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \
+			     "movq %%mm1, 0x08(%0)\n\t" \
+			     "movq %%mm2, 0x10(%0)\n\t" \
+			     "movq %%mm3, 0x18(%0)\n\t" \
+			     : : "r" ((regs)) : "memory" );
+
+#define RESTORE_MMX_REGS(regs) \
+	__asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \
+			     "movq 0x08(%0), %%mm1\n\t" \
+			     "movq 0x10(%0), %%mm2\n\t" \
+			     "movq 0x18(%0), %%mm3\n\t" \
+			     : : "r" ((regs)));
+
+#define SAVE_KNI_REGS(regs) \
+	__asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \
+			     "movups %%xmm1, 0x10(%0)\n\t" \
+			     "movups %%xmm2, 0x20(%0)\n\t" \
+			     "movups %%xmm3, 0x30(%0)\n\t" \
+			     : : "r" ((regs)) : "memory" );
+
+#define RESTORE_KNI_REGS(regs) \
+	__asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \
+			     "movups 0x10(%0), %%xmm1\n\t" \
+			     "movups 0x20(%0), %%xmm2\n\t" \
+			     "movups 0x30(%0), %%xmm3\n\t" \
+			     : : "r" ((regs)));
+
+#define SFENCE() \
+	__asm__ __volatile__("sfence":::"memory")
+
+
+extern spinlock_t kern_fpu_lock;
+
+/*
+ * Although it seems wasteful to do a unilateral clts() in the take_fpu
+ * functions, the reason I did it that way is because the alternative is
+ * to test for:
+ *
+ * if ( ( (current->flags & PF_USEDFPU) &&
+ *        (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ) ||
+ *      ( !(current->flags & PF_USEDFPU) &&
+ *        !(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) ) )
+ *
+ */
+
+#define kernel_take_fpu_mmx(recursive, regs, task_switch_regs, irqflags) do { \
+	spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+	clts(); \
+	(recursive) = (current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY); \
+	if ( (current->flags & PF_USEDFPU) && \
+	    !(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ){ \
+		i387_save_hard(current->tss.i387); \
+		current->tss.x86_fpustate |= X86_FPUSTATE_USER_SAVED; \
+	} \
+	if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+		SAVE_MMX_REGS((regs)); \
+	} else { \
+		current->tss.mmx_reg_space = (task_switch_regs); \
+		current->tss.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \
+	} \
+	spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_release_fpu_mmx(recursive, regs, irqflags) do { \
+	spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+	if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+		RESTORE_MMX_REGS((regs)); \
+	} else { \
+		current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \
+		current->tss.mmx_reg_space = NULL; \
+	} \
+	if ((recursive) == 0) { \
+		stts(); \
+	} \
+	spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_take_fpu_kni(recursive, regs, task_switch_regs, irqflags) do { \
+	spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+	clts(); \
+	(recursive) = current->tss.x86_fpustate; \
+	if ( (current->flags & PF_USEDFPU) || \
+	     (current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI) ) { \
+		SAVE_KNI_REGS((regs)); \
+	} \
+	if (!(current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI)) { \
+		current->tss.kni_reg_space = (task_switch_regs); \
+		current->tss.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \
+	} \
+	spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+		
+	
+#define kernel_release_fpu_kni(recursive, regs, irqflags) do { \
+	spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+	if ( (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) && \
+	     !(((recursive) & X86_FPUSTATE_USER_SAVED) && \
+		(current->flags & PF_USEDFPU)) ) { \
+		i387_restore_hard(current->tss.i387); \
+		current->tss.x86_fpustate &= ~X86_FPUSTATE_USER_SAVED; \
+	} \
+	if ( ((recursive) & X86_FPUSTATE_KERN_KNI) || \
+	     (current->flags & PF_USEDFPU) ) { \
+		RESTORE_KNI_REGS((regs)); \
+	} \
+	if (((recursive) & X86_FPUSTATE_KERN_KNI) == 0) { \
+		current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \
+		current->tss.kni_reg_space = NULL; \
+	} \
+	if ( ((recursive) == 0) && ((current->flags & PF_USEDFPU) == 0) ) { \
+		stts(); \
+	} \
+	spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+
+#endif /* __ASM_I386_I387_H */
Index: oldkernel/linux/include/asm-i386/io.h
diff -u linux/include/asm-i386/io.h:1.1.1.1 linux/include/asm-i386/io.h:1.2
--- linux/include/asm-i386/io.h:1.1.1.1	Wed May 31 12:33:49 2000
+++ linux/include/asm-i386/io.h	Thu Jun  1 15:05:19 2000
@@ -157,9 +157,9 @@
 #define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b))
 #define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b))
 
-#define memset_io(a,b,c)	memset(__io_virt(a),(b),(c))
-#define memcpy_fromio(a,b,c)	memcpy((a),__io_virt(b),(c))
-#define memcpy_toio(a,b,c)	memcpy(__io_virt(a),(b),(c))
+#define memset_io(a,b,c)	__memset_generic(__io_virt(a),(b),(c))
+#define memcpy_fromio(a,b,c)	__memcpy((a),__io_virt(b),(c))
+#define memcpy_toio(a,b,c)	__memcpy(__io_virt(a),(b),(c))
 
 /*
  * Again, i386 does not require mem IO specific function.
Index: oldkernel/linux/include/asm-i386/processor.h
diff -u linux/include/asm-i386/processor.h:1.1.1.1 linux/include/asm-i386/processor.h:1.2
--- linux/include/asm-i386/processor.h:1.1.1.1	Wed May 31 12:33:49 2000
+++ linux/include/asm-i386/processor.h	Thu Jun  1 15:05:19 2000
@@ -7,10 +7,11 @@
 #ifndef __ASM_I386_PROCESSOR_H
 #define __ASM_I386_PROCESSOR_H
 
+#include <linux/config.h>
 #include <asm/vm86.h>
 #include <asm/math_emu.h>
-#include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/user.h>
 
 /*
  *  CPU type and hardware bug flags. Kept separately for each CPU.
@@ -29,6 +30,7 @@
 	char	rfu;
 	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
 	__u32	x86_capability;
+	__u32	mmu_cr4_features;
 	char	x86_vendor_id[16];
 	char	x86_model_id[64];
 	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
@@ -36,6 +38,7 @@
 	int	fdiv_bug;
 	int	f00f_bug;
 	int	coma_bug;
+	int	enable_fixups;
 	unsigned long loops_per_sec;
 	unsigned long *pgd_quick;
 	unsigned long *pte_quick;
@@ -70,16 +73,16 @@
 #define X86_FEATURE_PGE		0x00002000	/* Page Global Enable */
 #define X86_FEATURE_MCA		0x00004000	/* Machine Check Architecture */
 #define X86_FEATURE_CMOV	0x00008000	/* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
-#define X86_FEATURE_PAT	0x00010000	/* Page Attribute Table */
+#define X86_FEATURE_PAT		0x00010000	/* Page Attribute Table */
 #define X86_FEATURE_PSE36	0x00020000	/* 36-bit PSEs */
-#define X86_FEATURE_18		0x00040000
+#define X86_FEATURE_PN		0x00040000      /* 96 bit CPU serial # */
 #define X86_FEATURE_19		0x00080000
 #define X86_FEATURE_20		0x00100000
 #define X86_FEATURE_21		0x00200000
 #define X86_FEATURE_22		0x00400000
 #define X86_FEATURE_MMX		0x00800000	/* multimedia extensions */
 #define X86_FEATURE_FXSR	0x01000000	/* FXSAVE and FXRSTOR instructions (fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) available */
-#define X86_FEATURE_25		0x02000000
+#define X86_FEATURE_XMM		0x02000000	/* Intel MMX2 instruction set */
 #define X86_FEATURE_26		0x04000000
 #define X86_FEATURE_27		0x08000000
 #define X86_FEATURE_28		0x10000000
@@ -89,6 +92,82 @@
 
 extern struct cpuinfo_x86 boot_cpu_data;
 
+#define X86_CR4_VME		0x0001	/* enable vm86 extensions */
+#define X86_CR4_PVI		0x0002	/* virtual interrupts flag enable */
+#define X86_CR4_TSD		0x0004	/* disable time stamp at ipl 3 */
+#define X86_CR4_DE		0x0008	/* enable debugging extensions */
+#define X86_CR4_PSE		0x0010	/* enable page size extensions */
+#define X86_CR4_PAE		0x0020	/* enable physical address extensions */
+#define X86_CR4_MCE		0x0040	/* Machine check enable */
+#define X86_CR4_PGE		0x0080	/* enable global pages */
+#define X86_CR4_PCE		0x0100	/* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR		0x0200	/* fast FPU save/restore */
+#define X86_CR4_OSXMMEXCPT	0x0400	/* KNI (MMX2) unmasked exception 16 */
+					/* handler is available */
+
+/*
+ * Some defines for using with the x86_fpu_state variable in the new
+ * thread struct.  We use these because the rest of the kernel doesn't
+ * like us messing with current->flags at arbitrary times ;-)
+ */
+#define X86_FPUSTATE_USER_SAVED	0x0001
+#define X86_FPUSTATE_KERN_ANY	0x0006
+#define X86_FPUSTATE_KERN_MMX	0x0002
+#define X86_FPUSTATE_KERN_KNI	0x0004
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+
+static inline void set_in_cr4(unsigned long mask)
+{
+	boot_cpu_data.mmu_cr4_features |= mask;
+	__asm__("movl %%cr4,%%eax\n\t"
+		"orl %0,%%eax\n\t"
+		"movl %%eax,%%cr4\n"
+		: : "irg" (mask)
+		:"ax");
+}
+
+extern int disable_x86_serial_nr;
+
+static inline void disable_serial_nr(void)
+{
+	if ( disable_x86_serial_nr && 
+	    (boot_cpu_data.x86_capability & X86_FEATURE_PN) ) {
+		printk("Disabling CPUID Serial number...");
+		__asm__ __volatile__( "movl $0x119,%%ecx\n\t"
+				"rdmsr\n\t"
+				"orl $0x00200000,%%eax\n\t"
+				"wrmsr":::"ax","dx","cx","memory");
+		/*
+		 * We might need to re-read the x86 capability set now to
+		 * make sure that the PN bit has been turned off so
+		 * we know that the serial number stuff is disabled
+		 *
+		 * Note: we don't need to re-read the registers.  We can tell
+		 * by rebooting that the flag is off since on reboots that
+		 * don't power the machine down the serial number doesn't
+		 * get disabled any more because it already is disabled.
+		 */
+		printk("done.\n");
+	}
+}
+
+static inline void load_default_mxcsr(void)
+{
+	long mxcsr = 0x1f80;
+
+	if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+	     (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+		__asm__("ldmxcsr %0": :"m" (mxcsr));
+	}
+}
+
+
 #ifdef __SMP__
 extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data cpu_data[smp_processor_id()]
@@ -170,37 +249,62 @@
  * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.
  */
 #define IO_BITMAP_SIZE	32
+
+struct i387_hard_fsave {
+	long     cwd;
+	long     swd;
+	long     twd;
+	long     fip;
+	long     fcs;
+	long     foo;
+	long     fos;
+	long     st_space[20];     /* 8*10 bytes for each FP-reg = 80 bytes */
+};
 
-struct i387_hard_struct {
-	long	cwd;
-	long	swd;
-	long	twd;
-	long	fip;
-	long	fcs;
-	long	foo;
-	long	fos;
-	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	long	status;		/* software status information */
+/*
+ * has to be 128-bit aligned
+ */
+struct i387_hard_fxsave {
+	unsigned short fxcwd;
+	unsigned short fxswd;
+	unsigned short fxtwd;
+	unsigned short fxfopcode;
+	long     fxfip;
+	short    fxfcs;
+	short    __reserved_00;
+	long     fxfoo;
+	short    fxfos;
+	short    __reserved_01;
+	long     mxcsr;
+	long     __reserved_02;
+	long     st_space[32];     /* 8*16 bytes for each FP/MMX-reg = 128 bytes */
+	long     xmm_space[32];    /* 8*16 bytes for each XMM-reg = 128 bytes */
+	long     __reserved_03 [14*4]; /* 14 16byte lines for remainder */
+} __attribute__ ((aligned (16)));
+
+union i387_hard_union {
+	struct i387_hard_fxsave    fxsave;
+	struct i387_hard_fsave     fsave;
 };
 
 struct i387_soft_struct {
-	long	cwd;
-	long	swd;
-	long	twd;
-	long	fip;
-	long	fcs;
-	long	foo;
-	long	fos;
-	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	unsigned char	ftop, changed, lookahead, no_update, rm, alimit;
-	struct info	*info;
-	unsigned long	entry_eip;
+	long     cwd;
+	long     swd;
+	long     twd;
+	long     fip;
+	long     fcs;
+	long     foo;
+	long     fos;
+	long     st_space[20];     /* 8*10 bytes for each FP-reg = 80 bytes */
+	unsigned char     ftop, changed, lookahead, no_update, rm, alimit;
+	struct info       *info;
+	unsigned long     entry_eip;
 };
 
 union i387_union {
-	struct i387_hard_struct hard;
+	union i387_hard_union hard;
 	struct i387_soft_struct soft;
-};
+} __attribute__ ((aligned(16)));
 
 typedef struct {
 	unsigned long seg;
@@ -242,6 +346,10 @@
 	struct vm86_struct * vm86_info;
 	unsigned long screen_bitmap;
 	unsigned long v86flags, v86mask, v86mode, saved_esp0;
+	volatile long x86_fpustate;
+	char *mmx_reg_space;
+	char *kni_reg_space;
+
 };
 
 #define INIT_MMAP \
@@ -263,8 +371,9 @@
 	{~0, }, /* ioperm */					\
 	_TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */	\
 	{ 0, },							\
-	{ { 0, }, },  /* 387 state */				\
+	{ { { 0, }, }, },  /* 387 state */			\
 	NULL, 0, 0, 0, 0, 0, /* vm86_info */			\
+	0, NULL, NULL /* fpustate, mmx, and xmm_reg_space */	\
 }
 
 #define start_thread(regs, new_eip, new_esp) do {		\
@@ -289,27 +398,6 @@
 extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm);
 extern void release_segments(struct mm_struct * mm);
 extern void forget_segments(void);
-
-/*
- * FPU lazy state save handling..
- */
-#define save_fpu(tsk) do { \
-	asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \
-	tsk->flags &= ~PF_USEDFPU; \
-	stts(); \
-} while (0)
-
-#define unlazy_fpu(tsk) do { \
-	if (tsk->flags & PF_USEDFPU) \
-		save_fpu(tsk); \
-} while (0)
-
-#define clear_fpu(tsk) do { \
-	if (tsk->flags & PF_USEDFPU) { \
-		tsk->flags &= ~PF_USEDFPU; \
-		stts(); \
-	} \
-} while (0)
 
 /*
  * Return saved PC of a blocked thread.
Index: oldkernel/linux/include/asm-i386/string.h
diff -u linux/include/asm-i386/string.h:1.1.1.1 linux/include/asm-i386/string.h:1.2
--- linux/include/asm-i386/string.h:1.1.1.1	Wed May 31 12:33:49 2000
+++ linux/include/asm-i386/string.h	Thu Jun  1 15:05:19 2000
@@ -14,6 +14,10 @@
 #include <asm/string-486.h>
 #else
 
+#ifndef _LINUX_CONFIG_H
+#include <linux/config.h>
+#endif
+
 /*
  * This string-include defines all string functions as inline
  * functions. Use gcc. It also assumes ds=es=data space, this should be
@@ -293,10 +297,21 @@
 }
 
 #define __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memcpy(void * to, const void * from, size_t n);
+extern void * best_memcpy(void * to, const void * from, size_t n);
+#define memcpy(t, f, n) \
+(__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
+ __constant_memcpy((t),(f),(n)) : \
+ best_memcpy((t),(f),(n))) : \
+ best_memcpy((t),(f),(n)))
+#else
 #define memcpy(t, f, n) \
 (__builtin_constant_p(n) ? \
  __constant_memcpy((t),(f),(n)) : \
  __memcpy((t),(f),(n)))
+#endif
 
 #define __HAVE_ARCH_MEMMOVE
 extern inline void * memmove(void * dest,const void * src, size_t n)
@@ -449,21 +464,32 @@
 #undef COMMON
 }
 
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
+#define __constant_x_count_memset(s, c, count) \
+(__builtin_constant_p(c) ? \
+ __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\
+ __constant_count_memset((s),(c),(count)))
 
 #define __memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
+(__builtin_constant_p(c) ? \
+ __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
  __memset_generic((s),(c),(count)))
 
 #define __HAVE_ARCH_MEMSET
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memset(void * s, char c, size_t count);
+extern void * best_memset(void * s, char c, size_t count);
 #define memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
+(__builtin_constant_p(count) ? \
+ (((count) < 128) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
+ best_memset((s),(c),(count))) : \
+ best_memset((s),(c),(count)))
+#else
+#define memset(s, c, count) \
+(__builtin_constant_p(count) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
  __memset((s),(c),(count)))
+#endif
 
 /*
  * find the first occurrence of byte 'c', or 1 past the area if none
Index: oldkernel/linux/include/asm-i386/uaccess.h
diff -u linux/include/asm-i386/uaccess.h:1.1.1.1 linux/include/asm-i386/uaccess.h:1.2
--- linux/include/asm-i386/uaccess.h:1.1.1.1	Wed May 31 12:33:49 2000
+++ linux/include/asm-i386/uaccess.h	Thu Jun  1 15:05:19 2000
@@ -571,20 +571,62 @@
 	return n;
 }
 
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+
+/*
+ * The XMM based copy_*_user() function declarations...the best_*_user()
+ * routines need this
+ */
+unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long);
+
+unsigned long best_copy_to_user(void *, const void *, unsigned long);
+unsigned long best_copy_from_user(void *, const void *, unsigned long);
+unsigned long __best_copy_to_user(void *, const void *, unsigned long);
+unsigned long __best_copy_from_user(void *, const void *, unsigned long);
+
 #define copy_to_user(to,from,n)				\
 	(__builtin_constant_p(n) ?			\
+	(((n) < 128) ? 					\
 	 __constant_copy_to_user((to),(from),(n)) :	\
-	 __generic_copy_to_user((to),(from),(n)))
+	 best_copy_to_user((to),(from),(n))) : 		\
+	 best_copy_to_user((to),(from),(n)))
 
 #define copy_from_user(to,from,n)			\
 	(__builtin_constant_p(n) ?			\
+	(((n) < 128) ? 					\
 	 __constant_copy_from_user((to),(from),(n)) :	\
-	 __generic_copy_from_user((to),(from),(n)))
+	 best_copy_from_user((to),(from),(n))) :	\
+	 best_copy_from_user((to),(from),(n)))
 
-#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; })
+#define __copy_to_user(to,from,n)			\
+	(__builtin_constant_p(n) ?			\
+	(((n) < 128) ? 					\
+	 __constant_copy_to_user_nocheck((to),(from),(n)) :	\
+	 __best_copy_to_user((to),(from),(n))) :	\
+	 __best_copy_to_user((to),(from),(n)))
 
-#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; })
+#define __copy_from_user(to,from,n)			\
+	(__builtin_constant_p(n) ?			\
+	(((n) < 128) ? 					\
+	 __constant_copy_from_user_nocheck((to),(from),(n)) :	\
+	 __best_copy_from_user((to),(from),(n))) :	\
+	 __best_copy_from_user((to),(from),(n)))
+
+#else /* CONFIG_X86_CPU_OPTIMIZATIONS */
 
+#define copy_to_user(to,from,n)				\
+	(__builtin_constant_p(n) ?			\
+	 __constant_copy_to_user((to),(from),(n)) :	\
+	 __generic_copy_to_user((to),(from),(n)))
+
+#define copy_from_user(to,from,n)			\
+	(__builtin_constant_p(n) ?			\
+	 __constant_copy_from_user((to),(from),(n)) :	\
+	 __generic_copy_from_user((to),(from),(n)))
+
 #define __copy_to_user(to,from,n)			\
 	(__builtin_constant_p(n) ?			\
 	 __constant_copy_to_user_nocheck((to),(from),(n)) :	\
@@ -594,6 +636,11 @@
 	(__builtin_constant_p(n) ?			\
 	 __constant_copy_from_user_nocheck((to),(from),(n)) :	\
 	 __generic_copy_from_user_nocheck((to),(from),(n)))
+#endif
+
+#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; })
+
+#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; })
 
 long strncpy_from_user(char *dst, const char *src, long count);
 long __strncpy_from_user(char *dst, const char *src, long count);
Index: oldkernel/linux/init/main.c
diff -u linux/init/main.c:1.4 linux/init/main.c:1.5
--- linux/init/main.c:1.4	Thu Jun  1 15:01:35 2000
+++ linux/init/main.c	Thu Jun  1 15:05:19 2000
@@ -103,6 +103,7 @@
 #ifdef __i386__
 extern void ioapic_pirq_setup(char *str, int *ints);
 extern void ioapic_setup(char *str, int *ints);
+extern void x86_serial_nr_setup(char *str, int *ints);
 #endif
 extern void no_scroll(char *str, int *ints);
 extern void kbd_reset_setup(char *str, int *ints);
@@ -644,6 +645,9 @@
 	{ "noapic", ioapic_setup },
 	{ "pirq=", ioapic_pirq_setup },
 #endif
+#endif
+#ifdef __i386__
+	{ "x86_serial_nr", x86_serial_nr_setup },
 #endif
 #ifdef CONFIG_BLK_DEV_RAM
 	{ "ramdisk_start=", ramdisk_start_setup },
