Index: Make.atlas
===================================================================
RCS file: Make.atlas
diff -N Make.atlas
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ Make.atlas	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,66 @@
+##################################################################
+# (C) Copyright IBM Corporation 2008
+#
+##################################################################
+
+# Platform
+
+ARCH	:= atlas
+
+# Tools
+
+SHELL	:= /bin/sh
+CD	:= cd
+CP	:= cp
+LN_S	:= ln -s
+MKDIR	:= mkdir
+TOUCH	:= touch
+
+CC	:= mpicc
+LINKER	:= mpicc
+ARCHIVER := /usr/bin/ar
+RANLIB	:= echo
+
+# Directories
+
+INCdir	:= $(TOPdir)/include
+BINdir	:= $(TOPdir)/bin/$(ARCH)
+
+# HPL library
+
+HPLlib	:= $(TOPdir)/lib/$(ARCH)/libhpl.a
+
+# MPI package
+
+MPdir	:= 
+MPinc	:= 
+MPlib	:= 
+
+# Linear Algebra Library package -- Atlas
+
+LAdir	:= /usr/local/atlas
+LAinc	:= -I$(LAdir)/include
+LAlib   := -L$(LAdir)/lib -lf77blas -latlas -lgfortran
+
+# F2C options
+
+F2CDEFS	:= -DAdd__ -DF77_INTEGER=int -DStringSunStyle
+
+# HPL options
+
+HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc)
+HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS)
+HPL_DEFS += -DHPL_USE_HUGE_PAGES=1
+
+ifdef TIMING
+HPL_DEFS += -DHPL_DETAILED_TIMING
+endif
+
+HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib)
+
+CCNOOPT := -m64 -Wall $(HPL_DEFS)
+CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops
+#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3
+LINKFLAGS := $(CCFLAGS)
+ARFLAGS := -r
+
Index: Make.qs22
===================================================================
RCS file: Make.qs22
diff -N Make.qs22
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ Make.qs22	20 Aug 2008 03:57:53 -0000	1.7
@@ -0,0 +1,74 @@
+##################################################################
+# (C) Copyright IBM Corporation 2008
+#
+##################################################################
+
+# Platform
+
+ARCH	:= qs22
+
+# Tools
+
+SHELL	:= /bin/sh
+CD	:= cd
+CP	:= cp
+LN_S	:= ln -s
+MKDIR	:= mkdir
+TOUCH	:= touch
+
+CC	:= mpicc
+LINKER	:= mpicc
+ARCHIVER := /usr/bin/ar
+RANLIB	:= echo
+
+# Directories
+
+INCdir	:= $(TOPdir)/include
+BINdir	:= $(TOPdir)/bin/$(ARCH)
+
+# HPL library
+
+HPLlib	:= $(TOPdir)/lib/$(ARCH)/libhpl.a
+ACLlib  := $(TOPdir)/accel/lib/libhpl_accel_ppu.a
+
+# MPI package
+
+MPdir	:= 
+MPinc	:= 
+MPlib	:= 
+
+# Linear Algebra Library package -- Atlas
+
+LAdir	:= /usr/local/atlas
+LAinc	:= -I$(LAdir)/include
+LAlib   := -L$(LAdir)/lib -lf77blas -latlas -lgfortran
+
+# Cell SDK
+
+CSdir	:= /opt/cell/sdk/prototype
+CSinc	:= -I$(CSdir)/usr/include
+CSlib	:= -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma
+
+# F2C options
+
+F2CDEFS	:= -DAdd__ -DF77_INTEGER=int -DStringSunStyle
+
+# HPL options
+
+HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc)
+HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS)
+HPL_DEFS += -DHPL_USE_HUGE_PAGES=1
+
+ifdef TIMING
+HPL_DEFS += -DHPL_DETAILED_TIMING
+endif
+
+HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) $(ACLlib)
+
+CCNOOPT := -m64 -Wall $(HPL_DEFS)
+CCNOOPT += -DHPL_CALL_ACCEL
+CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops
+#CCFLAGS := $(CCNOOPT) -O0 -ggdb3
+LINKFLAGS := $(CCFLAGS)
+ARFLAGS := -r
+
Index: Make.qs22_sdkblas
===================================================================
RCS file: Make.qs22_sdkblas
diff -N Make.qs22_sdkblas
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ Make.qs22_sdkblas	7 Aug 2008 13:07:08 -0000	1.4
@@ -0,0 +1,78 @@
+##################################################################
+# Licensed Materials - Property of IBM.
+# (C) Copyright IBM Corporation 2007
+# All Rights Reserved.
+#
+# US Government Users Restricted Rights -
+# Use, duplication or disclosure restricted by
+# GSA ADP Schedule Contract with IBM Corporation.
+
+##################################################################
+
+# Platform
+
+ARCH	:= qs22_sdkblas
+
+# Tools
+
+SHELL	:= /bin/sh
+CD	:= cd
+CP	:= cp
+LN_S	:= ln -s
+MKDIR	:= mkdir
+TOUCH	:= touch
+
+CC	:= mpicc
+LINKER	:= mpicc
+ARCHIVER := /usr/bin/ar
+RANLIB	:= echo
+
+# Directories
+
+INCdir	:= $(TOPdir)/include
+BINdir	:= $(TOPdir)/bin/$(ARCH)
+
+# HPL library
+
+HPLlib	:= $(TOPdir)/lib/$(ARCH)/libhpl.a
+
+# MPI package
+
+MPdir	:= 
+MPinc	:= 
+MPlib	:= 
+
+# Linear Algebra Library package
+
+LAdir	:= /usr
+LAinc	:= -I$(LAdir)/include
+LAlib	:= -L$(LAdir)/lib64 -lblas
+
+# Cell SDK
+
+CSdir	:= /opt/cell/sdk/prototype
+CSinc	:= -I$(CSdir)/usr/include
+CSlib	:= -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma
+
+# F2C options
+
+F2CDEFS	:= -DAdd__ -DF77_INTEGER=int -DStringSunStyle
+
+# HPL options
+
+HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc)
+HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS)
+HPL_DEFS += -DHPL_USE_HUGE_PAGES=1
+
+ifdef TIMING
+HPL_DEFS += -DHPL_DETAILED_TIMING
+endif
+
+HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib)
+
+CCNOOPT := -m64 -Wall $(HPL_DEFS)
+CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops
+#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3
+LINKFLAGS := $(CCFLAGS)
+ARFLAGS := -r
+
Index: Make.top
===================================================================
RCS file: /cvsroot/hpl_qs22/Make.top,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- Make.top	10 Feb 2008 21:45:50 -0000	1.1
+++ Make.top	26 Aug 2008 13:24:26 -0000	1.4
@@ -43,6 +43,8 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 arch             = UNKNOWN
 #
@@ -51,6 +53,7 @@
 ## build ###############################################################
 #
 build_src        :
+	( $(CD) src/accel/$(arch);         $(MAKE) )
 	( $(CD) src/auxil/$(arch);         $(MAKE) )
 	( $(CD) src/blas/$(arch);          $(MAKE) )
 	( $(CD) src/comm/$(arch);          $(MAKE) )
@@ -78,6 +81,7 @@
 	- $(MKDIR) bin/$(arch)
 #
 startup_src      :
+	- $(MAKE) -f Make.top leaf le=src/accel       arch=$(arch)
 	- $(MAKE) -f Make.top leaf le=src/auxil       arch=$(arch)
 	- $(MAKE) -f Make.top leaf le=src/blas        arch=$(arch)
 	- $(MAKE) -f Make.top leaf le=src/comm        arch=$(arch)
@@ -98,6 +102,7 @@
 ## refresh #############################################################
 #
 refresh_src      :
+	- $(CP) makes/Make.accel    src/accel/$(arch)/Makefile
 	- $(CP) makes/Make.auxil    src/auxil/$(arch)/Makefile
 	- $(CP) makes/Make.blas     src/blas/$(arch)/Makefile
 	- $(CP) makes/Make.comm     src/comm/$(arch)/Makefile
@@ -118,6 +123,7 @@
 ## clean ###############################################################
 #
 clean_src        :
+	- ( $(CD) src/accel/$(arch);        $(MAKE) clean )
 	- ( $(CD) src/auxil/$(arch);        $(MAKE) clean )
 	- ( $(CD) src/blas/$(arch);         $(MAKE) clean )
 	- ( $(CD) src/comm/$(arch);         $(MAKE) clean )
@@ -138,6 +144,7 @@
 ## clean_arch ##########################################################
 #
 clean_arch_src   :
+	- $(RM) -r src/accel/$(arch)
 	- $(RM) -r src/auxil/$(arch)
 	- $(RM) -r src/blas/$(arch)
 	- $(RM) -r src/comm/$(arch)
@@ -165,6 +172,7 @@
 ## clean_guard #########################################################
 #
 clean_guard_src  :
+	- ( $(CD) src/accel/$(arch);       $(RM) *.grd )
 	- ( $(CD) src/auxil/$(arch);       $(RM) *.grd )
 	- ( $(CD) src/blas/$(arch);        $(RM) *.grd )
 	- ( $(CD) src/comm/$(arch);        $(RM) *.grd )
Index: Makefile
===================================================================
RCS file: /cvsroot/hpl_qs22/Makefile,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- Makefile	10 Feb 2008 21:45:50 -0000	1.1
+++ Makefile	26 Aug 2008 13:24:26 -0000	1.4
@@ -43,12 +43,16 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 #
 SHELL            = /bin/sh
 #
 arch             = UNKNOWN
 #
+export TOPdir    = $(shell pwd)
+#
 ## Targets #############################################################
 #
 all              : install
@@ -70,10 +74,12 @@
 #
 build            :
 	$(MAKE) -f Make.top build_src       arch=$(arch)
+	$(MAKE) -C accel                    arch=$(arch)
 	$(MAKE) -f Make.top build_tst       arch=$(arch)
 #
 clean            :
 	$(MAKE) -f Make.top clean_src       arch=$(arch)
+	$(MAKE) -C accel clean              arch=$(arch)
 	$(MAKE) -f Make.top clean_tst       arch=$(arch)
 #
 clean_arch       :
Index: accel/Makefile
===================================================================
RCS file: accel/Makefile
diff -N accel/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/Makefile	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,25 @@
+# --------------------------------------------------------------- 
+# (C) Copyright IBM Corporation 2007,2008
+#                                                                 
+# All Rights Reserved.                                            
+# --------------------------------------------------------------- 
+
+ifeq ($(arch),qs22)
+
+########################################################################
+#			Target
+########################################################################
+
+DIRS			= lib
+
+########################################################################
+#			make.footer
+########################################################################
+
+include $(CELL_TOP)/buildutils/make.footer
+
+else
+
+all clean :
+
+endif
Index: accel/lib/Makefile
===================================================================
RCS file: accel/lib/Makefile
diff -N accel/lib/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/Makefile	20 Aug 2008 03:57:53 -0000	1.5
@@ -0,0 +1,39 @@
+# --------------------------------------------------------------- 
+# (C) Copyright IBM Corporation 2007,2008
+#                                                                 
+# --------------------------------------------------------------- 
+
+########################################################################
+#                       Subdirectories
+########################################################################
+
+DIRS             = spu
+
+########################################################################
+#                       Target
+########################################################################
+
+TARGET_PROCESSOR = ppu64
+LIBRARY          = libhpl_accel_ppu.a
+
+#CC_OPT_LEVEL	 = -g
+
+CPPFLAGS	 = -DNDEBUG
+#CPPFLAGS	+= -DACCEL_LITTLE_ENDIAN
+CPPFLAGS        += -DVALIDATE_4GB_CROSSING
+CPPFLAGS        += -DMATRIX_4GB_CROSSING
+#CPPFLAGS        += -DPANEL_4GB_CROSSING 
+
+########################################################################
+#                       Local Defines
+########################################################################
+
+SYS_LIBS        += -lspe2 -lpthread -lm
+
+IMPORTS          = spu/hpl_accel_spu-embed64.o
+
+########################################################################
+#                       make.footer
+########################################################################
+
+include $(CELL_TOP)/buildutils/make.footer
Index: accel/lib/hpl_accel.h
===================================================================
RCS file: accel/lib/hpl_accel.h
diff -N accel/lib/hpl_accel.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel.h	20 Aug 2008 03:57:53 -0000	1.13
@@ -0,0 +1,758 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _HPL_ACCEL_H_
+#define _HPL_ACCEL_H_
+
+#define M_SUB     (64)          /* Size of sub-blocks - M_SUB x M_SUB */
+
+/* ---------------------------------------------------------------- */
+/* Inline functions for addressing matrix storage of various formats*/
+/* ---------------------------------------------------------------- */
+
+/* The following inline functions compute an array index for the each
+ * of the supported formats - column ordered, row ordered, and blocked
+ * (column ordered blocks, whose blocks are row ordered).
+ * The inputs are the row (row), the column (col), the leading dimension
+ * (ld).
+ */
+
+/* ld is the number of elements from column n to column n+1 
+ */
+static inline unsigned int INDEX_COL(unsigned int row, unsigned int col, unsigned int ld) {
+  return (col*ld + row);
+}
+
+/* ld is the number of elements from row n to row n+1 
+ */
+static inline unsigned int INDEX_ROW(unsigned int row, unsigned int col, unsigned int ld) {
+  return (row*ld + col);
+}
+
+/* ld is the number of elements from block column n to block column n+1.
+ * This can also be described as the number of elements between column
+ * n and column n+M_SUB 
+ */
+static inline unsigned int INDEX_BLK(unsigned int row, unsigned int col, unsigned int ld) {
+  return ((col / M_SUB)*ld + INDEX_ROW( row, (col % M_SUB), M_SUB ));
+}
+
+
+/* NOTE 1:
+ *
+ * The following defines can be used to configure the code for handling
+ * 4GB crossings. They include:
+ *
+ * MATRIX_4GB_CROSSING    If defined then all block ordered matrices can cross a 4GB 
+ *                        address boundary. However, the crossing can only occur on a 
+ *                        block boundary, never within a matrix block. In addition,
+ *			  the block leading dimension must be no larger than 2^28 - 1.
+ *                        If not defined, then a matrix can not cross a 4GB
+ *			  address boundary.
+ *
+ * PANEL_4GB_CROSSING     If defined then all row or column order panels (this includes 
+ *	                  U panels, L panels,and row buffers) may cross at most 1 4GB 
+ *                        address boundary, but only on a row/column boundary. In addition,
+ *			  the leading dimension must not exceed 2^28 - 1.
+ *                        If not defined, then a panel can not cross a 4GB address boundary.
+ *
+ * VALIDATE_4GB_CROSSING  If defined, then include code to validate the specified
+ *                        boundary constraints. This define is intended for debug
+ *			  purposes only.
+ */
+
+#ifdef __PPU__
+
+#include <ppu_intrinsics.h>
+
+/* hpl_accel_byte_swap
+ * -------------------
+ * Convert a double from little-endian format to big-endian format. This
+ * function is not optimal. Recommend using hpl_accel_byte_swap_load and
+ * hpl_accel_byte_swap_store instead.
+ */
+static inline double hpl_accel_byte_swap(double d) {
+#ifdef ACCEL_LITTLE_ENDIAN
+  union {
+    unsigned long long ull;
+    double d;
+  } in, out;
+
+  in.d = d;
+  out.ull = __ldbrx(&in.ull);
+  return (out.d);
+#else
+  return (d);
+#endif
+}
+
+
+/* hpl_accel_byte_swap_load
+ * ------------------------
+ * Load a little endian byte ordered, double word value. 
+ */
+static inline double hpl_accel_byte_swap_load(unsigned long long *ptr)
+{
+#ifdef ACCEL_LITTLE_ENDIAN
+  union {
+    unsigned long long ull;
+    double d;
+  } x;
+
+  x.ull = __ldbrx(ptr);
+  return (x.d);
+#else
+  return (*((double *)ptr));
+#endif
+}
+
+
+/* hpl_accel_byte_swap_store
+ * -------------------------
+ * Store a double word value in little endian byte ordering.
+ */
+static inline void hpl_accel_byte_swap_store(unsigned long long *ptr, double d)
+{
+#ifdef ACCEL_LITTLE_ENDIAN
+  union {
+    unsigned long long ull;
+    double d;
+  } x;
+
+  x.d = d;
+  __stdbrx(ptr, x.ull);
+#else
+  *((double *)ptr) = d;
+#endif
+}
+
+
+/* hpl_accel_init
+ * --------------
+ * Initialize the HPL accelerator. If the accelerator is successfully 
+ * initialized, then HPL_ACCEL_INIT_SUCCESS is returned, otherwise 
+ * HPL_ACCEL_INIT_FAIL is returned.
+ */
+
+#define HPL_ACCEL_INIT_SUCCESS	0
+#define HPL_ACCEL_INIT_FAIL	-1
+
+extern int hpl_accel_init();
+
+/* hpl_accel_fini
+ * --------------
+ * Finalize the HPL accelerator. If the accelerator successfully 
+ * finishes , then HPL_ACCEL_FINI_SUCCESS is returned, otherwise 
+ * HPL_ACCEL_FINI_FAIL is returned.
+ */
+#define HPL_ACCEL_FINI_SUCCESS	0
+#define HPL_ACCEL_FINI_FAIL	-1
+
+extern int hpl_accel_fini();
+
+
+/* hpl_accel_dgemm_CL_R_B_CL
+ * hpl_accel_dgemm_CL_B_B_CL
+ * -------------------------
+ * Specialized accelerated DGEMM. The DGEMM computes:
+ *
+ *       [c] -= [a]*[b]
+ *
+ * If a panel is specified, then the output in placed in [panel]:
+ *
+ *       [panel] = [c] - [a]*[b];
+ *
+ * m           Number of rows in [a], [c], and [panel]. 
+ * n           Number of cols in [b], [c], and [panel]. 
+ * k           Number of cols in [a] and rows in [b].
+ * a           Column-ordered, little-endian, matrix of m rows and k columns.
+ * lda         Leading dimension of matrix [a].
+ * b           Big endian matrix of k rows and n columns. This is either row ordered,
+ *             in the case of hpl_accel_dgemm_CL_R_B_CL, or block formatted, in the 
+ *             hpl_accel_dgemm_CL_B_B_CL.
+ * ldb         Leading dimension of matrix [b]. For a block formatted [b] matrix,
+ *             this is the number of doubles to advance b from block column n to 
+ *             column n+1.
+ * c           Block-formatted, big-endian, matrix of m rows and n columns.
+ *             The block contents are row-ordered with the individual blocks 
+ *             that are column-ordered. Blocks are 64x64.
+ * ldc         Leading block dimension of matrix [c]. The number of doubles to 
+ *	       to advance c from block column n to column n+1.
+ * blk_row     Starting block matrix row offset. This offset is applied only to the 
+ *             [c] matrix.
+ * blk_col     Starting block matrix column offset. This offset is applied to the [c]
+ *             matrix and [b] matrix when it is block formatted (i.e. for 
+ *             hpl_accel_dgemm_CL_B_B_CL.
+ * panel       Column ordered, little endian DGEMM result matrix of m rows and n columns. 
+ *             If NULL, the result is returned in [c].
+ * ldp         Leading dimension of [panel]. If [panel] is NULL, this must be 0.
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *   b         Buffer may not straddle 4GB boundary (See Note 1).
+ *   c         Buffer may not straddle 4GB boundary (See Note 1).
+ *   panel     Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   k         Must be 128 to be accelerated.
+ *   m         Optimal if a multiple of 64. Integral multiples of 64 may be accelerated.
+ *   n	       Optimal if a multiple of 64. Integral multiples of 64 may be accelerated.
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   b         Optimal if cacheline aligned. Accelerated if [b] is quadword aligned.
+ *   c         Optimal if cacheline aligned. Accelerated if [c] is quadword aligned.
+ *   panel     Optimal if cacheline aligned. Accelerated if [panel] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   ldb       Optimal if a multiple of 16. Accelerated if ldb is even.
+ *   ldc       Optimal if a multiple of 16. Accelerated if ldc is even.
+ *   ldp       Optimal if a multiple of 16. Accelerated if ldp is even.
+ *   blk_row   Must be a multiple of M_SUB in order to be SPE accelerated.
+ *   blk_col   Must be a multiple of M_SUB in order to be SPE accelerated.
+ */
+
+extern void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k,
+				      const double *a, int lda,
+				      const double *b, int ldb,
+				      double *c, int ldc,
+				      unsigned int blk_row, unsigned int blk_col,
+				      double *panel, int ldp,
+				      unsigned long long *incomplete);
+
+extern void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k,
+				      const double *a, int lda,
+				      const double *b, int ldb,
+				      double *c, int ldc,
+				      unsigned int blk_row, unsigned int blk_col,
+				      double *panel, int ldp,
+				      unsigned long long *incomplete);
+
+
+/* hpl_accel_dgemm_C_C_C
+ * -------------------------
+ * Specialized accelerated DGEMM. The DGEMM computes:
+ *
+ *       [c] -= [a]*[b]
+ *
+ * m           Number of rows in [a] and [c]. 
+ * n           Number of cols in [b] and [c]. 
+ * k           Number of cols in [a] and rows in [b].
+ * a           Column-ordered, big-endian, matrix of m rows and k columns.
+ * lda         Leading dimension of matrix [a].
+ * b           Column-ordered, big endian matrix of k rows and n columns.
+ * ldb         Leading dimension of matrix [b].
+ * c           Column-ordered, big-endian, matrix of m rows and n columns.
+ * ldc         Leading block dimension of matrix [c].
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *   c         Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   k         Must be a multiple of 4 and no bigger than 64 to be accelerated.
+ *   m         Optimal if a multiple of 16. Integral multiples of 8 may be accelerated.
+ *   n         Optimal if a multiple of 4.
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   b         Optimal if cacheline aligned. Accelerated if [b] is quadword aligned.
+ *   c         Optimal if cacheline aligned. Accelerated if [c] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   ldb       Optimal if a multiple of 16. Accelerated if ldb is even.
+ *   ldc       Optimal if a multiple of 16. Accelerated if ldc is even.
+ */
+
+extern void hpl_accel_dgemm_C_C_C(int m, int n, int k,
+				  const double *a, int lda,
+				  const double *b, int ldb,
+				  double *c, int ldc,
+				  unsigned long long *incomplete);
+
+/* hpl_accel_dtrsm_CL_R_B
+ * ----------------------
+ * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation
+ *
+ *       [a]*[x] = [b]
+ *
+ * where a is unit lower triangle matrix. The solution is returned in [b] unless
+ * [c] is non-NULL, in which the solution is returned in [c].
+ *
+ * m           Number of rows in [b], number of column in [a].
+ * n           Number of columns in [b].
+ * a           Column-ordered, little-endian, unit lower triangle matrix of 
+ *             dimension lda rows by m columns.
+ * lda         Leading dimension of matrix [a].
+ * b           Row-order, big-endian, matrix of m rows and n columns. On entry
+ *             contains the right-hand side matrix and is overwritten by the 
+ *             solution matrix [x]. 
+ * ldb         Leading dimension of matrix [b].
+ * c           Block-formatted, big-endian, matrix. The block contents are 
+ *             row-ordered with the individual blocks that are column-ordered. 
+ *	       Blocks are 64x64. If non-NULL, the solution is returned in the
+ *             row of blocks in [c] instead of [b]. This must point to the start
+ *             of a matrix block.
+ * ldc         Leading block dimension of matrix [c]. The number of doubles to 
+ *	       to advance c from block column to the next block column. If [c] 
+ *	       is NULL, then ldc should also be 0.
+ * blk_row     Starting [c] block matrix row offset. If [c] is NULL, then blk_row
+ *             must also be 0.
+ * blk_col     Starting [c] block matrix column offset. If [c] is NULL, then blk_col
+ *             must also be 0.
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *   b         Buffer may not straddle 4GB boundary (See Note 1).
+ *   c         Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   m         Must be 128 to be accelerated.
+ *   n         Optimal if a multiple of 16. Integral multiples of 16 may be accelerated.
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   b         Optimal if cacheline aligned. Accelerated if [b] is quadword aligned.
+ *   c         Optimal if cacheline aligned. Accelerated if [c] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   ldb       Optimal if a multiple of 16. Accelerated if ldb is even. Memory throughput
+ *             is maximized if ldb is NOT an integral multiple of 256.
+ *   ldc       Optimal if a multiple of 16. Accelerated if ldc is even.
+ *   blk_col   Must be a multiple of 16 in order to be SPE accelerated. This is a current
+ *             implementation restriction.
+ */
+
+extern void hpl_accel_dtrsm_CL_R_B(int m, int n, 
+				   const double *a, int lda, 
+				   double *b, int ldb,
+				   double *c, int ldc,
+				   unsigned int blk_row, unsigned int blk_col,
+				   unsigned long long *incomplete);
+
+
+/* hpl_accel_dtrsm_CL_B
+ * --------------------
+ * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation
+ *
+ *       [a]*[x] = [b]
+ *
+ * where a is unit lower triangle matrix. The solution is returned in [b].
+ *
+ * m           Number of rows in [b], number of column in [a].
+ * n           Number of columns in [b].
+ * a           Column-ordered, little-endian, unit lower triangle matrix of 
+ *             dimension lda rows by m columns.
+ * lda         Leading dimension of matrix [a].
+ * b           Block-formatted, big-endian, matrix of m rows and n columns. 
+ *             The block contents are row-ordered with the individual blocks 
+ *             that are column-ordered. Blocks are 64x64. On entry contains
+ *             the right-hand side matrix and is overwritten by the 
+ *             solution matrix [x]. This must point to the start
+ *             of a matrix block.
+ * ldb         Leading dimension of matrix [b]. The number of doubles to 
+ *	       to advance b from block column to the next block column.
+ * blk_row     Starting [b] block matrix row offset. 
+ * blk_col     Starting [b] block matrix column offset. 
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *   b         Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   m         Must be 128 to be accelerated.
+ *   n         Optimal if a multiple of 16. Integral multiples of 16 may be accelerated.
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   b         Optimal if cacheline aligned. Accelerated if [b] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   ldb       Optimal if a multiple of 16. Accelerated if ldb is even. 
+ *   blk_col   Must be a multiple of 16 in order to be SPE accelerated. This is a current
+ *             implementation restriction.
+ */
+
+extern void hpl_accel_dtrsm_CL_B(int m, int n, 
+				 const double *a, int lda, 
+				 double *b, int ldb,
+				 unsigned int blk_row, unsigned int blk_col,
+				 unsigned long long *incomplete);
+
+
+
+/* hpl_accel_reform_panel_CL_to_B
+ * -------------------------------
+ * Copy and reformat the L panel from the panel buffer pointed to by panel into matrix [a].
+ * The input L panel is assumed to be column-order, little endian with a leading dimension of ldp.
+ * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks
+ * are assumed to be column ordered. 
+ *
+ * m           Number of rows of panel to copy to [a]
+ * n           Number of columns of panel to copy to [a]
+ * a           Block formatted matrix. a points to the location with [a] to receive the
+ *             data being copied and reformatted from panel
+ * lda         Leading dimension of matrix [a]. This contains the number of doubles to
+ *             advance a from block column n to column n+1.
+ * panel       Pointer to the L panel containing the data to be reformatted and copied to 
+ *             matrix [a]. The [panel] is column-ordered, little endian.
+ * ldp	       Leading dimension of the panel.  This is the number of doubles between
+ *             column n and column n+1
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *
+ */
+
+void hpl_ref_reform_panel_CL_to_B(int m, int n,
+				  double *a, int lda,
+				  double *panel, int ldp,
+				  unsigned long long *incomplete);
+
+
+/* hpl_accel_reform_matrix_CL_to_B
+ * -------------------------------   
+ * Inplace reformat the matrix [a] from column-ordered, little-endian to blocked, big-endian format. The blocked
+ * format is 64x64, row-ordered blocks with the blocks being column ordered. The pad between the columns of 
+ * blocks are zero filled.
+ *
+ * m           Number of rows in [a]. If m is not a multiple of 64, then the additional rows needed 
+ *             pad [a] to a multiple of 64 rows are zero'd.
+ * n           Number of cols in [a].
+ * a           Column-ordered, little-endian, matrix of m rows and n columns.
+ * lda         Leading dimension of matrix [a].
+ * scratch     Scratch buffer used to assist the reformating of [a]. The scratch buffer
+ *             must be at least 64*roundup(m,64) elements.
+ * size        The size (number of elements) of the scratch buffer. The scratch buffer
+ *             must be at least approximately 64*m elements. In general, better performance 
+ *             is achieved if the scratch buffer is larger and more SPEs can be deployed 
+ *             to the problem.
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   n	       Must be an integral multiple of 64.
+ *   a         Must be quadword aligned and buffer may not straddle 4GB boundary (See Note 1).
+ *   lda       Must be even and at least roundup(m,64).
+ *   scratch   Must be quadword aligned and must not straddle 4GB boundary.
+ *   size      Must be at least 64*roundup(m,64).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   a         Mush be cacheline aligned.
+ *   lda       Must be a multiple of 16.
+ *   scratch   Must be cacheline aligned.
+ *   size      Must be at least 4*64*m for optimal performance.
+ *
+ * Note: For 4GB crossing support, the matrix a is considered to be a block "matrix".
+ */
+extern void hpl_accel_reform_matrix_CL_to_B(int m, int n, 
+					    double *a, int lda, 
+					    double *scratch, int size,
+					    unsigned long long *incomplete);
+
+
+
+/* hpl_accel_reform_panel_B_to_CL
+ * -------------------------------   
+ * Copy and reformat the L panel from matrix [a] into the panel buffer pointed to by panel.
+ * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks
+ * are assumed to be column ordered. The output L panel is assumed to be column-order, little endian
+ * with a leading dimension of ldp.
+ *
+ * m           Number of rows of [a] to copy to panel
+ * n           Number of columns of [a] to copy to panel
+ * panel       Pointer to the L panel extracted and reformatted from matrix [a]. The 
+ *             [panel] is column-ordered, little-endian.
+ * ldp	       Leading dimension of the panel.
+ * a           Block formatted matrix. a points to the start of the panel to be reformatted
+ *             and copied into [panel].
+ * lda         Leading dimension of matrix [a]. This contains the number of doubles to
+ *             advance a from block column n to column n+1.
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   m         Must be a multiple of 64.
+ *   panel     Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1).
+ *   ldp       Must be even and at least m.
+ *   a         Must be quadword aligned and may not straddle a 4GB boundary (See Note 1).
+ *   lda       Must be even and at least m*M_SUB.
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   panel     Must be cacheline aligned.
+ *   ldp       Must be a multiple of 16.
+ *   a         Mush be cacheline aligned.
+ *   lda       Must be a multiple of 16.
+ */
+extern void hpl_accel_reform_panel_B_to_CL(int m, int n,
+					   double *panel, int ldp, 
+					   double *a, int lda,
+					   unsigned long long *incomplete);
+
+
+
+/* hpl_accel_reform_panel_R_to_B
+ * -------------------------------   
+ * Copy and reformat a U panel from a row buffer pointed to by panel into matrix [a].
+ * The input U panel is assumed to be row-order, big endian with a leading dimension of ldp.
+ * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks.
+ * The blocks are assumed to be column ordered.
+ *
+ * m           Number of rows of panel to copy to [a]
+ * n           Number of columns of panel to copy to [a]
+ * a           Block formatted matrix. a points to the location with [a] to receive the
+ *             data being copied and reformatted from panel
+ * lda         Leading dimension of matrix [a]. This contains the number of doubles to
+ *             advance a from block column n to column n+1.
+ * panel       Pointer to the U panel containing the data to be reformatted and copied to 
+ *             matrix [a]. The [panel] is row-ordered, big-endian.
+ * ldp	       Leading dimension of the panel.  This is the number of doubles between
+ *             row n and row n+1
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   m         None
+ *   panel     Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1).
+ *   ldp       Must be even and at least n.
+ *   a         Must be quadword aligned and may not straddle a 4GB boundary (See Note 1).
+ *   lda       Must be even and at least m*M_SUB.
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   panel     Must be cacheline aligned.
+ *   ldp       Must be a multiple of 16.
+ *   a         Must be cacheline aligned.
+ *   lda       Must be a multiple of 16.
+ */
+extern void hpl_accel_reform_panel_R_to_B(int m, int n,
+					  double *a, int lda,
+					  double *panel, int ldp, 
+					  unsigned long long *incomplete);
+
+
+/* hpl_accel_reform_rows_R_to_B
+ * hpl_accel_reform_rows_B_to_R
+ * ----------------------------
+ * Copy and reformat a set of rows between row ordered and block ordered formats. 
+ * hpl_accel_reform_rows_R_to_B reformats rows into blocks and hpl_accel_reform_rows_B_to_R
+ * reformats blocks into rows. These functions are expected to be used to gather/scatter winners 
+ * and losers when pivoting so that rows are coalesced into large DMAs for efficient transfer.
+ * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, 
+ * row ordered elements. The blocks themselves are column ordered.
+ *
+ * m           Number of rows to copy. Specifies the number of entries in the blk_rows array. 
+ * n           Number of values (doubles) per row to copy.
+ * rows        Pointer to the data rows to be reformatted and copied to/from matrix [a].
+ * ldr	       Leading dimension of the row buffer. This is the number of doubles between
+ *             rows of the [rows] buffer.
+ * a           Block formatted matrix. 
+ * lda         Leading dimension of matrix [a]. This contains the number of doubles to
+ *             advance a from block column n to column n+1.
+ * blk_rows    Array of row indices. blk_rows specifies starting [a] block matrix row offset
+ *             for each of the m rows.
+ * blk_col     Starting [a] block matrix column offset. 
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   rows      Buffer must not straddle 4GB boundary.
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   rows      Optimal if cacheline aligned. Accelerated if [rows] is quadword aligned.
+ *   ldr       Optimal if a multiple of 16. Accelerated if ldr is even.
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   blk_col   Optimal if a multiple of 16. Accelerated if blk_col is even.
+ */
+
+extern void hpl_accel_reform_rows_R_to_B(int m, int n,
+					double *rows, int ldr,
+					double *a, int lda,
+					int *blk_rows, int blk_col,
+					unsigned long long *incomplete);
+
+extern void hpl_accel_reform_rows_B_to_R(int m, int n,
+					double *rows, int ldr,
+					double *a, int lda,
+					int *blk_rows, int blk_col,
+					unsigned long long *incomplete);
+
+/* hpl_accel_swap_rows_B_to_B
+ * ----------------------------
+ * Swap a set of rows in block ordered format. 
+ * hpl_accel_swap_rows_B_to_B swaps a set of rows pairwise in a block-formatted matrix.
+ * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, 
+ * row ordered elements. The blocks themselves are column ordered.
+ *
+ * m           Number of rows to swap. Specifies the number of entries in the blk_rows array. 
+ * n           Number of values (doubles) per row to copy.
+ * a           Block formatted matrix. 
+ * lda         Leading dimension of matrix [a]. This contains the number of doubles to
+ *             advance a from block column n to column n+1.
+ * blk_rows    Array of row indices. blk_rows specifies starting [a] block matrix row offset
+ *             for each of the m rows.
+ * blk_col     Starting [a] block matrix column offset. 
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *   a         Buffer may not straddle 4GB boundary (See Note 1).
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ *   blk_col   Optimal if a multiple of 16. Accelerated if blk_col is even.
+ */
+
+extern void hpl_accel_swap_rows_B_to_B(int m, int n,
+					double *a, int lda,
+					int *blk_rows, int blk_col,
+					unsigned long long *incomplete);
+
+/* hpl_accel_copy_rows_R_to_R
+ * ----------------------------
+ * Copy a set of rows in row ordered format. 
+ * hpl_accel_copy_rows_R_to_R copies a set of rows from row-oriented matrix a to 
+ * row-oriented matrix b.
+ * No endian swapping is performed on the data.
+ *
+ * m           Number of rows to copy. Specifies the number of entries in the blk_rows array. 
+ * n           Number of values (doubles) per row to copy.
+ * a           Pointer to the source data rows to be copied to row-ordered matrix b.
+ * lda	       Leading dimension of the row-ordered source matrix a.
+ * b           Pointer to the row-ordered destination matrix.
+ * ldb	       Leading dimension of the row-ordered destination matrix b.
+ * rows        Array of row indices. rows specifies the destination row address in row-ordered
+ *             matrix b to receive source row from matrix a.
+ * incomplete  Pointer system variable that is first initialized to non-zero 
+ *             and asynchronously cleared when the requested operation has completed. 
+ *             If NULL, no completion notification is performed.
+ *
+ * FUNCTIONAL RESTRICTIONS:
+ *
+ * ADDITIONAL PERFORMANCE RESTRICTIONS:
+ *   a         Optimal if cacheline aligned. Accelerated if [a] is quadword aligned.
+ *   lda       Optimal if a multiple of 16. Accelerated if lda is even.
+ */
+
+extern void hpl_accel_copy_rows_R_to_R(int m, int n,
+					double *a, int lda,
+					double *b, int ldb,
+					int *rows, 
+					unsigned long long *incomplete);
+
+/* REFERENCE FUNCTIONS. 
+ *
+ * These functions are non-accelerated implementations that run on the PPU.
+ *
+ * They may not place the same functional and performance restrictions as the
+ * SPU accelerated functions.
+ */
+
+extern int hpl_ref_init();
+
+extern void hpl_ref_dgemm_CL_R_B(int m, int n, int k,
+				 const double *a, int lda,
+				 const double *b, int ldb,
+				 double *c, int ldc,
+				 unsigned long long *incomplete);
+
+extern void hpl_ref_dgemm_CL_B_B(int m, int n, int k,
+				 const double *a, int lda,
+				 const double *b, int ldb,
+				 double *c, int ldc,
+				 unsigned long long *incomplete);
+
+extern void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k,
+				    const double *a, int lda,
+				    const double *b, int ldb,
+				    double *c, int ldc,
+				    unsigned int blk_row, unsigned int blk_col,
+				    double *panel, int ldp,
+				    unsigned long long *incomplete);
+
+extern void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k,
+				    const double *a, int lda,
+				    const double *b, int ldb,
+				    double *c, int ldc,
+				    unsigned int blk_row, unsigned int blk_col,
+				    double *panel, int ldp,
+				    unsigned long long *incomplete);
+
+extern void hpl_ref_dgemm_C_C_C(int m, int n, int k,
+				const double *a, int lda,
+				const double *b, int ldb,
+				double *c, int ldc,
+				unsigned long long *incomplete);
+
+extern void hpl_ref_dtrsm_CL_R(int m, int n, 
+			       const double *a, int lda, 
+			       double *b, int ldb,
+			       unsigned long long *incomplete);
+
+extern void hpl_ref_dtrsm_CL_B(int m, int n, 
+			       const double *a, int lda, 
+			       double *b, int ldb,
+			       unsigned int blk_row, unsigned int blk_col,
+			       unsigned long long *incomplete);
+
+extern void hpl_ref_dtrsm_CL_R_B(int m, int n, 
+				 const double *a, int lda, 
+				 double *b, int ldb,
+				 double *c, int ldc,
+				 unsigned int blk_row, unsigned int blk_col,
+				 unsigned long long *incomplete);
+
+extern void hpl_ref_reform_matrix_CL_to_B(int m, int n, 
+					  double *a, int lda, 
+					  double *scratch, int size,
+					  unsigned long long *incomplete);
+
+extern void hpl_ref_reform_panel_B_to_CL(int m, int n,
+					 double *panel, int ldp, 
+					 double *a, int lda,
+					 unsigned long long *incomplete);
+
+extern void hpl_ref_reform_panel_R_to_B(int m, int n,
+					double *a, int lda,
+					double *panel, int ldp, 
+					unsigned long long *incomplete);
+
+extern void hpl_ref_reform_rows_R_to_B(int m, int n,
+				      double *rows, int ldr,
+				      double *a, int lda,
+				      int *blk_rows, int blk_col,
+				      unsigned long long *incomplete);
+
+extern void hpl_ref_reform_rows_B_to_R(int m, int n,
+				      double *rows, int ldr,
+				      double *a, int lda,
+				      int *blk_rows, int blk_col,
+				      unsigned long long *incomplete);
+
+extern void hpl_ref_swap_rows_B_to_B(int m, int n,
+					double *a, int lda,
+					int *blk_rows, int blk_col,
+					unsigned long long *incomplete);
+
+extern void hpl_ref_copy_rows_R_to_R(int m, int n,
+					double *a, int lda,
+					double *b, int ldb,
+					int *rows, 
+					unsigned long long *incomplete);
+
+#endif /* __PPU__ */
+
+#endif /* _HPL_ACCEL_H_ */
Index: accel/lib/hpl_accel_copy.c
===================================================================
RCS file: accel/lib/hpl_accel_copy.c
diff -N accel/lib/hpl_accel_copy.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_copy.c	20 Aug 2008 03:57:53 -0000	1.4
@@ -0,0 +1,98 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdint.h>
+#include <assert.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+#include "ppu_intrinsics.h"
+
+/* General purpose, reference, reformating facilities.
+ */
+
+void hpl_accel_copy_rows_R_to_R(int m, int n,
+                                double *a, int lda,
+                                double *b, int ldb,
+                                int *rows,
+                                unsigned long long *incomplete)
+{
+  unsigned int non_aligned;
+  int n0 = 0;
+
+  non_aligned = (((unsigned int)(lda | ldb) & 1) |
+         (((unsigned int)((uintptr_t)a) | (uintptr_t)b) & (16-1)));
+							   
+  if ((non_aligned == 0) && (n > 1)) {
+    int m_start, m_left, m_per_cmd;
+    unsigned int idx;
+    volatile hpl_accel_copy_rows_parms_t *parms;
+    int i;
+
+    init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+    n0 = n & ~1;
+
+    idx = hpl_accel_cmd_idx;
+
+    m_start = 0;
+    m_left = m;
+
+    /* Generate multiple command requests if the number of rows 
+     * is greater than what will fit in a single command request.
+     */
+    m_per_cmd = (int)(sizeof(parms->rows) / sizeof(int));
+
+    while (m_left > 0) {
+
+      parms = (volatile hpl_accel_copy_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+      parms->m = (m_left < m_per_cmd) ? m_left : m_per_cmd;
+      parms->n = n0;
+      parms->lda = lda * sizeof(double);
+      parms->ldb = ldb * sizeof(double);
+
+      parms->a = a + m_start * lda;
+      parms->b = b;
+
+      parms->incomplete = (parms->m < m_left) ? NULL : incomplete;
+
+      for (i=0; i<parms->m; i++) parms->rows[i] = rows[m_start+i];
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_COPY_ROWS_R_TO_R, idx, HPL_ACCEL_SPES);
+      
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+      m_start += parms->m;
+      m_left -= parms->m;
+    }
+  } else {
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup portions of the rows not implemented by the SPEs above.
+   */
+  if (n0 < n) {
+    unsigned int y1, y2, x;
+    double *src, *dst;
+
+    /* For each of the rows */
+    for (y1=0; y1<(unsigned int)m; y1++) {
+      y2 = rows[y1];  /* New location for row y1 */
+      src = a + (y1 * lda);
+      dst = b + (y2 * ldb);
+      for (x=n0; x<(unsigned int)n; x++) {
+          dst[x] = src[x];
+      }
+    }
+  }
+}
+
Index: accel/lib/hpl_accel_dgemm.c
===================================================================
RCS file: accel/lib/hpl_accel_dgemm.c
diff -N accel/lib/hpl_accel_dgemm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_dgemm.c	20 Aug 2008 03:57:53 -0000	1.12
@@ -0,0 +1,495 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+#include <stdint.h>
+#include <assert.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+#include "hpl_accel_global.h"
+
+#include <ppu_intrinsics.h>
+
+static void _dgemm_CL_R_B_CL(int m, int n, int k,
+			     const double *a, int lda,
+			     const double *b, int ldb,
+			     double *c, int ldc,
+			     unsigned int blk_row, unsigned int blk_col,
+			     double *panel, int ldp)
+{
+  unsigned int i, x, y;
+  unsigned long long *a_ptr;
+  double a_val, b_val, *p;
+
+  if (panel) {
+    /* Write the result into the panel buffer. We first perform the compution,
+     * placing the result into [panel]. Then byte swap panel.
+     */
+    p = panel;
+    for (x=0; x<(unsigned int)n; x++, p += ldp-m) {
+      a_ptr = (unsigned long long *)a;
+      b_val = b[INDEX_ROW(0,x,ldb)];
+
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap_load(a_ptr++);
+	*p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val;
+      }
+    }
+    
+    a += lda;
+    for (i=1; i<(unsigned int)k; i++, a+=lda) {
+      p = panel;
+      for (x=0; x<(unsigned int)n; x++, p += ldp-m) {
+	a_ptr = (unsigned long long *)a;
+	b_val = b[INDEX_ROW(i,x,ldb)];
+	
+	for (y=0; y<(unsigned int)m; y++) {
+	  a_val = hpl_accel_byte_swap_load(a_ptr++);
+	  *p++ -= a_val * b_val;
+	}
+      }
+    }
+#ifdef ACCEL_LITTLE_ENDIAN
+    /* Byte swap panel buffer
+     */
+    unsigned long long *p_ptr = (unsigned long long *)panel;
+    for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) {
+      for (y=0; y<(unsigned int)m; y++, p_ptr++) {
+	__stdbrx(p_ptr, *p_ptr);
+      }
+    }
+#endif
+  } else {
+    /* Write the result into the c matrix.
+     */
+    for (i=0; i<(unsigned int)k; i++, a+=lda) {
+      a_ptr = (unsigned long long *)a;
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap_load(a_ptr++);
+	for (x=0; x<(unsigned int)n; x++) {
+	  c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)];
+	}
+      }
+    }
+  }
+}
+
+
+void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k,
+			       const double *a, int lda,
+			       const double *b, int ldb,
+			       double *c, int ldc,
+			       unsigned int blk_row, unsigned int blk_col,
+			       double *panel, int ldp,
+			       unsigned long long *incomplete)
+{
+  int n0;
+  int m0 = 0;
+  unsigned int cmd;
+  unsigned int idx;
+  unsigned int aligned, bc, br;
+  volatile hpl_accel_dgemm_parms_t *parms;
+
+  /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist.
+   * This specialist assumes:
+   *   m	is at least M_SUB
+   *   n        is at least M_SUB
+   *   k        is equal to M
+   *   a	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   b	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   c	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   panel	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   lda	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance
+   *   ldb	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance
+   *   ldc	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance
+   *   ldp	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance
+   *   blk_col  is a multiple of M_SUB
+   *   blk_row  is a multiple of M_SUB
+   */
+  bc = blk_col/M_SUB;
+  br = blk_row/M_SUB;
+
+  c += (ldc * bc) + br*(M_SUB*M_SUB);
+
+  blk_col %= M_SUB;
+  blk_row %= M_SUB;
+
+  aligned = (blk_row | blk_col | 
+	     ((unsigned int)(lda | ldb | ldc | ldp) & 1) |
+	     ((unsigned int)((uintptr_t)a  | (uintptr_t)b | (uintptr_t)c  | (uintptr_t)panel) & (16-1)));
+
+
+  if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) {
+    /* Either all or a portion of the computation can be done by the SPE accelerators.
+     */
+    m0 = (m/M_SUB) * M_SUB;
+    n0 = (n/M_SUB) * M_SUB;
+    
+    /* Verify 4GB boundary expectation.
+     */
+    VALIDATE_PANEL_4GB_CROSSING(a, k, lda);
+    VALIDATE_PANEL_4GB_CROSSING(b,  k, ldb);
+    VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc);
+    VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp);
+
+    idx = hpl_accel_cmd_idx;
+ 
+    parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+    /* Place the parameters into a command queue buffer
+     */
+    parms->a = a;
+    parms->b = b;
+    parms->c = c;
+    parms->lda = lda * sizeof(double);
+    parms->ldb = ldb * sizeof(double);
+    parms->ldc = ldc * sizeof(double);
+    parms->n = n0 / M_SUB;
+    parms->m = m0 / M_SUB;
+    parms->b_blk = 0;
+    parms->incomplete = incomplete;
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M);
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M);
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n);
+
+    init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    
+    if (panel) {
+      parms->p = panel;
+      parms->ldp = ldp * sizeof(double);
+      cmd = HPL_ACCEL_CMD_DGEMM_PANEL;
+    } else {
+      cmd = HPL_ACCEL_CMD_DGEMM;      
+    }
+
+    /* Perform a sync in order to ensure that the parameters are written to 
+     * memory before writing to the mailbox command queue.
+     */
+    __sync();
+
+    send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES);
+
+    /* Complete any remain portion on the right side. That is when n is not a multiple
+     * of M_SUB.
+     */
+    if (n0 < n) {
+      _dgemm_CL_R_B_CL(m0, n-n0, k, a, lda, b+n0, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp);
+    }
+  } else {
+    /* This function is completely synchronous, therefore, clear incomplete.
+     */
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup any remaining portion of the matrix that was not handled above.
+   */
+  if (m0 < m) {
+    _dgemm_CL_R_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp);
+  }
+}
+
+
+
+static void _dgemm_CL_B_B_CL(int m, int n, int k,
+			     const double *a, int lda,
+			     const double *b, int ldb,
+			     double *c, int ldc,
+			     unsigned int blk_row, unsigned int blk_col,
+			     double *panel, int ldp)
+{
+  unsigned int i, x, y;
+  unsigned long long *a_ptr;
+  double a_val, b_val, *p;
+
+  if (panel) {
+    /* Write the result into the panel buffer. We first perform the compution,
+     * placing the result into [panel]. Then byte swap panel.
+     */
+    p = panel;
+    for (x=0; x<(unsigned int)n; x++, p += ldp-m) {
+      a_ptr = (unsigned long long *)a;
+      b_val = b[INDEX_BLK(0,x+blk_col,ldb)];
+
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap_load(a_ptr++);
+	*p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val;
+      }
+    }
+    
+    a += lda;
+    for (i=1; i<(unsigned int)k; i++, a+=lda) {
+      p = panel;
+      for (x=0; x<(unsigned int)n; x++, p += ldp-m) {
+	a_ptr = (unsigned long long *)a;
+	b_val = b[INDEX_BLK(i,x+blk_col,ldb)];
+	
+	for (y=0; y<(unsigned int)m; y++) {
+	  a_val = hpl_accel_byte_swap_load(a_ptr++);
+	  *p++ -= a_val * b_val;
+	}
+      }
+    }
+#ifdef ACCEL_LITTLE_ENDIAN
+    /* Byte swap panel buffer
+     */
+    unsigned long long *p_ptr = (unsigned long long *)panel;
+    for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) {
+      for (y=0; y<(unsigned int)m; y++, p_ptr++) {
+	__stdbrx(p_ptr, *p_ptr);
+      }
+    }
+#endif
+  } else {
+    /* Write the result into the c matrix.
+     */
+    for (i=0; i<(unsigned int)k; i++, a+=lda) {
+      a_ptr = (unsigned long long *)a;
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap_load(a_ptr++);
+	for (x=0; x<(unsigned int)n; x++) {
+	  c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)];
+	}
+      }
+    }
+  }
+}
+
+
+void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k,
+			       const double *a, int lda,
+			       const double *b, int ldb,
+			       double *c, int ldc,
+			       unsigned int blk_row, unsigned int blk_col,
+			       double *panel, int ldp,
+			       unsigned long long *incomplete)
+{
+  int n0;
+  int m0 = 0;
+  unsigned int cmd;
+  unsigned int idx;
+  unsigned int aligned, bc, br;
+  volatile hpl_accel_dgemm_parms_t *parms;
+
+  /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist.
+   * This specialist assumes:
+   *   m	is at least M_SUB
+   *   n        is at least M_SUB
+   *   k        is equal to M
+   *   a	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   b	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   c	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   panel	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   lda	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldb	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldc	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldp	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   blk_col  is a multiple of M_SUB
+   *   blk_row  is a multiple of M_SUB
+   */
+
+  bc = blk_col/M_SUB;
+  br = blk_row/M_SUB;
+
+  c += (ldc * bc) + br*(M_SUB*M_SUB);
+  b += (ldb * bc);
+
+  blk_col %= M_SUB;
+  blk_row %= M_SUB;
+
+  aligned = (blk_row | blk_col | 
+	     ((unsigned int)(lda | ldb | ldc | ldp) & 1) |
+	     ((unsigned int)((uintptr_t)a  | (uintptr_t)b | (uintptr_t)c  | (uintptr_t)panel) & (16-1)));
+
+
+  if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) {
+    /* Either all or a portion of the computation can be done by the SPE accelerators.
+     */
+    m0 = (m/M_SUB) * M_SUB;
+    n0 = (n/M_SUB) * M_SUB;
+    
+    /* Verify 4GB boundary expectation.
+     */
+    VALIDATE_PANEL_4GB_CROSSING(a, k, lda);
+    VALIDATE_MATRIX_4GB_CROSSING(b,  k, n0, ldb);
+    VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc);
+    VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp);
+
+    idx = hpl_accel_cmd_idx;
+ 
+    parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+    /* Place the parameters into a command queue buffer
+     */
+    parms->a = a;
+    parms->b = b;
+    parms->c = c;
+    parms->lda = lda * sizeof(double);
+    parms->ldb = ldb * sizeof(double);
+    parms->ldc = ldc * sizeof(double);
+    parms->n = n0 / M_SUB;
+    parms->m = m0 / M_SUB;
+    parms->b_blk = -1;
+    parms->incomplete = incomplete;
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M);
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n);
+
+    init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    
+    if (panel) {
+      parms->p = panel;
+      parms->ldp = ldp * sizeof(double);
+      cmd = HPL_ACCEL_CMD_DGEMM_PANEL;
+    } else {
+      cmd = HPL_ACCEL_CMD_DGEMM;      
+    }
+
+    /* Perform a sync in order to ensure that the parameters are written to 
+     * memory before writing to the mailbox command queue.
+     */
+    __sync();
+
+    send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES);
+
+    /* Complete any remain portion on the right side. That is when n is not a multiple
+     * of M_SUB.
+     */
+    if (n0 < n) {
+      _dgemm_CL_B_B_CL(m0, n-n0, k, a, lda, b, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp);
+    }
+  } else {
+    /* This function is completely synchronous, therefore, clear incomplete.
+     */
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup any remaining portion of the matrix that was not handled above.
+   */
+  if (m0 < m) {
+    _dgemm_CL_B_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp);
+  }
+}
+
+
+void _dgemm_C_C_C(int m, int n, int k,
+		  const double *a, int lda,
+		  const double *b, int ldb,
+		  double *c, int ldc)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val;
+
+  for (i=0; i<(unsigned int)k; i++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a_val = a[INDEX_COL(y,i,lda)];
+      for (x=0; x<(unsigned int)n; x++) {
+        c[INDEX_COL(y,x,ldc)] -= a_val * b[INDEX_COL(i,x,ldb)];
+      }
+    }
+  }
+}
+
+void hpl_accel_dgemm_C_C_C(int m, int n, int k,
+			   const double *a, int lda,
+			   const double *b, int ldb,
+			   double *c, int ldc,
+			   unsigned long long *incomplete)
+{
+  int m0 = 0;
+  int spes;
+  unsigned int cmd, idx, aligned;
+  volatile hpl_accel_dgemm_parms_t *parms;
+
+  /* Do as much of the dgemm as possible using the column-ordered dgemm SPU specialist.
+   * This specialist assumes:
+   *   k        is a multiple of 4 and less than or equal to 64
+   *   m	is a multiple of 8
+   *   n        is a multiple of 4
+   *   a	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   b	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   c	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   lda	is even (qword aligned cols). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldb	is even (qword aligned cols). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldc	is even (qword aligned cols). A multiple of 16 for optimal DMA 
+   *            performance.
+   */
+
+  aligned = (((unsigned int)(lda | ldb | ldc) & 1) |
+	     ((unsigned int)((uintptr_t)a  | (uintptr_t)b | (uintptr_t)c) & (16-1)));
+
+  if ((m >= 8) &&  (k <= 64) && (((k & (4-1)) | (n & (4-1))) == 0) && (aligned == 0)) {
+    /* Either all or a portion of the computation can be done by the SPE accelerators.
+     */
+    m0 = (m/8) * 8;
+
+    /* Verify 4GB boundary expectation.
+     */
+    VALIDATE_PANEL_4GB_CROSSING(a, k, lda);
+    VALIDATE_PANEL_4GB_CROSSING(c, n, ldc);
+
+    idx = hpl_accel_cmd_idx;
+    parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+    /* Place the parameters into a command queue buffer
+     */
+    parms->a = a;
+    parms->b = b;
+    parms->c = c;
+    parms->lda = lda * sizeof(double);
+    parms->ldb = ldb * sizeof(double);
+    parms->ldc = ldc * sizeof(double);
+    parms->n = n;
+    parms->m = m0;
+    parms->k = k;
+    parms->incomplete = incomplete;
+
+    /* Compute the number of SPES to deploy. Each SPE will need to compute
+     * at least one M_SUB high block.
+     */
+    spes = (m + (M_SUB-1)) / M_SUB;
+    if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES;
+
+    init_incomplete(incomplete, spes);
+
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    cmd = HPL_ACCEL_CMD_DGEMM_C_C_C;
+
+    /* Perform a sync in order to ensure that the parameters are written to 
+     * memory before writing to the mailbox command queue.
+     */
+    __sync();
+
+    send_cmd_to_spes(cmd, idx, spes);
+
+  } else {
+    /* This function is completely synchronous, therefore, clear incomplete.
+     */
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup any remaining portion of the matrix that was not handled above.
+   */
+  if (m0 < m) {
+    _dgemm_C_C_C(m-m0, n, k, a+m0, lda, b, ldb, c+m0, ldc);
+  }
+}
Index: accel/lib/hpl_accel_dtrsm.c
===================================================================
RCS file: accel/lib/hpl_accel_dtrsm.c
diff -N accel/lib/hpl_accel_dtrsm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_dtrsm.c	20 Aug 2008 03:57:53 -0000	1.5
@@ -0,0 +1,250 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+#include <stdint.h>
+#include <assert.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+#include "hpl_accel_global.h"
+
+#include <ppu_intrinsics.h>
+
+
+
+void hpl_accel_dtrsm_CL_R_B(int m, int n, 
+			    const double *a, int lda, 
+			    double *b, int ldb,
+			    double *c, int ldc,
+			    unsigned int blk_row, unsigned int blk_col,
+			    unsigned long long *incomplete)
+{
+  int spes;
+  int spans;
+  int n0 = 0;
+  unsigned int idx;
+  unsigned int aligned;
+  unsigned int cmd;
+  volatile hpl_accel_dtrsm_parms_t *parms;
+
+  /* Do as much of the dtrsm as possible using the dtrsm SPU specialist.
+   * This specialist assumes:
+   *   m	is at 128.
+   *   n        is a multiple of 16.
+   *   a	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   b	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   c	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   lda	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldb	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldc	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   */
+  c += (blk_row * M_SUB) + ldc*(blk_col / M_SUB);
+  blk_col %= M_SUB;
+
+  aligned = (((unsigned int)(lda | ldb | ldc) & 1) | (blk_col & 15) |
+	     ((unsigned int)((uintptr_t)a  | (uintptr_t)b | (uintptr_t)c) & (16-1)));
+
+
+  if ((m == M) && (n > 15) && (aligned == 0)) {
+    /* Either all or a portion of the computation can be done by the SPE accelerators.
+     */
+    spans = n/16;
+    n0 = spans * 16;
+
+    /* Verify 4GB boundary expectation.
+     */
+    VALIDATE_PANEL_4GB_CROSSING(a, m, lda);
+    VALIDATE_PANEL_4GB_CROSSING(b, m, ldb);
+    VALIDATE_MATRIX_4GB_CROSSING(c, m, n0, ldc);
+
+    idx = hpl_accel_cmd_idx;
+ 
+    parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+    /* Place the parameters into a command queue buffer
+     */
+    parms->a = a;
+    parms->b = b;
+    parms->c = c;
+    parms->lda = lda * sizeof(double);
+    parms->ldb = ldb * sizeof(double);
+    parms->ldc = ldc * sizeof(double);
+    parms->n = n0;
+    parms->m = m / M;
+    parms->blk_col = blk_col / 16;
+    parms->incomplete = incomplete;
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M);
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M);
+
+    spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES;
+
+    init_incomplete(incomplete, spes);
+
+    /* Perform a sync in order to ensure that the parameters are written to 
+     * memory before writing to the mailbox command queue.
+     */
+    __sync();
+
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+    cmd = (c == NULL) ? HPL_ACCEL_CMD_DTRSM : HPL_ACCEL_CMD_DTRSM_PANEL;
+
+    send_cmd_to_spes(cmd, idx, spes);
+  } else {
+    /* This function is completely synchronous, therefore, clear incomplete.
+     */
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup any remaining portion of the matrix that was not handled above.
+   */
+  if (n0 < n) {
+    unsigned int i, x, y;
+    unsigned long long *a_ptr;
+    double a_val;
+    double *b_next;
+   
+    a_ptr = (unsigned long long *)a;
+    if (c) {
+      /* Perform DTRSM cleanup into a block format matrix row.
+       */
+      for (x=n0; x<(unsigned int)n; x++) {
+	c[INDEX_BLK(0, x+blk_col, ldc)] = b[INDEX_ROW(0, x, ldb)];
+      }
+      /* y == 1 */
+      a_ptr++;
+      for (i=1; i<(unsigned int)m; i++) {
+	a_val = hpl_accel_byte_swap_load(a_ptr++);
+	for (x=n0; x<(unsigned int)n; x++) {
+	  c[INDEX_BLK(i, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)] - b[INDEX_ROW(0, x, ldb)] * a_val;
+	}
+      }
+      a_ptr += (lda - m);
+
+      /* y > 1 
+       */
+      for (y=2; y<(unsigned int)m; y++) {
+	a_ptr += y;
+	for (i=y; i<(unsigned int)m; i++) {
+	  a_val = hpl_accel_byte_swap_load(a_ptr++);
+	  for (x=n0; x<(unsigned int)n; x++) {
+	    c[INDEX_BLK(i, x+blk_col, ldc)] -= c[INDEX_BLK(y-1, x+blk_col, ldc)] * a_val;
+	  }
+	}
+	a_ptr += (lda - m);
+      }
+    } else {
+      /* Perform DTRSM cleanup into [b]
+       */
+      for (y=1; y<(unsigned int)m; y++, b+=ldb) {
+	a_ptr += y;
+	b_next = b+ldb;
+	for (i=y; i<(unsigned int)m; i++) {
+	  a_val = hpl_accel_byte_swap_load(a_ptr++);
+	  for (x=n0; x<(unsigned int)n; x++) {
+	    b_next[x] -= b[x] * a_val;
+	  }
+	  b_next += ldb;
+	}
+	a_ptr += (lda - m);
+      }
+    }
+  }
+}
+
+
+
+void hpl_accel_dtrsm_CL_B(int m, int n, 
+			  const double *a, int lda, 
+			  double *b, int ldb,
+			  unsigned int blk_row, unsigned int blk_col,
+			  unsigned long long *incomplete)
+{
+  int spes;
+  int spans;
+  int n0 = 0;
+  unsigned int i, x, y;
+  unsigned int idx;
+  unsigned int aligned;
+  volatile hpl_accel_dtrsm_parms_t *parms;
+
+  /* Do as much of the dtrsm as possible using the dtrsm SPU specialist.
+   * This specialist assumes:
+   *   m	is at 128.
+   *   n        is a multiple of 16.
+   *   a	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   b	is quadword aligned. A multiple of 16 for optimal DMA performance
+   *   lda	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   *   ldb	is even (qword aligned rows). A multiple of 16 for optimal DMA 
+   *            performance.
+   */
+  b += (blk_row * M_SUB) + ldb*(blk_col / M_SUB);
+  blk_col %= M_SUB;
+
+  aligned = (((unsigned int)(lda | ldb) & 1) | (blk_col & 15) |
+	     ((unsigned int)((uintptr_t)a  | (uintptr_t)b) & (16-1)));
+
+
+  if ((m == M) && (n > 15) && (aligned == 0)) {
+    /* Either all or a portion of the computation can be done by the SPE accelerators.
+     */
+    spans = n/16;
+    n0 = spans * 16;
+
+    /* Verify 4GB boundary expectation.
+     */
+    VALIDATE_PANEL_4GB_CROSSING(a, m, lda);
+    VALIDATE_MATRIX_4GB_CROSSING(b, m, n0, ldb);
+
+    idx = hpl_accel_cmd_idx;
+ 
+    parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+    /* Place the parameters into a command queue buffer
+     */
+    parms->a = a;
+    parms->b = b;
+    parms->lda = lda * sizeof(double);
+    parms->ldb = ldb * sizeof(double);
+    parms->n = n0;
+    parms->m = m / M;
+    parms->blk_col = blk_col / 16;
+    parms->incomplete = incomplete;
+    COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M);
+
+    spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES;
+
+    init_incomplete(incomplete, spes);
+
+    /* Perform a sync in order to ensure that the parameters are written to 
+     * memory before writing to the mailbox command queue.
+     */
+    __sync();
+
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+    send_cmd_to_spes(HPL_ACCEL_CMD_DTRSM_CL_B, idx, spes);
+  } else {
+    /* This function is completely synchronous, therefore, clear incomplete.
+     */
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup any remaining portion of the matrix that was not handled above.
+   */
+  for (x=n0; x<(unsigned int)n; x++) {
+    for (y=1; y<(unsigned int)m; y++) {
+      for (i=y; i<(unsigned int)m; i++) {
+	b[INDEX_BLK(i, x+blk_col, ldb)] -= b[INDEX_BLK(y-1, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]);
+      }
+    }
+  }
+}
Index: accel/lib/hpl_accel_global.c
===================================================================
RCS file: accel/lib/hpl_accel_global.c
diff -N accel/lib/hpl_accel_global.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_global.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,19 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+#include "hpl_accel_spu.h"
+#include "hpl_accel_global.h"
+
+
+/* SPE Thread Info
+ */
+int hpl_accel_initialized = 0;
+hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES];
+
+
+/* SPE Command Queue 
+ */
+unsigned int hpl_accel_cmd_idx = 0;
+hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES];
+
Index: accel/lib/hpl_accel_global.h
===================================================================
RCS file: accel/lib/hpl_accel_global.h
diff -N accel/lib/hpl_accel_global.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_global.h	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,34 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+#include <libspe2.h>
+#include <pthread.h>
+#include "hpl_accel_spu.h"
+
+#ifndef _HPL_ACCEL_GLOBAL_H_
+#define _HPL_ACCEL_GLOBAL_H_
+
+#define HPL_ACCEL_CMD_ENTRIES	8	/* number of command queue entries */
+
+
+typedef struct hpl_accel_thread_info {
+  spe_context_ptr_t id;
+  pthread_t pthread;
+  spe_spu_control_area_t *ctl_area;	// pointer to control ps area
+  int in_cnt;				// inbound mailbox available element count
+  struct hpl_accel_init_parms *init_parms;
+} hpl_accel_thread_info_t;
+
+
+typedef struct hpl_accel_cmd_entry {
+  unsigned char parms[128] __attribute__ ((aligned (128)));
+} hpl_accel_cmd_entry_t;
+
+
+extern int hpl_accel_initialized;
+extern hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES];
+extern unsigned int hpl_accel_cmd_idx;
+extern hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES];
+
+#endif /* _HPL_ACCEL_GLOBAL_H_ */
Index: accel/lib/hpl_accel_init.c
===================================================================
RCS file: accel/lib/hpl_accel_init.c
diff -N accel/lib/hpl_accel_init.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_init.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,112 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include <libspe2.h>
+#include <pthread.h>
+#include "hpl_accel.h"
+#include "hpl_accel_global.h"
+#include "hpl_accel_spu.h"
+
+static hpl_accel_init_parms_t init_parms[HPL_ACCEL_SPES];
+
+static void *ppu_pthread_function(void *arg) {
+  hpl_accel_thread_info_t *info;
+  unsigned int entry = SPE_DEFAULT_ENTRY;
+
+  info = (hpl_accel_thread_info_t *)arg;
+
+  if (spe_context_run(info->id, &entry, 0, (void *)(info->init_parms), NULL, NULL) < 0) {
+    perror("Failed running context");
+    exit (1);
+  }
+  pthread_exit(NULL);
+}
+
+extern spe_program_handle_t hpl_accel_spu;
+
+
+int hpl_accel_init()
+{
+  int i;
+
+  if (!hpl_accel_initialized) {
+
+    /* Create each of the SPU threads
+     */ 
+    for (i=0; i<HPL_ACCEL_SPES; i++) {
+      /* Create context */
+      if ((hpl_accel_threads[i].id = spe_context_create (SPE_MAP_PS, NULL)) == NULL) {
+	fprintf(stderr, "INTERNAL ERROR: failed to create spu context %d. Error = %s\n", (int)i, strerror(errno));
+	return HPL_ACCEL_INIT_FAIL;
+      }
+      /* Load program */
+      if ((spe_program_load (hpl_accel_threads[i].id, &hpl_accel_spu)) != 0) {
+	fprintf(stderr, "INTERNAL ERROR: failed to load program %d. Error = %s\n", (int)i, strerror(errno));
+	return HPL_ACCEL_INIT_FAIL;
+      }
+      /* Get problem state area pointers */
+      if ((hpl_accel_threads[i].ctl_area = (spe_spu_control_area_t *)spe_ps_area_get(hpl_accel_threads[i].id, SPE_CONTROL_AREA)) == NULL) {
+	fprintf(stderr, "INTERNAL ERROR: failed to get control problem state area for thread %d. Error = %s\n", (int)i, strerror(errno));
+	return HPL_ACCEL_INIT_FAIL;
+      }
+      hpl_accel_threads[i].init_parms = &init_parms[i];
+    }
+
+    /* Initialize the SPE parameter structure
+     */
+    for (i=0; i<HPL_ACCEL_SPES; i++) {
+      init_parms[i].id = i;
+      init_parms[i].cmd_base = (void *)hpl_accel_cmd_queue;
+    }
+
+    /* Start the SPE accelerator threads.
+     */
+    for (i=0; i<HPL_ACCEL_SPES; i++) {
+      /* Create thread */
+      if ((pthread_create (&hpl_accel_threads[i].pthread, NULL, &ppu_pthread_function, &hpl_accel_threads[i])) != 0) {
+	fprintf(stderr, "INTERNAL ERROR: failed to create pthread %d. Error = %s\n", (int)i, strerror(errno));
+	return HPL_ACCEL_INIT_FAIL;
+      }
+    }
+    
+    hpl_accel_initialized = 1;
+  }
+
+  return HPL_ACCEL_INIT_SUCCESS;
+}
+
+
+
+int hpl_accel_fini()
+{
+  int i;
+  unsigned int idx;
+
+  /* Do nothing unless it was previously initialized 
+   */
+  if (hpl_accel_initialized) {
+
+    hpl_accel_initialized = 0;
+
+    idx = hpl_accel_cmd_idx;
+ 
+    /* Send the command to each of the SPEs.
+     */
+    hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+    send_cmd_to_spes(HPL_ACCEL_CMD_FINI, idx, HPL_ACCEL_SPES);
+    
+    for (i=0; i<HPL_ACCEL_SPES; i++) {
+      if ((pthread_join(hpl_accel_threads[i].pthread, NULL) != 0)) {
+	fprintf(stderr, "INTERNAL ERROR: failed to joining pthread %d. Error = %s\n", (int)i, strerror(errno));
+	return HPL_ACCEL_INIT_FAIL;
+      }
+    }
+  }
+
+  return HPL_ACCEL_FINI_SUCCESS;
+}
Index: accel/lib/hpl_accel_reform.c
===================================================================
RCS file: accel/lib/hpl_accel_reform.c
diff -N accel/lib/hpl_accel_reform.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_reform.c	20 Aug 2008 03:57:53 -0000	1.5
@@ -0,0 +1,526 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdint.h>
+#include <assert.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+#include "ppu_intrinsics.h"
+
+/* General purpose, reference, reformating facilities.
+ */
+void hpl_accel_reform_panel_CL_to_B(int m, int n,
+				    double *a, int lda,
+				    double *panel, int ldp,
+				    unsigned long long *incomplete)
+{
+  hpl_ref_reform_panel_CL_to_B(m, n, a, lda, panel, ldp, incomplete);
+}
+
+
+void hpl_accel_reform_matrix_CL_to_B(int m, int n, 
+				     double *a, int lda, 
+				     double *scratch, int size,
+				     unsigned long long *incomplete)
+
+{
+  unsigned int idx;
+  int spes;
+  int m_padded;
+  volatile hpl_accel_reform_matrix_CL_to_B_parms_t *parms;
+
+  m_padded = ((m + M_SUB-1)/M_SUB)*M_SUB;
+
+  /* Assert that the parameter restrictions are not violated.
+   *   n	 Must be an intregral multiple of 64.
+   *   a         Must be quadword aligned.
+   *   lda       Must be even and at least roundup(m,64).
+   *   scratch   Must be quadword aligned and must not straddle 4GB boundary.
+   *   size      Must be at least 64*roundup(m,64).
+   */
+  assert((n % M_SUB) == 0);
+  assert(lda >= m_padded);
+  assert(size >= (m_padded-4)*M_SUB);
+
+ /* Assert that the parameters conform also to the desired performance restrictions:
+  *   a         Must be cacheline aligned.
+  *   lda       Must be a mulitple of 16.
+  *   scratch   Must be cacheline aligned.
+  *   size      Must be at least 4*64*m for optimal performance.
+  */
+  assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0);
+  assert(((uintptr_t)scratch & (uintptr_t)127) == (uintptr_t)0);
+  assert((lda & 15) == 0);
+
+
+  /* Verify 4GB boundary expectation.
+   */
+  VALIDATE_PANEL_4GB_CROSSING(scratch, 1, size);
+  VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda*M_SUB);
+
+  idx = hpl_accel_cmd_idx;
+ 
+  parms = (volatile hpl_accel_reform_matrix_CL_to_B_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+  /* Compute the number of SPEs to deploy 
+   */
+  spes = size / ((m_padded-4) * M_SUB);
+  if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES;
+
+  /* Place the parameters into a command queue buffer
+   */
+  parms->a = a;
+  parms->scratch = scratch;
+  parms->lda = lda * sizeof(double);
+  parms->n = n;
+  parms->m = m;
+  parms->spes = spes;
+  parms->incomplete = incomplete;
+
+  init_incomplete(incomplete, spes);
+
+  /* Perform a sync in order to ensure that the parameters are written 
+   * to memory before writing to the mailbox command queue.
+   */
+  __sync();
+
+  /* Send the command to each of the SPEs.
+   */
+  hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+  send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B, idx, spes);
+}
+
+
+
+
+
+void hpl_accel_reform_panel_B_to_CL(int m, int n,
+				    double *panel, int ldp,
+				    double *a, int lda,
+				    unsigned long long *incomplete)
+{
+  unsigned int idx;
+  volatile hpl_accel_reform_panel_parms_t *parms;
+
+  /* Assert that the parameter restrictions are not violated.
+   *   m	 Must be an intregral multiple of 64.
+   *   n	 Must be at least 1.
+   *   panel     Must be quadword aligned and buffer may not straddle 4GB boundary.
+   *   ldp       Must be at least m.
+   *   a         Must be quadword aligned and buffer may not straddle 4GB boundary.
+   *   lda       Must be at least m*M_SUB.
+   */
+  assert((m % M_SUB) == 0);
+  assert(n > 0);
+  assert(ldp >= m);
+  assert(lda >= m*M_SUB);
+
+  VALIDATE_PANEL_4GB_CROSSING(panel, n, ldp);
+  VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda);
+
+ /* Assert that the parameters conform also to the desired performance restrictions:
+  *   a         Mush be cacheline aligned.
+  *   lda       Must be a mulitple of 16.
+  *   panel	Must be cacheline aligned.
+  *   ldp	Must be a multiple of 16.
+  */
+  assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0);
+  assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0);
+  assert((lda & 15) == 0);
+  assert((ldp & 15) == 0);
+
+  idx = hpl_accel_cmd_idx;
+ 
+  parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+  /* Place the parameters into a command queue buffer
+   */
+  parms->n = n;
+  parms->m = m;
+  parms->a = a;
+  parms->lda = lda * sizeof(double);
+  parms->panel = panel;
+  parms->ldp = ldp * sizeof(double);
+  parms->incomplete = incomplete;
+  COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n);
+
+  init_incomplete(incomplete, HPL_ACCEL_REFORM_SPES);
+
+  /* Perform a sync in order to ensure that the parameters are written 
+   * to memory before writing to the mailbox command queue.
+   */
+  __sync();
+
+  /* Send the command to each of the SPEs.
+   */
+  hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+  send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL, idx, HPL_ACCEL_REFORM_SPES);
+}
+
+
+void hpl_accel_reform_panel_R_to_B(int m, int n,
+				   double *a, int lda,
+				   double *panel, int ldp, 
+				   unsigned long long *incomplete)
+{
+  unsigned int idx;
+  volatile hpl_accel_reform_panel_parms_t *parms;
+
+  /* Assert that the parameter restrictions are not violated.
+   *   panel     Must be quadword aligned and buffer may not straddle 4GB boundary.
+   *   ldp       Must be even at least n.
+   *   a         Must be quadword aligned and buffer may not straddle 4GB boundary.
+   *   lda       Must be at least m*M_SUB.
+   */
+  assert((ldp & 1) == 0);
+  assert(ldp >= n);
+  assert(lda >= m*M_SUB);
+
+  VALIDATE_PANEL_4GB_CROSSING(panel, m, ldp);
+  VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda);
+
+ /* Assert that the parameters conform also to the desired performance restrictions:
+  *   a         Must be cacheline aligned.
+  *   lda       Must be a mulitple of 16.
+  *   panel	Must be cacheline aligned.
+  *   ldp	Must be a multiple of 16.
+  */
+  assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0);
+  assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0);
+  assert((lda & 15) == 0);
+  assert((ldp & 15) == 0);
+
+  idx = hpl_accel_cmd_idx;
+ 
+  parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+  /* Place the parameters into a command queue buffer
+   */
+  parms->n = n;
+  parms->m = m;
+  parms->a = a;
+  parms->lda = lda * sizeof(double);
+  parms->panel = panel;
+  parms->ldp = ldp * sizeof(double);
+  parms->incomplete = incomplete;
+  COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n);
+
+  init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+  /* Perform a sync in order to ensure that the parameters are written 
+   * to memory before writing to the mailbox command queue.
+   */
+  __sync();
+
+  /* Send the command to each of the SPEs.
+   */
+  hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+  send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B, idx, HPL_ACCEL_SPES);
+}
+
+
+void hpl_accel_reform_rows_R_to_B(int m, int n,
+				  double *rows, int ldr,
+				  double *a, int lda,
+				  int *blk_rows, int blk_col,
+				  unsigned long long *incomplete)
+{
+  int i;
+  unsigned int non_aligned;
+  int n0 = 0;
+
+
+  a += (blk_col/M_SUB) * lda;
+  blk_col %= M_SUB;
+
+  non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | 
+		 (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1)));
+							   
+  if ((non_aligned == 0) && (n > 1)) {
+    int m_left;
+    int rows_per_block;
+    int *blk_row_ptr;
+    double *rows_ptr;
+    unsigned int idx;
+    volatile hpl_accel_reform_rows_parms_t *parms;
+
+    /* Assert that we won't span a 4G boundary crossing 
+     */
+    assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32));
+
+    VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda);
+
+    n0 = n & ~1;
+
+    idx = hpl_accel_cmd_idx;
+
+    m_left = m;
+    blk_row_ptr = blk_rows;
+    rows_ptr = rows;
+ 
+    /* Generate multiple command requests if the number of rows 
+     * is greater than what will fit in a single command request.
+     */
+    rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int));
+
+    while (m_left > rows_per_block) {
+      parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+      parms->m = rows_per_block;
+      parms->n = n0;
+      parms->rows = rows_ptr;
+      parms->ldr = ldr * sizeof(double);
+      parms->a = a;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+      
+      parms->incomplete = NULL;
+      for (i=0; i<rows_per_block; i++) parms->blk_rows[i] = blk_row_ptr[i];
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES);
+      
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+      
+      m_left -= rows_per_block;
+      rows_ptr += rows_per_block * ldr;
+      blk_row_ptr += rows_per_block;
+    }
+    
+    if (m_left > 0) {
+      parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+    
+      parms->m = m_left;
+      parms->n = n0;
+      parms->rows = rows_ptr;
+      parms->ldr = ldr * sizeof(double);
+      parms->a = a;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+    
+      parms->incomplete = incomplete;
+      for (i=0; i<m_left; i++) parms->blk_rows[i] = blk_row_ptr[i];
+
+      init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES);
+
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    } else {
+      if (incomplete) *incomplete = 0;
+    }
+    hpl_accel_cmd_idx = idx;
+  } else {
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup portions of the rows not implemented by the SPEs above.
+   */
+  if (n0 < n) {
+    unsigned int x, y, row;
+    int first_span, span, left;
+    double *src, *dst;
+
+    blk_col += n0;
+    rows += n0;
+    n -= n0;
+
+    a += (blk_col/M_SUB) * lda;
+    blk_col %= M_SUB;
+
+    first_span = M_SUB - blk_col;
+    if (first_span > n) first_span = n;
+
+    /* For each of the rows */
+    for (y=0; y<(unsigned int)m; y++) {
+      row = blk_rows[y];
+      left = n;
+      dst = a + (row * M_SUB);
+      span = first_span;
+      left = n - first_span;
+
+      /* For each of the destination buffer block spans
+       */
+      src = rows;
+
+      for (x=0; x<(unsigned int)span; x++) dst[x+blk_col] = src[x];
+      while (left) {
+	dst += lda;
+	src += span;
+	span = (left > M_SUB) ? M_SUB : left;
+	for (x=0; x<(unsigned int)span; x++) dst[x] = src[x];
+	left -= span;
+      }
+      rows += ldr;
+    }
+  }
+}
+
+
+void hpl_accel_reform_rows_B_to_R(int m, int n,
+				 double *rows, int ldr,
+				 double *a, int lda,
+				 int *blk_rows, int blk_col,
+				 unsigned long long *incomplete)
+{
+  int i;
+  unsigned int non_aligned;
+  int n0 = 0;
+
+  a += (blk_col/M_SUB) * lda;
+  blk_col %= M_SUB;
+
+  non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | 
+		 (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1)));
+							   
+  if ((non_aligned == 0) && (n > 1)) {
+    int m_left;
+    int rows_per_block;
+    int *blk_row_ptr;
+    double *rows_ptr;
+    unsigned int idx;
+    volatile hpl_accel_reform_rows_parms_t *parms;
+
+    /* Assert that we won't span a 4G boundary crossing 
+     */
+    assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32));
+
+    VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda);
+
+    n0 = n & ~1;
+
+    idx = hpl_accel_cmd_idx;
+
+    m_left = m;
+    blk_row_ptr = blk_rows;
+    rows_ptr = rows;
+ 
+    /* Generate multiple command requests if the number of rows 
+     * is greater than what will fit in a single command request.
+     */
+    rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int));
+
+    while (m_left > rows_per_block) {
+      parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+      parms->m = rows_per_block;
+      parms->n = n0;
+      parms->rows = rows_ptr;
+      parms->ldr = ldr * sizeof(double);
+      parms->a = a;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+      
+      parms->incomplete = NULL;
+      for (i=0; i<rows_per_block; i++) parms->blk_rows[i] = blk_row_ptr[i];
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES);
+      
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+      
+      m_left -= rows_per_block;
+      rows_ptr += rows_per_block * ldr;
+      blk_row_ptr += rows_per_block;
+    }
+    
+    if (m_left > 0) {
+      parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+    
+      parms->m = m_left;
+      parms->n = n0;
+      parms->rows = rows_ptr;
+      parms->ldr = ldr * sizeof(double);
+      parms->a = a;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+    
+      parms->incomplete = incomplete;
+      for (i=0; i<m_left; i++) parms->blk_rows[i] = blk_row_ptr[i];
+
+      init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES);
+
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    } else {
+      if (incomplete) *incomplete = 0;
+    }
+    hpl_accel_cmd_idx = idx;
+  } else {
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup portions of the rows not implemented by the SPEs above.
+   */
+  if (n0 < n) {
+    unsigned int x, y, row;
+    int first_span, span, left;
+    double *src, *dst;
+
+    blk_col += n0;
+    rows += n0;
+    n -= n0;
+
+    a += (blk_col/M_SUB) * lda;
+    blk_col %= M_SUB;
+
+    first_span = M_SUB - blk_col;
+    if (first_span > n) first_span = n;
+
+    /* For each of the rows */
+    for (y=0; y<(unsigned int)m; y++) {
+      row = (unsigned int)blk_rows[y];
+      left = n;
+      src = a + (row * M_SUB);
+      span = first_span;
+      left = n - first_span;
+
+      /* For each of the destination buffer block spans
+       */
+      dst = rows;
+
+      for (x=0; x<(unsigned int)span; x++) dst[x] = src[x+(unsigned int)blk_col];
+      while (left) {
+	src += lda;
+	dst += span;
+	span = (left > M_SUB) ? M_SUB : left;
+	for (x=0; x<(unsigned int)span; x++) dst[x] = src[x];
+	left -= span;
+      }
+      rows += ldr;
+    }
+  }
+}
+
Index: accel/lib/hpl_accel_spu.h
===================================================================
RCS file: accel/lib/hpl_accel_spu.h
diff -N accel/lib/hpl_accel_spu.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_spu.h	23 Oct 2008 21:20:24 -0000	1.12
@@ -0,0 +1,417 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+/* This file contains definitions shared between the PPE and SPE 
+ */
+
+#ifndef _HPL_ACCEL_SPU_H_
+#define _HPL_ACCEL_SPU_H_
+
+#include "hpl_accel.h"
+#include <stdlib.h>
+
+#define SUB	(2)		/* Number of sub-blocks per block (1 dim)*/
+#define M      	(SUB*M_SUB)	/* Size of the matrix block - M x M */
+#define SUB_SUB (SUB*SUB)	/* The number of sub-blocks per block */
+
+
+/* SPE Commands
+ */
+#define HPL_ACCEL_CMD_DGEMM			0
+#define HPL_ACCEL_CMD_DTRSM			1
+#define HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B	2
+#define HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL	3
+#define HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B	4
+#define HPL_ACCEL_CMD_DGEMM_PANEL		5
+#define HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B	6
+#define HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R	7
+#define HPL_ACCEL_CMD_FINI			8
+#define HPL_ACCEL_CMD_DTRSM_CL_B		9
+#define HPL_ACCEL_CMD_DTRSM_PANEL		10
+#define HPL_ACCEL_CMD_DGEMM_C_C_C		11
+#define HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B		12
+#define HPL_ACCEL_CMD_COPY_ROWS_R_TO_R		13
+
+
+#define HPL_ACCEL_CMD_MASK			0x7F
+
+#define HPL_ACCEL_SPES				8	/* # of SPEs to use per accelerator */
+#define HPL_ACCEL_REFORM_SPES			4	/* # of SPEs to use during some reformat */
+#define HPL_ACCEL_PARM_TAG			31
+
+/* Function parameters */
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+
+typedef struct hpl_accel_init_parms {
+  unsigned int id    __attribute__ ((aligned (16)));
+  unsigned long long cmd_base __attribute__ ((aligned (16)));;
+} hpl_accel_init_parms_t;
+
+typedef struct hpl_accel_dgemm_parms {
+  unsigned long long a __attribute__ ((aligned (16)));
+  unsigned long long b __attribute__ ((aligned (16)));
+  unsigned long long c __attribute__ ((aligned (16)));
+  unsigned long long p __attribute__ ((aligned (16)));
+  vec_uint4 ld;		/* lda, ldb, ldc, ldp */
+  vec_uint4 dim;	/*   n,   m,   k, pad */
+  vec_uint4 flags;	/* b_blk, a_count, b_count, p_count */
+  unsigned long long incomplete __attribute__ ((aligned (16)));
+} hpl_accel_dgemm_parms_t;
+
+typedef struct hpl_accel_dtrsm_parms {
+  unsigned long long a __attribute__ ((aligned (16)));
+  unsigned long long b __attribute__ ((aligned (16)));
+  unsigned long long c __attribute__ ((aligned (16)));
+  vec_uint4 ld;		/* lda, ldb, ldc, pad */
+  vec_uint4 dim;	/*   n,   m, a_count, b_count */
+  vec_uint4 blk_col;
+  unsigned long long incomplete __attribute__ ((aligned (16)));
+} hpl_accel_dtrsm_parms_t;
+
+typedef struct hpl_accel_reform_matrix_CL_to_B_parms {
+  unsigned long long a __attribute__ ((aligned (16)));
+  unsigned long long scratch __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int n __attribute__ ((aligned (16)));
+  int m __attribute__ ((aligned (16)));
+  int spes __attribute__ ((aligned (16)));
+  unsigned long long incomplete __attribute__ ((aligned (16)));
+} hpl_accel_reform_matrix_CL_to_B_parms_t;
+
+typedef struct hpl_accel_reform_panel_parms {
+  unsigned long long a __attribute__ ((aligned (16)));
+  unsigned long long panel __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int ldp __attribute__ ((aligned (16)));
+  int n __attribute__ ((aligned (16)));
+  int m __attribute__ ((aligned (16)));
+  int p_count __attribute__ ((aligned (16)));
+  unsigned long long incomplete __attribute__ ((aligned (16)));
+} hpl_accel_reform_panel_parms_t;
+
+typedef struct hpl_accel_reform_rows_parms {
+  vector signed int m_n_ldr_lda;
+  vector unsigned long long rows_a;
+  vector unsigned long long incomplete_blk_col;
+  int blk_rows[5*4];
+} hpl_accel_reform_rows_parms_t;
+
+typedef struct hpl_accel_swap_rows_parms {
+  vector signed int m_n_lda_blk_col __attribute__ ((aligned (16)));
+  vector unsigned long long a_incomplete __attribute__ ((aligned (16)));
+  int blk_rows[6*4];
+} hpl_accel_swap_rows_parms_t;
+
+typedef struct hpl_accel_copy_rows_parms {
+  vector signed int m_n_lda_ldb __attribute__ ((aligned (16)));
+  vector unsigned long long a_b __attribute__ ((aligned (16)));
+  vector unsigned long long incomplete_pad __attribute__ ((aligned (16)));
+  int rows[4*4];
+} hpl_accel_copy_rows_parms_t;
+
+#else
+
+typedef struct hpl_accel_init_parms {
+  unsigned int id  __attribute__ ((aligned (16)));
+  void *cmd_base  __attribute__ ((aligned (16)));  
+  void *signotify1[HPL_ACCEL_SPES] __attribute__ ((aligned (16)));
+} hpl_accel_init_parms_t;
+
+typedef struct hpl_accel_dgemm_parms {
+  const double *a __attribute__ ((aligned (16)));
+  const double *b __attribute__ ((aligned (16)));
+  double *c __attribute__ ((aligned (16)));
+  double *p __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int ldb;
+  int ldc; 
+  int ldp;
+  int n __attribute__ ((aligned (16)));
+  int m;
+  int k;
+  int b_blk __attribute__ ((aligned (16)));
+  int a_count;
+  int b_count;
+  int p_count;
+  unsigned long long *incomplete __attribute__ ((aligned (16)));
+} hpl_accel_dgemm_parms_t;
+
+
+typedef struct hpl_accel_dtrsm_parms {
+  const double *a __attribute__ ((aligned (16)));
+  double *b __attribute__ ((aligned (16)));
+  double *c __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int ldb;
+  int ldc;
+  int n __attribute__ ((aligned (16)));
+  int m;
+  int a_count;
+  int b_count;
+  unsigned int blk_col __attribute__ ((aligned (16)));
+  unsigned long long *incomplete __attribute__ ((aligned (16)));
+} hpl_accel_dtrsm_parms_t;
+
+typedef struct hpl_accel_reform_matrix_CL_to_B_parms {
+  double *a __attribute__ ((aligned (16)));
+  double *scratch __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int n __attribute__ ((aligned (16)));
+  int m __attribute__ ((aligned (16)));;
+  int spes __attribute__ ((aligned (16)));
+  unsigned long long *incomplete __attribute__ ((aligned (16)));
+} hpl_accel_reform_matrix_CL_to_B_parms_t;
+
+typedef struct hpl_accel_reform_panel_parms {
+  double *a __attribute__ ((aligned (16)));
+  double *panel __attribute__ ((aligned (16)));
+  int lda __attribute__ ((aligned (16)));
+  int ldp __attribute__ ((aligned (16)));
+  int n __attribute__ ((aligned (16)));
+  int m __attribute__ ((aligned (16)));
+  int p_count __attribute__ ((aligned (16)));
+  unsigned long long *incomplete __attribute__ ((aligned (16)));
+} hpl_accel_reform_panel_parms_t;
+
+typedef struct hpl_accel_reform_rows_parms {
+  int m, n, ldr, lda;
+  double *rows, *a;
+  unsigned long long *incomplete;
+  int blk_col, pad;
+  int blk_rows[5*4];
+} hpl_accel_reform_rows_parms_t;
+
+typedef struct hpl_accel_swap_rows_parms {
+  int m, n, lda, blk_col;
+  double *a;
+  unsigned long long *incomplete;
+  int blk_rows[6*4];
+} hpl_accel_swap_rows_parms_t;
+
+typedef struct hpl_accel_copy_rows_parms {
+  int m, n, lda, ldb;
+  double *a;
+  double *b;
+  unsigned long long *incomplete;
+  unsigned long long pad;
+  int rows[4*4];
+} hpl_accel_copy_rows_parms_t;
+
+#endif
+
+
+/* Inline support functions.
+ */
+#ifdef __PPU__
+
+#include <libspe2.h>
+#include "hpl_accel_global.h"
+
+
+/* init_incomplete
+ * ---------------
+ * Initialize the asynchronous completion notification variable according
+ * to the specified number of paraticants. The number of participants can
+ * be between 1 and 8 where each byte in the unsigned long long variable
+ * is a flag for each of the participants. The bytes are assigned as follows:
+ *
+ *       msb                                                     lsb
+ *    +-------+-------+-------+-------+-------+-------+-------+-------+
+ *    | SPE 0 | SPE 1 | SPE 2 | SPE 3 | SPE 4 | SPE 5 | SPE 6 | SPE 7 |
+ *    +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Parameters:
+ *   incomplete    Pointer to the asynchronous completion variable.
+ *
+ *   participants  Number of participants that will participate in the 
+ *                 command that need to acknowledge completion status.
+ *     
+ */
+static inline void init_incomplete(unsigned long long *incomplete, int participants)
+{
+  if (incomplete) {
+    *incomplete = 0xFFFFFFFFFFFFFFFFULL << (8*(8-participants));
+  }
+}
+
+
+/* send_cmd_to_spes
+ * ----------------
+ * Send the command with the index to the parameter buffer to the specified
+ * number of SPE participants. The command and index are combined into a 
+ * 32-bit message that is placed in the inbound SPE mailbox. The 7 least
+ * significant bits of the message contain the command id. The 25 most 
+ * significant bits is an offset from the cmd_base to the cacheline containing
+ * the command paramters.
+ *
+ * Parameters:
+ *   idx           Command buffer index that contains the parameters for
+ *                 this command.
+ *
+ *   participants  Number of participants that will participate in the command.
+ */
+
+static inline void send_cmd_to_spes(unsigned int cmd, int idx, int participants)
+{
+  int i;
+  int cnt;
+  volatile spe_spu_control_area_t *ctl;
+
+  /* Construct cmd message to be sent to each of the SPEs via the
+   * inbound mailbox.
+   */
+  cmd |= (unsigned int)(idx * sizeof(struct hpl_accel_cmd_entry));
+
+  for (i=0; i<participants; i++) {
+    ctl = (volatile spe_spu_control_area_t *)hpl_accel_threads[i].ctl_area;
+
+    cnt = hpl_accel_threads[i].in_cnt;
+
+    /* Wait until there is space available in the mailbox queue.
+     */
+    while (cnt == 0) {
+      cnt = (ctl->SPU_Mbox_Stat >> 8) & 0xFF;
+    }
+
+    /* Place the command into the inbound mailbox.
+     */
+    ctl->SPU_In_Mbox = cmd;
+    hpl_accel_threads[i].in_cnt = cnt-1;
+  }
+}
+#endif
+
+
+#ifdef PANEL_4GB_CROSSING
+#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) {			\
+  int _cnt;										\
+  /* Calculate the number of rows/columns to the 4GB crossing and clamp			\
+   * the result to max.									\
+   */											\
+  _cnt = (0x20000000 - ((unsigned int)(uintptr_t)_panel) / sizeof(double)) / _ld; 	\
+  _count = (_cnt > _max) ? _max : _cnt;							\
+}
+#else /* !PANEL_4GB_CROSSING */
+#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max)
+#endif /* PANEL_4GB_CROSSING */
+
+#define COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld)							\
+  /* return the number of 4GB crossings in panel _p */                  \
+  (((uintptr_t)(_p + _n*_ld - 1) >> 32) - ((uintptr_t)_p >> 32))
+
+#ifdef VALIDATE_4GB_CROSSING
+#ifdef PANEL_4GB_CROSSING
+
+#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) {							\
+  /* Verify that if the panel crosses a 4GB boundary. It does so only on a row 				\
+   * boundary, and only once.										\
+   */													\
+  if (_p) { 												\
+    unsigned int _crossings;										\
+    unsigned int _bytes_til_crossing;									\
+													\
+    _crossings = COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld);    \
+    switch (_crossings) {										\
+    case 0:												\
+      break;												\
+    case 1:												\
+      _bytes_til_crossing = ((uintptr_t)_p ^ (-1)) + 1;							\
+      if ((_bytes_til_crossing % (_ld * sizeof(double))) != 0) {					\
+        fprintf(stderr, "%s %d - Panel crosses 4GB boundary within a row/col. Parameters p=%p n=%d ld=%d\n",\
+  	        __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld);						\
+        abort();											\
+      }													\
+      break;												\
+    default:												\
+      fprintf(stderr, "%s %d - Panel crosses %d 4GB boundary. Parameters p=%p n=%d ld=%d\n", 		\
+   	      __PRETTY_FUNCTION__, __LINE__, _crossings, _p, _n, _ld);					\
+      abort();												\
+      break;												\
+    }													\
+    if (_ld > 0x0FFFFFFF) {										\
+      fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", 		\
+  	      __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld);						\
+      abort();												\
+    }													\
+  }													\
+}
+
+#else /* ! PANEL_4GB_CROSSING */
+
+#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) {							\
+  /* Verify that the panel does not cross a 4GB boundary */						\
+  if (_p) { 												\
+    if ( COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) != 0 ) {             \
+      fprintf(stderr, "%s %d - Panel crosses 4GB boundary unexpectedly. Parameters p=%p n=%d ld=%d\n", 	\
+  	      __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld);						\
+      abort();												\
+    }													\
+    if (_ld > 0x0FFFFFFF) {										\
+      fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", 		\
+	      __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld);						\
+      abort();												\
+    }													\
+  }													\
+}
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+
+#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) {							\
+  if (_p) {												\
+    int _i;												\
+    double *_start, *_end;										\
+    unsigned int _blks_per_col, _dbls_to_crossing;							\
+													\
+    if (_ld > 0x0FFFFFFF) {										\
+      fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", 	\
+   	      __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld);						\
+      abort();												\
+    }													\
+    /* For each column of blocks */									\
+    _blks_per_col = (_m + (M_SUB-1))/M_SUB;								\
+    for (_i=0, _start=(double *)_p; _i<_n; _i+=M_SUB) {							\
+      _end = _start + _ld;										\
+      if (((uintptr_t)(_end) >> 32) > ((uintptr_t)(_start) >> 32)) {					\
+	/* This column crosses a 4GB boundary. Check to see that it occurs only on a block boundary */ 	\
+	_dbls_to_crossing = 0x20000000 - ((unsigned int)(uintptr_t)_start) / sizeof(double); 		\
+	if (((M_SUB*M_SUB)*_blks_per_col > _dbls_to_crossing) &&					\
+	    ((_dbls_to_crossing % (M_SUB*M_SUB)) != 0)) {						\
+	  fprintf(stderr, "%s %d - Matrix block straddles 4GB boundary. Parameters p=%p m=%d n=%d ld=%d\n",\
+		  __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld);					\
+	  abort();											\
+	}												\
+      }													\
+      _start = _end;											\
+    }													\
+  }													\
+}
+
+#else /* !MATRIX_4GB_CROSSING */
+
+#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) {							\
+  if (_p) {												\
+    if ((((uintptr_t)_p) >> 32) != ((uintptr_t)(_p + _ld * (((_n+M_SUB-1)/M_SUB)-1) + ((_m+M_SUB-1)/M_SUB)*M_SUB*M_SUB-1) >> 32)) { \
+      fprintf(stderr, "%s %d - Matrix crosses 4GB boundary unexpectedly. Parameters p=%p m=%d n=%d ld=%d\n", \
+  	      __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld);						\
+      abort();												\
+    }													\
+    if (_ld > 0x0FFFFFFF) {										\
+      fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", 	\
+	      __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld);						\
+      abort();												\
+    }													\
+  }													\
+}
+#endif
+
+#else /* VALIDATE_4GB_CROSSING */
+#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld)
+#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld)
+#endif /* VALIDATE_4GB_CROSSING */
+
+#endif /* _HPL_ACCEL_SPU_H_ */
Index: accel/lib/hpl_accel_swap.c
===================================================================
RCS file: accel/lib/hpl_accel_swap.c
diff -N accel/lib/hpl_accel_swap.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_accel_swap.c	20 Aug 2008 03:57:53 -0000	1.4
@@ -0,0 +1,150 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdint.h>
+#include <assert.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+#include "ppu_intrinsics.h"
+
+/* General purpose, reference, reformating facilities.
+ */
+
+void hpl_accel_swap_rows_B_to_B(int m, int n,
+                                double *a, int lda,
+                                int *blk_rows, int blk_col,
+                                unsigned long long *incomplete)
+{
+  int i;
+  unsigned int non_aligned;
+  int n0 = 0;
+
+  a += (blk_col/M_SUB) * lda;
+  blk_col %= M_SUB;
+
+  non_aligned = (((unsigned int)(blk_col | lda) & 1) | 
+		 (((unsigned int)(uintptr_t)a) & (16-1)));
+							   
+  if ((non_aligned == 0) && (n > 1)) {
+    int m_start, m_left;
+    int rows_per_block;
+    unsigned int idx;
+    volatile hpl_accel_swap_rows_parms_t *parms;
+
+    /* Assert that we won't span a 4G boundary crossing 
+     */
+    VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda);
+
+    n0 = n & ~1;
+
+    idx = hpl_accel_cmd_idx;
+
+    m_start = 0;
+    m_left = m;
+ 
+    /* Generate multiple command requests if the number of rows 
+     * is greater than what will fit in a single command request.
+     */
+    rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int));
+
+    while (m_left > rows_per_block) {
+      parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+
+      parms->m = rows_per_block;
+      parms->n = n0;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+
+      parms->a = a + INDEX_BLK(m_start,0,lda);
+      parms->incomplete = NULL;
+
+      for (i=0; i<rows_per_block; i++) parms->blk_rows[i] = blk_rows[m_start+i]-m_start;
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES);
+      
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+
+      m_start += rows_per_block;
+      m_left -= rows_per_block;
+    }
+    
+    if (m_left > 0) {
+      parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]);
+    
+      parms->m = m_left;
+      parms->n = n0;
+      parms->lda = lda * sizeof(double);
+      parms->blk_col = blk_col;
+
+      parms->a = a + INDEX_BLK(m_start,0,lda);
+      parms->incomplete = incomplete;
+
+      for (i=0; i<m_left; i++) parms->blk_rows[i] = blk_rows[m_start+i]-m_start;
+
+      init_incomplete(incomplete, HPL_ACCEL_SPES);
+
+      /* Perform a sync in order to ensure that the parameters are written 
+       * to memory before writing to the mailbox command queue.
+       */
+      __sync();
+    
+      /* Send the command to each of the SPEs.
+       */
+      send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES);
+
+      idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES;
+    } else {
+      if (incomplete) *incomplete = 0;
+    }
+    hpl_accel_cmd_idx = idx;
+  } else {
+    if (incomplete) *incomplete = 0;
+  }
+
+  /* Cleanup portions of the rows not implemented by the SPEs above.
+   */
+  if (n0 < n) {
+    unsigned int y1, y2, x;
+    int first_span, span, left;
+    double tmp, *src, *dst;
+
+    blk_col += n0;
+    n -= n0;
+
+    a += (blk_col/M_SUB) * lda;
+    blk_col %= M_SUB;
+
+    first_span = M_SUB - blk_col;
+    if (first_span > n) first_span = n;
+
+    /* For each of the rows */
+    for (y1=0; y1<(unsigned int)m; y1++) {
+      y2 = blk_rows[y1];  /* New location for row y1 */
+      if (y1 != y2) {
+        dst = a + (y1 * M_SUB);
+        src = a + (y2 * M_SUB);
+        for (x=0; x<(unsigned int)first_span; x++) 
+          {tmp = dst[x+blk_col]; dst[x+blk_col] = src[x+blk_col]; src[x+blk_col] = tmp;}
+        left = n - first_span;
+        while (left) {
+          dst += lda;
+          src += lda;
+          span = (left > M_SUB) ? M_SUB : left;
+          for (x=0; x<(unsigned int)span; x++)
+            {tmp = dst[x]; dst[x] = src[x]; src[x] = tmp;}
+          left -= span;
+        }
+      }
+    }
+  }
+}
+
Index: accel/lib/hpl_ref.c
===================================================================
RCS file: accel/lib/hpl_ref.c
diff -N accel/lib/hpl_ref.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/hpl_ref.c	20 Aug 2008 03:57:53 -0000	1.11
@@ -0,0 +1,419 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <string.h>
+#include "hpl_accel.h"
+#include "hpl_accel_spu.h"
+
+#include <ppu_intrinsics.h>
+
+
+int hpl_ref_init() 
+{
+  return HPL_ACCEL_INIT_SUCCESS;
+}
+
+
+void hpl_ref_dgemm_CL_R_B(int m, int n, int k,
+			  const double *a, int lda,
+			  const double *b, int ldb,
+			  double *c, int ldc,
+			  unsigned long long *incomplete)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val;
+  
+  for (i=0; i<(unsigned int)k; i++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+      for (x=0; x<(unsigned int)n; x++) {
+	c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)];
+      }
+    }
+  }
+
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k,
+			     const double *a, int lda,
+			     const double *b, int ldb,
+			     double *c, int ldc,
+			     unsigned int blk_row, unsigned int blk_col,
+			     double *p, int ldp,
+			     unsigned long long *incomplete)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val;
+  
+  if (p) {
+    /* Copy c into p */
+    for (y=0; y<(unsigned int)m; y++) {
+      for (x=0; x<(unsigned int)n; x++) {
+	p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)];
+      }
+    }
+    /* Perform DGEMM on p */
+    for (i=0; i<(unsigned int)k; i++) {
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+	for (x=0; x<(unsigned int)n; x++) {
+	  p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_ROW(i,x,ldb)];
+	}
+      }
+    }
+    /* Byte swap the result */
+    for (y=0; y<(unsigned int)m; y++) {
+      for (x=0; x<(unsigned int)n; x++) {
+	p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]);
+      }
+    }
+  } else {
+    for (i=0; i<(unsigned int)k; i++) {
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+	for (x=0; x<(unsigned int)n; x++) {
+	  c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)];
+	}
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k,
+			     const double *a, int lda,
+			     const double *b, int ldb,
+			     double *c, int ldc,
+			     unsigned int blk_row, unsigned int blk_col,
+			     double *p, int ldp,
+			     unsigned long long *incomplete)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val;
+  
+  if (p) {
+    /* Copy c into p */
+    for (y=0; y<(unsigned int)m; y++) {
+      for (x=0; x<(unsigned int)n; x++) {
+	p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)];
+      }
+    }
+    /* Perform DGEMM on P */
+    for (i=0; i<(unsigned int)k; i++) {
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+	for (x=0; x<(unsigned int)n; x++) {
+	  p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)];
+	}
+      }
+    }
+    /* Byte swap the result */
+    for (y=0; y<(unsigned int)m; y++) {
+      for (x=0; x<(unsigned int)n; x++) {
+	p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]);
+      }
+    }
+  } else {
+    for (i=0; i<(unsigned int)k; i++) {
+      for (y=0; y<(unsigned int)m; y++) {
+	a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+	for (x=0; x<(unsigned int)n; x++) {
+	  c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)];
+	}
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+void hpl_ref_dgemm_CL_B_B(int m, int n, int k,
+			  const double *a, int lda,
+			  const double *b, int ldb,
+			  double *c, int ldc,
+			  unsigned long long *incomplete)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val;
+  
+  for (i=0; i<(unsigned int)k; i++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]);
+      for (x=0; x<(unsigned int)n; x++) {
+	c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_BLK(i,x,ldb)];
+      }
+    }
+  }
+
+  if (incomplete) *incomplete = 0;
+}
+
+
+extern void hpl_ref_dgemm_C_C_C(int m, int n, int k,
+				const double *a, int lda,
+				const double *b, int ldb,
+				double *c, int ldc,
+				unsigned long long *incomplete)
+{
+  unsigned int i;
+  unsigned int x, y;
+  double a_val, c_val;
+  
+  for (i=0; i<(unsigned int)k; i++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a_val = a[INDEX_COL(y,i,lda)];
+      for (x=0; x<(unsigned int)n; x++) {
+        c_val = c[INDEX_COL(y,x,ldc)];
+        c_val -= a_val * b[INDEX_COL(i,x,ldb)];
+        c[INDEX_COL(y,x,ldc)] = c_val;
+      }
+    }
+  }
+
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+void hpl_ref_dtrsm_CL_R(int m, int n, 
+			const double *a, int lda, 
+			double *b, int ldb,
+			unsigned long long *incomplete)
+{
+  unsigned int i, x, y;
+
+  for (x=0; x<(unsigned int)n; x++) {
+    for (y=1; y<(unsigned int)m; y++) {
+      for (i=y; i<(unsigned int)m; i++) {
+	b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]);
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_dtrsm_CL_B(int m, int n, 
+			const double *a, int lda, 
+			double *b, int ldb,
+			unsigned int blk_row, unsigned int blk_col,
+			unsigned long long *incomplete)
+{
+  unsigned int i, x, y;
+
+  for (x=0; x<(unsigned int)n; x++) {
+    for (y=1; y<(unsigned int)m; y++) {
+      for (i=y; i<(unsigned int)m; i++) {
+	b[INDEX_BLK(i+blk_row, x+blk_col, ldb)] -= b[INDEX_BLK(y-1+blk_row, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]);
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_dtrsm_CL_R_B(int m, int n, 
+			  const double *a, int lda, 
+			  double *b, int ldb,
+			  double *c, int ldc,
+			  unsigned int blk_row, unsigned int blk_col,
+			  unsigned long long *incomplete)
+{
+  unsigned int i, x, y;
+
+  if (c) {
+    for (x=0; x<(unsigned int)n; x++) {
+      
+      for (i=0; i<(unsigned int)m; i++) c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)];      /* Copy the column of b into c */
+      for (y=1; y<(unsigned int)m; y++) {
+	for (i=y; i<(unsigned int)m; i++) {
+	  c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] -= c[INDEX_BLK(y-1+blk_row, x+blk_col, ldc)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]);
+	}
+      }
+    }
+  } else {
+    for (x=0; x<(unsigned int)n; x++) {
+      for (y=1; y<(unsigned int)m; y++) {
+	for (i=y; i<(unsigned int)m; i++) {
+	  b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]);
+	}
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+
+/* General purpose, reference, reformating facilities.
+ */
+void hpl_ref_reform_panel_CL_to_B(int m, int n,
+				  double *a, int lda,
+				  double *panel, int ldp,
+				  unsigned long long *incomplete)
+{
+  unsigned int x, y;
+
+  for (x=0; x<(unsigned int)n; x++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a[INDEX_BLK(y,x,lda)] = hpl_accel_byte_swap(panel[INDEX_COL(y,x,ldp)]);
+    }
+  }
+
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_reform_matrix_CL_to_B(int m, int n, 
+				   double *a, int lda, 
+				   double *scratch, 
+				   int size __attribute__ ((unused)) ,
+				   unsigned long long *incomplete)
+
+{
+  unsigned int i;
+  unsigned int x, y;
+  unsigned int col;
+  
+  /* Reformat the matrix [a] from column-order, little-endian to blocked, 
+   * big-endian format. 
+   */
+
+  /* For each column of blocks */
+  for (col=0; col<(unsigned int)n; col+=M_SUB) {
+    /* Reformat the column of block into the scratch buffer */
+    for (x=0; x<(unsigned int)M_SUB; x++) {
+      for (y=0; y<(unsigned int)m; y++) {
+	scratch[INDEX_ROW(y,x,M_SUB)] = hpl_accel_byte_swap(a[INDEX_COL(y,x,lda)]);
+      }
+    }
+    /* Copy the reformated data back into a */
+    memcpy(a, scratch, sizeof(double)*M_SUB*m);
+
+    /* Zero the trailing block column of data */
+    a += M_SUB*m;
+    for (i=0; i<(unsigned int)M_SUB*(lda-m); i++) *a++ = 0.0;
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+
+void hpl_ref_reform_panel_B_to_CL(int m, int n,
+				  double *panel, int ldp,
+				  double *a, int lda,
+				  unsigned long long *incomplete)
+{
+  unsigned int x, y;
+
+  for (x=0; x<(unsigned int)n; x++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      panel[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(a[INDEX_BLK(y,x,lda)]);
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+void hpl_ref_reform_panel_R_to_B(int m, int n,
+                                 double *a, int lda,
+                                 double *panel, int ldp, 
+                                 unsigned long long *incomplete)
+{
+  unsigned int x, y;
+
+  for (x=0; x<(unsigned int)n; x++) {
+    for (y=0; y<(unsigned int)m; y++) {
+      a[INDEX_BLK(y,x,lda)] = panel[INDEX_ROW(y,x,ldp)];
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+void hpl_ref_reform_rows_R_to_B(int m, int n,
+			       double *rows, int ldr,
+			       double *a, int lda,
+			       int *blk_rows, int blk_col,
+			       unsigned long long *incomplete)
+{
+  unsigned int x, y;
+
+  for (y=0; y<(unsigned int)m; y++) {
+    for (x=0; x<(unsigned int)n; x++) {
+      a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)] = rows[INDEX_ROW(y, x, ldr)];
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+
+
+void hpl_ref_reform_rows_B_to_R(int m, int n,
+			       double *rows, int ldr,
+			       double *a, int lda,
+			       int *blk_rows, int blk_col,
+			       unsigned long long *incomplete)
+
+{
+  unsigned int x, y;
+
+  for (y=0; y<(unsigned int)m; y++) {
+    for (x=0; x<(unsigned int)n; x++) {
+      rows[INDEX_ROW(y, x, ldr)] = a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)];
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+void hpl_ref_swap_rows_B_to_B(int m, int n,
+                              double *a, int lda,
+                              int *blk_rows, int blk_col,
+                              unsigned long long *incomplete)
+{
+  unsigned int y1, y2, x;
+
+  for (y1=0; y1<(unsigned int)m; y1++) {
+    y2 = blk_rows[y1];  /* New location for row y1 */
+    if (y1 != y2) {
+      /* Swap rows y1 and y2 */
+      for (x=0; x<(unsigned int)n; x++) {
+        double tmp = a[INDEX_BLK(y1, x+blk_col, lda)];
+        a[INDEX_BLK(y1, x+blk_col, lda)] = a[INDEX_BLK(y2, x+blk_col, lda)];
+        a[INDEX_BLK(y2, x+blk_col, lda)] = tmp;
+      }
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
+
+void hpl_ref_copy_rows_R_to_R(int m, int n,
+			      double *a, int lda,
+			      double *b, int ldb,
+			      int *rows, 
+			      unsigned long long *incomplete)
+{
+  unsigned int y1, y2, x;
+
+  for (y1=0; y1<(unsigned int)m; y1++) {
+    y2 = rows[y1];  /* New location for row y1 */
+    /* Copy row a[y1] to b[y2] */
+    for (x=0; x<(unsigned int)n; x++) {
+      b[INDEX_ROW(y2, x, ldb)] = a[INDEX_ROW(y1, x, lda)];
+    }
+  }
+  if (incomplete) *incomplete = 0;
+}
Index: accel/lib/spu/Makefile
===================================================================
RCS file: accel/lib/spu/Makefile
diff -N accel/lib/spu/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/Makefile	20 Aug 2008 03:57:53 -0000	1.9
@@ -0,0 +1,57 @@
+# --------------------------------------------------------------- 
+# (C) Copyright IBM Corporation 2007,2008
+#                                                                 
+# --------------------------------------------------------------- 
+
+########################################################################
+#			Target
+########################################################################
+
+PROGRAM_spu 	:= hpl_accel_spu
+
+LIBRARY_embed64	= libhpl_accel_spu.a
+
+OBJS		= hpl_accel_spu.o		\
+		  accel_dgemm.o			\
+		  accel_dgemm_panel.o		\
+		  accel_dgemm_C.o		\
+		  accel_dtrsm.o			\
+		  accel_dtrsm_panel.o		\
+		  accel_dtrsm_CL_B.o		\
+		  accel_reform_matrix_CL_to_B.o	\
+		  accel_reform_panel_B_to_CL.o	\
+		  accel_reform_panel_R_to_B.o	\
+		  accel_reform_rows_B_to_R.o	\
+		  accel_reform_rows_R_to_B.o	\
+		  accel_swap_rows_B_to_B.o	\
+		  accel_copy_rows_R_to_R.o	\
+		  accel_buffers.o		\
+		  accel_mm_dp_64Cx64.o		\
+		  accel_dtrsm_dp_128Cx16.o	\
+		  accel_mm_dp.o
+
+
+########################################################################
+#			Local Defines
+########################################################################
+
+# CC_OPT_LEVEL	= -g
+
+#CPPFLAGS	= -DACCEL_LITTLE_ENDIAN
+CPPFLAGS       += -DMATRIX_4GB_CROSSING
+
+# THE SPU CODE DOES NOT YET SUPPORT 4GB PANEL CROSSING
+#CPPFLAGS       += -DPANEL_4GB_CROSSING 
+
+CFLAGS_gcc	= -march=celledp -mtune=celledp
+CFLAGS_xlc	= -qarch=edp -qtune=edp
+
+INCLUDE		= -I..
+
+
+
+########################################################################
+#			make.footer
+########################################################################
+
+include $(CELL_TOP)/buildutils/make.footer
Index: accel/lib/spu/accel_buffers.S
===================================================================
RCS file: accel/lib/spu/accel_buffers.S
diff -N accel/lib/spu/accel_buffers.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_buffers.S	23 Oct 2008 21:20:24 -0000	1.3
@@ -0,0 +1,24 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+	
+	.data
+	.align	7
+	.global	bufA
+bufA:
+	.global	bufA_128x128
+bufA_128x128:	
+	.skip 	2*64*64*8
+	.global	bufB
+bufB:	.skip 	2*64*64*8
+	
+	.global	bufC
+bufC:
+	.global bufB_128x16
+bufB_128x16:
+	.skip	2*128*16*8
+	.global bufB_list
+bufB_list:
+	.skip 	64*64*8
+
Index: accel/lib/spu/accel_buffers.h
===================================================================
RCS file: accel/lib/spu/accel_buffers.h
diff -N accel/lib/spu/accel_buffers.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_buffers.h	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,24 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_BUFFERS_H_
+#define _ACCEL_BUFFERS_H_
+
+#include <spu_intrinsics.h>
+
+/* The local store buffers is carved up uniquely for each acceleration function.
+ */
+
+/* DGEMM buffer set */
+extern vec_double2	bufA[2][64*64/2];
+extern vec_double2	bufB[2][64*64/2];
+extern vec_double2	bufC[2][64*64/2];
+
+/* DTRSM buffer set */
+extern vec_double2	bufA_128x128[128*128/2];
+extern vec_double2	bufB_128x16[2][128*16/2];
+extern vec_uint4	bufB_list[8][128/2];
+
+#endif /* _ACCEL_BUFFERS_H_ */
Index: accel/lib/spu/accel_copy_rows_R_to_R.c
===================================================================
RCS file: accel/lib/spu/accel_copy_rows_R_to_R.c
diff -N accel/lib/spu/accel_copy_rows_R_to_R.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_copy_rows_R_to_R.c	20 Aug 2008 03:57:53 -0000	1.4
@@ -0,0 +1,127 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+static inline void row_R_to_R(unsigned int src_hi, unsigned int src_lo,
+                              unsigned int dst_hi, unsigned int dst_lo,
+                              unsigned int left, unsigned int *tag)
+{
+  void *buf[2];
+  unsigned int size;
+
+  buf[0] = bufA;
+  buf[1] = bufB;
+
+  size = 16*1024;
+  if (size > left) size = left;
+
+  spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD);
+  left -= size;
+
+  while (left) {
+
+    spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD);
+
+    *tag ^= 1;
+
+    /* increment src_hi, src_lo, dst_hi, dst_lo */
+    MATRIX_EA_UADD32(src_hi, src_lo, size);
+    MATRIX_EA_UADD32(dst_hi, dst_lo, size);
+
+    size = 16*1024;
+    if (size > left) size = left;
+
+    spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD);
+    left -= size;
+  }
+
+  spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD);
+}
+
+
+void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *parms, 
+			    volatile hpl_accel_copy_rows_parms_t *cmd_parms)
+{
+  int m, n, lda, ldb;
+  unsigned int src, dst;
+  unsigned int id;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int src_hi, src_lo;
+  unsigned int dst_hi, dst_lo;
+  unsigned int row_size;
+  unsigned int tag;
+  unsigned int rows_per_spe, extra_rows, start_row, end_row;
+  vector signed int m_n_lda_ldb;
+  vector unsigned long long a_b, incomplete_pad;
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  m_n_lda_ldb = cmd_parms->m_n_lda_ldb;
+  a_b = cmd_parms->a_b;
+  incomplete_pad = cmd_parms->incomplete_pad;
+
+  m = spu_extract(m_n_lda_ldb, 0);
+  n = spu_extract(m_n_lda_ldb, 1);
+  lda = spu_extract(m_n_lda_ldb, 2);
+  ldb = spu_extract(m_n_lda_ldb, 3);
+
+  a_hi = spu_extract((vector unsigned int)a_b, 0);
+  a_lo = spu_extract((vector unsigned int)a_b, 1);
+
+  b_hi = spu_extract((vector unsigned int)a_b, 2);
+  b_lo = spu_extract((vector unsigned int)a_b, 3);
+
+  /* Process rows by assigning each row to one SPE.
+   */
+  row_size = n*sizeof(double);
+  rows_per_spe = m / HPL_ACCEL_SPES;
+  extra_rows = m % HPL_ACCEL_SPES;
+
+  start_row = id * rows_per_spe + ((id > extra_rows) ? extra_rows : id);
+  end_row = start_row + rows_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_rows, 0), spu_promote(id, 0)), 0); 
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT_RECEIVE();
+
+  tag = 0;
+  for (src=start_row; src<end_row; src++) {
+    dst = cmd_parms->rows[src];
+
+    src_hi = a_hi; src_lo = a_lo;
+    MATRIX_EA_UADD32(src_hi, src_lo, src*lda);
+
+    dst_hi = b_hi; dst_lo = b_lo;
+    MATRIX_EA_UADD32(dst_hi, dst_lo, dst*ldb);
+
+    row_R_to_R(src_hi, src_lo, dst_hi, dst_lo, row_size, &tag);
+
+    tag ^= 1;
+  }
+
+  DMA_WAIT(1<<tag);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, spu_extract(incomplete_pad, 0), tag^1);
+}
Index: accel/lib/spu/accel_dgemm.c
===================================================================
RCS file: accel/lib/spu/accel_dgemm.c
diff -N accel/lib/spu/accel_dgemm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dgemm.c	23 Oct 2008 21:20:24 -0000	1.7
@@ -0,0 +1,445 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "accel_dgemm.h"
+
+/* Double precision DGEMM matrix-matrix multiply for 128x128 sized 
+ * blocks. Computation is performed using 64x64 sized sub-blocks in 
+ * a manner that minimizes DMA/memory throughput. 
+ */
+
+
+
+void accel_dgemm(hpl_accel_init_parms_t *parms, 
+		 volatile hpl_accel_dgemm_parms_t *cmd_parms)
+{
+  int i;
+  int rotate;
+  unsigned int id;
+  unsigned int idx, a_idx, c_idx;
+  unsigned int i1, phase;
+  unsigned long long a, b, c;		/* ea pointers */
+  unsigned int hi, lo;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int c_hi, c_lo, c_lo_prev;
+  unsigned int sub_blocks, sub_blocks_per_spe;
+  unsigned int start_x, start_sub, end_sub;
+  unsigned int odd, buf;
+  unsigned int x_sub, y_sub;
+  unsigned int w_sub, h_sub;		/* width & height in sub_blocks */
+  unsigned int lda, ldb, stride;
+  unsigned int a_addend, b_addend, c_addend;
+  vec_uint4 vone = (vec_uint4){1, 1, 1, 1};
+  vec_uint4 ld, flags, b_blk;
+  vec_uint4 a_step, b_step, c_stepv, c_steph;
+  vec_uint4 dim;
+  vec_uint4 h_sub_v, h_sub2_v, y_sub2_v;
+  vec_uint4 down, corner;
+  vec_uint4 corner_eq2;
+  vec_uint4 step_sub = spu_splats(M_SUB*sizeof(double));
+  vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1};
+  vec_uchar16 splat_1  = (vec_uchar16)spu_splats((unsigned int)0x04050607);
+  vec_uchar16 splat_2  = (vec_uchar16)spu_splats((unsigned int)0x08090A0B);
+  vec_uchar16 shuf_0404 = (vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19};
+  vec_uchar16 shuf_0044 = (vec_uchar16){0,1,2,3, 0,1,2,3, 16,17,18,19, 16,17,18,19};
+  vec_double2 *c_ptr;
+#ifdef MATRIX_4GB_CROSSING
+  unsigned int c_hi_prev;
+#endif
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+
+  /* Fetch the command parameters
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  c = cmd_parms->c;
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+
+  c_hi = mfc_ea2h(c);
+  c_lo = mfc_ea2l(c);
+
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+  ldb = spu_extract(ld, 1); 
+      
+  dim = cmd_parms->dim;
+
+  flags = cmd_parms->flags;
+
+  b_blk = spu_maskw(spu_extract(flags, 0));
+
+  /* Computation of [C] -= [A][B] is performed in a surpetine pattern
+   * through the various sub-blocks of C. Below is a graphical attempt
+   * to explain the partitioning and order of the computation. For this
+   * example, consider the matrix-matrix multiply of a 5x5 (128x128 block)
+   * result after panel factorization of block 0,0 (bx,by). In this case,
+   * we must compute 128x128 blocks multiplies as follows:
+   *
+   *   for (x=1; x<5; x++) {
+   *     for (y=1; y<5; y++) {
+   *        C(x,y) -= A(bx,y)*B(x,by);
+   *     }
+   *   }
+   * 
+   * Assuming this computation is performed by 3 SPEs, the 16 blocks
+   * are subdivided as:
+   *
+   *   SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2)
+   *   SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4)
+   *   SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4)
+   *
+   * Therefore, SPE 1 will compute the resulting sub-blocks of C in the
+   * alphabetic order (a thru z) as marked below.
+   *                
+   *               X
+   *       0   1   2   3   4
+   *     +---B---+---+---+---+
+   *   0 |   |    U row      |  
+   *     |   |               |  
+   *     A---C---+---+---+---+
+   *   1 |   |   |   |i x|   |  
+   *     |   |   |   |j w|   |  
+   *     + L +---+---+---+---+
+   * Y 2 |   |   |   |k v|   |  
+   *     | p |   |   |l u|   |                                      
+   *     + a +---+---+---+---+
+   *   3 | n |   |a h|m t|   |  
+   *     | e |   |b g|n s|   |  
+   *     + l +---+---+---+---+
+   *   4 |   |   |c f|o r|   |  
+   *     |   |   |d e|p q|   |  
+   *     +---+---+---+---+---+
+   *
+   * Using 128x128 block partitioning amongst the SPEs results non-optimal
+   * load balancing of the SPEs. This is shown by the above example in which
+   * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes
+   * 16 64x64 multiplies. In addition, the corner turn between sub-blocks
+   * 'h' and 'i' will incur extra DMAs. 
+   *
+   * A more computational and transfer efficient load balance would be 
+   * to allocate computation on the 64 sub-blocks. This would allocate 
+   * 22,22,20 sub-block multiplies to each of the SPEs and the corner 
+   * turn becomes efficient. The sub-block, computation (alphabetically
+   * ordered) for SPE 1 becomes:
+   *
+   *               X
+   *       0   1   2   3   4
+   *     +---B---+---+---+---+
+   *   0 |   |    U row      |  
+   *     |   |               |  
+   *     A---C---+---+---+---+
+   *   1 |   |   |  j|k  |   |  
+   *     |   |   |  i|l  |   |  
+   *     + L +---+---+---+---+
+   * Y 2 |   |   |  h|m  |   |  
+   *     | p |   |  g|n  |   |  
+   *     + a +---+---+---+---+
+   *   3 | n |   |  f|o v|   |
+   *     | e |   |  e|p u|   |  
+   *     + l +---+---+---+---+
+   *   4 |   |   |a d|q t|   |  
+   *     |   |   |b c|r s|   |  
+   *     +---+---+---+---+---+
+   *
+   * This more efficient method is employed in the following code.
+   */
+	
+  w_sub = spu_extract(dim, 0);
+  h_sub_v = spu_shuffle(dim, dim, splat_1);
+  h_sub = spu_extract(h_sub_v, 0);
+
+  h_sub2_v = spu_sl(h_sub_v, 1);
+
+  sub_blocks = w_sub * h_sub;
+  sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES;
+  
+  start_sub = ((unsigned short)id) * sub_blocks_per_spe;
+  end_sub = start_sub + sub_blocks_per_spe;
+  if (end_sub > sub_blocks) end_sub = sub_blocks;
+      
+  sub_blocks = end_sub - start_sub;
+
+  if (LIKELY((int)sub_blocks > 0)) {
+    /* This SPE has some work to do
+     */
+    DMA_WAIT_REQUEST(-1);
+
+    /* Compute vectors for stepping the effective address matrix pointers.
+     * The pictograms below show 64x64 blocks within the 128x128 blocks.
+     *
+     *   A (L panel)         B (U panel)                C matrix
+     *   ++===+===++     ++---+---++---+---++      ++===+===++===+===++
+     *   || 1 | 2 ||     || 1 | 4 || 5 |   ||      || 1 |   ||   |   ||
+     *   ++---+---++     ++---+---++---+---++      ++---+---++---+---++
+     *   || 3 |   ||     || 2 | 3 ||   |   ||      || 2 |   ||   |   ||
+     *   ++===+===++     ++---+---++---+---++      ++===+===++===+===++
+     *   ||   |   ||                               ||   |   ||   |   || 
+     *   ++---+---++                               ++---+---++---+---++     
+     *   ||   |   ||                               || 3 | 4 ||   |   || 
+     *   ++===+===++                               ++===+===++===+===++
+     *          
+     * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3}
+     * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5}
+     * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2}
+     * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4}
+     */
+
+    a_step = spu_promote(lda * M_SUB, 0);
+    a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404);
+
+    c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double));
+    c_steph = spu_shuffle(ld, ld, splat_2);
+
+    b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk);
+    b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044);
+    b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101);
+
+    ldb    = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0);
+
+    /* Determine the following:
+     * 1) Starting sub-block - x_sub, y_sub
+     * 2) Number of sub-block multiplies before a corner turn - corner. 
+     */
+    x_sub = start_sub / h_sub;
+    y_sub = start_sub - h_sub * x_sub;
+    
+    start_x = x_sub / SUB;
+    y_sub = start_sub - h_sub*SUB*start_x;
+	
+    /* rotate = 4;
+     * 
+     * if (x_sub & 1) {
+     *   y_sub = h_sub - 1 - y_sub;
+     *   a_step = spu_sub(0, a_step);
+     *   c_stepv = spu_sub(0, c_stepv);
+     *   rotate = -rotate;
+     *   corner = 2*y_sub + 2
+     * } else {
+     *   corner = 2 * (h_sub-y_sub)
+     * }
+     */
+    odd = x_sub & 1;
+    
+    down = spu_cmpeq(spu_splats(odd), 0);
+	
+    y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)),
+				spu_promote(y_sub, 0),				
+				down), 0);
+	
+    y_sub2_v = spu_splats(2*y_sub);
+	
+    corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down);
+
+    /* Compute the initial EA buffer pointers.
+     */
+    a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0);
+    b_addend = spu_extract(spu_andc(b_step, down), 0);
+    c_addend = y_sub * spu_extract(c_stepv, 0);
+
+    a_lo += a_addend;
+
+    MATRIX_EA_UADD32(b_hi, b_lo, b_addend);
+    MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1));
+    MATRIX_EA_UADD32(c_hi, c_lo, c_addend);
+    MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0));
+	
+    /* Adjust the pointer steps according to the initial direction.
+     */
+    a_step = spu_sel(spu_sub(0, a_step), a_step, down);
+    b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0));
+    c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down);
+    rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0);
+	
+    /* Before starting, make sure all previous DMA transfers are completed so
+     * that all the LS buffers are known to be available.
+     */
+    DMA_WAIT_RECEIVE();
+
+    /* Download 3 blocks to get the process started. After that, each
+     * 64x64 block multiple requires 2 block transfers.
+     */
+    dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda);
+
+    dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb);
+
+    dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD);
+	
+    DMA_WAIT_REQUEST(1<<0);
+
+    c_lo_prev = c_lo;
+	
+    a_lo += spu_extract(a_step, 0);
+    MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0));
+	
+    dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda);
+	
+    dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb);
+	
+    phase = 0;
+	
+    i1 = 0;
+    a_idx = 0;
+	
+    /* For each C block, we perform 2 block computations 
+     */
+    for (i=0; i<(int)sub_blocks-1; i++) {
+      /* First block computation 
+       */
+      DMA_WAIT_RECEIVE();
+      DMA_WAIT_REQUEST((1<<1)|(1<<2));
+
+      mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]);
+      
+      a_step = spu_rlqwbyte(a_step, rotate);
+      
+      c_idx = i1 ^ 1;
+      
+      corner_eq2 = spu_cmpeq(corner, 2);
+
+      /* if (corner == 2) {
+       *   rotate = -rotate;
+       *   a_step = 0-a_step;
+       * } else {
+       *   a_lo += a_step;
+       * }
+       */
+      rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0);
+      a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0);
+      a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2);
+      
+      /* if corner != 2 then fetch next A buffer
+       * else "corner turn" fetch next B buffer
+       */
+      b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0));
+      MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0));
+      
+      idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0);
+      buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0),
+				spu_promote((unsigned int)bufB, 0),
+				corner_eq2), 0);
+      hi = spu_extract(spu_sel(spu_promote(a_hi, 0),
+			       spu_promote(b_hi, 0),
+			       corner_eq2), 0);
+      lo = spu_extract(spu_sel(spu_promote(a_lo, 0),
+			       spu_promote(b_lo, 0),
+			       corner_eq2), 0);
+      stride = spu_extract(spu_sel(spu_promote(lda, 0),
+				   spu_promote(ldb, 0),
+				   corner_eq2), 0);
+     
+      buf += idx * (unsigned int)(sizeof(bufA)/2);
+
+#ifdef __GNUC__
+      /* The following lnop was added to keep gcc from unscheduling the
+       * series of add,stqd instruction pairs used to build the DMA list in
+       * dma_block_getl.
+       */
+      si_lnop();
+#endif
+
+      dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride);
+
+      /* if (corner == 2) {
+       *   c_lo += c_steph;
+       *   c_stepv = -c_stepv;
+       * } else {
+       *   c_lo += c_stepv;
+       * }
+       */
+#ifdef MATRIX_4GB_CROSSING
+      c_hi_prev = c_hi;
+#endif
+      c_lo_prev = c_lo;
+      c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0);
+      MATRIX_EA_ADD32(c_hi, c_lo, c_addend);
+      c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2);
+      
+      /* Before getting another C buffer, we must wait for the previous
+       * one to be stored.
+       */
+      DMA_WAIT_RECEIVE();
+      dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD);
+
+      DMA_WAIT_REQUEST(1<<0);
+      
+      a_idx = phase^1;
+
+      /* Second block computation 
+       */
+      c_ptr = &bufC[i1][0];
+
+      mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]);
+      
+      a_step = spu_rlqwbyte(a_step, rotate);
+      a_lo += spu_extract(a_step, 0);
+            
+      /* if corner != 2 then fetch next A buffer
+       * else "corner turn" fetch next B buffer
+       */
+      b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0));
+      MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0));
+      
+      idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0);
+      buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0),
+				spu_promote((unsigned int)bufB, 0),
+				corner_eq2), 0);
+      hi = spu_extract(spu_sel(spu_promote(a_hi, 0),
+			       spu_promote(b_hi, 0),
+			       corner_eq2), 0);
+      lo = spu_extract(spu_sel(spu_promote(a_lo, 0),
+			       spu_promote(b_lo, 0),
+			       corner_eq2), 0);
+      stride = spu_extract(spu_sel(spu_promote(lda, 0),
+				   spu_promote(ldb, 0),
+				   corner_eq2), 0);
+     
+      buf += idx * (unsigned int)(sizeof(bufA)/2);
+      dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride);
+
+#ifdef MATRIX_4GB_CROSSING
+      dma_block(c_ptr, c_hi_prev, c_lo_prev, 2, MFC_PUT_CMD);
+#else
+      dma_block(c_ptr, c_hi, c_lo_prev, 2, MFC_PUT_CMD);
+#endif
+      
+      corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2);
+      phase ^= spu_extract(corner_eq2, 0) & 1;
+      
+      i1 ^= 1;
+      a_idx = phase;
+    }
+    
+    /* Finish the last sub-block */
+    DMA_WAIT_RECEIVE();
+    DMA_WAIT_REQUEST((1<<1)|(1<<2));
+    
+    mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]);
+    
+    DMA_WAIT_RECEIVE();
+    
+    mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]);
+    
+    dma_block(&bufC[i1][0], c_hi, c_lo, 1, MFC_PUT_CMD);
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, 1);
+}
+
+
+
Index: accel/lib/spu/accel_dgemm.h
===================================================================
RCS file: accel/lib/spu/accel_dgemm.h
diff -N accel/lib/spu/accel_dgemm.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dgemm.h	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,164 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_DGEMM_H_
+#define _ACCEL_DGEMM_H_		1
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+
+extern hpl_accel_init_parms_t parms;
+
+extern void mm_dp_64Cx64(vec_double2 *blkC, vec_double2 *blkA, vec_double2 *blkB);
+
+static inline void dma_block(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int cmd)
+{
+  spu_mfcdma64(ls, hi, lo, 16384, tag, cmd);
+  spu_mfcdma64(ls+(16384/16), hi, lo+16384, 16384, tag, cmd);
+}
+
+static inline void dma_block_getl(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride)
+{
+  vec_uint4 *list;
+  vec_uint4 e0, e1, e2;
+  vec_uint4 stride2, stride4, stride6;
+
+  /* Place the list at the end of the target LS buffer.
+   */
+  list = (vec_uint4 *)ls + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4));
+
+  /* Construct e0, e1, e2 and stride6 to contain
+   *
+   * e0 = {row size, lo+0*stride, row size, lo+1*stride}
+   * e1 = {row size, lo+2*stride, row size, lo+3*stride}
+   * e2 = {row size, lo+4*stride, row size, lo+5*stride}
+   *
+   * stride6 = {0, 6*stride, 0, 6*stride}
+   */
+
+  e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))),
+			   spu_promote(lo, 0), 
+			   ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+	       spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  
+  stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0),
+			       ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  e1 = spu_add(e0, stride2);
+
+  e2 = spu_add(e0, stride4); list[0]  = e0; 
+  e0 = spu_add(e0, stride6); list[1]  = e1;
+  e1 = spu_add(e1, stride6); list[2]  = e2;
+  e2 = spu_add(e2, stride6); list[3]  = e0;
+  e0 = spu_add(e0, stride6); list[4]  = e1;
+  e1 = spu_add(e1, stride6); list[5]  = e2;
+  e2 = spu_add(e2, stride6); list[6]  = e0;
+  e0 = spu_add(e0, stride6); list[7]  = e1;
+  e1 = spu_add(e1, stride6); list[8]  = e2;
+  e2 = spu_add(e2, stride6); list[9]  = e0;
+  e0 = spu_add(e0, stride6); list[10]  = e1;
+  e1 = spu_add(e1, stride6); list[11]  = e2;
+  e2 = spu_add(e2, stride6); list[12]  = e0;
+  e0 = spu_add(e0, stride6); list[13]  = e1;
+  e1 = spu_add(e1, stride6); list[14]  = e2;
+  e2 = spu_add(e2, stride6); list[15]  = e0;
+  e0 = spu_add(e0, stride6); list[16]  = e1;
+  e1 = spu_add(e1, stride6); list[17]  = e2;
+  e2 = spu_add(e2, stride6); list[18]  = e0;
+  e0 = spu_add(e0, stride6); list[19]  = e1;
+  e1 = spu_add(e1, stride6); list[20]  = e2;
+  e2 = spu_add(e2, stride6); list[21]  = e0;
+  e0 = spu_add(e0, stride6); list[22]  = e1;
+  e1 = spu_add(e1, stride6); list[23]  = e2;
+  e2 = spu_add(e2, stride6); list[24]  = e0;
+  e0 = spu_add(e0, stride6); list[25]  = e1;
+  e1 = spu_add(e1, stride6); list[26]  = e2;
+  e2 = spu_add(e2, stride6); list[27]  = e0;
+  e0 = spu_add(e0, stride6); list[28]  = e1;
+  e1 = spu_add(e1, stride6); list[29]  = e2;
+                             list[30]  = e0;
+                             list[31]  = e1;
+
+  /* Initiate the DMA transfer
+   */
+  spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_GETL_CMD);
+}
+
+static inline void dma_block_putl(vec_uint4 *list, vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride)
+{
+  vec_uint4 e0, e1, e2;
+  vec_uint4 stride2, stride4, stride6;
+
+  /* Construct e0, e1, e2 and stride6 to contain
+   *
+   * e0 = {row size, lo+0*stride, row size, lo+1*stride}
+   * e1 = {row size, lo+2*stride, row size, lo+3*stride}
+   * e2 = {row size, lo+4*stride, row size, lo+5*stride}
+   *
+   * stride6 = {0, 6*stride, 0, 6*stride}
+   */
+
+  e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))),
+			   spu_promote(lo, 0), 
+			   ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+	       spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  
+  stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0),
+			       ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  e1 = spu_add(e0, stride2);
+
+  e2 = spu_add(e0, stride4); list[0]  = e0; 
+  e0 = spu_add(e0, stride6); list[1]  = e1;
+  e1 = spu_add(e1, stride6); list[2]  = e2;
+  e2 = spu_add(e2, stride6); list[3]  = e0;
+  e0 = spu_add(e0, stride6); list[4]  = e1;
+  e1 = spu_add(e1, stride6); list[5]  = e2;
+  e2 = spu_add(e2, stride6); list[6]  = e0;
+  e0 = spu_add(e0, stride6); list[7]  = e1;
+  e1 = spu_add(e1, stride6); list[8]  = e2;
+  e2 = spu_add(e2, stride6); list[9]  = e0;
+  e0 = spu_add(e0, stride6); list[10]  = e1;
+  e1 = spu_add(e1, stride6); list[11]  = e2;
+  e2 = spu_add(e2, stride6); list[12]  = e0;
+  e0 = spu_add(e0, stride6); list[13]  = e1;
+  e1 = spu_add(e1, stride6); list[14]  = e2;
+  e2 = spu_add(e2, stride6); list[15]  = e0;
+  e0 = spu_add(e0, stride6); list[16]  = e1;
+  e1 = spu_add(e1, stride6); list[17]  = e2;
+  e2 = spu_add(e2, stride6); list[18]  = e0;
+  e0 = spu_add(e0, stride6); list[19]  = e1;
+  e1 = spu_add(e1, stride6); list[20]  = e2;
+  e2 = spu_add(e2, stride6); list[21]  = e0;
+  e0 = spu_add(e0, stride6); list[22]  = e1;
+  e1 = spu_add(e1, stride6); list[23]  = e2;
+  e2 = spu_add(e2, stride6); list[24]  = e0;
+  e0 = spu_add(e0, stride6); list[25]  = e1;
+  e1 = spu_add(e1, stride6); list[26]  = e2;
+  e2 = spu_add(e2, stride6); list[27]  = e0;
+  e0 = spu_add(e0, stride6); list[28]  = e1;
+  e1 = spu_add(e1, stride6); list[29]  = e2;
+                             list[30]  = e0;
+                             list[31]  = e1;
+
+  /* Initiate the DMA transfer
+   */
+  spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_PUTL_CMD);
+}
+
+
+
+#endif /* _ACCEL_DGEMM_H_ */
Index: accel/lib/spu/accel_dgemm_C.c
===================================================================
RCS file: accel/lib/spu/accel_dgemm_C.c
diff -N accel/lib/spu/accel_dgemm_C.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dgemm_C.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,229 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+
+
+extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b);
+
+
+/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used.
+ */
+static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize)
+{
+  vec_uint4 e0, e1, e2;
+  vec_uint4 stride2, stride4, stride6;
+
+  /* Construct e0, e1, e2 and stride6 to contain
+   *
+   * e0 = {row size, lo+0*stride, row size, lo+1*stride}
+   * e1 = {row size, lo+2*stride, row size, lo+3*stride}
+   * e2 = {row size, lo+4*stride, row size, lo+5*stride}
+   *
+   * stride6 = {0, 6*stride, 0, 6*stride}
+   */
+
+  e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), 
+			   ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+	       spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  
+  stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0),
+			       ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  e1 = spu_add(e0, stride2);
+
+  e2 = spu_add(e0, stride4); list[0]  = e0; 
+  e0 = spu_add(e0, stride6); list[1]  = e1;
+  e1 = spu_add(e1, stride6); list[2]  = e2;
+  e2 = spu_add(e2, stride6); list[3]  = e0;
+  e0 = spu_add(e0, stride6); list[4]  = e1;
+  e1 = spu_add(e1, stride6); list[5]  = e2;
+  e2 = spu_add(e2, stride6); list[6]  = e0;
+  e0 = spu_add(e0, stride6); list[7]  = e1;
+  e1 = spu_add(e1, stride6); list[8]  = e2;
+  e2 = spu_add(e2, stride6); list[9]  = e0;
+  e0 = spu_add(e0, stride6); list[10]  = e1;
+  e1 = spu_add(e1, stride6); list[11]  = e2;
+  e2 = spu_add(e2, stride6); list[12]  = e0;
+  e0 = spu_add(e0, stride6); list[13]  = e1;
+  e1 = spu_add(e1, stride6); list[14]  = e2;
+  e2 = spu_add(e2, stride6); list[15]  = e0;
+  e0 = spu_add(e0, stride6); list[16]  = e1;
+  e1 = spu_add(e1, stride6); list[17]  = e2;
+  e2 = spu_add(e2, stride6); list[18]  = e0;
+  e0 = spu_add(e0, stride6); list[19]  = e1;
+  e1 = spu_add(e1, stride6); list[20]  = e2;
+  e2 = spu_add(e2, stride6); list[21]  = e0;
+  e0 = spu_add(e0, stride6); list[22]  = e1;
+  e1 = spu_add(e1, stride6); list[23]  = e2;
+  e2 = spu_add(e2, stride6); list[24]  = e0;
+  e0 = spu_add(e0, stride6); list[25]  = e1;
+  e1 = spu_add(e1, stride6); list[26]  = e2;
+  e2 = spu_add(e2, stride6); list[27]  = e0;
+  e0 = spu_add(e0, stride6); list[28]  = e1;
+  e1 = spu_add(e1, stride6); list[29]  = e2;
+                             list[30]  = e0;
+                             list[31]  = e1;
+}
+
+
+
+/* Double precision DGEMM matrix-matrix multiply for column-ordered
+ * matrices.
+ */
+void accel_dgemm_C_C_C(hpl_accel_init_parms_t *parms, 
+		       volatile hpl_accel_dgemm_parms_t *cmd_parms)
+{
+  int rows, next_rows;
+  unsigned int id, i, k, m, m_start, m_next, n;
+  unsigned int elementsize, idx, tag;
+  unsigned int blks, blks_per_spe, extra_blks;
+  unsigned long long a, b, c;		/* ea pointers */
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int c_hi, c_lo;
+  unsigned int lda, ldb, ldc;
+  vec_uint4 ld, dim, *list, *c_list, *c_list_next;
+  vec_double2 *A, *B, *C;
+  void *ptrB;
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the command parameters
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  c = cmd_parms->c;
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+
+  c_hi = mfc_ea2h(c);
+  c_lo = mfc_ea2l(c);
+
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+  ldb = spu_extract(ld, 1); 
+  ldc = spu_extract(ld, 2); 
+      
+  dim = cmd_parms->dim;
+ 
+  n = spu_extract(dim, 0);
+  m = spu_extract(dim, 1);
+  k = spu_extract(dim, 2);
+
+  /* Get a copy of B 
+   */
+  B = (void *)&bufB[0][0];
+  ptrB = B;
+  DMA_WAIT_RECEIVE();
+  for (i=0; i<n; i++) {
+      spu_mfcdma64(ptrB, b_hi, b_lo, k*sizeof(double), 0, MFC_GET_CMD);
+      MATRIX_EA_UADD32(b_hi, b_lo, ldb);
+      ptrB += k*sizeof(double);
+  }
+
+  /* Determine the amount of work to be done by this SPE. Work is evenly
+   * distributed amongst the SPEs in blk of M_SUB high.
+   */
+  blks = (m + (M_SUB-1)) / M_SUB;
+  blks_per_spe = blks / HPL_ACCEL_SPES;
+  extra_blks = blks - HPL_ACCEL_SPES * blks_per_spe;
+  blks_per_spe *= M_SUB;
+
+  rows = blks_per_spe;
+  if (id < extra_blks) rows += M_SUB;
+  m_start = id * blks_per_spe + M_SUB * ((id < extra_blks) ? id : extra_blks);
+  if ((m_start + rows) > m) rows = m - m_start;
+
+  a_lo += m_start * sizeof(double);
+  c_lo += m_start * sizeof(double);
+
+  /* Fetch a block of A and C 
+   */
+  m = (rows > M_SUB) ? M_SUB : rows;
+  
+  elementsize = m * sizeof(double);
+  A = (void *)&bufA[0][0];
+  list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4));
+  construct_list(list, a_lo, lda, elementsize);
+  spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD);
+  
+  c_list = (vec_uint4 *)&bufB[1][0];
+  construct_list(c_list, c_lo, ldc, elementsize);
+  spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD);
+  
+  DMA_WAIT_REQUEST(1);
+
+  tag = 1;
+  idx = 1;
+  next_rows = rows - M_SUB;
+
+  while (next_rows > 0) {
+    /* Fetch the next block of A and C */
+    a_lo += elementsize;
+    c_lo += elementsize;
+      
+    m_next = (next_rows > M_SUB) ? M_SUB : next_rows;
+    elementsize = m_next * sizeof(double);
+    A = (void *)&bufA[tag][0];
+    list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4));
+    construct_list(list, a_lo, lda, elementsize);
+    spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD);
+  
+    c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB];
+    construct_list(c_list_next, c_lo, ldc, elementsize);
+    spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD);
+    
+    /* Compute a block */
+    DMA_WAIT_RECEIVE();
+    DMA_WAIT_REQUEST(1<<tag);
+    
+    idx = (idx + 1) & 3;		/* quad buffer the C list */
+    tag ^= 1;
+    A = (vec_double2 *)&bufA[tag][0];
+    C = (vec_double2 *)&bufC[tag][0];
+    mm_dp(k, n, m, C, B, A);
+    
+    /* Write the result C block back to memory */
+    spu_mfcdma64(C, c_hi, (unsigned int)c_list, 8*n, tag, MFC_PUTL_CMD);
+
+    m = m_next;
+    c_list = c_list_next;
+    rows = next_rows;
+    next_rows -= M_SUB;
+  }
+  /* Compute the final block and write it back to memory */
+  DMA_WAIT_RECEIVE();
+
+  tag ^= 1;
+  A = (vec_double2 *)&bufA[tag][0];
+  C = (vec_double2 *)&bufC[tag][0];
+  mm_dp(k, n, m, C, B, A);
+
+  spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list, 8*n, tag, MFC_PUTL_CMD);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
Index: accel/lib/spu/accel_dgemm_CL.c
===================================================================
RCS file: accel/lib/spu/accel_dgemm_CL.c
diff -N accel/lib/spu/accel_dgemm_CL.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dgemm_CL.c	14 May 2008 21:35:00 -0000	1.6
@@ -0,0 +1,231 @@
+/* --------------------------------------------------------------  */
+/* (C)Copyright 2007                                               */
+/* International Business Machines Corporation,                    */
+/*                                                                 */
+/* All Rights Reserved.                                            */
+/* --------------------------------------------------------------  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+
+
+extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b);
+
+
+/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used.
+ */
+static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize)
+{
+  vec_uint4 e0, e1, e2;
+  vec_uint4 stride2, stride4, stride6;
+
+  /* Construct e0, e1, e2 and stride6 to contain
+   *
+   * e0 = {row size, lo+0*stride, row size, lo+1*stride}
+   * e1 = {row size, lo+2*stride, row size, lo+3*stride}
+   * e2 = {row size, lo+4*stride, row size, lo+5*stride}
+   *
+   * stride6 = {0, 6*stride, 0, 6*stride}
+   */
+
+  e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), 
+			   ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+	       spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  
+  stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0),
+			       ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  e1 = spu_add(e0, stride2);
+
+  e2 = spu_add(e0, stride4); list[0]  = e0; 
+  e0 = spu_add(e0, stride6); list[1]  = e1;
+  e1 = spu_add(e1, stride6); list[2]  = e2;
+  e2 = spu_add(e2, stride6); list[3]  = e0;
+  e0 = spu_add(e0, stride6); list[4]  = e1;
+  e1 = spu_add(e1, stride6); list[5]  = e2;
+  e2 = spu_add(e2, stride6); list[6]  = e0;
+  e0 = spu_add(e0, stride6); list[7]  = e1;
+  e1 = spu_add(e1, stride6); list[8]  = e2;
+  e2 = spu_add(e2, stride6); list[9]  = e0;
+  e0 = spu_add(e0, stride6); list[10]  = e1;
+  e1 = spu_add(e1, stride6); list[11]  = e2;
+  e2 = spu_add(e2, stride6); list[12]  = e0;
+  e0 = spu_add(e0, stride6); list[13]  = e1;
+  e1 = spu_add(e1, stride6); list[14]  = e2;
+  e2 = spu_add(e2, stride6); list[15]  = e0;
+  e0 = spu_add(e0, stride6); list[16]  = e1;
+  e1 = spu_add(e1, stride6); list[17]  = e2;
+  e2 = spu_add(e2, stride6); list[18]  = e0;
+  e0 = spu_add(e0, stride6); list[19]  = e1;
+  e1 = spu_add(e1, stride6); list[20]  = e2;
+  e2 = spu_add(e2, stride6); list[21]  = e0;
+  e0 = spu_add(e0, stride6); list[22]  = e1;
+  e1 = spu_add(e1, stride6); list[23]  = e2;
+  e2 = spu_add(e2, stride6); list[24]  = e0;
+  e0 = spu_add(e0, stride6); list[25]  = e1;
+  e1 = spu_add(e1, stride6); list[26]  = e2;
+  e2 = spu_add(e2, stride6); list[27]  = e0;
+  e0 = spu_add(e0, stride6); list[28]  = e1;
+  e1 = spu_add(e1, stride6); list[29]  = e2;
+                             list[30]  = e0;
+                             list[31]  = e1;
+}
+
+
+
+/* Double precision DGEMM matrix-matrix multiply for column-ordered
+ * matrices.
+ */
+void accel_dgemm_CL_C_C(hpl_accel_init_parms_t *parms, 
+			volatile hpl_accel_dgemm_parms_t *cmd_parms)
+{
+  int rows, next_rows;
+  unsigned int id, i, k, m, m_start, m_next, n;
+  unsigned int elementsize, idx, tag;
+  unsigned int blks, blks_per_spe, extra_blks;
+  unsigned long long a, b, c;		/* ea pointers */
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int c_hi, c_lo;
+  unsigned int lda, ldb, ldc;
+  vec_uint4 ld, dim, *list, *c_list, *c_list_next;
+  vec_double2 *A, *B, *C;
+  void *ptrB;
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the command parameters
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  c = cmd_parms->c;
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+
+  c_hi = mfc_ea2h(c);
+  c_lo = mfc_ea2l(c);
+
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+  ldb = spu_extract(ld, 1); 
+  ldc = spu_extract(ld, 2); 
+      
+  dim = cmd_parms->dim;
+ 
+  n = spu_extract(dim, 0);
+  m = spu_extract(dim, 1);
+  k = spu_extract(dim, 2);
+
+  /* Get a copy of B 
+   */
+  B = (void *)&bufB[0][0];
+  ptrB = B;
+  DMA_WAIT_RECEIVE();
+  for (i=0; i<n; i++) {
+      spu_mfcdma64(ptrB, b_hi, b_lo, k*sizeof(double), 0, MFC_GET_CMD);
+      MATRIX_EA_UADD32(b_hi, b_lo, ldb);
+      ptrB += k*sizeof(double);
+  }
+
+  /* Determine the amount of work to be done by this SPE. Work is evenly
+   * distributed amongst the SPEs in blk of M_SUB high.
+   */
+  blks = (m + (M_SUB-1)) / M_SUB;
+  blks_per_spe = blks / HPL_ACCEL_SPES;
+  extra_blks = blks - HPL_ACCEL_SPES * blks_per_spe;
+  blks_per_spe *= M_SUB;
+
+  rows = blks_per_spe;
+  if (id < extra_blks) rows += M_SUB;
+  m_start = id * blks_per_spe + M_SUB * ((id < extra_blks) ? id : extra_blks);
+  if ((m_start + rows) > m) rows = m - m_start;
+
+  a_lo += m_start * sizeof(double);
+  c_lo += m_start * sizeof(double);
+
+  /* Fetch a block of A and C 
+   */
+  m = (rows > M_SUB) ? M_SUB : rows;
+  
+  elementsize = m * sizeof(double);
+  A = (void *)&bufA[0][0];
+  list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4));
+  construct_list(list, a_lo, lda, elementsize);
+  spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD);
+  
+  c_list = (vec_uint4 *)&bufB[1][0];
+  construct_list(c_list, c_lo, ldc, elementsize);
+  spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD);
+  
+  DMA_WAIT_REQUEST(1);
+
+  tag = 1;
+  idx = 1;
+  next_rows = rows - M_SUB;
+
+  while (next_rows > 0) {
+    /* Fetch the next block of A and C */
+    a_lo += elementsize;
+    c_lo += elementsize;
+      
+    m_next = (next_rows > M_SUB) ? M_SUB : next_rows;
+    elementsize = m_next * sizeof(double);
+    A = (void *)&bufA[tag][0];
+    list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4));
+    construct_list(list, a_lo, lda, elementsize);
+    spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD);
+  
+    c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB];
+    construct_list(c_list_next, c_lo, ldc, elementsize);
+    spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD);
+    
+    /* Compute a block */
+    DMA_WAIT_RECEIVE();
+    DMA_WAIT_REQUEST(1<<tag);
+    
+    idx = (idx + 1) & 3;		/* quad buffer the C list */
+    tag ^= 1;
+    A = (vec_double2 *)&bufA[tag][0];
+    C = (vec_double2 *)&bufC[tag][0];
+    mm_dp(k, n, m, C, B, A);
+    
+    /* Write the result C block back to memory */
+    spu_mfcdma64(C, c_hi, (unsigned int)c_list, 8*n, tag, MFC_PUTL_CMD);
+
+    m = m_next;
+    c_list = c_list_next;
+    rows = next_rows;
+    next_rows -= M_SUB;
+  }
+  /* Compute the final block and write it back to memory */
+  DMA_WAIT_RECEIVE();
+
+  tag ^= 1;
+  A = (vec_double2 *)&bufA[tag][0];
+  C = (vec_double2 *)&bufC[tag][0];
+  mm_dp(k, n, m, C, B, A);
+
+  spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list, 8*n, tag, MFC_PUTL_CMD);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
Index: accel/lib/spu/accel_dgemm_panel.c
===================================================================
RCS file: accel/lib/spu/accel_dgemm_panel.c
diff -N accel/lib/spu/accel_dgemm_panel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dgemm_panel.c	23 Oct 2008 21:20:24 -0000	1.5
@@ -0,0 +1,585 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "accel_dgemm.h"
+
+/* transpose_and_swap
+ * ------------------
+ * For a 64x64 matrix m, inplace transpose the matrix and byte swap the contents.
+ */
+
+static void transpose_and_swap(vec_double2 m[])
+{
+  int i, j;
+  vec_double2 *row, *col;
+#ifdef ACCEL_LITTLE_ENDIAN
+  vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
+#else
+  vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23};
+#endif
+  vec_uchar16 pat_odd;
+  vec_double2 r00, r01, r10, r11, r20, r21, r30, r31;
+  vec_double2 c00, c01, c10, c11, c20, c21, c30, c31;
+
+  pat_odd = spu_or(pat_even, 8);
+
+  /* Perform transpose and swap on 4x4 micro blocks 
+   */
+  for (i=0; i<64; i+=4) {
+    /* Transpose and swap the micro block on the diagonal. For example, consider 
+     * the 16x16 matrix consisting of the following 16 micro blocks. The following
+     * code transposes the micro block along the diagonal, as marked by the "X".
+     *
+     *   +---+---+---+---+
+     *   | X |   |   |   |
+     *   +---+---+---+---+
+     *   |   | X |   |   |
+     *   +---+---+---+---+
+     *   |   |   | X |   |
+     *   +---+---+---+---+
+     *   |   |   |   | X |
+     *   +---+---+---+---+
+     */
+    r00 = m[0*32+0];
+    r01 = m[0*32+1];
+    r10 = m[1*32+0];
+    r11 = m[1*32+1];
+    r20 = m[2*32+0];
+    r21 = m[2*32+1];
+    r30 = m[3*32+0];
+    r31 = m[3*32+1];
+
+    m[0*32+0] = spu_shuffle(r00, r10, pat_even);
+    m[0*32+1] = spu_shuffle(r20, r30, pat_even);
+    m[1*32+0] = spu_shuffle(r00, r10, pat_odd);
+    m[1*32+1] = spu_shuffle(r20, r30, pat_odd);
+    m[2*32+0] = spu_shuffle(r01, r11, pat_even);
+    m[2*32+1] = spu_shuffle(r21, r31, pat_even);
+    m[3*32+0] = spu_shuffle(r01, r11, pat_odd);
+    m[3*32+1] = spu_shuffle(r21, r31, pat_odd);
+    
+    row = m + 2;
+    col = m + 4*32;
+
+    for (j=i+4; j<64; j+=4) {
+      /* Tranpose and swap the micro blocks across the diagonal. For example, consider 
+       * the 16x16 matrix consisting of the following 16 micro blocks. For each row
+       * of micro blocks, the row blocks to the right of the diagonal are transposed 
+       * and swap with the column blocks below the diagonal. In our example, the first
+       * row, row block A is transposed and swap with column block 'a'. Likewise for 
+       * 'B' and 'b'; and 'C' and 'c'.
+       *
+       *   +---+---+---+---+
+       *   |   | A | B | C |
+       *   +---+---+---+---+
+       *   | a |   | D | E |
+       *   +---+---+---+---+
+       *   | b | d |   | F |
+       *   +---+---+---+---+
+       *   | c | e | f |   |
+       *   +---+---+---+---+
+       */
+      r00 = row[0*32+0];
+      r01 = row[0*32+1];
+      r10 = row[1*32+0];
+      r11 = row[1*32+1];
+      r20 = row[2*32+0];
+      r21 = row[2*32+1];
+      r30 = row[3*32+0];
+      r31 = row[3*32+1];
+
+      c00 = col[0*32+0];
+      c01 = col[0*32+1];
+      c10 = col[1*32+0];
+      c11 = col[1*32+1];
+      c20 = col[2*32+0];
+      c21 = col[2*32+1];
+      c30 = col[3*32+0];
+      c31 = col[3*32+1];
+
+      row[0*32+0] = spu_shuffle(c00, c10, pat_even);
+      row[0*32+1] = spu_shuffle(c20, c30, pat_even);
+      row[1*32+0] = spu_shuffle(c00, c10, pat_odd);
+      row[1*32+1] = spu_shuffle(c20, c30, pat_odd);
+
+      col[0*32+0] = spu_shuffle(r00, r10, pat_even);
+      col[0*32+1] = spu_shuffle(r20, r30, pat_even);
+      col[1*32+0] = spu_shuffle(r00, r10, pat_odd);
+      col[1*32+1] = spu_shuffle(r20, r30, pat_odd);
+
+      row[2*32+0] = spu_shuffle(c01, c11, pat_even);
+      row[2*32+1] = spu_shuffle(c21, c31, pat_even);
+      row[3*32+0] = spu_shuffle(c01, c11, pat_odd);
+      row[3*32+1] = spu_shuffle(c21, c31, pat_odd);
+
+      col[2*32+0] = spu_shuffle(r01, r11, pat_even);
+      col[2*32+1] = spu_shuffle(r21, r31, pat_even);
+      col[3*32+0] = spu_shuffle(r01, r11, pat_odd);
+      col[3*32+1] = spu_shuffle(r21, r31, pat_odd);
+
+      row += 2;		/* Advance pointer to next row micro block */
+      col += 4*32;	/* Advance pointer to next column micro block */
+    }
+
+    m += 4*32+2;
+  }
+}
+
+void accel_dgemm_panel(hpl_accel_init_parms_t *parms, 
+		       volatile hpl_accel_dgemm_parms_t *cmd_parms)
+{
+  int i;
+  int rotate;
+  unsigned int id;
+  unsigned int idx, a_idx, c_idx;
+  unsigned int i1, phase;
+  unsigned long long a, b, c, p;	/* ea pointers */
+  unsigned int hi, lo;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int c_hi, c_lo;
+  unsigned int p_hi, p_lo;
+  unsigned int sub_blocks, sub_blocks_per_spe;
+  unsigned int start_x, start_sub, end_sub;
+  unsigned int odd, buf;
+  unsigned int x_sub, y_sub;
+  unsigned int w_sub, h_sub;		/* width & height in sub_blocks */
+  unsigned int lda, ldb, ldp, stride;
+  unsigned int a_addend, b_addend, c_addend, p_addend;
+  vec_uint4 vone = (vec_uint4){1, 1, 1, 1};
+  vec_uint4 ld, flags, b_blk;
+  vec_uint4 a_step, b_step, c_stepv, c_steph, p_stepv, p_steph;
+  vec_uint4 dim;
+  vec_uint4 h_sub_v, h_sub2_v, y_sub2_v;
+  vec_uint4 down, corner;
+  vec_uint4 corner_eq2;
+  vec_uint4 step_sub = spu_splats(M_SUB*sizeof(double));
+  vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1};
+  vec_uint4 list[2][M_SUB/2];
+  vec_uchar16 splat_1  = (vec_uchar16)spu_splats((unsigned int)0x04050607);
+  vec_uchar16 splat_2  = (vec_uchar16)spu_splats((unsigned int)0x08090A0B);
+  vec_uchar16 shuf_0404 = (vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19};
+  vec_uchar16 shuf_0044 = (vec_uchar16){0,1,2,3, 0,1,2,3, 16,17,18,19, 16,17,18,19};
+  vec_double2 *c_ptr;
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+
+  /* Fetch the command parameters
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  c = cmd_parms->c;
+  p = cmd_parms->p;
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+
+  c_hi = mfc_ea2h(c);
+  c_lo = mfc_ea2l(c);
+
+  p_hi = mfc_ea2h(p);
+  p_lo = mfc_ea2l(p);
+
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+  ldb = spu_extract(ld, 1); 
+  ldp = spu_extract(ld, 3);
+
+  dim = cmd_parms->dim;
+
+  flags = cmd_parms->flags;
+
+  b_blk = spu_maskw(spu_extract(flags, 0));
+
+  /* Computation of [C] -= [A][B] is performed in a surpetine pattern
+   * through the various sub-blocks of C. Below is a graphical attempt
+   * to explain the partitioning and order of the computation. For this
+   * example, consider the matrix-matrix multiply of a 5x5 (128x128 block)
+   * result after panel factorization of block 0,0 (bx,by). In this case,
+   * we must compute 128x128 blocks multiplies as follows:
+   *
+   *   for (x=1; x<5; x++) {
+   *     for (y=1; y<5; y++) {
+   *        C(x,y) -= A(bx,y)*B(x,by);
+   *     }
+   *   }
+   * 
+   * Assuming this computation is performed by 3 SPEs, the 16 blocks
+   * are subdivided as:
+   *
+   *   SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2)
+   *   SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4)
+   *   SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4)
+   *
+   * Therefore, SPE 1 will compute the resulting sub-blocks of C in the
+   * alphabetic order (a thru z) as marked below.
+   *                
+   *               X
+   *       0   1   2   3   4
+   *     +---B---+---+---+---+
+   *   0 |   |    U row      |  
+   *     |   |               |  
+   *     A---C---+---+---+---+
+   *   1 |   |   |   |i x|   |  
+   *     |   |   |   |j w|   |  
+   *     + L +---+---+---+---+
+   * Y 2 |   |   |   |k v|   |  
+   *     | p |   |   |l u|   |                                      
+   *     + a +---+---+---+---+
+   *   3 | n |   |a h|m t|   |  
+   *     | e |   |b g|n s|   |  
+   *     + l +---+---+---+---+
+   *   4 |   |   |c f|o r|   |  
+   *     |   |   |d e|p q|   |  
+   *     +---+---+---+---+---+
+   *
+   * Using 128x128 block partitioning amongst the SPEs results non-optimal
+   * load balancing of the SPEs. This is shown by the above example in which
+   * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes
+   * 16 64x64 multiplies. In addition, the corner turn between sub-blocks
+   * 'h' and 'i' will incur extra DMAs. 
+   *
+   * A more computational and transfer efficient load balance would be 
+   * to allocate computation on the 64 sub-blocks. This would allocate 
+   * 22,22,20 sub-block multiplies to each of the SPEs and the corner 
+   * turn becomes efficient. The sub-block, computation (alphabetically
+   * ordered) for SPE 1 becomes:
+   *
+   *               X
+   *       0   1   2   3   4
+   *     +---B---+---+---+---+
+   *   0 |   |    U row      |  
+   *     |   |               |  
+   *     A---C---+---+---+---+
+   *   1 |   |   |  j|k  |   |  
+   *     |   |   |  i|l  |   |  
+   *     + L +---+---+---+---+
+   * Y 2 |   |   |  h|m  |   |  
+   *     | p |   |  g|n  |   |  
+   *     + a +---+---+---+---+
+   *   3 | n |   |  f|o v|   |
+   *     | e |   |  e|p u|   |  
+   *     + l +---+---+---+---+
+   *   4 |   |   |a d|q t|   |  
+   *     |   |   |b c|r s|   |  
+   *     +---+---+---+---+---+
+   *
+   * This more efficient method is employed in the following code.
+   */
+	
+  w_sub = spu_extract(dim, 0);
+  h_sub_v = spu_shuffle(dim, dim, splat_1);
+  h_sub = spu_extract(h_sub_v, 0);
+
+  h_sub2_v = spu_sl(h_sub_v, 1);
+
+  sub_blocks = w_sub * h_sub;
+  sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES;
+  
+  start_sub = ((unsigned short)id) * sub_blocks_per_spe;
+  end_sub = start_sub + sub_blocks_per_spe;
+  if (end_sub > sub_blocks) end_sub = sub_blocks;
+      
+  sub_blocks = end_sub - start_sub;
+
+  if (LIKELY((int)sub_blocks > 0)) {
+    /* This SPE has some work to do
+     */
+    DMA_WAIT_REQUEST(-1);
+
+    /* Compute vectors for stepping the effective address matrix pointers.
+     * The pictograms below show 64x64 blocks within the 128x128 blocks.
+     *
+     *   A (L panel)         B (U panel)                C matrix
+     *   ++===+===++     ++---+---++---+---++      ++===+===++===+===++
+     *   || 1 | 2 ||     || 1 | 4 || 5 |   ||      || 1 |   ||   |   ||
+     *   ++---+---++     ++---+---++---+---++      ++---+---++---+---++
+     *   || 3 |   ||     || 2 | 3 ||   |   ||      || 2 |   ||   |   ||
+     *   ++===+===++     ++---+---++---+---++      ++===+===++===+===++
+     *   ||   |   ||                               ||   |   ||   |   || 
+     *   ++---+---++                               ++---+---++---+---++     
+     *   ||   |   ||                               || 3 | 4 ||   |   || 
+     *   ++===+===++                               ++===+===++===+===++
+     *          
+     *
+     *   P (output matrix)
+     *   ++===+===++===+===++
+     *   || 1 |   ||   |   ||
+     *   ++---+---++---+---++
+     *   || 2 |   ||   |   ||
+     *   ++===+===++===+===++
+     *   ||   |   ||   |   ||
+     *   ++---+---++---+---++
+     *   || 3 | 4 ||   |   ||
+     *   ++===+===++===+===++
+     * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3}
+     * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5}
+     * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2}
+     * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4}
+     * p_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2}
+     * p_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4}
+     */
+
+    a_step = spu_promote(lda * M_SUB, 0);
+    a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404);
+
+    c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double));
+    c_steph = spu_shuffle(ld, ld, splat_2);
+
+    p_stepv = step_sub;
+    p_steph = spu_promote(ldp * M_SUB, 0);
+
+    b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk);
+    b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044);
+    b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101);
+
+    ldb    = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0);
+
+    /* Determine the following:
+     * 1) Starting sub-block - x_sub, y_sub
+     * 2) Number of sub-block multiplies before a corner turn - corner. 
+     */
+    x_sub = start_sub / h_sub;
+    y_sub = start_sub - h_sub * x_sub;
+    
+    start_x = x_sub / SUB;
+    y_sub = start_sub - h_sub*SUB*start_x;
+	
+    /* rotate = 4;
+     * 
+     * if (x_sub & 1) {
+     *   y_sub = h_sub - 1 - y_sub;
+     *   a_step = spu_sub(0, a_step);
+     *   c_stepv = spu_sub(0, c_stepv);
+     *   p_stepv = spu_sub(0, p_stepv);
+     *   rotate = -rotate;
+     *   corner = 2*y_sub + 2
+     * } else {
+     *   corner = 2 * (h_sub-y_sub)
+     * }
+     */
+    odd = x_sub & 1;
+    
+    down = spu_cmpeq(spu_splats(odd), 0);
+	
+    y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)),
+				spu_promote(y_sub, 0),				
+				down), 0);
+	
+    y_sub2_v = spu_splats(2*y_sub);
+	
+    corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down);
+
+    /* Compute the initial EA buffer pointers.
+     */
+    a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0);
+    b_addend = spu_extract(spu_andc(b_step, down), 0);
+    c_addend = y_sub * spu_extract(c_stepv, 0);
+    p_addend = y_sub * spu_extract(p_stepv, 0) + x_sub * spu_extract(p_steph, 0);
+
+    a_lo += a_addend;
+    p_lo += p_addend;
+    MATRIX_EA_UADD32(b_hi, b_lo, b_addend);
+    MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1));
+    MATRIX_EA_UADD32(c_hi, c_lo, c_addend);
+    MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0));
+
+    /* Adjust the pointer steps according to the initial direction.
+     */
+    a_step = spu_sel(spu_sub(0, a_step), a_step, down);
+    b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0));
+    c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down);
+    p_stepv = spu_sel(spu_sub(0, p_stepv), p_stepv, down);
+    rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0);
+	
+    /* Before starting, make sure all previous DMA transfers are completed so
+     * that all the LS buffers are known to be available.
+     */
+    DMA_WAIT_RECEIVE();
+
+    /* Download 3 blocks to get the process started. After that, each
+     * 64x64 block multiple requires 2 block transfers.
+     */
+    dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda);
+
+    dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb);
+	
+    dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD);
+	
+    DMA_WAIT_REQUEST(1<<0);
+
+    a_lo += spu_extract(a_step, 0);
+    MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0));
+	
+    dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda);
+	
+    dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb);
+	
+    phase = 0;
+	
+    i1 = 0;
+    a_idx = 0;
+	
+    for (i=0; i<(int)sub_blocks-1; i++) {
+      /* First block computation 
+       */
+      DMA_WAIT_RECEIVE();
+      DMA_WAIT_REQUEST((1<<1)|(1<<2));
+
+      mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]);
+      
+      a_step = spu_rlqwbyte(a_step, rotate);
+      
+      c_idx = i1 ^ 1;
+      
+      corner_eq2 = spu_cmpeq(corner, 2);
+      
+      /* if (corner == 2) {
+       *   rotate = -rotate;
+       *   a_step = 0-a_step;
+       * } else {
+       *   a_lo += a_step;
+       * }
+       */
+      rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0);
+      a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0);
+      a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2);
+      
+      /* if corner != 2 then fetch next A buffer
+       * else "corner turn" fetch next B buffer
+       */
+      b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0));
+      MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0));
+      
+      idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0);
+      buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0),
+				spu_promote((unsigned int)bufB, 0),
+				corner_eq2), 0);
+      hi = spu_extract(spu_sel(spu_promote(a_hi, 0),
+			       spu_promote(b_hi, 0),
+			       corner_eq2), 0);
+      lo = spu_extract(spu_sel(spu_promote(a_lo, 0),
+			       spu_promote(b_lo, 0),
+			       corner_eq2), 0);
+      stride = spu_extract(spu_sel(spu_promote(lda, 0),
+				   spu_promote(ldb, 0),
+				   corner_eq2), 0);
+      
+      buf += idx * (unsigned int)(sizeof(bufA)/2);
+
+#ifdef __GNUC__
+      /* The following lnop was added to keep gcc from unscheduling the
+       * series of add,stqd instruction pairs used to build the DMA list in
+       * dma_block_getl.
+       */
+      si_lnop();
+#endif
+
+      dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride);
+      
+      /* if (corner == 2) {
+       *   c_lo += c_steph;
+       *   c_stepv = -c_stepv;
+       * } else {
+       *   c_lo += c_stepv;
+       * }
+       */
+      c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0);
+      MATRIX_EA_ADD32(c_hi, c_lo, c_addend);
+      c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2);
+	
+      /* Before getting another C buffer, we must wait for the previous
+       * one to be stored.
+       */
+      DMA_WAIT_RECEIVE();
+      dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD);
+      
+      DMA_WAIT_REQUEST(1<<0);
+
+      a_idx = phase^1;
+
+      /* Second block computation 
+       */
+      c_ptr = &bufC[i1][0];
+
+      mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]);
+      
+      a_step = spu_rlqwbyte(a_step, rotate);
+      a_lo += spu_extract(a_step, 0);
+      
+      /* if corner != 2 then fetch next A buffer
+       * else "corner turn" fetch next B buffer
+       */
+      b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0));
+      MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0));
+      
+      idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0);
+      buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0),
+				spu_promote((unsigned int)bufB, 0),
+				corner_eq2), 0);
+      hi = spu_extract(spu_sel(spu_promote(a_hi, 0),
+			       spu_promote(b_hi, 0),
+			       corner_eq2), 0);
+      lo = spu_extract(spu_sel(spu_promote(a_lo, 0),
+			       spu_promote(b_lo, 0),
+			       corner_eq2), 0);
+      stride = spu_extract(spu_sel(spu_promote(lda, 0),
+				   spu_promote(ldb, 0),
+				   corner_eq2), 0);
+      
+      buf += idx * (unsigned int)(sizeof(bufA)/2);
+      dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride);
+      
+      /* Transpose and swap the resulting block 
+       *
+       * if (corner == 2) {
+       *   p_lo += p_steph;
+       *   p_stepv = -p_stepv;
+       * } else {
+       *   p_lo += p_stepv;
+       * }
+       */
+      transpose_and_swap(&bufC[i1][0]);
+      
+      dma_block_putl(&list[i1][0], c_ptr, p_hi, p_lo, 2, ldp);
+
+      p_lo += spu_extract(spu_sel(p_stepv, p_steph, corner_eq2), 0);
+      p_stepv = spu_sel(p_stepv, spu_sub(0, p_stepv), corner_eq2);
+      
+      corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2);
+      phase ^= spu_extract(corner_eq2, 0) & 1;
+      
+      i1 ^= 1;
+      a_idx = phase;
+    }
+    
+    /* Finish the last sub-block */
+    DMA_WAIT_RECEIVE();
+    DMA_WAIT_REQUEST((1<<1)|(1<<2));
+    
+    mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]);
+    
+    DMA_WAIT_RECEIVE();
+    
+    mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]);
+    
+    /* Transpose and swap the resulting block 
+     */
+    transpose_and_swap(&bufC[i1][0]);
+    dma_block_putl(&list[i1][0], &bufC[i1][0], p_hi, p_lo, 1, ldp);
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, 1);
+}
Index: accel/lib/spu/accel_dtrsm.c
===================================================================
RCS file: accel/lib/spu/accel_dtrsm.c
diff -N accel/lib/spu/accel_dtrsm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dtrsm.c	20 Aug 2008 03:57:53 -0000	1.4
@@ -0,0 +1,154 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_dtrsm.h"
+
+
+void accel_dtrsm(hpl_accel_init_parms_t *parms, 
+		 volatile hpl_accel_dtrsm_parms_t *cmd_parms)
+{
+  int i;
+  unsigned int idx, tag, next_tag;
+  unsigned int size, lda, stride;
+  unsigned int id;
+  unsigned long long a, b;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int list;
+  unsigned int n;
+  vec_uint4 ld;
+  vec_uint4 element, stride2, stride4, stride6;
+  volatile void *lsa;
+  
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+
+  /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup
+   * time, we will download only the necessary data columns in groups of 16 
+   * while preserving the cacheline alignment. The download will be done 
+   * starting from the smallest column to the largest.
+   */
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  lsa = (volatile void *)(&bufA_128x128[0]);
+  size = 128*sizeof(double);
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT_RECEIVE();
+
+  for (i=0; i<127; i++) {
+    unsigned int adjust;
+
+    spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD);
+
+    a_lo += lda;
+    lsa  += 128*sizeof(double);
+
+    /* Compute the next DMA parameters 
+     */
+    adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0);
+
+    a_lo += adjust;
+    lsa  += adjust;
+    size -= adjust;
+  }
+
+  n = spu_extract(cmd_parms->dim, 0) / 16;
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+  
+  b_lo += 16 * sizeof(double) * id;
+
+  /* Download the initial set of 16 B columns
+   */
+  stride = spu_extract(ld, 1);
+  
+  element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))),
+				spu_promote(b_lo, 0), 
+				((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+		    spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6);
+  spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD);
+
+
+  idx = 1;
+  next_tag = 0;
+  tag = 1;
+
+  DMA_WAIT_REQUEST(1<<0);
+
+  for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) {
+    /* Fetch the next buffer
+     */
+    element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)}));
+
+    fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6);
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD);
+    tag ^= 1;
+
+    /* Wait for the previous get to complete */
+    DMA_WAIT_RECEIVE();
+
+    /* Perform the dtrsm.
+     */
+    dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]);
+
+    idx = (idx + 1) & 3;
+
+    list = (unsigned int)&bufB_list[idx^2][0];
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD);
+
+    next_tag = tag ^ 1;
+
+    DMA_WAIT_REQUEST(1<<next_tag);
+  }
+
+  /* Wait for the previous GET to complete */
+  DMA_WAIT_RECEIVE();
+
+  /* Perform the dtrsm.
+   */
+  dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[next_tag][0]);
+
+  idx = (idx + 1) & 3;
+
+  /* Store the results back to system memory, either into b or c 
+   */
+  list = (unsigned int)&bufB_list[idx^2][0];
+
+  spu_mfcdma64(&bufB_128x16[next_tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
+
Index: accel/lib/spu/accel_dtrsm.h
===================================================================
RCS file: accel/lib/spu/accel_dtrsm.h
diff -N accel/lib/spu/accel_dtrsm.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dtrsm.h	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,83 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_DTRSM_H_
+#define _ACCEL_DTRSM_H_		1
+
+
+extern  void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB);
+
+static inline void fill_dma_list(volatile vec_uint4 *list, vec_uint4 e0, vec_uint4 stride2, vec_uint4 stride4, vec_uint4 stride6)
+{
+  vec_uint4 e1, e2;
+
+  e1 = spu_add(e0, stride2);
+  e2 = spu_add(e0, stride4); 	list[0]  = e0;
+  e0 = spu_add(e0, stride6); 	list[1]  = e1;
+  e1 = spu_add(e1, stride6); 	list[2]  = e2;
+  e2 = spu_add(e2, stride6); 	list[3]  = e0;
+  e0 = spu_add(e0, stride6); 	list[4]  = e1;
+  e1 = spu_add(e1, stride6); 	list[5]  = e2;
+  e2 = spu_add(e2, stride6); 	list[6]  = e0;
+  e0 = spu_add(e0, stride6); 	list[7]  = e1;
+  e1 = spu_add(e1, stride6); 	list[8]  = e2;
+  e2 = spu_add(e2, stride6); 	list[9]  = e0;
+  e0 = spu_add(e0, stride6); 	list[10] = e1;
+  e1 = spu_add(e1, stride6); 	list[11] = e2;
+  e2 = spu_add(e2, stride6); 	list[12] = e0;
+  e0 = spu_add(e0, stride6); 	list[13] = e1;
+  e1 = spu_add(e1, stride6); 	list[14] = e2;
+  e2 = spu_add(e2, stride6); 	list[15] = e0;
+  e0 = spu_add(e0, stride6); 	list[16] = e1;
+  e1 = spu_add(e1, stride6); 	list[17] = e2;
+  e2 = spu_add(e2, stride6); 	list[18] = e0;
+  e0 = spu_add(e0, stride6); 	list[19] = e1;
+  e1 = spu_add(e1, stride6); 	list[20] = e2;
+  e2 = spu_add(e2, stride6); 	list[21] = e0;
+  e0 = spu_add(e0, stride6); 	list[22] = e1;
+  e1 = spu_add(e1, stride6); 	list[23] = e2;
+  e2 = spu_add(e2, stride6); 	list[24] = e0;
+  e0 = spu_add(e0, stride6); 	list[25] = e1;
+  e1 = spu_add(e1, stride6); 	list[26] = e2;
+  e2 = spu_add(e2, stride6); 	list[27] = e0;
+  e0 = spu_add(e0, stride6); 	list[28] = e1;
+  e1 = spu_add(e1, stride6); 	list[29] = e2;
+  e2 = spu_add(e2, stride6); 	list[30] = e0;
+  e0 = spu_add(e0, stride6); 	list[31] = e1;
+  e1 = spu_add(e1, stride6); 	list[32] = e2;
+  e2 = spu_add(e2, stride6); 	list[33] = e0;
+  e0 = spu_add(e0, stride6); 	list[34] = e1;
+  e1 = spu_add(e1, stride6); 	list[35] = e2;
+  e2 = spu_add(e2, stride6); 	list[36] = e0;
+  e0 = spu_add(e0, stride6); 	list[37] = e1;
+  e1 = spu_add(e1, stride6); 	list[38] = e2;
+  e2 = spu_add(e2, stride6); 	list[39] = e0;
+  e0 = spu_add(e0, stride6); 	list[40] = e1;
+  e1 = spu_add(e1, stride6); 	list[41] = e2;
+  e2 = spu_add(e2, stride6); 	list[42] = e0;
+  e0 = spu_add(e0, stride6); 	list[43] = e1;
+  e1 = spu_add(e1, stride6); 	list[44] = e2;
+  e2 = spu_add(e2, stride6); 	list[45] = e0;
+  e0 = spu_add(e0, stride6); 	list[46] = e1;
+  e1 = spu_add(e1, stride6); 	list[47] = e2;
+  e2 = spu_add(e2, stride6); 	list[48] = e0;
+  e0 = spu_add(e0, stride6); 	list[49] = e1;
+  e1 = spu_add(e1, stride6); 	list[50] = e2;
+  e2 = spu_add(e2, stride6); 	list[51] = e0;
+  e0 = spu_add(e0, stride6); 	list[52] = e1;
+  e1 = spu_add(e1, stride6); 	list[53] = e2;
+  e2 = spu_add(e2, stride6); 	list[54] = e0;
+  e0 = spu_add(e0, stride6); 	list[55] = e1;
+  e1 = spu_add(e1, stride6); 	list[56] = e2;
+  e2 = spu_add(e2, stride6); 	list[57] = e0;
+  e0 = spu_add(e0, stride6); 	list[58] = e1;
+  e1 = spu_add(e1, stride6); 	list[59] = e2;
+  e2 = spu_add(e2, stride6); 	list[60] = e0;
+  e0 = spu_add(e0, stride6); 	list[61] = e1;
+ 				list[62] = e2;
+  			 	list[63] = e0;
+}
+
+#endif /* _ACCEL_DTRSM_H_ */
Index: accel/lib/spu/accel_dtrsm_CL_B.c
===================================================================
RCS file: accel/lib/spu/accel_dtrsm_CL_B.c
diff -N accel/lib/spu/accel_dtrsm_CL_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dtrsm_CL_B.c	22 Oct 2008 03:28:08 -0000	1.4
@@ -0,0 +1,249 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_dtrsm.h"
+
+
+void accel_dtrsm_CL_B(hpl_accel_init_parms_t *parms, 
+		      volatile hpl_accel_dtrsm_parms_t *cmd_parms)
+{
+  int i;
+  unsigned int idx, tag, next_tag;
+  unsigned int size, lda, stride;
+  unsigned int id;
+  unsigned long long a, b;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int list;
+  unsigned int n;
+  unsigned int span;
+  vec_uint4 ld;
+  vec_uint4 element, stride2, stride4, stride6, next;
+  volatile void *lsa;
+#ifdef MATRIX_4GB_CROSSING
+  unsigned int list_size, hi;
+  vec_uint4 sizes[4];
+  vec_uint4 b_his[4];
+#endif
+#if (HPL_ACCEL_SPES & 3) != 0
+  unsigned int stride0, stride1;
+  vec_uint4 blk_idx, next0, next1;
+#endif
+  
+  id = parms->id;
+
+  stride2 = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)});
+  stride4 = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)});
+  stride6 = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)});
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+
+  /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup
+   * time, we will download only the necessary data columns in groups of 16 
+   * while preserving the cacheline alignment. The download will be done 
+   * starting from the smallest column to the largest.
+   */
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  lsa = (volatile void *)(&bufA_128x128[0]);
+  size = 128*sizeof(double);
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT_RECEIVE();
+
+  for (i=0; i<127; i++) {
+    unsigned int adjust;
+
+    spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD);
+
+    a_lo += lda;
+    lsa  += 128*sizeof(double);
+
+    /* Compute the next DMA parameters 
+     */
+    adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0);
+
+    a_lo += adjust;
+    lsa  += adjust;
+    size -= adjust;
+  }
+
+  n = spu_extract(cmd_parms->dim, 0) / 16;
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+  
+  /* Download the initial set of 16 B columns
+   */
+  span = spu_extract(cmd_parms->blk_col, 0) + id;
+  stride = spu_extract(ld, 1);
+  b_lo += (span & 3) * 16 * sizeof(double);
+  MATRIX_EA_UMADD32(b_hi, b_lo, (span/4), stride);
+  element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), spu_promote(b_lo, 0),
+				((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+		     ((vec_uint4){0, 0, 0, 64*sizeof(double)}));
+
+
+#if (HPL_ACCEL_SPES & 3) != 0
+  blk_idx = spu_splats(span & 3);
+
+  stride0  = stride * (HPL_ACCEL_SPES / 4);
+  stride1  = stride * (1 +  HPL_ACCEL_SPES / 4);
+  stride0 += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double);
+  stride1 -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double);
+
+  next0 = spu_shuffle(spu_promote(stride0, 0), spu_promote(stride0, 0),
+		      ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+  next1 = spu_shuffle(spu_promote(stride1, 0), spu_promote(stride1, 0),
+		      ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+#else
+  stride *= HPL_ACCEL_SPES / 4;
+  next = spu_shuffle(spu_promote(stride, 0), 
+		     spu_promote(stride, 0), 
+		     ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+#endif
+
+
+  list = (unsigned int)&bufB_list[0][0];
+  fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6);
+      
+#if (HPL_ACCEL_SPES & 3) != 0
+  blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3));
+  next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3));
+  blk_idx = spu_and(blk_idx, 3);
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+  /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through
+   * the list.
+   */
+  list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8);
+
+  spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, list_size, 0, MFC_GETL_CMD);
+  spu_mfcdma64(&bufB_128x16[0][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, 0, MFC_GETL_CMD);
+
+  sizes[0] = spu_promote(list_size, 0);
+  b_his[0] = spu_promote(b_hi, 0);
+
+  b_hi += spu_extract(spu_genc(element, next), 1);
+#else
+  spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, 128*8, 0, MFC_GETL_CMD);
+#endif
+  element = spu_add(element, next);
+
+  idx = 1;
+  next_tag = 0;
+  tag = 1;
+
+  DMA_WAIT_REQUEST(1<<0);
+
+  for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) {
+    /* Fetch the next buffer
+     */
+    list = (unsigned int)&bufB_list[idx][0];
+    fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6);
+      
+#if (HPL_ACCEL_SPES & 3) != 0
+    blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3));
+    next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3));
+    blk_idx = spu_and(blk_idx, 3);
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+    /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through
+     * the list.
+     */
+    list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8);
+
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, list_size, tag, MFC_GETLB_CMD);
+    spu_mfcdma64(&bufB_128x16[tag][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_GETL_CMD);
+
+    sizes[idx] = spu_promote(list_size, 0);
+    b_his[idx] = spu_promote(b_hi, 0);
+
+    b_hi += spu_extract(spu_genc(element, next), 1);
+#else
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_GETLB_CMD);
+#endif
+    element = spu_add(element, next);
+
+    tag ^= 1;
+
+    /* Wait for the previous get to complete */
+    DMA_WAIT_RECEIVE();
+
+    /* Perform the dtrsm.
+     */
+    dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]);
+
+    idx = (idx + 1) & 3;
+
+    /* Store the update matrix columns back to memory
+     */
+    list = (unsigned int)&bufB_list[idx^2][0];
+#ifdef MATRIX_4GB_CROSSING
+    list_size = spu_extract(sizes[idx^2], 0);
+    
+    hi = spu_extract(b_his[idx^2], 0);
+
+    spu_mfcdma64(&bufB_128x16[tag][0], hi, list, list_size, tag, MFC_PUTL_CMD);
+    spu_mfcdma64(&bufB_128x16[tag][list_size], hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD);
+#else
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD);
+#endif
+
+    next_tag = tag ^ 1;
+
+    DMA_WAIT_REQUEST(1<<next_tag);
+  }
+
+  /* Wait for the previous GET to complete */
+  DMA_WAIT_RECEIVE();
+
+  /* Perform the dtrsm.
+   */
+  dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[next_tag][0]);
+
+  idx = (idx + 1) & 3;
+
+  /* Store the final results back to system memory.
+   */
+  list = (unsigned int)&bufB_list[idx^2][0];
+#ifdef MATRIX_4GB_CROSSING
+  list_size = spu_extract(sizes[idx^2], 0);
+    
+  hi = spu_extract(b_his[idx^2], 0);
+
+  spu_mfcdma64(&bufB_128x16[next_tag][0], hi, list, list_size, tag, MFC_PUTL_CMD);
+  spu_mfcdma64(&bufB_128x16[next_tag][list_size], hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD);
+#else
+  spu_mfcdma64(&bufB_128x16[next_tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD);
+#endif
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
+
Index: accel/lib/spu/accel_dtrsm_dp_128Cx16.S
===================================================================
RCS file: accel/lib/spu/accel_dtrsm_dp_128Cx16.S
diff -N accel/lib/spu/accel_dtrsm_dp_128Cx16.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dtrsm_dp_128Cx16.S	23 Oct 2008 21:20:24 -0000	1.3
@@ -0,0 +1,2270 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+/*
+ * SYNOPSIS:
+ *	void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB)
+ *
+ * DESCRIPTION: 
+ *   This file contains a specialized DTRSM function that solves
+ *   the matrix equation for [x].
+ *
+ *      [a]*[x] = [b]
+ *
+ *   where:
+ *      [a] is a unit lower, column ordered, double precision, little endian triangle
+ *          matrix of 128 rows by 128 columns.
+ *	[b] is a row ordered, double precision, matrix of 128 rows and 16 columns. 
+ *   The solution [x] is returned in [b].
+ *
+ *   This implementation is a highly optimized solution that mimics the following
+ *   scalar design that processes 4 rows of b at a time:
+ *	
+ *    for (i=0; i<128; i+=4) {				# iloop 
+ *	for (x=0; x<i; x++) {				# xloop
+ *	  for (j=0; j<16; j++) {
+ *          b[i+0,j] -=  b[x,j] * a[i+0,x];
+ *          b[i+1,j] -=  b[x,j] * a[i+1,x];
+ *          b[i+2,j] -=  b[x,j] * a[i+2,x];
+ *          b[i+3,j] -=  b[x,j] * a[i+3,x];
+ *        }
+ *      }
+ *      for (j=0; j<16; j++) {
+ *        b[i+1,j] -=  b[i+0,j] * a[i+1,i+0];
+ *        b[i+2,j] -=  b[i+0,j] * a[i+2,i+0];
+ *        b[i+3,j] -=  b[i+0,j] * a[i+3,i+0];
+ *        b[i+2,j] -=  b[i+1,j] * a[i+2,i+1];
+ *        b[i+3,j] -=  b[i+1,j] * a[i+3,i+1];
+ *        b[i+3,j] -=  b[i+2,j] * a[i+3,i+2];
+ *      }
+ *    }
+ *
+ *  Numerous code transformations were applied to this code to reduce dependencies and maximize
+ *  dual issue rates. The transformations include:
+ *	
+ *  1) The j loop has been fully unrolled and removed.
+ *  2) The xloop has been pipelines so the the first 2 and last 2 iterations are pulled outsize
+ *     the loop.
+ *  3) The i loop has been unrolled by two into phases 1 and 2.
+ */
+
+
+#if 1
+#define DFNMS(_d, _a, _b)       dfnms   _d, _a, _b
+#else
+#define DFNMS(_d, _a, _b)       fnms   _d, _a, _b, _d
+#endif	
+#define LQD(_d, _a, _idx)       lqd     _d, (_idx)*16(_a)
+#define STQD(_d, _a, _idx)      stqd    _d, (_idx)*16(_a)
+#define SHUFB(_d, _a, _b, _c)   shufb   _d, _a, _b, _c
+#if 1
+#define HBRP			hbrp	
+#else
+#define HBRP			lnop
+#endif	
+
+/* Input parameters
+ */
+#define bufA	$3
+#define bufB	$4
+	
+/* Working variables
+ */
+#define base	$2	
+
+#define splat0	$5
+#define splat1	$6	
+
+#define	B00a	$7
+#define	B01a	$8
+#define	B02a	$9
+#define	B03a	$10
+#define	B04a	$11
+#define	B05a	$12
+#define	B06a	$13
+#define	B07a	$14
+#define	B10a	$15
+#define	B11a	$16
+#define	B12a	$17
+#define	B13a	$18
+#define	B14a	$19
+#define	B15a	$20
+#define	B16a	$21
+#define	B17a	$22
+#define	B20a	$23
+#define	B21a	$24
+#define	B22a	$25
+#define	B23a	$26
+#define	B24a	$27
+#define	B25a	$28
+#define	B26a	$29
+#define	B27a	$30
+#define	B30a	$31
+#define	B31a	$32
+#define	B32a	$33
+#define	B33a	$34
+#define	B34a	$35
+#define	B35a	$36
+#define	B36a	$37
+#define	B37a	$38
+#define Ax	$39	// alias
+#define addend	$39	// alias
+#define merge	$39	// alias
+#define Bx	$40
+#define Bi	$41
+#define B0	$42	
+#define B1	$43
+#define B2	$44	
+#define B3	$45	
+#define B4	$46	
+#define B5	$47	
+#define B6	$48	
+#define B7	$49	
+#define Bx0	$50	
+#define Bx1	$51
+#define Bx2	$52	
+#define Bx3	$53	
+#define Bx4	$54	
+#define Bx5	$55	
+#define Bx6	$56	
+#define Bx7	$57	
+#define A0	$58
+#define A1	$59
+#define bufB_bufA_i_Bi	$60
+#define	A2	$61
+#define A3	$62	
+#define	A2_0	$63
+#define	A3_0	$64
+#define	A2_1	$65
+#define	A3_1	$66
+#define	A3_2	$67
+#define i	$68	// alias
+#define x	$68	// alias
+#define target	$69
+	
+#define	B00b	$70
+#define	B01b	$71
+#define	B02b	$72
+#define	B03b	$73
+#define	B04b	$74
+#define	B05b	$75
+#define	B06b	$76
+#define	B07b	$77
+#define	B10b	$78
+#define	B11b	$79
+#define	B12b	$80
+#define	B13b	$81
+#define	B14b	$82
+#define	B15b	$83
+#define	B16b	$84
+#define	B17b	$85
+#define	B20b	$86
+#define	B21b	$87
+#define	B22b	$88
+#define	B23b	$89
+#define	B24b	$90
+#define	B25b	$91
+#define	B26b	$92
+#define	B27b	$93
+#define	B30b	$94
+#define	B31b	$95
+#define	B32b	$96
+#define	B33b	$97
+#define	B34b	$98
+#define	B35b	$99
+#define	B36b	$100
+#define	B37b	$101
+
+
+        .data
+        .align  4
+splat_dw0:
+#ifdef ACCEL_LITTLE_ENDIAN
+        .word   0x07060504, 0x03020100, 0x07060504, 0x03020100
+#else  /* BIG_ENDIAN */
+        .word   0x00010203, 0x04050607, 0x00010203, 0x04050607
+#endif /* LITTLE_ENDIAN */
+splat_04i0:
+	.word	0x00010203, 0x10111213, 0x000000E0, 0x00010203
+const_256_2K_xloop1_0:
+	.word	256, 2048, xloop1_br_targets, 0
+const_256_2K_xloop2_0:
+	.word	256, 2048, xloop2_br_targets, 0
+const_0_32_m8_1K:	
+	.word	0, 32, -8, 1024
+const_0_32_0_0:
+	.word	0, 32, 0, 0
+const_512_4K_4_0:	
+	.word	512, 4096, 4, 0
+	
+	.align	4
+br_target_base:	
+iloop_br_targets:	
+	.word	iloop_done, iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+	.word	iloop,      iloop, iloop, iloop
+
+xloop1_br_targets:
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1,	xloop1
+	.word	xloop1,	xloop1,	xloop1_done
+
+xloop2_br_targets:
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2
+	.word	xloop2,	xloop2,	xloop2,	xloop2_done
+	.word	xloop2_done
+	
+
+	.text
+	.align	6
+	.global dtrsm_dp_128Cx16
+dtrsm_dp_128Cx16:
+	/* Save the non-volatile registers */
+ib0:	
+	HBRP
+	lqr	splat0, splat_dw0
+	
+	LQD(A1, bufA, 0)
+	LQD(B0, bufB, 0)
+	
+	LQD(B1, bufB, 1)
+	LQD(B2, bufB, 2)
+	
+	LQD(B3, bufB, 3)
+	LQD(B4, bufB, 4)
+	
+	orbi	splat1, splat0, 8
+	LQD(B5, bufB, 5)
+	
+	LQD(B6, bufB, 6)
+	LQD(B7, bufB, 7)
+
+	SHUFB(A1, A1, A1, splat1)
+	LQD(B10a, bufB, 8*1 + 0)
+	
+	LQD(B11a, bufB, 8*1 + 1)
+	LQD(B12a, bufB, 8*1 + 2)
+	
+	LQD(B13a, bufB, 8*1 + 3)
+	LQD(B14a, bufB, 8*1 + 4)
+	
+	LQD(B15a, bufB, 8*1 + 5)
+	LQD(B16a, bufB, 8*1 + 6)
+	
+	LQD(B17a, bufB, 8*1 + 7)
+	LQD(A3_0, bufA, 1)
+	
+	LQD(A3_1, bufA, 64+1)
+	LQD(A3_2, bufA, 128+1)
+	
+        stqd    $80, -16*1($SP)
+        stqd    $81, -16*2($SP)
+	
+        stqd    $82, -16*3($SP)
+        stqd    $83, -16*4($SP)
+	
+	ila	base, br_target_base
+	stqd    $84, -16*5($SP)
+	
+        stqd    $85, -16*6($SP)
+        stqd    $87, -16*8($SP)
+
+ib1:	
+	HBRP
+        stqd    $88, -16*9($SP)
+
+	DFNMS(B10a, B0, A1)
+	SHUFB(A2_0, A3_0, A3_0, splat0)
+	
+	DFNMS(B11a, B1, A1)
+	SHUFB(A3_0, A3_0, A3_0, splat1)
+	
+	DFNMS(B12a, B2, A1)
+	LQD(B20a, bufB, 8*2 + 0)
+	
+	DFNMS(B13a, B3, A1)
+	LQD(B21a, bufB, 8*2 + 1)
+	
+	DFNMS(B14a, B4, A1)
+	LQD(B22a, bufB, 8*2 + 2)
+	
+	DFNMS(B15a, B5, A1)
+	LQD(B23a, bufB, 8*2 + 3)
+	
+	DFNMS(B16a, B6, A1)
+	LQD(B24a, bufB, 8*2 + 4)
+
+	DFNMS(B17a, B7, A1)
+	LQD(B25a, bufB, 8*2 + 5)
+	
+	DFNMS(B20a, B0, A2_0)
+	LQD(B26a, bufB, 8*2 + 6)
+	
+	DFNMS(B21a, B1, A2_0)
+	LQD(B27a, bufB, 8*2 + 7)
+	
+	DFNMS(B22a, B2, A2_0)
+	LQD(B30a, bufB, 8*3 + 0)
+	
+	DFNMS(B23a, B3, A2_0)
+	LQD(B31a, bufB, 8*3 + 1)
+	
+	DFNMS(B24a, B4, A2_0)
+	LQD(B32a, bufB, 8*3 + 2)
+	
+	DFNMS(B25a, B5, A2_0)
+	LQD(B33a, bufB, 8*3 + 3)
+	
+	DFNMS(B26a, B6, A2_0)
+	LQD(B34a, bufB, 8*3 + 4)
+	
+ib2:	
+	DFNMS(B27a, B7, A2_0)
+	HBRP
+	
+	DFNMS(B30a, B0, A3_0)
+	LQD(B35a, bufB, 8*3 + 5)
+	
+	DFNMS(B31a, B1, A3_0)
+	LQD(B36a, bufB, 8*3 + 6)
+	
+	DFNMS(B32a, B2, A3_0)
+	LQD(B37a, bufB, 8*3 + 7)
+	
+        stqd    $86, -16*7($SP)
+        stqd    $89, -16*10($SP)
+	
+	DFNMS(B33a, B3, A3_0)
+	SHUFB(A2_1, A3_1, A3_1, splat0)
+	
+	DFNMS(B34a, B4, A3_0)
+	SHUFB(A3_1, A3_1, A3_1, splat1)
+	
+	DFNMS(B35a, B5, A3_0)
+        stqd    $90, -16*11($SP)
+	
+	DFNMS(B36a, B6, A3_0)
+        stqd    $91, -16*12($SP)
+	
+	DFNMS(B37a, B7, A3_0)
+        stqd    $92, -16*13($SP)
+	
+	DFNMS(B20a, B10a, A2_1)
+        stqd    $93, -16*14($SP)
+	
+	DFNMS(B21a, B11a, A2_1)
+        stqd    $94, -16*15($SP)
+	
+	DFNMS(B22a, B12a, A2_1)
+        stqd    $95, -16*16($SP)
+	
+	DFNMS(B23a, B13a, A2_1)
+	stqd    $96, -16*17($SP)
+	
+	DFNMS(B24a, B14a, A2_1)
+        stqd    $97, -16*18($SP)
+	
+	DFNMS(B25a, B15a, A2_1)	
+        stqd    $98, -16*19($SP)
+
+ib3:	
+	DFNMS(B26a, B16a, A2_1)
+	HBRP
+	
+	DFNMS(B27a, B17a, A2_1)
+        stqd    $99, -16*20($SP)
+	
+	DFNMS(B30a, B10a, A3_1)
+        stqd    $100, -16*21($SP)
+	
+	DFNMS(B31a, B11a, A3_1)
+	SHUFB(A3_2, A3_2, A3_2, splat1)
+	
+	DFNMS(B32a, B12a, A3_1)
+	STQD(B20a, bufB, 8*2+0)
+	
+	DFNMS(B33a, B13a, A3_1)
+	STQD(B21a, bufB, 8*2+1)
+	
+	DFNMS(B34a, B14a, A3_1)
+	STQD(B22a, bufB, 8*2+2)
+	
+	DFNMS(B35a, B15a, A3_1)
+	STQD(B23a, bufB, 8*2+3)
+	
+	DFNMS(B36a, B16a, A3_1)
+	STQD(B24a, bufB, 8*2+4)
+	
+	DFNMS(B37a, B17a, A3_1)
+        stqd    $101, -16*22($SP)
+	
+	DFNMS(B30a, B20a, A3_2)
+	STQD(B25a, bufB, 8*2+5)
+
+	DFNMS(B31a, B21a, A3_2)
+	STQD(B26a, bufB, 8*2+6)
+	
+	DFNMS(B32a, B22a, A3_2)
+	STQD(B27a, bufB, 8*2+7)
+	
+	DFNMS(B33a, B23a, A3_2)
+	LQD(B00b, bufB, 8*4+0)
+	
+	DFNMS(B34a, B24a, A3_2)
+	LQD(B01b, bufB, 8*4+1)
+	
+	DFNMS(B35a, B25a, A3_2)
+	LQD(B02b, bufB, 8*4+2)
+	
+ib4:	
+	DFNMS(B36a, B26a, A3_2)
+	HBRP
+
+	ai	bufA, bufA, 2*16
+	LQD(B03b, bufB, 8*4+3)
+	
+	DFNMS(B37a, B27a, A3_2)
+	LQD(B04b, bufB, 8*4+4)
+	
+	lr	Bi, bufB
+	STQD(B10a, bufB, 8*1+0)
+	
+	STQD(B11a, bufB, 8*1+1)
+	LQD(A1, bufA, 0)
+	
+	lqr	merge, splat_04i0
+	STQD(B12a, bufB, 8*1+2)
+	
+	STQD(B13a, bufB, 8*1+3)
+	STQD(B14a, bufB, 8*1+4)
+	
+	STQD(B15a, bufB, 8*1+5)
+	STQD(B16a, bufB, 8*1+6)
+	
+	STQD(B17a, bufB, 8*1+7)
+	STQD(B30a, bufB, 8*3+0)
+	
+	STQD(B31a, bufB, 8*3+1)
+	STQD(B32a, bufB, 8*3+2)
+	
+	STQD(B33a, bufB, 8*3+3)
+	STQD(B34a, bufB, 8*3+4)
+	
+	STQD(B35a, bufB, 8*3+5)
+	STQD(B36a, bufB, 8*3+6)
+		
+	/* x = 0 */
+	
+	
+	# These 3 shuffles should open enough space for a intruction fetch of ib 6
+	STQD(B37a, bufB, 8*3+7)
+	SHUFB(A0, A1, A1, splat0)
+	
+	SHUFB(A1, A1, A1, splat1)
+	SHUFB	bufB_bufA_i_Bi, bufB, bufA, merge
+
+	// LOAD BEGIN
+	LQD(B05b, Bi, 8*4+5)
+	LQD(B06b, Bi, 8*4+6)
+	
+	LQD(B07b, Bi, 8*4+7)
+	LQD(B10b, Bi, 8*5+0)
+	
+ib5:	
+	LQD(B11b, Bi, 8*5+1)
+	LQD(B12b, Bi, 8*5+2)
+	
+	LQD(B13b, Bi, 8*5+3)
+	LQD(B14b, Bi, 8*5+4)
+	
+	DFNMS(B00b, B0, A0)
+	LQD(B15b, Bi, 8*5+5)
+	
+	DFNMS(B10b, B0, A1)
+	LQD(B16b, Bi, 8*5+6)
+	
+	DFNMS(B01b, B1, A0)
+	LQD(B17b, Bi, 8*5+7)
+	
+	// COMPUTE BEGIN
+
+	DFNMS(B11b, B1, A1)
+	LQD(B20b, Bi, 8*6+0)
+	
+	DFNMS(B02b, B2, A0)
+	LQD(B21b, Bi, 8*6+1)
+	
+	DFNMS(B12b, B2, A1)
+	LQD(B22b, Bi, 8*6+2)
+	
+	DFNMS(B03b, B3, A0)
+	LQD(B23b, Bi, 8*6+3)
+	
+	DFNMS(B13b, B3, A1)
+	LQD(A3, bufA, 1)
+	
+	LQD(B24b, Bi, 8*6+4)
+	LQD(B25b, Bi, 8*6+5)
+	
+	DFNMS(B04b, B4, A0)
+	LQD(B26b, Bi, 8*6+6)
+	
+	DFNMS(B14b, B4, A1)
+	LQD(B27b, Bi, 8*6+7)
+	
+	DFNMS(B05b, B5, A0)
+	LQD(B30b, Bi, 8*7+0)
+	
+	DFNMS(B15b, B5, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B06b, B6, A0)
+	SHUFB(A3, A3, A3, splat1)
+	
+ib6:	
+	DFNMS(B16b, B6, A1)
+	LQD(B31b, Bi, 8*7+1)
+	
+	DFNMS(B07b, B7, A0)
+	hbrr	br_fwd_phase2, phase2
+	
+	DFNMS(B17b, B7, A1)
+	LQD(B32b, Bi, 8*7+2)
+	
+	DFNMS(B20b, B0, A2)
+	LQD(B33b, Bi, 8*7+3)
+	
+	DFNMS(B30b, B0, A3)
+	LQD(B34b, Bi, 8*7+4)
+	
+	DFNMS(B21b, B1, A2)
+	LQD(B35b, Bi, 8*7+5)
+	
+	DFNMS(B31b, B1, A3)
+	LQD(A1, bufA, 64)
+	
+	DFNMS(B22b, B2, A2)
+	LQD(B36b, Bi, 8*7+6)
+	
+	DFNMS(B32b, B2, A3)
+	LQD(B37b, Bi, 8*7+7)
+	
+	DFNMS(B23b, B3, A2)
+	LQD(Bx0, bufB, 8+0)
+	
+	DFNMS(B33b, B3, A3)
+	LQD(Bx1, bufB, 8+1)
+	
+	DFNMS(B24b, B4, A2)
+	LQD(Bx2, bufB, 8+2)
+	
+	DFNMS(B34b, B4, A3)
+	lqr	addend, const_256_2K_xloop2_0
+
+	DFNMS(B25b, B5, A2)
+	LQD(Bx3, bufB, 8+3)
+	
+	DFNMS(B35b, B5, A3)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B26b, B6, A2)
+	SHUFB(A1, A1, A1, splat1)
+	
+ib7:	
+	DFNMS(B36b, B6, A3)
+	LQD(Bx4, bufB, 8+4)
+  
+	DFNMS(B27b, B7, A2)
+	LQD(Bx5, bufB, 8+5)
+	
+	DFNMS(B37b, B7, A3)
+	LQD(Bx6, bufB, 8+6)
+	
+	LQD(Bx7, bufB, 8+7)
+	
+	/* Skip to second half of loop since the loop has been unrolled and the loop count is odd
+	*/
+br_fwd_phase2:	
+	br	phase2
+	
+	.align	7
+	
+	/* for i=4; i<128; i+=8 */
+iloop:
+	/************************ PHASE 1  **************************/
+	// STORE END
+ib8:
+	DFNMS(B00a, B0, A0)
+	HBRP
+	
+	DFNMS(B10a, B0, A1)
+	STQD(B00b, Bx, 8*2+0)
+	
+	DFNMS(B01a, B1, A0)
+	STQD(B01b, Bx, 8*2+1)
+	
+	DFNMS(B11a, B1, A1)
+	STQD(B02b, Bx, 8*2+2)
+	
+	DFNMS(B02a, B2, A0)
+	STQD(B03b, Bx, 8*2+3)
+	
+	DFNMS(B12a, B2, A1)
+	STQD(B04b, Bx, 8*2+4)
+	
+	DFNMS(B03a, B3, A0)
+	STQD(B05b, Bx, 8*2+5)
+	
+	DFNMS(B13a, B3, A1)
+	STQD(B06b, Bx, 8*2+6)
+	
+	DFNMS(B04a, B4, A0)
+	STQD(B07b, Bx, 8*2+7)
+	
+	DFNMS(B14a, B4, A1)
+	STQD(B10b, Bx, 8*3+0)
+	
+	DFNMS(B05a, B5, A0)
+	STQD(B11b, Bx, 8*3+1)
+	
+	DFNMS(B15a, B5, A1)
+	STQD(B12b, Bx, 8*3+2)
+	
+	DFNMS(B06a, B6, A0)
+	STQD(B13b, Bx, 8*3+3)
+	
+	DFNMS(B16a, B6, A1)
+	STQD(B14b, Bx, 8*3+4)
+	
+	DFNMS(B07a, B7, A0)
+	STQD(B15b, Bx, 8*3+5)
+	
+	DFNMS(B17a, B7, A1)
+	STQD(B16b, Bx, 8*3+6)
+	
+ib9:	
+	DFNMS(B20a, B0, A2)
+	HBRP
+
+	DFNMS(B30a, B0, A3)
+	STQD(B17b, Bx, 8*3+7)
+	
+	DFNMS(B21a, B1, A2)
+	LQD(A1, bufA, 64)
+	
+	DFNMS(B31a, B1, A3)
+	STQD(B20b, Bx, 8*4+0)
+	
+	DFNMS(B22a, B2, A2)
+	STQD(B21b, Bx, 8*4+1)
+	DFNMS(B32a, B2, A3)
+	STQD(B22b, Bx, 8*4+2)
+	DFNMS(B23a, B3, A2)
+	STQD(B23b, Bx, 8*4+3)
+	
+	DFNMS(B33a, B3, A3)
+	STQD(B24b, Bx, 8*4+4)
+	
+
+	// COMPUTE BEGIN
+
+	DFNMS(B24a, B4, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B34a, B4, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B25a, B5, A2)
+	LQD(Bx0, bufB, 8+0)
+	DFNMS(B35a, B5, A3)
+	LQD(Bx1, bufB, 8+1)
+	DFNMS(B26a, B6, A2)
+	LQD(Bx2, bufB, 8+2)
+	DFNMS(B36a, B6, A3)
+	LQD(Bx3, bufB, 8+3)
+	DFNMS(B27a, B7, A2)
+	LQD(Bx4, bufB, 8+4)
+	DFNMS(B37a, B7, A3)
+	LQD(Bx5, bufB, 8+5)
+	
+	
+ib10:		# ifetch expected above
+	DFNMS(B00a, Bx0, A0)
+	LQD(Bx6, bufB, 8+6)
+	DFNMS(B10a, Bx0, A1)
+	LQD(Bx7, bufB, 8+7)
+
+	/* x = 1 */
+
+	STQD(B25b, Bx, 8*4+5)
+	LQD(A3, bufA,64+ 1)
+
+	DFNMS(B01a, Bx1, A0)
+	STQD(B26b, Bx, 8*4+6)
+	DFNMS(B11a, Bx1, A1)
+	STQD(B27b, Bx, 8*4+7)
+	DFNMS(B02a, Bx2, A0)
+	STQD(B30b, Bx, 8*5+0)
+	DFNMS(B12a, Bx2, A1)
+	STQD(B31b, Bx, 8*5+1)
+	
+	DFNMS(B03a, Bx3, A0)
+	lqr	addend, const_256_2K_xloop1_0
+	
+	DFNMS(B13a, Bx3, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B04a, Bx4, A0)
+	SHUFB(A3, A3, A3, splat1)	
+	
+	DFNMS(B14a, Bx4, A1)
+	STQD(B32b, Bx, 8*5+2)
+	DFNMS(B05a, Bx5, A0)
+	STQD(B33b, Bx, 8*5+3)
+	DFNMS(B15a, Bx5, A1)
+	STQD(B34b, Bx, 8*5+4)
+	DFNMS(B06a, Bx6, A0)
+	STQD(B35b, Bx, 8*5+5)
+
+	DFNMS(B16a, Bx6, A1)
+	STQD(B36b, Bx, 8*5+6)
+	
+	DFNMS(B07a, Bx7, A0)
+	STQD(B37b, Bx, 8*5+7)
+	
+ib11:	
+	a	Bx, bufB_bufA_i_Bi, addend
+	DFNMS(B17a, Bx7, A1)
+	
+	DFNMS(B20a, Bx0, A2)
+	rotqbyi	Ax, Bx, 4
+	
+	DFNMS(B30a, Bx0, A3)
+	LQD(A1, bufA, 128)
+	
+	DFNMS(B21a, Bx1, A2)
+	rotqbyi	x, Bx, 8
+	DFNMS(B31a, Bx1, A3)
+	DFNMS(B22a, Bx2, A2)
+	
+	DFNMS(B32a, Bx2, A3)
+	LQD(Bx0, bufB, 16+0)
+	
+	DFNMS(B23a, Bx3, A2)
+	LQD(Bx1, bufB, 16+1)
+
+	DFNMS(B33a, Bx3, A3)
+	lnop			# force xloop1 to start on a line buffer
+	DFNMS(B24a, Bx4, A2)
+	lnop			# force xloop1 to start on a line buffer
+	
+	DFNMS(B34a, Bx4, A3)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B25a, Bx5, A2)
+	SHUFB(A1, A1, A1, splat1)
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx2, bufB, 16+2)
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx3, bufB, 16+3)
+	DFNMS(B36a, Bx6, A3)
+	LQD(Bx4, bufB, 16+4)
+	
+	DFNMS(B27a, Bx7, A2)
+	LQD(Bx5, bufB, 16+5)
+	
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx6, bufB, 16+6)
+	
+	/* FOR (x=31-i; x!=0; x--) */
+xloop1:	
+	/* x = 4*(31-i) + 2 */
+ib12:	
+	DFNMS(B00a, Bx0, A0)
+	DFNMS(B10a, Bx0, A1)
+	
+	DFNMS(B01a, Bx1, A0)
+	DFNMS(B11a, Bx1, A1)
+	
+	DFNMS(B02a, Bx2, A0)
+	lqd	target, 0(x)
+	
+	DFNMS(B12a, Bx2, A1)
+	LQD(Bx7, Bx, 7)
+	
+	DFNMS(B03a, Bx3, A0)
+	LQD(A3, Ax, 1)
+	
+	DFNMS(B13a, Bx3, A1)
+	DFNMS(B04a, Bx4, A0)
+	DFNMS(B14a, Bx4, A1)
+	DFNMS(B05a, Bx5, A0)	
+	
+	DFNMS(B15a, Bx5, A1)
+	rotqby	target, target, x
+	
+	DFNMS(B06a, Bx6, A0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B16a, Bx6, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B07a, Bx7, A0)
+	DFNMS(B17a, Bx7, A1)
+	
+	DFNMS(B20a, Bx0, A2)
+	LQD(A1, Ax, 64)
+
+	DFNMS(B30a, Bx0, A3)
+	DFNMS(B21a, Bx1, A2)
+	
+	DFNMS(B31a, Bx1, A3)
+	LQD(Bx0, Bx, 8+0)
+	
+	DFNMS(B22a, Bx2, A2)
+	LQD(Bx1, Bx, 8+1)
+	
+	DFNMS(B32a, Bx2, A3)
+	LQD(Bx2, Bx, 8+2)
+	
+ib13:	
+	DFNMS(B23a, Bx3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24a, Bx4, A2)
+	LQD(Bx3, Bx, 8+3)
+	
+	DFNMS(B34a, Bx4, A3)
+	DFNMS(B25a, Bx5, A2)
+	
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx4, Bx, 8+4)
+	
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx5, Bx, 8+5)
+	
+	DFNMS(B36a, Bx6, A3)
+	DFNMS(B27a, Bx7, A2)
+	
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx6, Bx, 8+6)
+	
+	/* x = 4*(31-i) + 3 */
+	DFNMS(B00a, Bx0, A0)
+	LQD(Bx7, Bx, 8+7)
+
+	DFNMS(B10a, Bx0, A1)
+	LQD(A3, Ax, 64+1)
+	
+	DFNMS(B01a, Bx1, A0)
+	DFNMS(B11a, Bx1, A1)
+	
+	DFNMS(B02a, Bx2, A0)
+	DFNMS(B12a, Bx2, A1)
+	
+	DFNMS(B03a, Bx3, A0)
+	DFNMS(B13a, Bx3, A1)
+	
+	DFNMS(B04a, Bx4, A0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B14a, Bx4, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B05a, Bx5, A0)
+	DFNMS(B15a, Bx5, A1)
+	
+ib14:	
+	DFNMS(B06a, Bx6, A0)
+	DFNMS(B16a, Bx6, A1)
+	
+	DFNMS(B07a, Bx7, A0)
+	DFNMS(B17a, Bx7, A1)
+	
+	DFNMS(B20a, Bx0, A2)
+	LQD(A1, Ax, 128)
+	
+	DFNMS(B30a, Bx0, A3)
+	DFNMS(B21a, Bx1, A2)
+	
+	DFNMS(B31a, Bx1, A3)
+	LQD(Bx0, Bx, 16+0)
+	
+	DFNMS(B22a, Bx2, A2)
+	LQD(Bx1, Bx, 16+1)
+	
+	DFNMS(B32a, Bx2, A3)
+	LQD(Bx2, Bx, 16+2)
+	
+	DFNMS(B23a, Bx3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24a, Bx4, A2)
+	LQD(Bx3, Bx, 16+3)
+	
+	DFNMS(B34a, Bx4, A3)
+	DFNMS(B25a, Bx5, A2)
+	
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx4, Bx, 16+4)
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx5, Bx, 16+5)
+	
+	DFNMS(B36a, Bx6, A3)
+	DFNMS(B27a, Bx7, A2)
+	
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx6, Bx, 16+6)
+	
+	/* x = 4*(31-i) + 4 */
+	DFNMS(B00a, Bx0, A0)
+	LQD(Bx7, Bx, 16+7)
+	
+ib15:
+	DFNMS(B10a, Bx0, A1)
+	LQD(A3, Ax, 128+1)
+	
+	DFNMS(B01a, Bx1, A0)
+	DFNMS(B11a, Bx1, A1)
+	
+	DFNMS(B02a, Bx2, A0)
+	DFNMS(B12a, Bx2, A1)
+	
+	DFNMS(B03a, Bx3, A0)
+	DFNMS(B13a, Bx3, A1)
+	
+	DFNMS(B04a, Bx4, A0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B14a, Bx4, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B05a, Bx5, A0)
+	DFNMS(B15a, Bx5, A1)
+	
+	DFNMS(B06a, Bx6, A0)
+	hbr 	xloop1_branch, target
+	
+	DFNMS(B16a, Bx6, A1)
+	DFNMS(B07a, Bx7, A0)
+	
+	DFNMS(B17a, Bx7, A1)
+	DFNMS(B20a, Bx0, A2)
+	
+	DFNMS(B30a, Bx0, A3)
+	LQD(A1, Ax, 128+64)
+	
+	DFNMS(B21a, Bx1, A2)
+	LQD(Bx0, Bx, 24+0)
+	
+	DFNMS(B31a, Bx1, A3)
+	LQD(Bx1, Bx, 24+1)
+	
+	DFNMS(B22a, Bx2, A2)
+	DFNMS(B32a, Bx2, A3)
+	
+	DFNMS(B23a, Bx3, A2)
+	LQD(Bx2, Bx, 24+2)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A0, A1, A1, splat0)
+	
+ib16:
+	DFNMS(B24a, Bx4, A2)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B34a, Bx4, A3)
+	LQD(Bx3, Bx, 24+3)
+	
+	DFNMS(B25a, Bx5, A2)
+	LQD(Bx4, Bx, 24+4)
+	
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx5, Bx, 24+5)
+	
+	DFNMS(B26a, Bx6, A2)
+	DFNMS(B36a, Bx6, A3)
+	
+	DFNMS(B27a, Bx7, A2)
+	LQD(Bx6, Bx, 24+6)
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx7, Bx, 24+7)
+
+	/* x = 4*(31-i) + 5 */
+
+	DFNMS(B00a, Bx0, A0)
+	LQD(A3, Ax, 128+64+1)
+
+	DFNMS(B10a, Bx0, A1)
+	lqr	x, const_512_4K_4_0
+	
+	DFNMS(B01a, Bx1, A0)
+	DFNMS(B11a, Bx1, A1)
+	
+	DFNMS(B02a, Bx2, A0)
+	DFNMS(B12a, Bx2, A1)
+	
+	DFNMS(B03a, Bx3, A0)
+	DFNMS(B13a, Bx3, A1)
+	
+	DFNMS(B04a, Bx4, A0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B14a, Bx4, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B05a, Bx5, A0)
+	DFNMS(B15a, Bx5, A1)
+	
+	DFNMS(B06a, Bx6, A0)
+	a	Bx, Bx, x
+	
+ib17:	
+	DFNMS(B16a, Bx6, A1)
+	DFNMS(B07a, Bx7, A0)
+	
+	DFNMS(B17a, Bx7, A1)
+	LQD(A1, Ax, 256)
+	
+	DFNMS(B20a, Bx0, A2)
+	rotqbyi	x, Bx, 8
+	
+	DFNMS(B30a, Bx0, A3)
+	LQD(Bx0, Bx, 0)
+	
+	DFNMS(B21a, Bx1, A2)
+	rotqbyi	Ax, Bx, 4
+	
+	DFNMS(B31a, Bx1, A3)
+	LQD(Bx1, Bx, 1)
+	
+	DFNMS(B22a, Bx2, A2)
+	DFNMS(B32a, Bx2, A3)
+
+	DFNMS(B23a, Bx3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24a, Bx4, A2)
+	LQD(Bx2, Bx, 2)
+	
+	DFNMS(B34a, Bx4, A3)
+	DFNMS(B25a, Bx5, A2)
+	
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx3, Bx, 3)
+	
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx4, Bx, 4)
+	
+	DFNMS(B36a, Bx6, A3)
+	LQD(Bx5, Bx, 5)
+	
+	DFNMS(B27a, Bx7, A2)
+	LQD(Bx6, Bx, 6)
+	
+	DFNMS(B37a, Bx7, A3)
+xloop1_branch:	
+	bi	target
+	
+	.align	6
+xloop1_done:	
+ib18:	
+	/* x = 4*(31-i) + 2 */
+	DFNMS(B00a, Bx0, A0)
+ 	DFNMS(B10a, Bx0, A1)
+	DFNMS(B01a, Bx1, A0)
+	DFNMS(B11a, Bx1, A1)
+	
+	DFNMS(B02a, Bx2, A0)
+	LQD(A3, Ax, 1)
+	DFNMS(B12a, Bx2, A1)
+	LQD(Bx7, Bx, 7)
+	
+	DFNMS(B03a, Bx3, A0)
+	DFNMS(B13a, Bx3, A1)
+
+	DFNMS(B04a, Bx4, A0)
+	DFNMS(B14a, Bx4, A1)
+
+	DFNMS(B05a, Bx5, A0)
+	SHUFB(A2, A3, A3, splat0)
+	DFNMS(B15a, Bx5, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B06a, Bx6, A0)
+	DFNMS(B16a, Bx6, A1)
+	
+	DFNMS(B07a, Bx7, A0)
+	DFNMS(B17a, Bx7, A1)
+	
+	DFNMS(B20a, Bx0, A2)
+	LQD(A1, Ax, 64)
+
+	DFNMS(B30a, Bx0, A3)
+	LQD(Bx0, Bx, 8+0)
+	
+	DFNMS(B21a, Bx1, A2)
+	DFNMS(B31a, Bx1, A3)
+	
+	DFNMS(B22a, Bx2, A2)
+	LQD(Bx1, Bx, 8+1)
+	
+	DFNMS(B32a, Bx2, A3)
+	DFNMS(B23a, Bx3, A2)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A0, A1, A1, splat0)
+	
+ib19:	
+	DFNMS(B24a, Bx4, A2)
+	SHUFB(A1, A1, A1, splat1)
+	DFNMS(B34a, Bx4, A3)
+	DFNMS(B25a, Bx5, A2)
+	
+	DFNMS(B35a, Bx5, A3)
+	LQD(Bx2, Bx, 8+2)
+	
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx3, Bx, 8+3)
+	
+	DFNMS(B36a, Bx6, A3)
+	LQD(Bx4, Bx, 8+4)
+	
+	DFNMS(B27a, Bx7, A2)
+	LQD(Bx5, Bx, 8+5)
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx6, Bx, 8+6)
+	
+	DFNMS(B00a, Bx0, A0)
+	LQD(A3, Ax, 64+1)
+	
+	DFNMS(B10a, Bx0, A1)
+
+	/* x = 4*(31-i) + 3 */
+	DFNMS(B01a, Bx1, A0)
+	
+	DFNMS(B11a, Bx1, A1)
+	DFNMS(B02a, Bx2, A0)
+	
+	DFNMS(B12a, Bx2, A1)
+	LQD(Bx7, Bx, 8+7)
+	
+	DFNMS(B03a, Bx3, A0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B13a, Bx3, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B04a, Bx4, A0)
+	DFNMS(B14a, Bx4, A1)
+	
+	DFNMS(B05a, Bx5, A0)
+	LQD(A3_0, Ax, 128+1)
+	DFNMS(B15a, Bx5, A1)
+	LQD(A3_1, Ax, 192+1)
+	
+ib20:	
+	DFNMS(B06a, Bx6, A0)
+	DFNMS(B16a, Bx6, A1)
+	DFNMS(B07a, Bx7, A0)
+	lnop		# pad to align phase2
+	DFNMS(B17a, Bx7, A1)
+	lnop		# pad to align phase2
+
+	DFNMS(B20a, Bx0, A2)
+	LQD(A1, Ax, 128)
+	
+	DFNMS(B30a, Bx0, A3)
+	DFNMS(B21a, Bx1, A2)
+	
+	DFNMS(B31a, Bx1, A3)
+	DFNMS(B22a, Bx2, A2)
+	
+	DFNMS(B32a, Bx2, A3)
+	LQD(A3_2, Ax, 256+1)
+
+	DFNMS(B23a, Bx3, A2)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B33a, Bx3, A3)
+	SHUFB(A2_0, A3_0, A3_0, splat0)
+	
+	DFNMS(B24a, Bx4, A2)
+	SHUFB(A3_0, A3_0, A3_0, splat1)
+	
+	DFNMS(B34a, Bx4, A3)
+	SHUFB(A2_1, A3_1, A3_1, splat0)
+	
+	DFNMS(B25a, Bx5, A2)
+	SHUFB(A3_1, A3_1, A3_1, splat1)
+	
+	DFNMS(B35a, Bx5, A3)
+	SHUFB(A3_2, A3_2, A3_2, splat1)
+	
+	DFNMS(B26a, Bx6, A2)
+	LQD(Bx0, bufB, 8+0)
+	
+	DFNMS(B36a, Bx6, A3)
+	LQD(Bx1, bufB, 8+1)
+	
+	DFNMS(B27a, Bx7, A2)
+	LQD(Bx2, bufB, 8+2)
+	
+ib21:	
+	DFNMS(B37a, Bx7, A3)
+	LQD(Bx3, bufB, 8+3)
+	
+
+	// COMPUTE END
+	DFNMS(B10a, B00a, A1)
+	lqr	addend, const_0_32_0_0
+	
+	DFNMS(B11a, B01a, A1)
+	LQD(Bx4, bufB, 8+4)
+	
+	DFNMS(B12a, B02a, A1)
+	LQD(Bx5, bufB, 8+5)
+	
+	DFNMS(B13a, B03a, A1)
+	LQD(Bx6, bufB, 8+6)
+	
+	DFNMS(B14a, B04a, A1)
+	LQD(Bx7, bufB, 8+7)
+	
+	DFNMS(B15a, B05a, A1)
+	STQD(B00a, Bx, 8*2+0)
+	
+	DFNMS(B16a, B06a, A1)
+	STQD(B01a, Bx, 8*2+1)
+	
+	DFNMS(B17a, B07a, A1)
+	STQD(B02a, Bx, 8*2+2)
+	
+	DFNMS(B20a, B00a, A2_0)
+	STQD(B03a, Bx, 8*2+3)
+	
+	DFNMS(B21a, B01a, A2_0)
+	STQD(B00a, Bx, 8*2+0)
+
+	DFNMS(B22a, B02a, A2_0)
+	STQD(B01a, Bx, 8*2+1)
+	
+	DFNMS(B23a, B03a, A2_0)
+	STQD(B02a, Bx, 8*2+2)
+	
+	a	bufB_bufA_i_Bi, bufB_bufA_i_Bi, addend
+	STQD(B03a, Bx, 8*2+3)
+
+	DFNMS(B24a, B04a, A2_0)
+	STQD(B04a, Bx, 8*2+4)
+	
+	DFNMS(B25a, B05a, A2_0)
+	HBRP
+	
+ib22:	
+	DFNMS(B26a, B06a, A2_0)
+	STQD(B05a, Bx, 8*2+5)
+	
+	DFNMS(B27a, B07a, A2_0)
+	lqr	addend, const_256_2K_xloop2_0
+	
+	DFNMS(B30a, B00a, A3_0)
+	rotqbyi	bufA, bufB_bufA_i_Bi, 4
+
+	DFNMS(B31a, B01a, A3_0)
+	LQD(B00b, Bi, 8*4+0)
+	
+	DFNMS(B32a, B02a, A3_0)
+	LQD(B01b, Bi, 8*4+1)
+	
+	DFNMS(B33a, B03a, A3_0)
+	LQD(B02b, Bi, 8*4+2)
+	
+	DFNMS(B34a, B04a, A3_0)
+	LQD(B03b, Bi, 8*4+3)
+
+	DFNMS(B35a, B05a, A3_0)
+	LQD(B04b, Bi, 8*4+4)
+	
+	DFNMS(B36a, B06a, A3_0)
+	LQD(B05b, Bi, 8*4+5)
+	
+	DFNMS(B37a, B07a, A3_0)
+	LQD(B06b, Bi, 8*4+6)
+	
+	LQD(B07b, Bi, 8*4+7)
+	LQD(A1, bufA, 0)
+	
+	DFNMS(B20a, B10a, A2_1)
+	LQD(A3, bufA, 1)
+	
+	DFNMS(B21a, B11a, A2_1)
+	LQD(B10b, Bi, 8*5+0)
+	
+	DFNMS(B22a, B12a, A2_1)
+	LQD(B11b, Bi, 8*5+1)
+	
+	DFNMS(B23a, B13a, A2_1)
+	LQD(B12b, Bi, 8*5+2)
+	
+	DFNMS(B24a, B14a, A2_1)
+	HBRP
+	
+ib23:	
+	DFNMS(B25a, B15a, A2_1)
+	LQD(B13b, Bi, 8*5+3)
+	
+	DFNMS(B26a, B16a, A2_1)
+	LQD(B14b, Bi, 8*5+4)
+	
+	DFNMS(B27a, B17a, A2_1)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B30a, B10a, A3_1)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B31a, B11a, A3_1)
+	LQD(B15b, Bi, 8*5+5)
+	
+	DFNMS(B32a, B12a, A3_1)
+	LQD(B16b, Bi, 8*5+6)
+	
+	DFNMS(B33a, B13a, A3_1)
+	LQD(B17b, Bi, 8*5+7)
+	
+	DFNMS(B34a, B14a, A3_1)
+	LQD(B20b, Bi, 8*6+0)
+	
+	DFNMS(B35a, B15a, A3_1)
+	LQD(B21b, Bi, 8*6+1)
+	
+	DFNMS(B36a, B16a, A3_1)
+	LQD(B22b, Bi, 8*6+2)
+	
+	DFNMS(B37a, B17a, A3_1)
+	LQD(B23b, Bi, 8*6+3)
+
+	DFNMS(B30a, B20a, A3_2)
+	LQD(B24b, Bi, 8*6+4)
+	
+	DFNMS(B31a, B21a, A3_2)
+	LQD(B25b, Bi, 8*6+5)
+	
+	DFNMS(B32a, B22a, A3_2)
+	LQD(B26b, Bi, 8*6+6)
+	
+	DFNMS(B33a, B23a, A3_2)
+	LQD(B27b, Bi, 8*6+7)
+	
+	DFNMS(B34a, B24a, A3_2)
+	HBRP
+	
+ib24:	
+	LQD(B30b, Bi, 8*7+0)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B35a, B25a, A3_2)
+	LQD(B31b, Bi, 8*7+1)
+	
+	DFNMS(B36a, B26a, A3_2)
+	LQD(B32b, Bi, 8*7+2)
+	
+	DFNMS(B37a, B27a, A3_2)
+	LQD(B33b, Bi, 8*7+3)
+
+	/************************ PHASE 2  **************************/
+	// COMPUTE BEGIN
+
+	DFNMS(B00b, B0, A0)
+	LQD(B34b, Bi, 8*7+4)
+	
+	DFNMS(B10b, B0, A1)
+	LQD(B35b, Bi, 8*7+5)
+	
+	DFNMS(B01b, B1, A0)
+	LQD(B36b, Bi, 8*7+6)
+	
+	nop	
+	LQD(B37b, Bi, 8*7+7)
+	
+	nop
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B11b, B1, A1)
+	STQD(B06a, Bx, 8*2+6)
+
+	DFNMS(B02b, B2, A0)
+	STQD(B07a, Bx, 8*2+7)
+	
+	DFNMS(B12b, B2, A1)
+	STQD(B10a, Bx, 8*3+0)
+	
+	DFNMS(B03b, B3, A0)
+	STQD(B11a, Bx, 8*3+1)
+	
+	DFNMS(B13b, B3, A1)
+	STQD(B12a, Bx, 8*3+2)
+	
+	DFNMS(B04b, B4, A0)
+	STQD(B13a, Bx, 8*3+3)
+
+	nop
+	STQD(B14a, Bx, 8*3+4)
+	
+ib25:	
+	DFNMS(B14b, B4, A1)
+	HBRP
+	
+	DFNMS(B05b, B5, A0)
+	STQD(B15a, Bx, 8*3+5)
+	
+	DFNMS(B15b, B5, A1)
+	STQD(B16a, Bx, 8*3+6)
+	
+	DFNMS(B06b, B6, A0)
+	STQD(B17a, Bx, 8*3+7)
+	
+	DFNMS(B16b, B6, A1)
+	STQD(B20a, Bx, 8*4+0)
+	
+	DFNMS(B07b, B7, A0)
+	STQD(B21a, Bx, 8*4+1)
+	
+	DFNMS(B17b, B7, A1)
+	LQD(A1, bufA, 64)
+	
+	DFNMS(B20b, B0, A2)
+	STQD(B22a, Bx, 8*4+2)
+	
+	DFNMS(B30b, B0, A3)
+	STQD(B23a, Bx, 8*4+3)
+	
+	DFNMS(B21b, B1, A2)
+	STQD(B24a, Bx, 8*4+4)
+	
+	DFNMS(B31b, B1, A3)
+	STQD(B25a, Bx, 8*4+5)
+	
+	DFNMS(B22b, B2, A2)
+	STQD(B26a, Bx, 8*4+6)
+	
+	DFNMS(B32b, B2, A3)
+	STQD(B27a, Bx, 8*4+7)
+	
+	DFNMS(B23b, B3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B33b, B3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24b, B4, A2)
+	HBRP
+	
+ib26:	
+	nop
+	STQD(B30a, Bx, 8*5+0)
+	
+	DFNMS(B34b, B4, A3)
+	STQD(B31a, Bx, 8*5+1)
+	
+	DFNMS(B25b, B5, A2)
+	STQD(B32a, Bx, 8*5+2)
+
+	DFNMS(B35b, B5, A3)
+	STQD(B33a, Bx, 8*5+3)
+	
+	DFNMS(B26b, B6, A2)
+	STQD(B34a, Bx, 8*5+4)
+	
+	DFNMS(B36b, B6, A3)
+	STQD(B35a, Bx, 8*5+5)
+	
+	DFNMS(B27b, B7, A2)
+	STQD(B36a, Bx, 8*5+6)
+	
+	DFNMS(B37b, B7, A3)
+	STQD(B37a, Bx, 8*5+7)
+
+  phase2:
+	/* x = 1 */
+	a	Bx, bufB_bufA_i_Bi, addend
+	LQD(A3, bufA, 64+1)
+	
+	DFNMS(B00b, Bx0, A0)
+	lnop
+	
+	DFNMS(B10b, Bx0, A1)
+	rotqbyi	x, Bx, 8
+	
+	DFNMS(B01b, Bx1, A0)
+	DFNMS(B11b, Bx1, A1)
+	
+	DFNMS(B02b, Bx2, A0)
+	DFNMS(B12b, Bx2, A1)
+	
+	DFNMS(B03b, Bx3, A0)
+	lqd	target, 0 (x)
+	
+	DFNMS(B13b, Bx3, A1)
+	SHUFB(A2, A3, A3, splat0)
+	DFNMS(B04b, Bx4, A0)
+	SHUFB(A3, A3, A3, splat1)
+
+ib27:	
+	DFNMS(B14b, Bx4, A1)	
+	rotqbyi	Ax, Bx, 4
+	
+	DFNMS(B05b, Bx5, A0)
+	DFNMS(B15b, Bx5, A1)
+	
+	DFNMS(B06b, Bx6, A0)
+	rotqby	target, target, x
+	
+	DFNMS(B16b, Bx6, A1)
+
+	DFNMS(B07b, Bx7, A0)
+		
+	DFNMS(B17b, Bx7, A1)
+	lnop
+	
+	DFNMS(B20b, Bx0, A2)
+	hbr	xloop2_entry, target
+	
+	DFNMS(B30b, Bx0, A3)
+	LQD(A1, bufA, 128)
+	
+	DFNMS(B21b, Bx1, A2)
+	DFNMS(B31b, Bx1, A3)
+	
+	DFNMS(B22b, Bx2, A2)
+	LQD(Bx0, bufB, 16+0)
+	
+	DFNMS(B32b, Bx2, A3)
+	LQD(Bx1, bufB, 16+1)
+	
+	DFNMS(B23b, Bx3, A2)
+	DFNMS(B33b, Bx3, A3)
+	
+	DFNMS(B24b, Bx4, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B34b, Bx4, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B25b, Bx5, A2)
+	LQD(Bx2, bufB, 16+2)
+	
+	DFNMS(B35b, Bx5, A3)
+	LQD(Bx3, bufB, 16+3)
+	
+	DFNMS(B26b, Bx6, A2)
+	LQD(Bx4, bufB, 16+4)
+	
+ib28:	
+	DFNMS(B36b, Bx6, A3)
+	LQD(Bx5, bufB, 16+5)
+	
+	DFNMS(B27b, Bx7, A2)
+	LQD(Bx6, bufB, 16+6)
+	
+	DFNMS(B37b, Bx7, A3)
+	LQD(Bx7, Bx, 7)
+	
+	DFNMS(B00b, Bx0, A0)
+	LQD(A3, Ax, 1)
+	
+	DFNMS(B10b, Bx0, A1)
+xloop2_entry:	
+	bi	target
+	
+	.align	7
+	
+xloop2:	
+ib29:	
+	/* x = 4*(31-i) + 2 */
+	DFNMS(B01b, Bx1, A0)
+	DFNMS(B11b, Bx1, A1)
+	DFNMS(B02b, Bx2, A0)
+	DFNMS(B12b, Bx2, A1)
+	
+	DFNMS(B03b, Bx3, A0)
+	lqd	target, 0 (x)
+
+	DFNMS(B13b, Bx3, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B04b, Bx4, A0)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B14b, Bx4, A1)
+	DFNMS(B05b, Bx5, A0)
+	
+	DFNMS(B15b, Bx5, A1)
+	DFNMS(B06b, Bx6, A0)
+	
+	DFNMS(B16b, Bx6, A1)
+	rotqby	target, target, x
+	
+	DFNMS(B07b, Bx7, A0)
+	DFNMS(B17b, Bx7, A1)
+	DFNMS(B20b, Bx0, A2)
+	DFNMS(B30b, Bx0, A3)
+	
+	DFNMS(B21b, Bx1, A2)
+	lqr	x, const_512_4K_4_0
+	DFNMS(B31b, Bx1, A3)
+	LQD(A1, Ax, 64)
+	
+	DFNMS(B22b, Bx2, A2)
+	LQD(Bx0, Bx, 8+0)
+	
+	DFNMS(B32b, Bx2, A3)
+	LQD(Bx1, Bx, 8+1)
+	
+	DFNMS(B23b, Bx3, A2)
+	LQD(Bx2, Bx, 8+2)
+	
+	DFNMS(B33b, Bx3, A3)
+	DFNMS(B24b, Bx4, A2)
+	
+ib30:	
+	DFNMS(B34b, Bx4, A3)
+	SHUFB(A0, A1, A1, splat0)
+	DFNMS(B25b, Bx5, A2)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B35b, Bx5, A3)
+	LQD(Bx3, Bx, 8+3)
+	DFNMS(B26b, Bx6, A2)
+	LQD(Bx4, Bx, 8+4)
+	
+	DFNMS(B36b, Bx6, A3)
+	LQD(Bx5, Bx, 8+5)
+	
+	DFNMS(B27b, Bx7, A2)
+	LQD(Bx6, Bx, 8+6)
+	
+	DFNMS(B37b, Bx7, A3)
+	LQD(A3, Ax, 64+1)
+	
+	/* x = 4*(31-i) + 3 */
+
+	DFNMS(B00b, Bx0, A0)
+	LQD(Bx7, Bx, 8+7)
+
+	DFNMS(B10b, Bx0, A1)
+	DFNMS(B01b, Bx1, A0)
+	DFNMS(B11b, Bx1, A1)
+	DFNMS(B02b, Bx2, A0)
+	
+	DFNMS(B12b, Bx2, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B03b, Bx3, A0)
+	SHUFB(A3, A3, A3, splat1)
+
+	DFNMS(B13b, Bx3, A1)
+	DFNMS(B04b, Bx4, A0)
+	
+	DFNMS(B14b, Bx4, A1)
+	DFNMS(B05b, Bx5, A0)
+	
+	DFNMS(B15b, Bx5, A1)
+	DFNMS(B06b, Bx6, A0)
+	
+	DFNMS(B16b, Bx6, A1)
+	DFNMS(B07b, Bx7, A0)
+	
+ib31:	
+	DFNMS(B17b, Bx7, A1)
+	DFNMS(B20b, Bx0, A2)
+	DFNMS(B30b, Bx0, A3)
+	DFNMS(B21b, Bx1, A2)
+	
+	DFNMS(B31b, Bx1, A3)
+	LQD(A1, Ax, 128)
+	DFNMS(B22b, Bx2, A2)
+	LQD(Bx0, Bx, 16+0)
+	
+	DFNMS(B32b, Bx2, A3)
+	LQD(Bx1, Bx, 16+1)
+	
+	DFNMS(B23b, Bx3, A2)
+	LQD(Bx2, Bx, 16+2)
+	
+	DFNMS(B33b, Bx3, A3)
+	LQD(Bx3, Bx, 16+3)
+
+	DFNMS(B24b, Bx4, A2)
+	DFNMS(B34b, Bx4, A3)
+	
+	DFNMS(B25b, Bx5, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B35b, Bx5, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B26b, Bx6, A2)
+	LQD(Bx4, Bx, 16+4)
+	
+	DFNMS(B36b, Bx6, A3)
+	LQD(Bx5, Bx, 16+5)
+	
+	DFNMS(B27b, Bx7, A2)
+	LQD(Bx6, Bx, 16+6)
+	
+	DFNMS(B37b, Bx7, A3)
+	DFNMS(B00b, Bx0, A0)
+	
+	/* x = 4*(31-i) + 4 */
+
+	DFNMS(B10b, Bx0, A1)
+	LQD(A3, Ax, 128+1)
+	
+	DFNMS(B01b, Bx1, A0)
+	LQD(Bx7, Bx, 16+7)
+	
+ib32:	
+	DFNMS(B11b, Bx1, A1)
+	DFNMS(B02b, Bx2, A0)
+	
+	DFNMS(B12b, Bx2, A1)
+	DFNMS(B03b, Bx3, A0)
+	
+	DFNMS(B13b, Bx3, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B04b, Bx4, A0)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B14b, Bx4, A1)
+	DFNMS(B05b, Bx5, A0)
+	
+	DFNMS(B15b, Bx5, A1)
+	DFNMS(B06b, Bx6, A0)
+	DFNMS(B16b, Bx6, A1)
+	DFNMS(B07b, Bx7, A0)
+	
+	DFNMS(B17b, Bx7, A1)
+	LQD(A1, Ax, 128+64)
+	
+	DFNMS(B20b, Bx0, A2)
+	DFNMS(B30b, Bx0, A3)
+	
+	DFNMS(B21b, Bx1, A2)
+	DFNMS(B31b, Bx1, A3)
+	
+	DFNMS(B22b, Bx2, A2)
+	LQD(Bx0, Bx, 24+0)
+	
+	DFNMS(B32b, Bx2, A3)
+	LQD(Bx1, Bx, 24+1)
+	
+	DFNMS(B23b, Bx3, A2)
+	LQD(Bx2, Bx, 24+2)
+	
+	DFNMS(B33b, Bx3, A3)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B24b, Bx4, A2)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B34b, Bx4, A3)
+	LQD(Bx3, Bx, 24+3)
+	
+ib33:	
+	DFNMS(B25b, Bx5, A2)
+	LQD(Bx4, Bx, 24+4)
+	
+	DFNMS(B35b, Bx5, A3)
+	DFNMS(B26b, Bx6, A2)
+	
+	DFNMS(B36b, Bx6, A3)
+	DFNMS(B27b, Bx7, A2)
+	
+	DFNMS(B37b, Bx7, A3)
+	LQD(A3, Ax, 128+64+1)
+	
+	/* x = 4*(31-i) + 5 */
+
+	DFNMS(B00b, Bx0, A0)
+	LQD(Bx5, Bx, 24+5)
+	
+	DFNMS(B10b, Bx0, A1)
+	LQD(Bx6, Bx, 24+6)
+	DFNMS(B01b, Bx1, A0)
+	LQD(Bx7, Bx, 24+7)
+	
+	DFNMS(B11b, Bx1, A1)
+	DFNMS(B02b, Bx2, A0)
+
+	DFNMS(B12b, Bx2, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B03b, Bx3, A0)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B13b, Bx3, A1)
+	DFNMS(B04b, Bx4, A0)
+	
+	DFNMS(B14b, Bx4, A1)
+	DFNMS(B05b, Bx5, A0)
+	
+	DFNMS(B15b, Bx5, A1)
+	DFNMS(B06b, Bx6, A0)
+	
+	DFNMS(B16b, Bx6, A1)
+	DFNMS(B07b, Bx7, A0)
+	
+	DFNMS(B17b, Bx7, A1)
+	LQD(A1, Ax, 256)
+	
+	a	Bx, Bx, x
+	DFNMS(B20b, Bx0, A2)
+	
+ib34:
+	DFNMS(B30b, Bx0, A3)
+	hbr	xloop2_branch, target
+	
+	DFNMS(B21b, Bx1, A2)
+	rotqbyi	Ax, Bx, 4
+	
+	DFNMS(B31b, Bx1, A3)
+	rotqbyi	x, Bx, 8
+	
+	DFNMS(B22b, Bx2, A2)
+	DFNMS(B32b, Bx2, A3)
+	
+	DFNMS(B23b, Bx3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B33b, Bx3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24b, Bx4, A2)
+	LQD(Bx0, Bx, 0)
+	
+	DFNMS(B34b, Bx4, A3)
+	LQD(Bx1, Bx, 1)
+	
+	DFNMS(B25b, Bx5, A2)
+	LQD(Bx2, Bx, 2)
+	
+	DFNMS(B35b, Bx5, A3)
+	LQD(Bx3, Bx, 3)
+	
+	DFNMS(B26b, Bx6, A2)
+	LQD(Bx4, Bx, 4)
+	
+	DFNMS(B36b, Bx6, A3)
+	LQD(Bx5, Bx, 5)
+	
+	DFNMS(B27b, Bx7, A2)
+	LQD(Bx6, Bx, 6)
+	
+	DFNMS(B37b, Bx7, A3)
+	LQD(Bx7, Bx, 7)
+	
+	DFNMS(B00b, Bx0, A0)
+	LQD(A3, Ax, 1)
+
+	DFNMS(B10b, Bx0, A1)
+xloop2_branch:	
+	bi	target
+	
+	.align	6
+xloop2_done:	
+ib35:	
+	/* x = 4*(31-i) + 2 */
+	DFNMS(B01b, Bx1, A0)
+	lqr	target, const_0_32_m8_1K
+	
+	DFNMS(B11b, Bx1, A1)
+	DFNMS(B02b, Bx2, A0)
+	
+	DFNMS(B12b, Bx2, A1)
+	DFNMS(B03b, Bx3, A0)
+	
+	DFNMS(B13b, Bx3, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+	DFNMS(B04b, Bx4, A0)
+	a	bufB_bufA_i_Bi, bufB_bufA_i_Bi, target
+	
+	DFNMS(B14b, Bx4, A1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B05b, Bx5, A0)
+	rotqbyi	i, bufB_bufA_i_Bi, 8
+	
+	DFNMS(B15b, Bx5, A1)
+	rotqbyi	bufA, bufB_bufA_i_Bi, 4
+	
+	DFNMS(B06b, Bx6, A0)
+	rotqbyi	Bi, bufB_bufA_i_Bi, 12
+	
+	DFNMS(B16b, Bx6, A1)
+	DFNMS(B07b, Bx7, A0)
+	
+	DFNMS(B17b, Bx7, A1)
+	lqx	target, i, base
+
+	DFNMS(B20b, Bx0, A2)
+	LQD(A1, Ax, 64)
+	
+	DFNMS(B30b, Bx0, A3)
+	DFNMS(B21b, Bx1, A2)
+	
+	DFNMS(B31b, Bx1, A3)
+	LQD(Bx0, Bx, 8+0)
+	
+	DFNMS(B22b, Bx2, A2)
+	LQD(Bx1, Bx, 8+1)
+	
+	DFNMS(B32b, Bx2, A3)
+	rotqby	target, target, i
+	
+ib36:
+	DFNMS(B23b, Bx3, A2)
+	SHUFB(A0, A1, A1, splat0)
+	DFNMS(B33b, Bx3, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B24b, Bx4, A2)
+	LQD(Bx2, Bx, 8+2)
+	DFNMS(B34b, Bx4, A3)
+	LQD(Bx3, Bx, 8+3)
+	
+	DFNMS(B25b, Bx5, A2)
+	lnop
+	DFNMS(B35b, Bx5, A3)
+	hbr	iloop_branch, target
+	
+	DFNMS(B26b, Bx6, A2)
+	LQD(Bx4, Bx, 8+4)
+	DFNMS(B36b, Bx6, A3)
+	LQD(Bx5, Bx, 8+5)
+
+	DFNMS(B27b, Bx7, A2)
+	DFNMS(B37b, Bx7, A3)
+
+	/* x = 4*(31-i) + 3 */
+	DFNMS(B00b, Bx0, A0)
+	DFNMS(B10b, Bx0, A1)
+	
+	DFNMS(B01b, Bx1, A0)
+	LQD(A3, Ax, 64+1)
+	
+	DFNMS(B11b, Bx1, A1)
+	LQD(Bx6, Bx, 8+6)
+	DFNMS(B02b, Bx2, A0)
+	LQD(Bx7, Bx, 8+7)
+	
+	DFNMS(B12b, Bx2, A1)
+	DFNMS(B03b, Bx3, A0)
+	
+	DFNMS(B13b, Bx3, A1)
+	DFNMS(B04b, Bx4, A0)
+	
+	DFNMS(B14b, Bx4, A1)
+	SHUFB(A2, A3, A3, splat0)
+	
+ib37:
+	DFNMS(B05b, Bx5, A0)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B15b, Bx5, A1)
+	DFNMS(B06b, Bx6, A0)
+	
+	DFNMS(B16b, Bx6, A1)
+	DFNMS(B07b, Bx7, A0)
+	
+	DFNMS(B17b, Bx7, A1)
+	LQD(A1, Ax, 128)
+	
+	DFNMS(B20b, Bx0, A2)
+	LQD(A3_0, Ax, 128+1)
+	
+	DFNMS(B30b, Bx0, A3)
+	LQD(A3_1, Ax, 192+1)
+	
+	DFNMS(B21b, Bx1, A2)
+	LQD(A3_2, Ax, 256+1)
+	
+	DFNMS(B31b, Bx1, A3)
+	DFNMS(B22b, Bx2, A2)
+	
+	DFNMS(B32b, Bx2, A3)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B23b, Bx3, A2)
+	SHUFB(A2_0, A3_0, A3_0, splat0)
+	
+	DFNMS(B33b, Bx3, A3)
+	SHUFB(A3_0, A3_0, A3_0, splat1)
+	
+	DFNMS(B24b, Bx4, A2)
+	SHUFB(A2_1, A3_1, A3_1, splat0)
+	
+	DFNMS(B34b, Bx4, A3)
+	SHUFB(A3_1, A3_1, A3_1, splat1)
+	
+	DFNMS(B25b, Bx5, A2)
+	SHUFB(A3_2, A3_2, A3_2, splat1)
+	
+	DFNMS(B35b, Bx5, A3)
+	DFNMS(B26b, Bx6, A2)
+	
+	DFNMS(B36b, Bx6, A3)
+	DFNMS(B27b, Bx7, A2)
+	
+ib38:	
+	DFNMS(B37b, Bx7, A3)
+	DFNMS(B10b, B00b, A1)
+	
+	DFNMS(B11b, B01b, A1)
+	DFNMS(B12b, B02b, A1)
+	
+	DFNMS(B13b, B03b, A1)
+	DFNMS(B14b, B04b, A1)
+	
+	DFNMS(B15b, B05b, A1)
+	DFNMS(B16b, B06b, A1)
+	
+	DFNMS(B17b, B07b, A1)
+	LQD(B00a, Bi, 8*0+0)
+	
+	DFNMS(B20b, B00b, A2_0)
+	LQD(B01a, Bi, 8*0+1)
+	DFNMS(B21b, B01b, A2_0)	
+	LQD(B02a, Bi, 8*0+2)
+	DFNMS(B22b, B02b, A2_0)
+	LQD(B03a, Bi, 8*0+3)
+
+	DFNMS(B23b, B03b, A2_0)
+	DFNMS(B24b, B04b, A2_0)
+	
+	DFNMS(B25b, B05b, A2_0)
+	LQD(B04a, Bi, 8*0+4)
+	
+	DFNMS(B26b, B06b, A2_0)
+	LQD(B05a, Bi, 8*0+5)
+	
+	DFNMS(B27b, B07b, A2_0)
+	LQD(A1, bufA, 0)
+	
+	DFNMS(B30b, B00b, A3_0)
+	LQD(B06a, Bi, 8*0+6)
+	
+	DFNMS(B31b, B01b, A3_0)
+	LQD(B07a, Bi, 8*0+7)
+	
+	DFNMS(B32b, B02b, A3_0)
+	LQD(B10a, Bi, 8*1+0)
+	
+	DFNMS(B33b, B03b, A3_0)
+	LQD(B11a, Bi, 8*1+1)
+	
+ib39:
+	DFNMS(B34b, B04b, A3_0)
+	HBRP
+
+	DFNMS(B35b, B05b, A3_0)
+	LQD(B12a, Bi, 8*1+2)
+	
+	DFNMS(B36b, B06b, A3_0)
+	LQD(B13a, Bi, 8*1+3)
+	
+	DFNMS(B37b, B07b, A3_0)
+	LQD(B14a, Bi, 8*1+4)
+	
+	DFNMS(B20b, B10b, A2_1)
+	LQD(B15a, Bi, 8*1+5)
+	
+	DFNMS(B21b, B11b, A2_1)
+	LQD(B16a, Bi, 8*1+6)
+	
+	DFNMS(B22b, B12b, A2_1)
+	LQD(B17a, Bi, 8*1+7)
+	
+	DFNMS(B23b, B13b, A2_1)
+	LQD(B20a, Bi, 8*2+0)
+	
+	DFNMS(B24b, B14b, A2_1)
+	SHUFB(A0, A1, A1, splat0)
+	
+	DFNMS(B25b, B15b, A2_1)
+	SHUFB(A1, A1, A1, splat1)
+	
+	DFNMS(B26b, B16b, A2_1)
+	LQD(A3, bufA, 1)
+	
+	DFNMS(B27b, B17b, A2_1)
+	LQD(B21a, Bi, 8*2+1)
+	
+	DFNMS(B30b, B10b, A3_1)
+	LQD(B22a, Bi, 8*2+2)
+	
+	DFNMS(B31b, B11b, A3_1)
+	LQD(B23a, Bi, 8*2+3)
+	
+	DFNMS(B32b, B12b, A3_1)
+	LQD(B24a, Bi, 8*2+4)
+	
+	DFNMS(B33b, B13b, A3_1)
+	LQD(B25a, Bi, 8*2+5)
+	
+	
+ib40:	
+	DFNMS(B34b, B14b, A3_1)
+	LQD(B26a, Bi, 8*2+6)
+	
+	nop
+	LQD(B27a, Bi, 8*2+7)
+	
+	DFNMS(B35b, B15b, A3_1)
+	SHUFB(A2, A3, A3, splat0)
+
+	DFNMS(B36b, B16b, A3_1)
+	SHUFB(A3, A3, A3, splat1)
+	
+	DFNMS(B37b, B17b, A3_1)
+	LQD(B30a, Bi, 8*3+0)
+	
+	DFNMS(B30b, B20b, A3_2)
+	LQD(B31a, Bi, 8*3+1)
+	DFNMS(B31b, B21b, A3_2)
+	LQD(B32a, Bi, 8*3+2)
+	DFNMS(B32b, B22b, A3_2)
+	LQD(B33a, Bi, 8*3+3)
+	DFNMS(B33b, B23b, A3_2)
+	LQD(B34a, Bi, 8*3+4)
+	DFNMS(B34b, B24b, A3_2)
+	LQD(B35a, Bi, 8*3+5)
+	DFNMS(B35b, B25b, A3_2)
+	LQD(B36a, Bi, 8*3+6)
+	DFNMS(B36b, B26b, A3_2)
+	LQD(B37a, Bi, 8*3+7)
+	
+	DFNMS(B37b, B27b, A3_2)
+iloop_branch:	
+	bi	target
+	
+	.align	6
+iloop_done:	
+	// STORE END
+ib41:
+	HBRP	
+	STQD(B00b, Bx, 8*2+0)
+	
+	STQD(B01b, Bx, 8*2+1)
+	STQD(B02b, Bx, 8*2+2)
+	
+	STQD(B03b, Bx, 8*2+3)
+	STQD(B04b, Bx, 8*2+4)
+	
+	STQD(B05b, Bx, 8*2+5)
+	STQD(B06b, Bx, 8*2+6)
+	
+	STQD(B07b, Bx, 8*2+7)
+	STQD(B10b, Bx, 8*3+0)
+	
+	STQD(B11b, Bx, 8*3+1)
+	STQD(B12b, Bx, 8*3+2)
+	
+	STQD(B13b, Bx, 8*3+3)
+	STQD(B14b, Bx, 8*3+4)
+	
+	STQD(B15b, Bx, 8*3+5)
+	STQD(B16b, Bx, 8*3+6)
+	
+	STQD(B17b, Bx, 8*3+7)
+	STQD(B20b, Bx, 8*4+0)
+	
+	STQD(B21b, Bx, 8*4+1)
+	hbr	return, $0
+	
+	STQD(B22b, Bx, 8*4+2)
+	STQD(B23b, Bx, 8*4+3)
+	
+	STQD(B24b, Bx, 8*4+4)
+	STQD(B25b, Bx, 8*4+5)
+	
+	STQD(B26b, Bx, 8*4+6)
+	STQD(B27b, Bx, 8*4+7)
+	
+	STQD(B30b, Bx, 8*5+0)
+	STQD(B31b, Bx, 8*5+1)
+	
+	STQD(B32b, Bx, 8*5+2)
+	STQD(B33b, Bx, 8*5+3)
+	
+	STQD(B34b, Bx, 8*5+4)
+	STQD(B35b, Bx, 8*5+5)
+	
+ib42:	
+	STQD(B36b, Bx, 8*5+6)
+	STQD(B37b, Bx, 8*5+7)
+	/* Restore the non-volatile register */
+	
+        lqd    $80, -16*1($SP)
+        lqd    $81, -16*2($SP)
+	
+        lqd    $82, -16*3($SP)
+        lqd    $83, -16*4($SP)
+	
+	lqd    $84, -16*5($SP)
+        lqd    $85, -16*6($SP)
+	
+        lqd    $86, -16*7($SP)
+        lqd    $87, -16*8($SP)
+
+        lqd    $88, -16*9($SP)
+        lqd    $89, -16*10($SP)
+	
+        lqd    $90, -16*11($SP)
+        lqd    $91, -16*12($SP)
+
+        lqd    $92, -16*13($SP)
+        lqd    $93, -16*14($SP)
+	
+        lqd    $94, -16*15($SP)
+        lqd    $95, -16*16($SP)
+	
+	lqd    $96, -16*17($SP)
+        lqd    $97, -16*18($SP)
+	
+        lqd    $98, -16*19($SP)
+        lqd    $99, -16*20($SP)
+	
+        lqd    $100, -16*21($SP)
+        lqd    $101, -16*22($SP)
+return:	
+	bi	$0
Index: accel/lib/spu/accel_dtrsm_panel.c
===================================================================
RCS file: accel/lib/spu/accel_dtrsm_panel.c
diff -N accel/lib/spu/accel_dtrsm_panel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_dtrsm_panel.c	22 Oct 2008 03:28:08 -0000	1.3
@@ -0,0 +1,250 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_dtrsm.h"
+
+void accel_dtrsm_panel(hpl_accel_init_parms_t *parms, 
+		       volatile hpl_accel_dtrsm_parms_t *cmd_parms)
+{
+  int i;
+  unsigned int idx, tag, next_tag;
+  unsigned int size, lda, stride;
+  unsigned int id;
+  unsigned long long a, b, c;
+  unsigned int a_hi, a_lo;
+  unsigned int b_hi, b_lo;
+  unsigned int c_hi, c_lo;
+  unsigned int list;
+  unsigned int n;
+  unsigned int span;
+  vec_uint4 ld;
+  vec_uint4 element, stride2, stride4, stride6;
+  vec_uint4 elementc, nextc, stride2c, stride4c, stride6c;
+  volatile void *lsa;
+#ifdef MATRIX_4GB_CROSSING
+  unsigned int list_size;
+#endif
+#if (HPL_ACCEL_SPES & 3) != 0
+  unsigned int stride0c, stride1c;
+  vec_uint4 blk_idx, next0c, next1c;
+#endif
+  
+  id = parms->id;
+
+  stride2c = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)});
+  stride4c = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)});
+  stride6c = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)});
+
+  elementc = (vec_uint4){0};		/* included just to eliminate a warning */
+  nextc    = (vec_uint4){0};		/* included just to eliminate a warning */
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  b = cmd_parms->b;
+  c = cmd_parms->c;
+  ld = cmd_parms->ld;
+
+  lda = spu_extract(ld, 0);
+
+  /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup
+   * time, we will download only the necessary data columns in groups of 16 
+   * while preserving the cacheline alignment. The download will be done 
+   * starting from the smallest column to the largest.
+   */
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  lsa = (volatile void *)(&bufA_128x128[0]);
+  size = 128*sizeof(double);
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT_RECEIVE();
+
+  for (i=0; i<127; i++) {
+    unsigned int adjust;
+
+    spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD);
+
+    a_lo += lda;
+    lsa  += 128*sizeof(double);
+
+    /* Compute the next DMA parameters 
+     */
+    adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0);
+
+    a_lo += adjust;
+    lsa  += adjust;
+    size -= adjust;
+  }
+
+  n = spu_extract(cmd_parms->dim, 0) / 16;
+  b_hi = mfc_ea2h(b);
+  b_lo = mfc_ea2l(b);
+  
+  b_lo += 16 * sizeof(double) * id;
+
+  /* Download the initial set of 16 B columns
+   */
+  stride = spu_extract(ld, 1);
+  
+  element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))),
+				spu_promote(b_lo, 0), 
+				((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+		    spu_rlmaskqwbyte(spu_promote(stride, 0), -12));
+
+  stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1);
+  stride4 = spu_add(stride2, stride2);
+  stride6 = spu_add(stride2, stride4);
+
+  fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6);
+  spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD);
+
+
+  c_hi = mfc_ea2h(c);
+  c_lo = mfc_ea2l(c);
+
+  span = spu_extract(cmd_parms->blk_col, 0) + id;
+  stride = spu_extract(ld, 2);
+  c_lo += (span & 3) * 16 * sizeof(double);
+  MATRIX_EA_UMADD32(c_hi, c_lo, (span/4), stride);
+  elementc = spu_add(spu_shuffle(element, spu_promote(c_lo, 0),
+				 ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})),
+		     ((vec_uint4){0, 0, 0, 64*sizeof(double)}));
+
+#if (HPL_ACCEL_SPES & 3) != 0
+  blk_idx = spu_splats(span & 3);
+
+  stride0c  = stride * (HPL_ACCEL_SPES / 4);
+  stride1c  = stride * (1 +  HPL_ACCEL_SPES / 4);
+  stride0c += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double);
+  stride1c -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double);
+
+  next0c = spu_shuffle(spu_promote(stride0c, 0), spu_promote(stride0c, 0),
+		      ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+  next1c = spu_shuffle(spu_promote(stride1c, 0), spu_promote(stride1c, 0),
+		      ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+#else
+  stride *= HPL_ACCEL_SPES / 4;
+  nextc = spu_shuffle(spu_promote(stride, 0), 
+		      spu_promote(stride, 0), 
+		      ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}));
+#endif
+
+  idx = 1;
+  next_tag = 0;
+  tag = 1;
+
+  DMA_WAIT_REQUEST(1<<0);
+
+  for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) {
+    /* Fetch the next buffer
+     */
+    element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)}));
+
+    fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6);
+    spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD);
+    tag ^= 1;
+
+    /* Wait for the previous get to complete */
+    DMA_WAIT_RECEIVE();
+
+    /* Perform the dtrsm.
+     */
+    dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]);
+
+    idx = (idx + 1) & 3;
+
+    /* Store the results back to system memory in c 
+     * Construct the display list to store to the blocked formated C matrix.
+     */
+    list = (unsigned int)(&bufB_list[idx+4][0]);
+    fill_dma_list((volatile vec_uint4 *)list, elementc, stride2c, stride4c, stride6c);
+      
+#if (HPL_ACCEL_SPES & 3) != 0
+    blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3));
+    nextc = spu_sel(next0c, next1c, spu_cmpgt(blk_idx, 3));
+    blk_idx = spu_and(blk_idx, 3);
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+    /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through
+     * the list.
+     */
+    list_size = (spu_extract(elementc, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8);
+
+    spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD);
+    spu_mfcdma64(&bufB_128x16[tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD);
+
+    c_hi += spu_extract(spu_genc(elementc, nextc), 1);
+#else
+    spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD);
+#endif
+    elementc = spu_add(elementc, nextc);
+
+    next_tag = tag ^ 1;
+
+    DMA_WAIT_REQUEST(1<<next_tag);
+  }
+
+  /* Wait for the previous GET to complete */
+  DMA_WAIT_RECEIVE();
+
+  /* Perform the dtrsm.
+   */
+  dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[next_tag][0]);
+
+  idx = (idx + 1) & 3;
+
+  /* Store the results back to system memory, either into b or c 
+   */
+  list = (unsigned int)&bufB_list[idx^2][0];
+
+  /* Construct the display list to store to the blocked formated C matrix.
+   */
+  list = (unsigned int)(&bufB_list[idx+4][0]);
+  fill_dma_list((volatile vec_uint4 *)list, elementc, stride2c, stride4c, stride6c);
+
+#if (HPL_ACCEL_SPES & 3) != 0
+  blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3));
+  nextc = spu_sel(next0c, next1c, spu_cmpgt(blk_idx, 3));
+  blk_idx = spu_and(blk_idx, 3);
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+  /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through
+   * the list.
+   */
+  list_size = (spu_extract(elementc, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8);
+
+  spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD);
+  spu_mfcdma64(&bufB_128x16[next_tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD);
+
+  c_hi += spu_extract(spu_genc(elementc, nextc), 1);
+#else
+  spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD);
+#endif
+
+  elementc = spu_add(elementc, nextc);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
+
Index: accel/lib/spu/accel_mm_dp.c
===================================================================
RCS file: accel/lib/spu/accel_mm_dp.c
diff -N accel/lib/spu/accel_mm_dp.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_mm_dp.c	20 Aug 2008 03:57:53 -0000	1.8
@@ -0,0 +1,289 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <spu_intrinsics.h>
+
+
+/* Compute generalized matrix multiply of the form
+ * 
+ *   [C] -= [A] * [B]
+ *
+ * where 
+ *   C is a row ordered matrix of dimension n by m (width by height) elements 
+ *     with leading dimension n.
+ *   A is a row ordered matrix of dimension k by m elements with leading
+ *     dimension k.
+ *   B is a row ordered matrix of dimension n by k elements with leading
+ *     dimenstion n.
+ *
+ *  The computation is performed by computing the result using sub-block of
+ *  the size 8x4 for B and C, and 4x4 for A.
+ *
+ *  This blocking mandates that k and m must be an integral multiple of 4 and 
+ *  n must be an integral multiple of 8.
+ *
+ *  NOTE: The leading dimensions are a double stride, not a vector stride.
+ */
+
+void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b)
+{
+  int i, x, y;
+  vector unsigned int pA, pB, pC;
+  vector unsigned int pA_start, pA_row, pB_start, pC_start;
+  vector unsigned int n1, n32, k1, k32;
+  vector double *pA0, *pA1, *pA2, *pA3;
+  vector double *pB0, *pB1, *pB2, *pB3;
+  vector double *pC0, *pC1, *pC2, *pC3;
+  vector double A00, A01, A10, A11, A20, A21, A30, A31;
+  vector double A00_0, A10_0, A20_0, A30_0;
+  vector double A00_1, A10_1, A20_1, A30_1;
+  vector double A01_0, A11_0, A21_0, A31_0;
+  vector double A01_1, A11_1, A21_1, A31_1;
+  vector double B00, B01, B02, B03;
+  vector double B10, B11, B12, B13;
+  vector double B20, B21, B22, B23;
+  vector double B30, B31, B32, B33;
+  vector double C00, C01, C02, C03;
+  vector double C10, C11, C12, C13;
+  vector double C20, C21, C22, C23;
+  vector double C30, C31, C32, C33;
+  vector unsigned int v_0123 = (vector unsigned int){0, 8, 16, 24};
+  vector unsigned int n_0123, k_0123;
+  vector unsigned char pat0 = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  vector unsigned char pat1;
+
+  pat1 = spu_or(pat0, 8);
+
+  /* Precompute 4 local store pointers for each of the buffer pointers
+   * 
+   *   pA_start = a+0*k, a+1*k, a+2*k, a+3*k
+   *   pB_start = b+0*n, b+1*n, a+2*n, a+3*n
+   *   pC_start = c+0*n, c+1*n, a+2*n, a+3*n
+   *
+   * where a, b, c are double pointers.
+   */
+  k1 = spu_splats((unsigned int)k);
+  n1 = spu_splats((unsigned int)n);
+  
+  k_0123 = spu_mulo((vector unsigned short)k1, (vector unsigned short)v_0123);
+  n_0123 = spu_mulo((vector unsigned short)n1, (vector unsigned short)v_0123);
+  pA_start = spu_add(spu_splats((unsigned int)a), k_0123);
+  pB_start = spu_add(spu_splats((unsigned int)b), n_0123);
+  pC_start = spu_add(spu_splats((unsigned int)c), n_0123);
+
+  n32 = spu_sl(n1, 5);
+  k32 = spu_sl(k1, 5);
+
+  for (x=0; x<n; x+=8) {
+    pA_row = pA_start;
+    pC = pC_start;
+
+    for (y=0; y<m; y+=4) {
+      /* Fetch a 4x8 block of C */
+      pC0 = (vec_double2 *)spu_extract(pC, 0);
+      pC1 = (vec_double2 *)spu_extract(pC, 1);
+      pC2 = (vec_double2 *)spu_extract(pC, 2);
+      pC3 = (vec_double2 *)spu_extract(pC, 3);
+
+      C00 = pC0[0];
+      C01 = pC0[1];
+      C02 = pC0[2];
+      C03 = pC0[3];
+
+      C10 = pC1[0];
+      C11 = pC1[1];
+      C12 = pC1[2];
+      C13 = pC1[3];
+
+      C20 = pC2[0];
+      C21 = pC2[1];
+      C22 = pC2[2];
+      C23 = pC2[3];
+
+      C30 = pC3[0];
+      C31 = pC3[1];
+      C32 = pC3[2];
+      C33 = pC3[3];
+
+      pA = pA_row;
+      pB = pB_start;
+
+      for (i=0; i<k; i+=4) {
+	/* A column 0 */
+	pA0 = (vec_double2 *)spu_extract(pA, 0);
+	pA1 = (vec_double2 *)spu_extract(pA, 1);
+	pA2 = (vec_double2 *)spu_extract(pA, 2);
+	pA3 = (vec_double2 *)spu_extract(pA, 3);
+
+	A00 = pA0[0];
+	A10 = pA1[0];
+	A20 = pA2[0];
+	A30 = pA3[0];
+
+	pB0 = (vec_double2 *)spu_extract(pB, 0);
+	B00 = pB0[0];
+	B01 = pB0[1];
+	B02 = pB0[2];
+	B03 = pB0[3];
+
+	A00_0 = spu_shuffle(A00, A00, pat0);
+	A10_0 = spu_shuffle(A10, A10, pat0);
+	A20_0 = spu_shuffle(A20, A20, pat0);
+	A30_0 = spu_shuffle(A30, A30, pat0);
+
+	C00 = spu_nmsub(A00_0, B00, C00);
+	C01 = spu_nmsub(A00_0, B01, C01);
+	C02 = spu_nmsub(A00_0, B02, C02);
+	C03 = spu_nmsub(A00_0, B03, C03);
+
+	C10 = spu_nmsub(A10_0, B00, C10);
+	C11 = spu_nmsub(A10_0, B01, C11);
+	C12 = spu_nmsub(A10_0, B02, C12);
+	C13 = spu_nmsub(A10_0, B03, C13);
+
+	C20 = spu_nmsub(A20_0, B00, C20);
+	C21 = spu_nmsub(A20_0, B01, C21);
+	C22 = spu_nmsub(A20_0, B02, C22);
+	C23 = spu_nmsub(A20_0, B03, C23);
+
+	C30 = spu_nmsub(A30_0, B00, C30);
+	C31 = spu_nmsub(A30_0, B01, C31);
+	C32 = spu_nmsub(A30_0, B02, C32);
+	C33 = spu_nmsub(A30_0, B03, C33);
+
+	/* A column 1 */
+	pB1 = (vec_double2 *)spu_extract(pB, 1);
+	B10 = pB1[0];
+	B11 = pB1[1];
+	B12 = pB1[2];
+	B13 = pB1[3];
+
+	A00_1 = spu_shuffle(A00, A00, pat1);
+	A10_1 = spu_shuffle(A10, A10, pat1);
+	A20_1 = spu_shuffle(A20, A20, pat1);
+	A30_1 = spu_shuffle(A30, A30, pat1);
+
+	C00 = spu_nmsub(A00_1, B10, C00);
+	C01 = spu_nmsub(A00_1, B11, C01);
+	C02 = spu_nmsub(A00_1, B12, C02);
+	C03 = spu_nmsub(A00_1, B13, C03);
+
+	C10 = spu_nmsub(A10_1, B10, C10);
+	C11 = spu_nmsub(A10_1, B11, C11);
+	C12 = spu_nmsub(A10_1, B12, C12);
+	C13 = spu_nmsub(A10_1, B13, C13);
+
+	C20 = spu_nmsub(A20_1, B10, C20);
+	C21 = spu_nmsub(A20_1, B11, C21);
+	C22 = spu_nmsub(A20_1, B12, C22);
+	C23 = spu_nmsub(A20_1, B13, C23);
+
+	C30 = spu_nmsub(A30_1, B10, C30);
+	C31 = spu_nmsub(A30_1, B11, C31);
+	C32 = spu_nmsub(A30_1, B12, C32);
+	C33 = spu_nmsub(A30_1, B13, C33);
+
+	/* A colum 2 */
+	A01 = pA0[1];
+	A11 = pA1[1];
+	A21 = pA2[1];
+	A31 = pA3[1];
+
+	pB2 = (vec_double2 *)spu_extract(pB, 2);
+	B20 = pB2[0];
+	B21 = pB2[1];
+	B22 = pB2[2];
+	B23 = pB2[3];
+
+	A01_0 = spu_shuffle(A01, A01, pat0);
+	A11_0 = spu_shuffle(A11, A11, pat0);
+	A21_0 = spu_shuffle(A21, A21, pat0);
+	A31_0 = spu_shuffle(A31, A31, pat0);
+
+	C00 = spu_nmsub(A01_0, B20, C00);
+	C01 = spu_nmsub(A01_0, B21, C01);
+	C02 = spu_nmsub(A01_0, B22, C02);
+	C03 = spu_nmsub(A01_0, B23, C03);
+
+	C10 = spu_nmsub(A11_0, B20, C10);
+	C11 = spu_nmsub(A11_0, B21, C11);
+	C12 = spu_nmsub(A11_0, B22, C12);
+	C13 = spu_nmsub(A11_0, B23, C13);
+
+	C20 = spu_nmsub(A21_0, B20, C20);
+	C21 = spu_nmsub(A21_0, B21, C21);
+	C22 = spu_nmsub(A21_0, B22, C22);
+	C23 = spu_nmsub(A21_0, B23, C23);
+
+	C30 = spu_nmsub(A31_0, B20, C30);
+	C31 = spu_nmsub(A31_0, B21, C31);
+	C32 = spu_nmsub(A31_0, B22, C32);
+	C33 = spu_nmsub(A31_0, B23, C33);
+
+	/* A column 3 */
+	pB3 = (vec_double2 *)spu_extract(pB, 3);
+	B30 = pB3[0];
+	B31 = pB3[1];
+	B32 = pB3[2];
+	B33 = pB3[3];
+
+	A01_1 = spu_shuffle(A01, A01, pat1);
+	A11_1 = spu_shuffle(A11, A11, pat1);
+	A21_1 = spu_shuffle(A21, A21, pat1);
+	A31_1 = spu_shuffle(A31, A31, pat1);
+
+	C00 = spu_nmsub(A01_1, B30, C00);
+	C01 = spu_nmsub(A01_1, B31, C01);
+	C02 = spu_nmsub(A01_1, B32, C02);
+	C03 = spu_nmsub(A01_1, B33, C03);
+
+	C10 = spu_nmsub(A11_1, B30, C10);
+	C11 = spu_nmsub(A11_1, B31, C11);
+	C12 = spu_nmsub(A11_1, B32, C12);
+	C13 = spu_nmsub(A11_1, B33, C13);
+
+	C20 = spu_nmsub(A21_1, B30, C20);
+	C21 = spu_nmsub(A21_1, B31, C21);
+	C22 = spu_nmsub(A21_1, B32, C22);
+	C23 = spu_nmsub(A21_1, B33, C23);
+
+	C30 = spu_nmsub(A31_1, B30, C30);
+	C31 = spu_nmsub(A31_1, B31, C31);
+	C32 = spu_nmsub(A31_1, B32, C32);
+	C33 = spu_nmsub(A31_1, B33, C33);
+
+	pA = spu_add(pA, 2*16);
+	pB = spu_add(pB, n32);
+      }
+      pA_row = spu_add(pA_row, k32);
+
+      /* Store the updated 4x8 block of C */
+      pC0[0] = C00;
+      pC0[1] = C01;
+      pC0[2] = C02;
+      pC0[3] = C03;
+
+      pC1[0] = C10;
+      pC1[1] = C11;
+      pC1[2] = C12;
+      pC1[3] = C13;
+
+      pC2[0] = C20;
+      pC2[1] = C21;
+      pC2[2] = C22;
+      pC2[3] = C23;
+
+      pC3[0] = C30;
+      pC3[1] = C31;
+      pC3[2] = C32;
+      pC3[3] = C33;
+
+      pC = spu_add(pC, n32);
+    }
+
+    pB_start = spu_add(pB_start, 4*16);
+    pC_start = spu_add(pC_start, 4*16);
+  }
+}
Index: accel/lib/spu/accel_mm_dp_64Cx64.S
===================================================================
RCS file: accel/lib/spu/accel_mm_dp_64Cx64.S
diff -N accel/lib/spu/accel_mm_dp_64Cx64.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_mm_dp_64Cx64.S	23 Oct 2008 21:20:24 -0000	1.4
@@ -0,0 +1,2323 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+/*
+ * SYNOPSIS:
+ *	void mm_dp_64Cx64(vec_double2 *blkC, vec_double2 *blkA, vec_double2 *blkB)
+ *
+ * DESCRIPTION: 
+ *      mm_dp_64x64 computes a double precision, matrix negative 
+ *	multiply subtract on 64x64 matrix blocks. 
+ *
+ *		[C] -= [A] * [B]
+ *
+ *      This function assumes that matrices [B], and [C] are row ordered and
+ *      and matrix [A] is column ordered.
+ *
+ *      This function can also be compiled for [A] being little-endian if
+ *	ACCEL_LITTLE_ENDIAN is defined.
+ *    
+ */
+	
+
+#if 1
+#define DFNMS(_d, _a, _b)	dfnms	_d, _a, _b
+#else
+#define DFNMS(_d, _a, _b)	fnms	_d, _a, _b, _d
+#endif	
+#define ADD(_d, _a, _b)		a	_d, _a, _b
+#define SHUFB(_d, _a, _b, _c)	shufb	_d, _a, _b, _c
+#define LQD(_d, _a, _idx)	lqd	_d, (_idx)*16(_a)
+#define STQD(_d, _a, _idx)	stqd	_d, (_idx)*16(_a)
+#define LR(_d, _a)		rotqbyi	_d, _a, 0
+#define HBRP()			hbrp	
+
+	
+#define blkCnext	$2
+	
+#define blkC		$3	
+#define blkA		$4
+#define blkB		$5	
+	
+/* Working variables
+ */
+#define blkA2		$6	
+#define blkB2		$7
+	
+#define A00		$8
+#define A01		$9
+#define A10		$10
+#define A11		$11
+#define A20		$12
+#define A21		$13
+#define A30		$14
+#define A31		$15
+	
+#define B00		$16
+#define B01		$17
+#define B02		$18
+#define B03		$19
+#define B10		$20
+#define B11		$21
+#define B12		$22
+#define B13		$23
+	
+#define C00		$24
+#define C01		$25
+#define C02		$26
+#define C03		$27
+#define C10		$28
+#define C11		$29
+#define C12		$30
+#define C13		$31
+#define C20		$32
+#define C21		$33
+#define C22		$34
+#define C23		$35
+#define C30		$36
+#define C31		$37
+#define C32		$38
+#define C33		$39
+
+#define C00next		$40
+#define C01next		$41
+#define C02next		$42
+#define C03next		$43
+#define C10next		$44
+#define C11next		$45
+#define C12next		$46
+#define C13next		$47
+#define C20next		$48
+#define C21next		$49
+#define C22next		$50
+#define C23next		$51
+#define C30next		$52
+#define C31next		$53
+#define C32next		$54
+#define C33next		$55
+
+#define pat0		$56
+#define pat1		$57
+#define offset		$58
+#define flowptr		$59	
+#define one		$60	
+#define pointers	$61	
+#define pat_blkA1	$62	
+#define pat_blkA2	$63	
+#define pat_blkB1	$64	
+#define pat_blkB2	$65	
+#define pat_blkC	$66
+#define pat_flowidx	$67
+#define pat_ptraddend	$68
+#define merge		$69
+#define blkA1B1		$70	
+#define blkA2B2		$71
+#define blkCflowidx	$72	
+#define sel_0F00	$73
+#define flowidx		$74	
+#define flowentryptr	$75	
+
+/* Aliases to reduce register usage
+ */	
+#define blkA1		blkA
+#define blkB1		blkB
+#define A02		A00
+#define A03		A01
+#define A12		A10
+#define A13		A11
+#define A22		A20
+#define A23		A21
+#define A32		A30
+#define A33		A31
+#define B20		B00
+#define B21		B01
+#define B22		B02
+#define B23		B03
+#define B30		B10
+#define B31		B11
+#define B32		B12
+#define B33		B13
+	
+#define step		offset
+#define flow		merge
+#define target		sel_0F00
+
+/* Q equals the number of quadwords per row */	
+#define	Q		32	
+
+/* The d-form instruction contain a signed 14-bit byte index. This offset
+ * provides immediate offset of -16 to 15.998 matrix rows (where the row
+ * length is 64 double-words. To extend addressability, we offset the 
+ * array address by +16 rows.
+ */
+#define S14_OFF		(16*Q)
+
+	
+	.data
+	.align	4
+splat_dw0:		
+#ifdef ACCEL_LITTLE_ENDIAN
+	.word	0x07060504, 0x03020100, 0x07060504, 0x03020100
+#else	/* BIG ENDIAN */
+	.word	0x00010203, 0x04050607, 0x00010203, 0x04050607
+#endif	
+shuf_blkA1:	
+	.word	0x80800001, 0x80808080, 0x80808080, 0x80808080
+shuf_ptraddend:
+	.word	0x06070A0B, 0x06070A0B, 0x02031213, 0x80808080
+shuf_pointers:	
+	.word	0x00010203, 0x04050607, 0x12138080, 0x80808080
+	
+/* The flow control array consists of 4 values for each of the 128 loop
+ * iterations. The 4 values are:
+ *   1) blkC pointer step (addend)
+ *   2) blkA pointer step (addend)
+ *   3) blkB pointer step (addend)
+ *   4) branch target
+ * The addend are divided by 16 so that pointer arithmetic can be performed
+ * using half-word arithmetic to eliminate an inner-loop, pipe 0, add.
+ *
+ * All pointers are maintained in register called "pointers". The pointers
+ * are packed in a halfword vector as follows:
+ *
+ * +-------+-------+-------+-------+------+---------+--------+--------+  	
+ * | blkA1 | blkB1 | blkA2 | blkB2 | blkC | flowidx | unused | unused |
+ * +-------+-------+-------+-------+------+---------+--------+--------+  	
+ *
+ * flowidx is a byte offset for flow_entry. The flow_entry array contains
+ * the offset for the flow_control array. 
+ */
+flow_entry:
+        .rept           15
+          .byte		0, 0, 0, 0, 0, 0, 0, 16
+        .endr
+        .byte		0, 0, 0, 0, 0, 0, 0, 32
+
+flow_control:
+	.word		16*4/16, 0/16, 8*8/16, loop_next
+	.word		(16*4+16*3*Q)/16, 4*8/16, (8*8-16*Q)/16, loop_next
+	.word		0/16, 0/16, 0/16, loop_end
+	
+
+	.text
+	.align	6
+	
+	.global mm_dp_64Cx64
+mm_dp_64Cx64:
+	/* Pack all the pointers into the pointer variable.
+	 * Initialize all the variables needed in the inner loop.
+	 * Load initial block data from A (4x4), initial block of data from
+	 * B (4x8), and the first 12 quadword of data from C (3x8 of 4x8).
+	 */
+ib0:
+	ila	offset, 16*S14_OFF
+	HBRP
+	
+	ilhu	merge, 0x0203
+	lqr	pat_blkA1, shuf_blkA1
+	
+	a	blkA1, blkA, offset
+	lqr	pat0, splat_dw0
+	
+	a	blkB1, blkB, offset
+	LQD(C00next, blkC, 0)
+	
+	ila	one, 1
+	LQD(C01next, blkC, 1)
+	
+	rotmi	blkA2, blkA1, -4
+	LQD(A00, blkA1, 0-16*Q)
+	
+	hbrr	 fwd_loop_start, loop_start
+	LQD(A20, blkA1, 1-16*Q)
+
+	orbi	pat_blkB1, pat_blkA1, 2
+        LQD(C02next, blkC, 2)	
+
+	orbi	pat1, pat0, 0x8
+        LQD(C03next, blkC, 3)	
+	
+	orbi	pat_blkA2, pat_blkA1, 4
+	lqr	pat_ptraddend, shuf_ptraddend
+
+	orbi	pat_blkB2, pat_blkA1, 6
+	LQD(B00, blkB1, 0-16*Q)
+	
+	orbi	pat_blkC,  pat_blkA1, 8
+	SHUFB(A10, A00, A00, pat1)
+	
+	rotmi	blkB2, blkB1, -4
+	SHUFB(A00, A00, A00, pat0)
+	
+	orbi	pat_flowidx, pat_blkA1, 10
+	LQD(B01, blkB1, 1-16*Q)
+
+	iohl	merge, 0x1213
+	LQD(B02, blkB1, 2-16*Q)
+	
+	ilh	offset, 0x400
+	LQD(B03, blkB1, 3-16*Q)
+	
+ib1:	
+	rotmi	blkCflowidx, blkC, -4
+	shufb	blkA1B1, blkA2, blkB2, merge
+	
+	ila	flowptr, flow_control
+	SHUFB(A30, A20, A20, pat1)
+
+	ori	blkCnext, blkC, 0
+	fsmbi	sel_0F00, 0x0F00
+	
+	ila	flowentryptr, flow_entry
+        LQD(C10next, blkC, 0+Q)
+
+	a	blkA2B2, blkA1B1, offset
+	SHUFB(A20, A20, A20, pat0)
+
+	lqr	merge, shuf_pointers
+        LQD(C11next, blkC, 1+Q)
+	
+        LQD(C12next, blkC, 2+Q)
+        LQD(C13next, blkC, 3+Q)
+	
+	selb	pointers, blkA1B1, blkA2B2, sel_0F00
+        LQD(C20next, blkC, 0+2*Q)
+	
+        LQD(C21next, blkC, 1+2*Q)
+        LQD(C22next, blkC, 2+2*Q)
+	
+	shufb	pointers, pointers, blkCflowidx, merge
+        LQD(C23next, blkC, 3+2*Q)
+	
+fwd_loop_start:	
+	br	loop_start
+
+	.align	6
+
+
+loop_next:
+ib2:	
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	HBRP
+	DFNMS(C01, A03, B31)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C02, A03, B32)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C03, A03, B33)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C10, A13, B30)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C11, A13, B31)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C12, A13, B32)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C20, A23, B30)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C22, A23, B32)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C23, A23, B33)
+	STQD(C00, blkC, 0)
+	DFNMS(C30, A33, B30)
+	STQD(C01, blkC, 1)
+	DFNMS(C31, A33, B31)
+	STQD(C02, blkC, 2)
+	DFNMS(C32, A33, B32)
+	STQD(C03, blkC, 3)
+	DFNMS(C33, A33, B33)
+	STQD(C10, blkC, 0+Q)
+	
+loop_start:	
+ib3:	
+	/* Computation is performed on mini-block matrix operation of the
+	 * form:
+	 *
+	 *  [c] -= [a]*[b]
+	 *
+	 * where: [a] is a 4x4 sub-matrix of [A]
+         *        [b] is a 4x8 sub-matrix of [B]
+	 *        [c] is a 4x8 sub-matrix of [C]
+	 *
+	 *        A sub-matrix of m x n consists of m rows and n columns. 
+	 * 	
+	 * Therefore, in order to compute a resultant mini-block of [C],
+	 * 16 mini-block multiples are performed. These are considered
+	 * stages of the matrix multiply.
+	 */
+	
+	/* STAGE 0 */
+	#undef  OFFSET  
+	#define OFFSET	-15*Q
+	DFNMS(C00next, A00, B00)
+	DFNMS(C01next, A00, B01)
+	DFNMS(C02next, A00, B02)
+	LQD(C30next, blkCnext, 0+3*Q)
+	DFNMS(C03next, A00, B03)
+        LQD(C31next, blkCnext, 1+3*Q)
+	DFNMS(C10next, A10, B00)
+        LQD(C32next, blkCnext, 2+3*Q)
+	DFNMS(C11next, A10, B01)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C12next, A10, B02)
+        LQD(C33next, blkCnext, 3+3*Q)
+	DFNMS(C13next, A10, B03)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C20next, A20, B00)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C21next, A20, B01)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C22next, A20, B02)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C23next, A20, B03)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C30next, A30, B00)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C31next, A30, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C32next, A30, B02)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C33next, A30, B03)
+	SHUFB(A21, A21, A21, pat0)
+
+	#undef  OFFSET  
+	#define OFFSET	-14*Q
+	DFNMS(C00next, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01next, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02next, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03next, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10next, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11next, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12next, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13next, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20next, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21next, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22next, A21, B12)
+	DFNMS(C23next, A21, B13)
+	DFNMS(C30next, A31, B10)
+	STQD(C11, blkC, 1+Q)
+	DFNMS(C31next, A31, B11)
+	STQD(C12, blkC, 2+Q)
+	DFNMS(C32next, A31, B12)
+	DFNMS(C33next, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-13*Q
+	DFNMS(C00next, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01next, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02next, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03next, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10next, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11next, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12next, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13next, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20next, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21next, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22next, A22, B22)
+	DFNMS(C23next, A22, B23)
+	DFNMS(C30next, A32, B20)
+	STQD(C13, blkC, 3+Q)
+	DFNMS(C31next, A32, B21)
+	STQD(C20, blkC, 0+2*Q)
+	DFNMS(C32next, A32, B22)
+	DFNMS(C33next, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-12*Q
+	DFNMS(C00next, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01next, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02next, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03next, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10next, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11next, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12next, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13next, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20next, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21next, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22next, A23, B32)
+	DFNMS(C23next, A23, B33)
+	DFNMS(C30next, A33, B30)
+	STQD(C21, blkC, 1+2*Q)
+	DFNMS(C31next, A33, B31)
+	STQD(C22, blkC, 2+2*Q)
+	DFNMS(C32next, A33, B32)
+	DFNMS(C33next, A33, B33)
+	
+	/* STAGE 1 */
+	#undef  OFFSET  
+	#define OFFSET	-11*Q
+	DFNMS(C00next, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01next, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02next, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03next, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10next, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11next, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12next, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13next, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20next, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21next, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22next, A20, B02)
+	DFNMS(C23next, A20, B03)
+	DFNMS(C30next, A30, B00)
+	STQD(C23, blkC, 3+2*Q)
+	DFNMS(C31next, A30, B01)
+	STQD(C30, blkC, 0+3*Q)
+	DFNMS(C32next, A30, B02)
+	DFNMS(C33next, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-10*Q
+	DFNMS(C00next, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01next, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02next, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03next, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10next, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11next, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12next, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13next, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20next, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21next, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22next, A21, B12)
+	DFNMS(C23next, A21, B13)
+	DFNMS(C30next, A31, B10)
+	STQD(C31, blkC, 1+3*Q)
+	DFNMS(C31next, A31, B11)
+	STQD(C32, blkC, 2+3*Q)
+	DFNMS(C32next, A31, B12)
+	DFNMS(C33next, A31, B13)
+	
+	#undef  OFFSET  
+	#define OFFSET	-9*Q
+	DFNMS(C00next, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01next, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02next, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03next, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10next, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11next, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12next, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13next, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20next, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21next, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22next, A22, B22)
+	LR(C00, C00next)
+	DFNMS(C23next, A22, B23)
+	LR(C01, C01next)
+	DFNMS(C30next, A32, B20)
+	STQD(C33, blkC, 3+3*Q)
+	DFNMS(C31next, A32, B21)
+	LR(C02, C02next)
+	DFNMS(C32next, A32, B22)
+	LR(C03, C03next)
+	DFNMS(C33next, A32, B23)
+	LR(C10, C10next)
+
+	#undef  OFFSET  
+	#define OFFSET	-8*Q
+	DFNMS(C00, A03, B30)
+	LR(C11, C11next)
+	DFNMS(C01, A03, B31)
+	LR(C12, C12next)
+	DFNMS(C02, A03, B32)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C11, A13, B31)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C12, A13, B32)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C13next, A13, B33)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C20next, A23, B30)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C21next, A23, B31)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C22next, A23, B32)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C23next, A23, B33)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C30next, A33, B30)
+	DFNMS(C31next, A33, B31)
+	DFNMS(C32next, A33, B32)
+	DFNMS(C33next, A33, B33)
+
+	/* STAGE 2 */
+	#undef  OFFSET  
+	#define OFFSET	-7*Q
+	DFNMS(C00, A00, B00)
+	LR(C13, C13next)
+	DFNMS(C01, A00, B01)
+	LR(C20, C20next)
+	DFNMS(C02, A00, B02)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C11, A10, B01)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C12, A10, B02)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C13, A10, B03)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C20, A20, B00)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C21next, A20, B01)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C22next, A20, B02)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C23next, A20, B03)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C30next, A30, B00)
+	DFNMS(C31next, A30, B01)
+	DFNMS(C32next, A30, B02)
+	DFNMS(C33next, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LR(C21, C21next)
+	DFNMS(C03, A01, B13)
+	LR(C22, C22next)
+	DFNMS(C10, A11, B10)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C11, A11, B11)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C12, A11, B12)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C13, A11, B13)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C20, A21, B10)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C22, A21, B12)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C23next, A21, B13)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C30next, A31, B10)
+	DFNMS(C31next, A31, B11)
+	DFNMS(C32next, A31, B12)
+	DFNMS(C33next, A31, B13)
+
+	
+	#undef  OFFSET  
+	#define OFFSET	-5*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LR(C23, C23next)
+	DFNMS(C11, A12, B21)
+	LR(C30, C30next)
+	DFNMS(C12, A12, B22)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C13, A12, B23)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C20, A22, B20)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C22, A22, B22)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C23, A22, B23)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31next, A32, B21)
+	DFNMS(C32next, A32, B22)
+	DFNMS(C33next, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	LR(C31, C31next)
+	DFNMS(C13, A13, B33)
+	LR(C32, C32next)
+	DFNMS(C20, A23, B30)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C22, A23, B32)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C23, A23, B33)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33next, A33, B33)
+	
+	/* STAGE 3 */
+	#undef  OFFSET  
+	#define OFFSET	-3*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	shufb	flowidx, pointers, pointers, pat_flowidx
+	DFNMS(C20, A20, B00)
+	LR(C33, C33next)
+	DFNMS(C21, A20, B01)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C22, A20, B02)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C23, A20, B03)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+	
+	
+	#undef  OFFSET  
+	#define OFFSET	-2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	lqx	flow, flowentryptr, flowidx	/* fetch flow_entry value */
+	DFNMS(C11, A11, B11)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C12, A11, B12)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C20, A21, B10)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C22, A21, B12)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C23, A21, B13)
+	rotqby	flowidx, flow, flowidx	/* rotate flow entry offset into byte 0 */
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-1*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	rotqmbyi flowidx, flowidx, -3	/* move flow entry offset into byte 3 */
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C20, A22, B20)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C22, A22, B22)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C23, A22, B23)
+	lqx	flow, flowptr, flowidx	/* fetch flow control quadword */
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	0*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	
+	
+	/* STAGE 4 */
+	#undef  OFFSET  
+	#define OFFSET	1*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	3*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	LR(blkC, blkCnext)
+	DFNMS(C23, A22, B23)
+	shufb	step, flow, one, pat_ptraddend
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	shufb	blkA2, pointers, pointers, pat_blkA2
+	DFNMS(C23, A23, B33)
+	shufb	blkB2, pointers, pointers, pat_blkB2
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	ah	pointers, pointers, step
+	shlqbii	blkA2, blkA2, 4
+	
+
+	/* STAGE 5 */
+	#undef  OFFSET  
+	#define OFFSET	5*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	shlqbii	blkB2, blkB2, 4
+	DFNMS(C23, A20, B03)
+	shufb	blkCnext, pointers, pointers, pat_blkC
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	rotqbyi	target, flow, 12
+	DFNMS(C23, A21, B13)
+	shlqbii	blkCnext, blkCnext, 4
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	7*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	LQD(C00next, blkCnext, 0)
+	DFNMS(C23, A22, B23)
+	LQD(C01next, blkCnext, 1)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+	
+
+
+	#undef  OFFSET  
+	#define OFFSET	8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+        LQD(C02next, blkCnext, 2)	
+	DFNMS(C23, A23, B33)
+        LQD(C03next, blkCnext, 3)	
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 6 */
+	#undef  OFFSET  
+	#define OFFSET	9*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+        LQD(C10next, blkCnext, 0+Q)
+	DFNMS(C23, A20, B03)
+        LQD(C11next, blkCnext, 1+Q)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+        LQD(C12next, blkCnext, 2+Q)
+	DFNMS(C23, A21, B13)
+        LQD(C13next, blkCnext, 3+Q)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	11*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+        LQD(C20next, blkCnext, 0+2*Q)
+	DFNMS(C23, A22, B23)
+        LQD(C21next, blkCnext, 1+2*Q)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA1, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA1, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+        LQD(C22next, blkCnext, 2+2*Q)
+	DFNMS(C23, A23, B33)
+        LQD(C23next, blkCnext, 3+2*Q)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 7 */
+	#undef  OFFSET  
+	#define OFFSET	13*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA1, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA1, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA1, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA1, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	15*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA1, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA1, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 8 */
+	#undef  OFFSET  
+	#define OFFSET	-15*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-13*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 9 */
+	#undef  OFFSET  
+	#define OFFSET	-11*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-9*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+
+	/* STAGE 10 */
+	#undef  OFFSET  
+	#define OFFSET	-7*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-5*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 11 */
+	#undef  OFFSET  
+	#define OFFSET	-3*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-1*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	0*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 12 */
+	#undef  OFFSET  
+	#define OFFSET	1*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	3*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 13 */
+	#undef  OFFSET  
+	#define OFFSET	5*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	7*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 14 */
+	#undef  OFFSET  
+	#define OFFSET	9*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	11*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A00, blkA2, OFFSET+0)
+	DFNMS(C01, A03, B31)
+	LQD(A20, blkA2, OFFSET+1)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A10, A00, A00, pat1)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A00, A00, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A30, A20, A20, pat1)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A20, A20, pat0)
+	DFNMS(C22, A23, B32)
+	shufb	blkA1, pointers, pointers, pat_blkA1
+	DFNMS(C23, A23, B33)
+	shufb	blkB1, pointers, pointers, pat_blkB1
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 15 */
+	#undef  OFFSET  
+	#define OFFSET	13*Q
+	DFNMS(C00, A00, B00)
+	LQD(A01, blkA2, OFFSET+0)
+	DFNMS(C01, A00, B01)
+	LQD(A21, blkA2, OFFSET+1)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A11, A01, A01, pat1)
+	DFNMS(C13, A10, B03)
+	SHUFB(A01, A01, A01, pat0)
+	DFNMS(C20, A20, B00)
+	SHUFB(A31, A21, A21, pat1)
+	DFNMS(C21, A20, B01)
+	SHUFB(A21, A21, A21, pat0)
+	DFNMS(C22, A20, B02)
+	shlqbii	blkA1, blkA1, 4
+	DFNMS(C23, A20, B03)
+	shlqbii	blkB1, blkB1, 4
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A02, blkA2, OFFSET+0)
+	DFNMS(C01, A01, B11)
+	LQD(A22, blkA2, OFFSET+1)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A12, A02, A02, pat1)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A02, A02, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A32, A22, A22, pat1)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A22, A22, pat0)
+	DFNMS(C22, A21, B12)
+	hbr	loop_inst, target
+	DFNMS(C23, A21, B13)
+	lnop
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	15*Q
+	DFNMS(C00, A02, B20)
+	LQD(A03, blkA2, OFFSET+0)
+	DFNMS(C01, A02, B21)
+	LQD(A23, blkA2, OFFSET+1)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A13, A03, A03, pat1)
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A03, A03, pat0)
+	DFNMS(C20, A22, B20)
+	SHUFB(A33, A23, A23, pat1)
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A23, A23, pat0)
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	lnop
+	DFNMS(C33, A32, B23)
+loop_inst:	
+	bi	target
+	
+	.align	6
+loop_end:	
+
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	hbr return, $0	
+	DFNMS(C01, A03, B31)
+	DFNMS(C02, A03, B32)
+	
+	DFNMS(C03, A03, B33)
+	DFNMS(C10, A13, B30)
+	
+	DFNMS(C11, A13, B31)
+	DFNMS(C12, A13, B32)
+	
+	DFNMS(C13, A13, B33)
+	DFNMS(C20, A23, B30)
+	
+	DFNMS(C21, A23, B31)
+	STQD(C00, blkC, 0)
+	
+	DFNMS(C22, A23, B32)
+	STQD(C01, blkC, 1)
+	
+	DFNMS(C23, A23, B33)
+	STQD(C02, blkC, 2)
+	DFNMS(C30, A33, B30)
+	STQD(C03, blkC, 3)
+	DFNMS(C31, A33, B31)
+	STQD(C10, blkC, 0+Q)
+	DFNMS(C32, A33, B32)
+	STQD(C11, blkC, 1+Q)
+	DFNMS(C33, A33, B33)
+	STQD(C12, blkC, 2+Q)
+	
+	STQD(C13, blkC, 3+Q)
+	STQD(C20, blkC, 0+2*Q)
+	
+	STQD(C21, blkC, 1+2*Q)
+	STQD(C22, blkC, 2+2*Q)
+	
+	STQD(C23, blkC, 3+2*Q)
+	STQD(C30, blkC, 0+3*Q)
+	
+	STQD(C31, blkC, 1+3*Q)
+	STQD(C32, blkC, 2+3*Q)
+	
+	STQD(C33, blkC, 3+3*Q)
+return:		
+	bi	$0
Index: accel/lib/spu/accel_mm_dp_64x64.S
===================================================================
RCS file: accel/lib/spu/accel_mm_dp_64x64.S
diff -N accel/lib/spu/accel_mm_dp_64x64.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_mm_dp_64x64.S	23 Oct 2008 21:20:24 -0000	1.4
@@ -0,0 +1,2321 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+/*
+ * SYNOPSIS:
+ *	void mm_dp_64x64(vec_double2 *blkC, vec_double2 *blkA, vec_double2 *blkB)
+ *
+ * DESCRIPTION: 
+ *      mm_dp_64x64 computes a double precision, matrix negative 
+ *	multiply subtract on 64x64 matrix blocks. 
+ *
+ *		[C] -= [A] * [B]
+ *
+ *      This function assumes that matrices [A], [B], and [C] are row ordered.
+ *
+ *      This function can also be compiled for [A] being little-endian if
+ *	ACCEL_LITTLE_ENDIAN is defined.
+ *    
+ */
+	
+
+#if 1
+#define DFNMS(_d, _a, _b)	dfnms	_d, _a, _b
+#else
+#define DFNMS(_d, _a, _b)	fnms	_d, _a, _b, _d
+#endif	
+#define ADD(_d, _a, _b)		a	_d, _a, _b
+#define SHUFB(_d, _a, _b, _c)	shufb	_d, _a, _b, _c
+#define LQD(_d, _a, _idx)	lqd	_d, (_idx)*16(_a)
+#define STQD(_d, _a, _idx)	stqd	_d, (_idx)*16(_a)
+#define LR(_d, _a)		rotqbyi	_d, _a, 0
+#define HBRP()			hbrp	
+
+	
+#define blkCnext	$2
+	
+#define blkC		$3	
+#define blkA		$4
+#define blkB		$5	
+	
+/* Working variables
+ */
+#define blkA2		$6	
+#define blkB2		$7
+	
+#define A00		$8
+#define A01		$9
+#define A10		$10
+#define A11		$11
+#define A20		$12
+#define A21		$13
+#define A30		$14
+#define A31		$15
+	
+#define B00		$16
+#define B01		$17
+#define B02		$18
+#define B03		$19
+#define B10		$20
+#define B11		$21
+#define B12		$22
+#define B13		$23
+	
+#define C00		$24
+#define C01		$25
+#define C02		$26
+#define C03		$27
+#define C10		$28
+#define C11		$29
+#define C12		$30
+#define C13		$31
+#define C20		$32
+#define C21		$33
+#define C22		$34
+#define C23		$35
+#define C30		$36
+#define C31		$37
+#define C32		$38
+#define C33		$39
+
+#define C00next		$40
+#define C01next		$41
+#define C02next		$42
+#define C03next		$43
+#define C10next		$44
+#define C11next		$45
+#define C12next		$46
+#define C13next		$47
+#define C20next		$48
+#define C21next		$49
+#define C22next		$50
+#define C23next		$51
+#define C30next		$52
+#define C31next		$53
+#define C32next		$54
+#define C33next		$55
+
+#define pat0		$56
+#define pat1		$57
+#define offset		$58
+#define flowptr		$59	
+#define one		$60	
+#define pointers	$61	
+#define pat_blkA1	$62	
+#define pat_blkB1	$63	
+#define pat_blkB2	$64	
+#define pat_blkC	$65
+#define pat_flowidx	$66
+#define pat_ptraddend	$67
+#define merge		$68
+#define blkA1B1		$69	
+#define blkA2B2		$70
+#define blkCflowidx	$71	
+#define sel_0F00	$72
+#define flowidx		$73	
+#define flowentryptr	$74	
+#define A0		$75	
+#define A1		$76	
+#define A2		$77
+#define A3		$78
+
+/* Aliases to reduce register usage
+ */	
+#define blkA1		blkA
+#define blkB1		blkB
+#define A02		A00
+#define A03		A01
+#define A12		A10
+#define A13		A11
+#define A22		A20
+#define A23		A21
+#define A32		A30
+#define A33		A31
+#define B20		B00
+#define B21		B01
+#define B22		B02
+#define B23		B03
+#define B30		B10
+#define B31		B11
+#define B32		B12
+#define B33		B13
+	
+#define step		offset
+#define flow		merge
+#define target		sel_0F00
+
+/* Q equals the number of quadwords per row */	
+#define	Q		32	
+
+/* The d-form instruction contain a signed 14-bit byte index. This offset
+ * provides immediate offset of -16 to 15.998 matrix rows (where the row
+ * length is 64 double-words. To extend addressability, we offset the 
+ * array address by +16 rows.
+ */
+#define S14_OFF		(16*Q)
+
+	
+	.data
+	.align	4
+splat_dw0:		
+#ifdef ACCEL_LITTLE_ENDIAN
+	.word	0x07060504, 0x03020100, 0x07060504, 0x03020100
+#else	/* BIG ENDIAN */
+	.word	0x00010203, 0x04050607, 0x00010203, 0x04050607
+#endif	/* LITTLE ENDIAN */
+shuf_blkA1:	
+	.word	0x80800001, 0x80808080, 0x80808080, 0x80808080
+shuf_ptraddend:
+	.word	0x06070A0B, 0x06070A0B, 0x02031213, 0x80808080
+shuf_pointers:	
+	.word	0x00010203, 0x04050607, 0x12138080, 0x80808080
+	
+/* The flow control array consists of 4 values for each of the 128 loop
+ * iterations. The 4 values are:
+ *   1) blkC pointer step (addend)
+ *   2) blkA pointer step (addend)
+ *   3) blkB pointer step (addend)
+ *   4) branch target
+ * The addend are divided by 16 so that pointer arithmetic can be performed
+ * using half-word arithmetic to eliminate an inner-loop, pipe 0, add.
+ *
+ * All pointers are maintained in register called "pointers". The pointers
+ * are packed in a halfword vector as follows:
+ *
+ * +-------+-------+-------+-------+------+---------+--------+--------+  	
+ * | blkA1 | blkB1 | ----- | blkB2 | blkC | flowidx | unused | unused |
+ * +-------+-------+-------+-------+------+---------+--------+--------+  	
+ *
+ * flowidx is a byte offset for flow_entry. The flow_entry array contains
+ * the offset for the flow_control array. 
+ */
+flow_entry:
+        .rept           15
+          .byte		0, 0, 0, 0, 0, 0, 0, 16
+        .endr
+        .byte		0, 0, 0, 0, 0, 0, 0, 32
+
+flow_control:
+	.word		16*4/16, 0/16, 8*8/16, loop_next
+	.word		(16*4+16*3*Q)/16, 4*Q, (8*8-16*Q)/16, loop_next
+	.word		0/16, 0/16, 0/16, loop_end
+	
+
+	.text
+	.align	6
+	
+	.global mm_dp_64x64
+mm_dp_64x64:
+	/* Pack all the pointers into the pointer variable.
+	 * Initialize all the variables needed in the inner loop.
+	 * Load initial block data from A (4x4), initial block of data from
+	 * B (4x8), and the first 12 quadword of data from C (3x8 of 4x8).
+	 */
+ib0:
+	ila	offset, 16*S14_OFF
+	HBRP
+	
+	ilhu	merge, 0x0203
+	lqr	pat_blkA1, shuf_blkA1
+	
+	LQD(A0, blkA, 0*Q+0)
+	lqr	pat0, splat_dw0
+	
+	a	blkB1, blkB, offset
+	LQD(C00next, blkC, 0)
+	
+	ila	one, 1
+	LQD(C01next, blkC, 1)
+	
+	rotmi	blkA2, blkA1, -4
+	LQD(A1, blkA, 1*Q+0)
+	
+	
+	hbrr	 fwd_loop_start, loop_start
+	LQD(A2, blkA, 2*Q+0)
+
+	orbi	pat_blkB1, pat_blkA1, 2
+        LQD(C02next, blkC, 2)	
+
+	orbi	pat1, pat0, 0x8
+        LQD(C03next, blkC, 3)	
+	
+	LQD(A3, blkA, 3*Q+0)
+	lqr	pat_ptraddend, shuf_ptraddend
+
+	orbi	pat_blkB2, pat_blkA1, 6
+	LQD(B00, blkB1, 0-16*Q)
+	
+	orbi	pat_blkC,  pat_blkA1, 8
+	SHUFB(A00, A0, A0, pat0)
+	
+	rotmi	blkB2, blkB1, -4
+	SHUFB(A10, A1, A1, pat0)
+
+	orbi	pat_flowidx, pat_blkA1, 10
+	LQD(B01, blkB1, 1-16*Q)
+
+	iohl	merge, 0x1213
+	LQD(B02, blkB1, 2-16*Q)
+	
+	ilh	offset, 0x400
+	LQD(B03, blkB1, 3-16*Q)
+	
+ib1:	
+	rotmi	blkCflowidx, blkC, -4
+	shufb	blkA1B1, blkA2, blkB2, merge
+	
+	ila	flowptr, flow_control
+	SHUFB(A20, A2, A2, pat0)
+
+	ori	blkCnext, blkC, 0
+	fsmbi	sel_0F00, 0x0F00
+	
+	ila	flowentryptr, flow_entry
+        LQD(C10next, blkC, 0+Q)
+
+	a	blkA2B2, blkA1B1, offset
+	SHUFB(A30, A3, A3, pat0)
+
+	lqr	merge, shuf_pointers
+        LQD(C11next, blkC, 1+Q)
+	
+        LQD(C12next, blkC, 2+Q)
+        LQD(C13next, blkC, 3+Q)
+	
+	selb	pointers, blkA1B1, blkA2B2, sel_0F00
+        LQD(C20next, blkC, 0+2*Q)
+	
+        LQD(C21next, blkC, 1+2*Q)
+        LQD(C22next, blkC, 2+2*Q)
+	
+	shufb	pointers, pointers, blkCflowidx, merge
+        LQD(C23next, blkC, 3+2*Q)
+	
+fwd_loop_start:	
+	br	loop_start
+
+	.align	6
+
+
+loop_next:
+ib2:	
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	HBRP
+	DFNMS(C01, A03, B31)
+	LQD(A2, blkA1, 2*Q+0)
+	DFNMS(C02, A03, B32)
+	LQD(A3, blkA1, 3*Q+0)
+	DFNMS(C03, A03, B33)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C10, A13, B30)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C11, A13, B31)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C12, A13, B32)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C13, A13, B33)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C22, A23, B32)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C23, A23, B33)
+	STQD(C00, blkC, 0)
+	DFNMS(C30, A33, B30)
+	STQD(C01, blkC, 1)
+	DFNMS(C31, A33, B31)
+	STQD(C02, blkC, 2)
+	DFNMS(C32, A33, B32)
+	STQD(C03, blkC, 3)
+	DFNMS(C33, A33, B33)
+	STQD(C10, blkC, 0+Q)
+	
+loop_start:	
+ib3:	
+	/* Computation is performed on mini-block matrix operation of the
+	 * form:
+	 *
+	 *  [c] -= [a]*[b]
+	 *
+	 * where: [a] is a 4x4 sub-matrix of [A]
+         *        [b] is a 4x8 sub-matrix of [B]
+	 *        [c] is a 4x8 sub-matrix of [C]
+	 *
+	 *        A sub-matrix of m x n consists of m rows and n columns. 
+	 * 	
+	 * Therefore, in order to compute a resultant mini-block of [C],
+	 * 16 mini-block multiples are performed. These are considered
+	 * stages of the matrix multiply.
+	 */
+	
+	/* STAGE 0 */
+	#undef  OFFSET  
+	#define OFFSET	-15*Q
+	DFNMS(C00next, A00, B00)
+	DFNMS(C01next, A00, B01)
+	DFNMS(C02next, A00, B02)
+	LQD(C30next, blkCnext, 0+3*Q)
+	DFNMS(C03next, A00, B03)
+        LQD(C31next, blkCnext, 1+3*Q)
+	DFNMS(C10next, A10, B00)
+        LQD(C32next, blkCnext, 2+3*Q)
+	DFNMS(C11next, A10, B01)
+	SHUFB(A01, A0, A0, pat1)
+	DFNMS(C12next, A10, B02)
+	SHUFB(A11, A1, A1, pat1)
+	DFNMS(C13next, A10, B03)
+        LQD(C33next, blkCnext, 3+3*Q)
+	DFNMS(C20next, A20, B00)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C21next, A20, B01)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C22next, A20, B02)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C23next, A20, B03)
+	DFNMS(C30next, A30, B00)
+	DFNMS(C31next, A30, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C32next, A30, B02)
+	SHUFB(A21, A2, A2, pat1)
+	DFNMS(C33next, A30, B03)
+	SHUFB(A31, A3, A3, pat1)
+
+	#undef  OFFSET  
+	#define OFFSET	-14*Q
+	DFNMS(C00next, A01, B10)
+	LQD(A0, blkA1, 0*Q+1)
+	DFNMS(C01next, A01, B11)
+	LQD(A1, blkA1, 1*Q+1)
+	DFNMS(C02next, A01, B12)
+	LQD(A2, blkA1, 2*Q+1)
+	DFNMS(C03next, A01, B13)
+	LQD(A3, blkA1, 3*Q+1)
+	DFNMS(C10next, A11, B10)
+	DFNMS(C11next, A11, B11)
+	DFNMS(C12next, A11, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C13next, A11, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C20next, A21, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C21next, A21, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C22next, A21, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C23next, A21, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C30next, A31, B10)
+	STQD(C11, blkC, 1+Q)
+	DFNMS(C31next, A31, B11)
+	STQD(C12, blkC, 2+Q)
+	DFNMS(C32next, A31, B12)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C33next, A31, B13)
+	SHUFB(A32, A3, A3, pat0)
+
+	#undef  OFFSET  
+	#define OFFSET	-13*Q
+	DFNMS(C00next, A02, B20)
+	DFNMS(C01next, A02, B21)
+	DFNMS(C02next, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03next, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10next, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11next, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12next, A12, B22)
+	SHUFB(A03, A0, A0, pat1)
+	DFNMS(C13next, A12, B23)
+	SHUFB(A13, A1, A1, pat1)
+	DFNMS(C20next, A22, B20)
+	SHUFB(A23, A2, A2, pat1)
+	DFNMS(C21next, A22, B21)
+	SHUFB(A33, A3, A3, pat1)
+	DFNMS(C22next, A22, B22)
+	DFNMS(C23next, A22, B23)
+	DFNMS(C30next, A32, B20)
+	STQD(C13, blkC, 3+Q)
+	DFNMS(C31next, A32, B21)
+	STQD(C20, blkC, 0+2*Q)
+	DFNMS(C32next, A32, B22)
+	DFNMS(C33next, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-12*Q
+	DFNMS(C00next, A03, B30)
+	LQD(A0, blkA1, 0*Q+2)
+	DFNMS(C01next, A03, B31)
+	LQD(A1, blkA1, 1*Q+2)
+	DFNMS(C02next, A03, B32)
+	LQD(A2, blkA1, 2*Q+2)
+	DFNMS(C03next, A03, B33)
+	LQD(A3, blkA1, 3*Q+2)
+	DFNMS(C10next, A13, B30)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C11next, A13, B31)
+	DFNMS(C12next, A13, B32)
+	DFNMS(C13next, A13, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C20next, A23, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C21next, A23, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C22next, A23, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C23next, A23, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C30next, A33, B30)
+	STQD(C21, blkC, 1+2*Q)
+	DFNMS(C31next, A33, B31)
+	STQD(C22, blkC, 2+2*Q)
+	DFNMS(C32next, A33, B32)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C33next, A33, B33)
+	SHUFB(A30, A3, A3, pat0)
+	
+	/* STAGE 1 */
+	#undef  OFFSET  
+	#define OFFSET	-11*Q
+	DFNMS(C00next, A00, B00)
+	DFNMS(C01next, A00, B01)
+	DFNMS(C02next, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03next, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10next, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11next, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12next, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13next, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20next, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21next, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22next, A20, B02)
+	DFNMS(C23next, A20, B03)
+	DFNMS(C30next, A30, B00)
+	STQD(C23, blkC, 3+2*Q)
+	DFNMS(C31next, A30, B01)
+	STQD(C30, blkC, 0+3*Q)
+	DFNMS(C32next, A30, B02)
+	LQD(A0, blkA1, 0*Q+3)
+	DFNMS(C33next, A30, B03)
+	LQD(A1, blkA1, 1*Q+3)
+
+	#undef  OFFSET  
+	#define OFFSET	-10*Q
+	DFNMS(C00next, A01, B10)
+	LQD(A2, blkA1, 2*Q+3)
+	DFNMS(C01next, A01, B11)
+	LQD(A3, blkA1, 3*Q+3)
+	DFNMS(C02next, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03next, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10next, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11next, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12next, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13next, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20next, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21next, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22next, A21, B12)
+	DFNMS(C23next, A21, B13)
+	DFNMS(C30next, A31, B10)
+	STQD(C31, blkC, 1+3*Q)
+	DFNMS(C31next, A31, B11)
+	STQD(C32, blkC, 2+3*Q)
+	DFNMS(C32next, A31, B12)
+	DFNMS(C33next, A31, B13)
+	
+	#undef  OFFSET  
+	#define OFFSET	-9*Q
+	DFNMS(C00next, A02, B20)
+	DFNMS(C01next, A02, B21)
+	DFNMS(C02next, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03next, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10next, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11next, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12next, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13next, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20next, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21next, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22next, A22, B22)
+	LR(C00, C00next)
+	DFNMS(C23next, A22, B23)
+	LR(C01, C01next)
+	DFNMS(C30next, A32, B20)
+	STQD(C33, blkC, 3+3*Q)
+	DFNMS(C31next, A32, B21)
+	LR(C02, C02next)
+	DFNMS(C32next, A32, B22)
+	LR(C03, C03next)
+	DFNMS(C33next, A32, B23)
+	LR(C10, C10next)
+
+	#undef  OFFSET  
+	#define OFFSET	-8*Q
+	DFNMS(C00, A03, B30)
+	LR(C11, C11next)
+	DFNMS(C01, A03, B31)
+	LR(C12, C12next)
+	DFNMS(C02, A03, B32)
+	LQD(A0, blkA1, 0*Q+4)
+	DFNMS(C03, A03, B33)
+	LQD(A1, blkA1, 1*Q+4)
+	DFNMS(C10, A13, B30)
+	LQD(A2, blkA1, 2*Q+4)
+	DFNMS(C11, A13, B31)
+	LQD(A3, blkA1, 3*Q+4)
+	DFNMS(C12, A13, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C13next, A13, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C20next, A23, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C21next, A23, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C22next, A23, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C23next, A23, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C30next, A33, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C31next, A33, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C32next, A33, B32)
+	DFNMS(C33next, A33, B33)
+
+	/* STAGE 2 */
+	#undef  OFFSET  
+	#define OFFSET	-7*Q
+	DFNMS(C00, A00, B00)
+	LR(C13, C13next)
+	DFNMS(C01, A00, B01)
+	LR(C20, C20next)
+	DFNMS(C02, A00, B02)
+	DFNMS(C03, A00, B03)
+	DFNMS(C10, A10, B00)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C11, A10, B01)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C12, A10, B02)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C13, A10, B03)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C20, A20, B00)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C21next, A20, B01)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C22next, A20, B02)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C23next, A20, B03)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C30next, A30, B00)
+	LQD(A0, blkA1, 0*Q+5)
+	DFNMS(C31next, A30, B01)
+	LQD(A1, blkA1, 1*Q+5)
+	DFNMS(C32next, A30, B02)
+	DFNMS(C33next, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+5)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+5)
+	DFNMS(C02, A01, B12)
+	LR(C21, C21next)
+	DFNMS(C03, A01, B13)
+	LR(C22, C22next)
+	DFNMS(C10, A11, B10)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C11, A11, B11)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C12, A11, B12)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C13, A11, B13)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C20, A21, B10)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C22, A21, B12)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C23next, A21, B13)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C30next, A31, B10)
+	DFNMS(C31next, A31, B11)
+	DFNMS(C32next, A31, B12)
+	DFNMS(C33next, A31, B13)
+
+	
+	#undef  OFFSET  
+	#define OFFSET	-5*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LR(C23, C23next)
+	DFNMS(C11, A12, B21)
+	LR(C30, C30next)
+	DFNMS(C12, A12, B22)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C13, A12, B23)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C20, A22, B20)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C22, A22, B22)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C23, A22, B23)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA1, 0*Q+6)
+	DFNMS(C31next, A32, B21)
+	LQD(A1, blkA1, 1*Q+6)
+	DFNMS(C32next, A32, B22)
+	DFNMS(C33next, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA1, 2*Q+6)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA1, 3*Q+6)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	LR(C31, C31next)
+	DFNMS(C13, A13, B33)
+	LR(C32, C32next)
+	DFNMS(C20, A23, B30)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C22, A23, B32)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C23, A23, B33)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33next, A33, B33)
+	
+	/* STAGE 3 */
+	#undef  OFFSET  
+	#define OFFSET	-3*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	shufb	flowidx, pointers, pointers, pat_flowidx
+	DFNMS(C20, A20, B00)
+	LR(C33, C33next)
+	DFNMS(C21, A20, B01)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C22, A20, B02)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C23, A20, B03)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C30, A30, B00)
+	LQD(A0, blkA1, 0*Q+7)
+	DFNMS(C31, A30, B01)
+	LQD(A1, blkA1, 1*Q+7)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+	
+	#undef  OFFSET  
+	#define OFFSET	-2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+7)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+7)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	lqx	flow, flowentryptr, flowidx	/* fetch flow_entry value */
+	DFNMS(C11, A11, B11)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C12, A11, B12)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C13, A11, B13)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C22, A21, B12)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C23, A21, B13)
+	rotqby	flowidx, flow, flowidx	/* rotate flow entry offset into byte 0 */
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-1*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	rotqmbyi flowidx, flowidx, -3	/* move flow entry offset into byte 3 */
+	DFNMS(C13, A12, B23)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C22, A22, B22)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C23, A22, B23)
+	lqx	flow, flowptr, flowidx	/* fetch flow control quadword */
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA1, 0*Q+8)
+	DFNMS(C31, A32, B21)
+	LQD(A1, blkA1, 1*Q+8)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	0*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA1, 2*Q+8)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA1, 3*Q+8)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 4 */
+	#undef  OFFSET  
+	#define OFFSET	1*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	LQD(A0, blkA1, 0*Q+9)
+	DFNMS(C31, A30, B01)
+	LQD(A1, blkA1, 1*Q+9)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+9)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+9)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	3*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LR(blkC, blkCnext)
+	DFNMS(C23, A22, B23)
+	shufb	step, flow, one, pat_ptraddend
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA1, 0*Q+10)
+	DFNMS(C31, A32, B21)
+	LQD(A1, blkA1, 1*Q+10)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA1, 2*Q+10)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA1, 3*Q+10)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	LR(blkA2, blkA1)
+	DFNMS(C23, A23, B33)
+	shufb	blkB2, pointers, pointers, pat_blkB2
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	ah	pointers, pointers, step
+	lnop
+
+	/* STAGE 5 */
+	#undef  OFFSET  
+	#define OFFSET	5*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	shlqbii	blkB2, blkB2, 4
+	DFNMS(C23, A20, B03)
+	shufb	blkCnext, pointers, pointers, pat_blkC
+	DFNMS(C30, A30, B00)
+	LQD(A0, blkA1, 0*Q+11)
+	DFNMS(C31, A30, B01)
+	LQD(A1, blkA1, 1*Q+11)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+11)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+11)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	rotqbyi	target, flow, 12
+	DFNMS(C23, A21, B13)
+	shlqbii	blkCnext, blkCnext, 4
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	7*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(C00next, blkCnext, 0)
+	DFNMS(C23, A22, B23)
+	LQD(C01next, blkCnext, 1)
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA1, 0*Q+12)
+	DFNMS(C31, A32, B21)
+	LQD(A1, blkA1, 1*Q+12)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+	
+
+	#undef  OFFSET  
+	#define OFFSET	8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA1, 2*Q+12)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA1, 3*Q+12)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+        LQD(C02next, blkCnext, 2)	
+	DFNMS(C23, A23, B33)
+        LQD(C03next, blkCnext, 3)	
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 6 */
+	#undef  OFFSET  
+	#define OFFSET	9*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+        LQD(C10next, blkCnext, 0+Q)
+	DFNMS(C23, A20, B03)
+        LQD(C11next, blkCnext, 1+Q)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	LQD(A0, blkA1, 0*Q+13)
+	DFNMS(C33, A30, B03)
+	LQD(A1, blkA1, 1*Q+13)
+
+	#undef  OFFSET  
+	#define OFFSET	10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+13)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+13)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+        LQD(C12next, blkCnext, 2+Q)
+	DFNMS(C23, A21, B13)
+        LQD(C13next, blkCnext, 3+Q)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	11*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+        LQD(C20next, blkCnext, 0+2*Q)
+	DFNMS(C23, A22, B23)
+        LQD(C21next, blkCnext, 1+2*Q)
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA1, 0*Q+14)
+	DFNMS(C31, A32, B21)
+	LQD(A1, blkA1, 1*Q+14)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA1, 2*Q+14)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA1, 3*Q+14)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB1, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB1, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB1, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB1, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+        LQD(C22next, blkCnext, 2+2*Q)
+	DFNMS(C23, A23, B33)
+        LQD(C23next, blkCnext, 3+2*Q)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 7 */
+	#undef  OFFSET  
+	#define OFFSET	13*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB1, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB1, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB1, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB1, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	DFNMS(C23, A20, B03)
+	DFNMS(C30, A30, B00)
+	LQD(A0, blkA1, 0*Q+15)
+	DFNMS(C31, A30, B01)
+	LQD(A1, blkA1, 1*Q+15)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA1, 2*Q+15)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA1, 3*Q+15)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB1, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB1, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB1, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB1, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	15*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB1, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB1, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB1, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB1, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	DFNMS(C23, A22, B23)
+	DFNMS(C30, A32, B20)
+	LQD(A0, blkA2, 0*Q+16)
+	DFNMS(C31, A32, B21)
+	LQD(A1, blkA2, 1*Q+16)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+16)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+16)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 8 */
+	#undef  OFFSET  
+	#define OFFSET	-15*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+17)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+17)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+17)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+17)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-13*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+18)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+18)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+18)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+18)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 9 */
+	#undef  OFFSET  
+	#define OFFSET	-11*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+19)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+19)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+19)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+19)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-9*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+20)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+20)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+20)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+20)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+
+	/* STAGE 10 */
+	#undef  OFFSET  
+	#define OFFSET	-7*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+21)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+21)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+21)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+21)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-5*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+22)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+22)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	-4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+22)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+22)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 11 */
+	#undef  OFFSET  
+	#define OFFSET	-3*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+23)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+23)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	-2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+23)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+23)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	-1*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+24)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+24)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	0*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+24)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+24)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 12 */
+	#undef  OFFSET  
+	#define OFFSET	1*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+25)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+25)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	2*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+25)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+25)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	3*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+26)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+26)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	4*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+26)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+26)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 13 */
+	#undef  OFFSET  
+	#define OFFSET	5*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+27)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+27)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	6*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+27)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+27)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	7*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+28)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+28)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	8*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+28)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+28)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	DFNMS(C23, A23, B33)
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 14 */
+	#undef  OFFSET  
+	#define OFFSET	9*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	LQD(A0, blkA2, 0*Q+29)
+	DFNMS(C23, A20, B03)
+	LQD(A1, blkA2, 1*Q+29)
+	DFNMS(C30, A30, B00)
+	DFNMS(C31, A30, B01)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	10*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+29)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+29)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	DFNMS(C23, A21, B13)
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	11*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA2, 0*Q+30)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA2, 1*Q+30)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	DFNMS(C33, A32, B23)
+
+	#undef  OFFSET  
+	#define OFFSET	12*Q
+	DFNMS(C00, A03, B30)
+	LQD(A2, blkA2, 2*Q+30)
+	DFNMS(C01, A03, B31)
+	LQD(A3, blkA2, 3*Q+30)
+	DFNMS(C02, A03, B32)
+	LQD(B00, blkB2, OFFSET+0)
+	DFNMS(C03, A03, B33)
+	LQD(B01, blkB2, OFFSET+1)
+	DFNMS(C10, A13, B30)
+	LQD(B02, blkB2, OFFSET+2)
+	DFNMS(C11, A13, B31)
+	LQD(B03, blkB2, OFFSET+3)
+	DFNMS(C12, A13, B32)
+	SHUFB(A00, A0, A0, pat0)
+	DFNMS(C13, A13, B33)
+	SHUFB(A10, A1, A1, pat0)
+	DFNMS(C20, A23, B30)
+	SHUFB(A20, A2, A2, pat0)
+	DFNMS(C21, A23, B31)
+	SHUFB(A30, A3, A3, pat0)
+	DFNMS(C22, A23, B32)
+	shufb	blkA1, pointers, pointers, pat_blkA1
+	DFNMS(C23, A23, B33)
+	shufb	blkB1, pointers, pointers, pat_blkB1
+	DFNMS(C30, A33, B30)
+	DFNMS(C31, A33, B31)
+	DFNMS(C32, A33, B32)
+	DFNMS(C33, A33, B33)
+	
+	/* STAGE 15 */
+	#undef  OFFSET  
+	#define OFFSET	13*Q
+	DFNMS(C00, A00, B00)
+	DFNMS(C01, A00, B01)
+	DFNMS(C02, A00, B02)
+	LQD(B10, blkB2, OFFSET+0)
+	DFNMS(C03, A00, B03)
+	LQD(B11, blkB2, OFFSET+1)
+	DFNMS(C10, A10, B00)
+	LQD(B12, blkB2, OFFSET+2)
+	DFNMS(C11, A10, B01)
+	LQD(B13, blkB2, OFFSET+3)
+	DFNMS(C12, A10, B02)
+	SHUFB(A01, A0, A0, pat1)	
+	DFNMS(C13, A10, B03)
+	SHUFB(A11, A1, A1, pat1)	
+	DFNMS(C20, A20, B00)
+	SHUFB(A21, A2, A2, pat1)	
+	DFNMS(C21, A20, B01)
+	SHUFB(A31, A3, A3, pat1)	
+	DFNMS(C22, A20, B02)
+	shlqbii	blkA1, blkA1, 4
+	DFNMS(C23, A20, B03)
+	shlqbii	blkB1, blkB1, 4
+	DFNMS(C30, A30, B00)
+	LQD(A0, blkA2, 0*Q+31)
+	DFNMS(C31, A30, B01)
+	LQD(A1, blkA2, 1*Q+31)
+	DFNMS(C32, A30, B02)
+	DFNMS(C33, A30, B03)
+
+	#undef  OFFSET  
+	#define OFFSET	14*Q
+	DFNMS(C00, A01, B10)
+	LQD(A2, blkA2, 2*Q+31)
+	DFNMS(C01, A01, B11)
+	LQD(A3, blkA2, 3*Q+31)
+	DFNMS(C02, A01, B12)
+	LQD(B20, blkB2, OFFSET+0)
+	DFNMS(C03, A01, B13)
+	LQD(B21, blkB2, OFFSET+1)
+	DFNMS(C10, A11, B10)
+	LQD(B22, blkB2, OFFSET+2)
+	DFNMS(C11, A11, B11)
+	LQD(B23, blkB2, OFFSET+3)
+	DFNMS(C12, A11, B12)
+	SHUFB(A02, A0, A0, pat0)
+	DFNMS(C13, A11, B13)
+	SHUFB(A12, A1, A1, pat0)
+	DFNMS(C20, A21, B10)
+	SHUFB(A22, A2, A2, pat0)
+	DFNMS(C21, A21, B11)
+	SHUFB(A32, A3, A3, pat0)
+	DFNMS(C22, A21, B12)
+	hbr	loop_inst, target
+	DFNMS(C23, A21, B13)
+	lnop
+	DFNMS(C30, A31, B10)
+	DFNMS(C31, A31, B11)
+	DFNMS(C32, A31, B12)
+	DFNMS(C33, A31, B13)
+
+	#undef  OFFSET  
+	#define OFFSET	15*Q
+	DFNMS(C00, A02, B20)
+	DFNMS(C01, A02, B21)
+	DFNMS(C02, A02, B22)
+	LQD(B30, blkB2, OFFSET+0)
+	DFNMS(C03, A02, B23)
+	LQD(B31, blkB2, OFFSET+1)
+	DFNMS(C10, A12, B20)
+	LQD(B32, blkB2, OFFSET+2)
+	DFNMS(C11, A12, B21)
+	LQD(B33, blkB2, OFFSET+3)
+	DFNMS(C12, A12, B22)
+	SHUFB(A03, A0, A0, pat1)	
+	DFNMS(C13, A12, B23)
+	SHUFB(A13, A1, A1, pat1)	
+	DFNMS(C20, A22, B20)
+	SHUFB(A23, A2, A2, pat1)	
+	DFNMS(C21, A22, B21)
+	SHUFB(A33, A3, A3, pat1)	
+	DFNMS(C22, A22, B22)
+	LQD(A0, blkA1, 0*Q+0)
+	DFNMS(C23, A22, B23)
+	LQD(A1, blkA1, 1*Q+0)
+	DFNMS(C30, A32, B20)
+	DFNMS(C31, A32, B21)
+	DFNMS(C32, A32, B22)
+	lnop
+	DFNMS(C33, A32, B23)
+loop_inst:	
+	bi	target
+	
+	.align	6
+loop_end:	
+
+	#undef  OFFSET  
+	#define OFFSET	-16*Q
+	DFNMS(C00, A03, B30)
+	hbr return, $0	
+	DFNMS(C01, A03, B31)
+	DFNMS(C02, A03, B32)
+	
+	DFNMS(C03, A03, B33)
+	DFNMS(C10, A13, B30)
+	
+	DFNMS(C11, A13, B31)
+	DFNMS(C12, A13, B32)
+	
+	DFNMS(C13, A13, B33)
+	DFNMS(C20, A23, B30)
+	
+	DFNMS(C21, A23, B31)
+	STQD(C00, blkC, 0)
+	
+	DFNMS(C22, A23, B32)
+	STQD(C01, blkC, 1)
+	
+	DFNMS(C23, A23, B33)
+	STQD(C02, blkC, 2)
+	DFNMS(C30, A33, B30)
+	STQD(C03, blkC, 3)
+	DFNMS(C31, A33, B31)
+	STQD(C10, blkC, 0+Q)
+	DFNMS(C32, A33, B32)
+	STQD(C11, blkC, 1+Q)
+	DFNMS(C33, A33, B33)
+	STQD(C12, blkC, 2+Q)
+	
+	STQD(C13, blkC, 3+Q)
+	STQD(C20, blkC, 0+2*Q)
+	
+	STQD(C21, blkC, 1+2*Q)
+	STQD(C22, blkC, 2+2*Q)
+	
+	STQD(C23, blkC, 3+2*Q)
+	STQD(C30, blkC, 0+3*Q)
+	
+	STQD(C31, blkC, 1+3*Q)
+	STQD(C32, blkC, 2+3*Q)
+	
+	STQD(C33, blkC, 3+3*Q)
+return:		
+	bi	$0
Index: accel/lib/spu/accel_reform.c
===================================================================
RCS file: accel/lib/spu/accel_reform.c
diff -N accel/lib/spu/accel_reform.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform.c	18 May 2008 21:11:28 -0000	1.4
@@ -0,0 +1,10 @@
+/* --------------------------------------------------------------  */
+/* (C)Copyright 2001,2006,                                         */
+/* International Business Machines Corporation,                    */
+/*                                                                 */
+/* All Rights Reserved.                                            */
+/* --------------------------------------------------------------  */
+
+/* All code previously contained in this file have been moved to
+ * seperate source files.
+ */
Index: accel/lib/spu/accel_reform.h
===================================================================
RCS file: accel/lib/spu/accel_reform.h
diff -N accel/lib/spu/accel_reform.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform.h	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,103 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_REFORM_H_
+#define _ACCEL_REFORM_H_	1
+
+
+
+/* Reformat a block, in 8 quadword chunks
+ */
+#define REFORM_8(_dst, _src, _row, _col, _even, _odd)		\
+  a0 = _src[(_row+0)*M_SUB/2];					\
+  a1 = _src[(_row+1)*M_SUB/2];					\
+  a2 = _src[(_row+2)*M_SUB/2];					\
+  a3 = _src[(_row+3)*M_SUB/2];					\
+  a4 = _src[(_row+4)*M_SUB/2];					\
+  a5 = _src[(_row+5)*M_SUB/2];					\
+  a6 = _src[(_row+6)*M_SUB/2];					\
+  a7 = _src[(_row+7)*M_SUB/2];					\
+								\
+  _dst[_col+0 + 0*M_SUB/2] = spu_shuffle(a0, a1, _even);	\
+  _dst[_col+0 + 1*M_SUB/2] = spu_shuffle(a0, a1, _odd);		\
+  _dst[_col+1 + 0*M_SUB/2] = spu_shuffle(a2, a3, _even);	\
+  _dst[_col+1 + 1*M_SUB/2] = spu_shuffle(a2, a3, _odd);		\
+  _dst[_col+2 + 0*M_SUB/2] = spu_shuffle(a4, a5, _even);	\
+  _dst[_col+2 + 1*M_SUB/2] = spu_shuffle(a4, a5, _odd);		\
+  _dst[_col+3 + 0*M_SUB/2] = spu_shuffle(a6, a7, _even);	\
+  _dst[_col+3 + 1*M_SUB/2] = spu_shuffle(a6, a7, _odd);
+
+
+static inline void fill_dma_list(volatile vec_uint4 *list, vec_uint4 element0, vec_uint4 element1, vec_uint4 element2, vec_uint4 element3, vec_uint4 element_next)
+{
+  list[0]  = element0;
+  list[1]  = element1; element0 = spu_add(element0, element_next);
+  list[2]  = element2; element1 = spu_add(element1, element_next);
+  list[3]  = element3; element2 = spu_add(element2, element_next);
+  list[4]  = element0; element3 = spu_add(element3, element_next);
+  list[5]  = element1; element0 = spu_add(element0, element_next);
+  list[6]  = element2; element1 = spu_add(element1, element_next);
+  list[7]  = element3; element2 = spu_add(element2, element_next);
+  list[8]  = element0; element3 = spu_add(element3, element_next);
+  list[9]  = element1; element0 = spu_add(element0, element_next);
+  list[10] = element2; element1 = spu_add(element1, element_next);
+  list[11] = element3; element2 = spu_add(element2, element_next);
+  list[12] = element0; element3 = spu_add(element3, element_next);
+  list[13] = element1; element0 = spu_add(element0, element_next);
+  list[14] = element2; element1 = spu_add(element1, element_next);
+  list[15] = element3; element2 = spu_add(element2, element_next);
+  list[16] = element0; element3 = spu_add(element3, element_next);
+  list[17] = element1; element0 = spu_add(element0, element_next);
+  list[18] = element2; element1 = spu_add(element1, element_next);
+  list[19] = element3; element2 = spu_add(element2, element_next);
+  list[20] = element0; element3 = spu_add(element3, element_next);
+  list[21] = element1; element0 = spu_add(element0, element_next);
+  list[22] = element2; element1 = spu_add(element1, element_next);
+  list[23] = element3; element2 = spu_add(element2, element_next);
+  list[24] = element0; element3 = spu_add(element3, element_next);
+  list[25] = element1; element0 = spu_add(element0, element_next);
+  list[26] = element2; element1 = spu_add(element1, element_next);
+  list[27] = element3; element2 = spu_add(element2, element_next);
+  list[28] = element0; element3 = spu_add(element3, element_next);
+  list[29] = element1; 
+  list[30] = element2; 
+  list[31] = element3; 
+}
+
+#if 0
+void dump_dma_list(double *ls, unsigned int hi, unsigned int list, unsigned int size, unsigned int tag, 
+		   unsigned int cmd)
+{
+  int i;
+  unsigned int len, lo;
+  unsigned int *plist;
+  unsigned int l;
+
+  plist = (unsigned int *)list;
+
+  printf("CMD=%d size=%d tag=%d hi=0x%x\n", cmd, size, tag, hi);
+  for (i=0; i<(int)size/4; i+=2) {
+    len = plist[i+0];
+    lo  = plist[i+1];
+    printf("0x%08x\t", lo);
+    for (l=0; l<len/8; l++) {
+      printf("%f ", *ls++);
+      if ((l & 7) == 7) printf("\n\t");
+    }
+    printf("\n");
+  }
+}
+#endif
+
+/* Optimized function to clamp an integer value to the range 0 to 32.
+ */
+static inline unsigned int clamp_0_32(int in)
+{
+  vec_int4 in_v = spu_promote(in, 0);
+  
+  return (spu_extract(spu_sel(spu_and(in_v, (vec_int4)spu_cmpgt(in_v, 0)), spu_splats(32), spu_cmpgt(in_v, 32)), 0));
+}
+
+#endif	/* _ACCEL_REFORM_H_ */
Index: accel/lib/spu/accel_reform_matrix_CL_to_B.c
===================================================================
RCS file: accel/lib/spu/accel_reform_matrix_CL_to_B.c
diff -N accel/lib/spu/accel_reform_matrix_CL_to_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform_matrix_CL_to_B.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,400 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *parms, 
+				 volatile hpl_accel_reform_matrix_CL_to_B_parms_t *cmd_parms)
+{
+  int i;
+  unsigned int x, y;
+  unsigned int id;
+  unsigned long long a, scratch;
+  unsigned int a_hi, a_lo, out_hi, out_lo;
+  unsigned int scratch_hi, scratch_lo, lo;
+  unsigned int n, nb, m, mb, m_pad, lda, spes, trailing, left;
+  unsigned int dst_idx;
+  unsigned int tag, next_tag;
+  unsigned int retained;	/* Number of buffers kept in local store instead of the scratch buffer */
+  vec_uint4 next_col_blk, next_row_blk;
+  vec_uint4 element0, element1, element2, element3, element_next;
+  volatile vec_uint4 *list;
+  vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1};
+#ifdef ACCEL_LITTLE_ENDIAN
+  vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
+#else
+  vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23};
+#endif
+  vec_uchar16 pat_odd = spu_or(pat_even, 8);
+  vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3};
+  vec_uchar16 pat_zzzz = (vec_uchar16){128,128,128,128, 128,128,128,128, 128,128,128,128, 128,128,128,128};
+  vec_double2 *srcTop, *srcBot, *dst, *buf;
+  vec_double2 a0, a1, a2, a3, a4, a5, a6, a7;
+#ifdef MATRIX_4GB_CROSSING
+  unsigned int in_hi;
+  vec_uint4 carry;
+#endif
+
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  scratch = cmd_parms->scratch;
+  lda = cmd_parms->lda;
+  m = cmd_parms->m;
+  n = cmd_parms->n;
+  spes = cmd_parms->spes;
+
+  /* Pad m and n to the nearest block and compute the number of blocks to be 
+   * reformated. Rows are padded with 0.0. Columns to filled in with don't care
+   * values.
+   */
+  m_pad = (m % M_SUB) - 1;
+  mb = (m + M_SUB-1)/M_SUB;
+
+  nb = (n + M_SUB-1)/M_SUB;
+
+  /* Compute the amount of trailing data to zero after the blocked data.
+   */
+  trailing = (lda - mb*M_SUB*sizeof(double))*M_SUB;
+
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  MATRIX_EA_UMADD32(a_hi, a_lo, lda, id*M_SUB);
+
+#ifdef MATRIX_4GB_CROSSING
+  in_hi = a_hi;
+#endif
+
+  scratch_hi = mfc_ea2h(scratch);
+  scratch_lo = mfc_ea2l(scratch);
+
+  scratch_lo += id*(mb-4)*M_SUB*M_SUB*sizeof(double);
+
+  /* Compute all the working variables needed to generate that DMA lists.
+   *
+   * element0     = {M_SUB*sizeof(double), a_lo + 0*lda, M_SUB*sizeof(double), a_lo + 1*lda}
+   * element1     = {M_SUB*sizeof(double), a_lo + 2*lda, M_SUB*sizeof(double), a_lo + 3*lda}
+   * element2     = {M_SUB*sizeof(double), a_lo + 4*lda, M_SUB*sizeof(double), a_lo + 5*lda}
+   * element3     = {M_SUB*sizeof(double), a_lo + 6*lda, M_SUB*sizeof(double), a_lo + 7*lda}
+   * element_next = {                   0,        8*lda,                    0,        8*lda}
+   */
+  next_col_blk = spu_splats((unsigned int)(M_SUB*sizeof(double)));
+  next_row_blk = spu_and(spu_splats(lda*M_SUB*spes - mb*M_SUB*sizeof(double)), mask_0101);
+
+  element_next = spu_promote(8*lda, 0);
+  element_next = spu_shuffle(element_next, element_next, pat_z0z0);
+
+  element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8),
+		     spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(a_lo), mask_0101));
+
+  next_col_blk = spu_and(next_col_blk, mask_0101);
+
+  element1  = spu_rlmask(element_next, -2);
+  element2  = spu_rlmask(element_next, -1);
+  element3  = spu_add(spu_add(element1, element2), element0);
+  element1  = spu_add(element1, element0);
+  element2  = spu_add(element2, element0);
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT(-1);
+
+  /* Reformat the blocks
+   */
+  tag = 0;
+
+  for (x=id; x<nb; x+=spes) {
+    /* Fetch a blocks worth of data.
+     */
+    buf = &bufA[tag][0];
+    list = (volatile vec_uint4 *)&bufA[tag][((M_SUB-1)*M_SUB)/2];
+    fill_dma_list(list, element0, element1, element2, element3, element_next);
+
+#ifdef MATRIX_4GB_CROSSING
+    carry = spu_genc(spu_rlqwbyte(element0, 4), spu_promote(M_SUB*lda, 0));
+
+    if (spu_extract(carry, 0)) {
+      /* Possible crossing occurred, determine where the crossing occurred */
+      int count;
+      unsigned int list_lo;
+      vec_uint4 addrs, start, cmp;
+	
+      count = -1;
+      start = spu_shuffle(element0, element0, ((vec_uchar16){4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7}));
+
+      for (i=0; i<32; i+=2) {
+	addrs = spu_shuffle(list[i], list[i+1], ((vec_uchar16){4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31}));
+	cmp = spu_gather(spu_cmpgt(start, addrs));
+	count += (int)spu_extract(spu_cntlz(cmp), 0) - 28;
+	if (spu_extract(cmp, 0)) break;
+      }
+
+      /* Issue the list up to the crossing. 
+       */
+      spu_mfcdma64(buf, in_hi, (unsigned int)list, 8*count, tag, MFC_GETLB_CMD);
+      
+      /* Issue the possible crossing list element.
+       */
+      list_lo = *((unsigned int *)list + 2*count + 1);
+      spu_mfcdma64((void *)buf+(count*(M_SUB*sizeof(double))), in_hi, list_lo, M_SUB*sizeof(double), tag, MFC_GETB_CMD);
+
+      /* Issue the remaining lists following the crossing.
+       */
+      count++;
+      if (count < 64) {
+	spu_mfcdma64((void *)buf+(count*(M_SUB*sizeof(double))), in_hi+1, (unsigned int)list+8*count, 8*(64-count), tag, MFC_GETLB_CMD);
+      }
+    } else {
+      /* No crossing occurred in this list, issue the whole list */
+      spu_mfcdma64(buf, in_hi, (unsigned int)list, 8*M_SUB, tag, MFC_GETL_CMD);
+    }
+    in_hi += spu_extract(spu_genc(element0, next_col_blk), 1);
+#else
+    spu_mfcdma64(buf, a_hi, (unsigned int)list, 8*M_SUB, tag, MFC_GETL_CMD);
+#endif
+    element0  = spu_add(element0, next_col_blk);
+    element1  = spu_add(element1, next_col_blk);
+    element2  = spu_add(element2, next_col_blk);
+    element3  = spu_add(element3, next_col_blk);
+
+    lo = scratch_lo;
+    dst_idx = 0;
+    retained = 0;
+
+    for (y=0; y<mb-1; y++) {
+      /* Fetch the next block of input data.
+       */
+      next_tag = tag ^ 1;
+
+      buf = &bufA[next_tag][0];
+      list = (volatile vec_uint4 *)&bufA[next_tag][((M_SUB-1)*M_SUB)/2];
+      fill_dma_list(list, element0, element1, element2, element3, element_next);
+#ifdef MATRIX_4GB_CROSSING
+      carry = spu_genc(spu_rlqwbyte(element0, 4), spu_promote(M_SUB*lda, 0));
+      if (spu_extract(carry, 0)) {
+	/* Possible crossing occurred, determine where the crossing occurred */
+	int count;
+	unsigned int list_lo;
+	vec_uint4 addrs, start, cmp;
+	
+	count = -1;
+	start = spu_shuffle(element0, element0, ((vec_uchar16){4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7}));
+	for (i=0; i<32; i+=2) {
+	  addrs = spu_shuffle(list[i], list[i+1], ((vec_uchar16){4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31}));
+	  cmp = spu_gather(spu_cmpgt(start, addrs));
+	  count += (int)spu_extract(spu_cntlz(cmp), 0) - 28;
+	  if (spu_extract(cmp, 0)) break;
+	}
+	/* Issue the list up to the crossing. 
+	 */
+	spu_mfcdma64(buf, in_hi, (unsigned int)list, 8*count, next_tag, MFC_GETLB_CMD);
+
+	/* Issue the possible crossing list element.
+	 */
+	list_lo = *((unsigned int *)list + 2*count + 1);
+	spu_mfcdma64((void *)buf+(count*(M_SUB*sizeof(double))), in_hi, list_lo, M_SUB*sizeof(double), next_tag, MFC_GETB_CMD);
+
+	/* Issue the remaining lists following the crossing.
+	 */
+	count++;
+	if (count < 64) {
+	  spu_mfcdma64((void *)buf+(count*(M_SUB*sizeof(double))), in_hi+1, (unsigned int)list+8*count, 8*(64-count), next_tag, MFC_GETLB_CMD);
+	}
+      } else {
+	/* No crossing occurred in this list, issue the whole list */
+	spu_mfcdma64(buf, in_hi, (unsigned int)list, 8*M_SUB, next_tag, MFC_GETL_CMD);
+      }
+      in_hi += spu_extract(spu_genc(element0, next_col_blk), 1);
+#else
+      spu_mfcdma64(buf, a_hi, (unsigned int)list, 8*M_SUB, next_tag, MFC_GETL_CMD);
+#endif
+      element0  = spu_add(element0, next_col_blk);
+      element1  = spu_add(element1, next_col_blk);
+      element2  = spu_add(element2, next_col_blk);
+      element3  = spu_add(element3, next_col_blk);
+
+      /* Wait for the previous block *
+       */
+      DMA_WAIT(1<<tag);
+      
+      /* Place the reformatted block into the 4 buffers as: 0, 1, 2, 3, 2, 3, ...
+       */
+      srcTop = &bufA[tag][16*M_SUB/2];
+      srcBot = &bufA[tag][48*M_SUB/2];
+      dst    = &bufB[dst_idx][0];
+
+      /* Reformat the block */
+      for (i=0; i<64; i+=2) {
+	REFORM_8(dst, srcTop, -16,  0, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,  -8,  4, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,   0,  8, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,   8, 12, pat_even, pat_odd);
+      
+	REFORM_8(dst, srcBot, -16, 16, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,  -8, 20, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,   0, 24, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,   8, 28, pat_even, pat_odd);
+
+	srcTop += 1;
+	srcBot += 1;
+	dst  += 2*M_SUB/2; 
+      }
+
+      /* Store the resulting block matrix into the scratch buffer. The first two blocks are left in
+       * the local store.
+       */
+      if (dst_idx > 1) {      
+	if (y != mb-2) {
+	  /* If this is the next to last block of the column, then do not put into
+	   * the scratch buffer.
+	   */
+	  dst = &bufB[dst_idx][0];
+	  spu_mfcdma64(dst,        scratch_hi, lo,       16384, tag, MFC_PUT_CMD);
+	  spu_mfcdma64(dst + 1024, scratch_hi, lo+16384, 16384, tag, MFC_PUT_CMD);
+	  lo += M_SUB*M_SUB*sizeof(double);
+	}
+	dst_idx ^= 1;
+      } else {
+	dst_idx++;
+	retained++;
+      }
+
+      tag = next_tag;
+    }
+
+    /* Wait for the final block get before putting reformated blocks back into the
+     * matrix.
+     */
+    DMA_WAIT(1<<tag);
+
+    /* Store all the reformated blocks that have not been pushed out to memory
+     * back into the original matrix.
+     */
+    out_hi = a_hi;
+    out_lo = a_lo;
+    for (i=0; i<(int)retained; i++) {
+      srcTop    = &bufB[i][0];
+      spu_mfcdma64(srcTop,        out_hi, out_lo,       16384, tag^1, MFC_PUT_CMD);
+      spu_mfcdma64(srcTop + 1024, out_hi, out_lo+16384, 16384, tag^1, MFC_PUT_CMD);
+      MATRIX_EA_UADD32(out_hi, out_lo, 32768);
+    }
+
+    /* Move all the buffers that were saved in the scratch buffer back into the matrix.
+     */
+    lo = scratch_lo;
+    buf = &bufB[tag^1][0];
+    for (; i<(int)mb-2; i++) {
+      spu_mfcdma64(buf,        scratch_hi, lo,           16384, tag^1, MFC_GETB_CMD);
+      spu_mfcdma64(buf + 1024, scratch_hi, lo+16384,     16384, tag^1, MFC_GET_CMD);
+
+      spu_mfcdma64(buf,            out_hi, out_lo,       16384, tag^1, MFC_PUTB_CMD);
+      spu_mfcdma64(buf + 1024,     out_hi, out_lo+16384, 16384, tag^1, MFC_PUT_CMD);
+
+      MATRIX_EA_UADD32(out_hi, out_lo, 32768);
+      lo  += 32768;
+    }
+
+    /* Move the next to last buffer back into the matrix.
+     */
+    if (mb > 3) {
+      srcTop    = &bufB[dst_idx^1][0];
+      spu_mfcdma64(srcTop,        out_hi, out_lo,       16384, tag^1, MFC_PUT_CMD);
+      spu_mfcdma64(srcTop + 1024, out_hi, out_lo+16384, 16384, tag^1, MFC_PUT_CMD);
+      MATRIX_EA_UADD32(out_hi, out_lo, 32768);
+    }
+
+
+    /* Finish reformating the last block. The last block contains special handling
+     * code to zeros out the pad rows.
+     */
+    srcTop = &bufA[tag][16*M_SUB/2];
+    srcBot = &bufA[tag][48*M_SUB/2]; 
+    dst    = &bufB[dst_idx][0];
+
+    for (i=0; i<64; i+=2) {
+      vec_uchar16 pat_e, pat_o;
+
+      pat_e = spu_sel(pat_even, pat_zzzz, 
+		      spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)i, 0), spu_promote(m_pad, 0)), 0)));
+      pat_o = spu_sel(pat_odd, pat_zzzz, 
+		      spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)(i+1), 0), spu_promote(m_pad, 0)), 0)));
+      REFORM_8(dst, srcTop, -16,  0, pat_e, pat_o);
+      REFORM_8(dst, srcTop,  -8,  4, pat_e, pat_o);
+      REFORM_8(dst, srcTop,   0,  8, pat_e, pat_o);
+      REFORM_8(dst, srcTop,   8, 12, pat_e, pat_o);
+      
+      REFORM_8(dst, srcBot, -16, 16, pat_e, pat_o);
+      REFORM_8(dst, srcBot,  -8, 20, pat_e, pat_o);
+      REFORM_8(dst, srcBot,   0, 24, pat_e, pat_o);
+      REFORM_8(dst, srcBot,   8, 28, pat_e, pat_o);
+
+      srcTop += 1;
+      srcBot += 1;
+      dst  += 2*M_SUB/2; 
+    }
+
+    /* Store the final block back into the matrix.
+     */
+    spu_mfcdma64(&bufB[dst_idx][0],    out_hi, out_lo,       16384, tag, MFC_PUT_CMD);
+    spu_mfcdma64(&bufB[dst_idx][1024], out_hi, out_lo+16384, 16384, tag, MFC_PUT_CMD);
+
+    /* Zero out final trailing data resulting from lda striding.
+     */
+    MATRIX_EA_UADD32(out_hi, out_lo, 32768);
+
+    left = trailing;
+    while (left) {
+      unsigned int size;
+
+#ifndef MFC_SDCRZ_CMD
+#define MFC_SDCRZ_CMD      0x0089   /* SPU Only */
+#endif /* MFC_SDCRZ_CMD */
+
+      size = (left > 16384) ? 16384 : left;
+      spu_mfcdma64(0, out_hi, out_lo, size, tag, MFC_SDCRZ_CMD);
+
+      MATRIX_EA_UADD32(out_hi, out_lo, size);
+      left -= size;
+    }
+
+    /* Advance pointers to next column to be processed.
+     */
+#ifdef MATRIX_4GB_CROSSING
+    in_hi += spu_extract(spu_genc(element0, next_row_blk), 1);
+#endif
+    element0 = spu_add(element0, next_row_blk);
+    element1 = spu_add(element1, next_row_blk);
+    element2 = spu_add(element2, next_row_blk);
+    element3 = spu_add(element3, next_row_blk);
+
+    MATRIX_EA_UMADD32(a_hi, a_lo, lda, spes*M_SUB);
+
+    /* Wait for all the transfers except the final block to complete 
+     */
+    DMA_WAIT(1<<(tag^1));
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag);
+}
+
Index: accel/lib/spu/accel_reform_panel_B_to_CL.c
===================================================================
RCS file: accel/lib/spu/accel_reform_panel_B_to_CL.c
diff -N accel/lib/spu/accel_reform_panel_B_to_CL.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform_panel_B_to_CL.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,247 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *parms, 
+				volatile hpl_accel_reform_panel_parms_t *cmd_parms)
+{
+  int i;
+  unsigned int id;
+  int x, y, columns, rows;
+  int dma_size1, dma_size2;
+  unsigned long long a, panel;
+  unsigned int a_hi, a_lo, hi, lo;
+  unsigned int panel_hi, panel_lo;
+  unsigned int lda, ldp;
+  unsigned int n, m, mb;
+  unsigned int tag, next_tag;
+  unsigned int addend;
+#ifdef ACCEL_LITTLE_ENDIAN
+  vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
+#else
+  vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23};
+#endif
+  vec_uchar16 pat_odd = spu_or(pat_even, 8);
+  vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3};
+  vec_uint4 e0, e1, e2, e3, esize;
+  vec_uint4 element0, element1, element2, element3, element_next;
+  vec_uint4 next_col_blk, next_row_blk;
+  vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1};
+  vec_double2 a0, a1, a2, a3, a4, a5, a6, a7;
+  vec_double2 *srcTop, *srcBot, *dst;
+  volatile vec_uint4 *list;
+  
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  panel = cmd_parms->panel;
+  lda = cmd_parms->lda;
+  ldp = cmd_parms->ldp;
+  m = cmd_parms->m;
+  n = cmd_parms->n;
+
+  mb = (m + (M_SUB-1)) / M_SUB;
+
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  addend = id * (M_SUB * M_SUB * sizeof(double));
+  MATRIX_EA_UADD32(a_hi, a_lo, addend);
+
+  panel_hi = mfc_ea2h(panel);
+  panel_lo = mfc_ea2l(panel);
+
+  panel_lo += id * M_SUB * sizeof(double);
+
+  /* Compute all the working variables needed to generate the DMA lists.
+   *
+   * element0     = {M_SUB*sizeof(double), panel_lo + 0*ldp, M_SUB*sizeof(double), panel_lo + 1*ldp}
+   * element1     = {M_SUB*sizeof(double), panel_lo + 2*ldp, M_SUB*sizeof(double), panel_lo + 3*ldp}
+   * element2     = {M_SUB*sizeof(double), panel_lo + 4*ldp, M_SUB*sizeof(double), panel_lo + 5*ldp}
+   * element3     = {M_SUB*sizeof(double), panel_lo + 6*ldp, M_SUB*sizeof(double), panel_lo + 7*ldp}
+   * element_next = {                   0,            8*ldp,                    0,            8*ldp}
+   */
+  next_col_blk = spu_and(spu_splats(ldp*M_SUB), mask_0101);
+  next_row_blk = spu_and(spu_splats(HPL_ACCEL_REFORM_SPES*M_SUB*sizeof(double)), mask_0101);
+
+  element_next = spu_promote(8*ldp, 0);
+  element_next = spu_shuffle(element_next, element_next, pat_z0z0);
+
+  element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8),
+		     spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(panel_lo), mask_0101));
+
+  element1  = spu_rlmask(element_next, -2);
+  element2  = spu_rlmask(element_next, -1);
+  element3  = spu_add(spu_add(element1, element2), element0);
+  element1  = spu_add(element1, element0);
+  element2  = spu_add(element2, element0);
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT_RECEIVE();
+
+  /* Reformat the blocks
+   */
+  tag = 0;
+
+  /* Fetch the first block 
+   */
+  if (id < mb) {
+    dma_size1 = (int)m - id*M_SUB;
+    dma_size2 = dma_size1-32;
+    dma_size1 = clamp_0_32(dma_size1);
+    dma_size2 = clamp_0_32(dma_size2);
+    rows = (dma_size1 + dma_size2) / 2;
+    esize = spu_promote(rows * sizeof(vec_double2), 0);
+
+    spu_mfcdma64(&bufA[0][0],    a_hi, a_lo,       (unsigned int)dma_size1*M_SUB*sizeof(double), 0, MFC_GET_CMD);
+    spu_mfcdma64(&bufA[0][1024], a_hi, a_lo+16384, (unsigned int)dma_size2*M_SUB*sizeof(double), 0, MFC_GET_CMD);
+  }
+
+  /* For each of the row of blocks.
+   */
+  for (y=id; y<(int)mb; ) {
+    hi = a_hi;
+    lo = a_lo;
+    MATRIX_EA_UADD32(hi, lo, lda);
+
+    e0 = element0;
+    e1 = element1;
+    e2 = element2;
+    e3 = element3;
+
+    for (x=0; x<(int)n-M_SUB; x+=M_SUB) {
+      next_tag = tag ^ 1;
+
+      /* Fetch the next block.
+       */
+      spu_mfcdma64(&bufA[next_tag][0],    hi, lo,       16384, next_tag, MFC_GET_CMD);
+      spu_mfcdma64(&bufA[next_tag][1024], hi, lo+16384, 16384, next_tag, MFC_GET_CMD);
+      MATRIX_EA_UADD32(hi, lo, lda);
+
+      DMA_WAIT(1<<tag);
+
+      /* Reformat the block from row-ordered, big endian, to column-ordered, little endian.
+       */
+      srcTop = &bufA[tag][16*M_SUB/2];
+      srcBot = &bufA[tag][48*M_SUB/2];
+      dst = &bufB[tag][0];
+
+      for (i=0; i<64; i+=2) {
+	REFORM_8(dst, srcTop, -16,  0, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,  -8,  4, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,   0,  8, pat_even, pat_odd);
+	REFORM_8(dst, srcTop,   8, 12, pat_even, pat_odd);
+      
+	REFORM_8(dst, srcBot, -16, 16, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,  -8, 20, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,   0, 24, pat_even, pat_odd);
+	REFORM_8(dst, srcBot,   8, 28, pat_even, pat_odd);
+	
+	srcTop += 1;
+	srcBot += 1;
+	dst += 2*M_SUB/2;
+      }
+
+      /* Put the reformated block into the panel buffer 
+       */
+      list = (volatile vec_uint4 *)&bufC[tag][0];
+      fill_dma_list(list, e0, e1, e2, e3, element_next);
+      e0  = spu_add(e0, next_col_blk);
+      e1  = spu_add(e1, next_col_blk);
+      e2  = spu_add(e2, next_col_blk);
+      e3  = spu_add(e3, next_col_blk);
+
+      spu_mfcdma64(&bufB[tag][0], panel_hi, (unsigned int)list, 8*M_SUB, tag, MFC_PUTL_CMD);
+
+      tag = next_tag;	
+    }
+    
+    /* Advance the block pointer to the next row of blocks to be processed by this SPE. 
+     * Kick off a DMA for the next row, if there is a next row.
+     */
+    addend = HPL_ACCEL_REFORM_SPES*M_SUB*M_SUB*sizeof(double);
+    MATRIX_EA_UADD32(a_hi, a_lo, addend);
+    y += HPL_ACCEL_REFORM_SPES;
+    next_tag = tag ^ 1;
+
+    if (y < (int)mb) {
+      dma_size1 = (int)m - y*M_SUB;
+      dma_size2 = dma_size1-32;
+      dma_size1 = clamp_0_32(dma_size1);
+      dma_size2 = clamp_0_32(dma_size2);
+      rows = (dma_size1 + dma_size2) / 2;
+      esize = spu_promote(rows * sizeof(vec_double2), 0);
+
+      spu_mfcdma64(&bufA[next_tag][0],    a_hi, a_lo,       (unsigned int)dma_size1*M_SUB*sizeof(double), next_tag, MFC_GET_CMD);
+      spu_mfcdma64(&bufA[next_tag][1024], a_hi, a_lo+16384, (unsigned int)dma_size2*M_SUB*sizeof(double), next_tag, MFC_GET_CMD);
+    }
+
+    /* Reformat the final block in the row from row-ordered, big endian, to column-ordered, little endian.
+     */
+    DMA_WAIT(1<<tag);
+
+    /* Reformat the block from row-ordered, big endian, to column-ordered, little endian.
+     */
+    srcTop = &bufA[tag][16*M_SUB/2];
+    srcBot = &bufA[tag][48*M_SUB/2];
+    dst = &bufB[tag][0];
+
+    columns = n-x;
+
+    for (i=0; i<(int)columns; i+=2) {
+      REFORM_8(dst, srcTop, -16,  0, pat_even, pat_odd);
+      REFORM_8(dst, srcTop,  -8,  4, pat_even, pat_odd);
+      REFORM_8(dst, srcTop,   0,  8, pat_even, pat_odd);
+      REFORM_8(dst, srcTop,   8, 12, pat_even, pat_odd);
+      
+      REFORM_8(dst, srcBot, -16, 16, pat_even, pat_odd);
+      REFORM_8(dst, srcBot,  -8, 20, pat_even, pat_odd);
+      REFORM_8(dst, srcBot,   0, 24, pat_even, pat_odd);
+      REFORM_8(dst, srcBot,   8, 28, pat_even, pat_odd);
+
+      srcTop += 1;
+      srcBot += 1;
+      dst += 2*M_SUB/2;
+    }
+    
+    /* Put the reformated block into the panel buffer 
+     */
+    list = (volatile vec_uint4 *)&bufC[tag][0];
+    fill_dma_list(list, e0, e1, e2, e3, element_next);
+    spu_mfcdma64(&bufB[tag][0], panel_hi, (unsigned int)list, 8*columns, tag, MFC_PUTL_CMD);
+    element0  = spu_add(element0, next_row_blk);
+    element1  = spu_add(element1, next_row_blk);
+    element2  = spu_add(element2, next_row_blk);
+    element3  = spu_add(element3, next_row_blk);
+
+    tag = next_tag;
+  }
+
+  /* Wait for the next to last block to complete before notifying completion.
+   */
+  DMA_WAIT(1<<tag);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag^1);
+}
Index: accel/lib/spu/accel_reform_panel_R_to_B.c
===================================================================
RCS file: accel/lib/spu/accel_reform_panel_R_to_B.c
diff -N accel/lib/spu/accel_reform_panel_R_to_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform_panel_R_to_B.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,153 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *parms, 
+			       volatile hpl_accel_reform_panel_parms_t *cmd_parms)
+{
+  int i, x, y;
+  unsigned int id;
+  unsigned int idx;
+  unsigned int a_hi, a_lo, hi, lo;
+  unsigned long long a, panel;
+  unsigned int panel_hi, panel_lo, p_lo;
+  unsigned int tag;
+  unsigned int lda, ldp;
+  unsigned int n, m, row_len, size, left, esize, extra;
+  unsigned int *list, list_offset;
+  unsigned int addend;
+  vec_double2 *buf;
+#ifdef MATRIX_4GB_CROSSING
+  unsigned int carry;
+#endif
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+
+  /* Fetch the parameters 
+   */
+  a = cmd_parms->a;
+  panel = cmd_parms->panel;
+  lda = cmd_parms->lda;
+  ldp = cmd_parms->ldp;
+  m = cmd_parms->m;
+  n = cmd_parms->n;
+  
+  a_hi = mfc_ea2h(a);
+  a_lo = mfc_ea2l(a);
+
+  panel_hi = mfc_ea2h(panel);
+  panel_lo = mfc_ea2l(panel);
+
+  addend = id * (M_SUB * sizeof(double));
+
+  MATRIX_EA_UADD32(a_hi, a_lo, addend);
+  panel_lo += id * ldp;
+
+  row_len = (n&~1)*sizeof(double);
+  extra = (n&1)*sizeof(double);
+
+  tag = 0;
+  list_offset = 0;
+  size = 0;
+
+  /* Before starting, make sure all previous DMA transfers are completed so
+   * that all the LS buffers are known to be available.
+   */
+  DMA_WAIT(-1);
+
+  /* For each row
+   */
+  for (y=(int)id; y<(int)m; y+=HPL_ACCEL_SPES) {
+    /* For each portion of the row in 16K chunks
+     */
+    hi = a_hi;
+    lo = a_lo;
+    esize = lda;
+
+    p_lo = panel_lo;
+
+    for (x=0; x<(int)row_len; x+=(int)size) {
+
+      left = row_len - (unsigned int)x;
+      size = (left < 16384) ? left : 16384;
+
+      buf = &bufA[tag][0];
+      spu_mfcdma64(buf, panel_hi, p_lo, size, tag, MFC_GET_CMD);
+
+      p_lo += size;
+      
+      /* Construct a list for the placement into blocked format.
+       */
+      list = (unsigned int *)(&bufB[0][0] + list_offset);
+      for (i=0, idx=0; i<(int)size; i+=(int)M_SUB*sizeof(double)) {
+	esize = size - i;
+	if (esize > M_SUB*sizeof(double)) esize = M_SUB*sizeof(double);
+	list[idx+0] = esize;
+	list[idx+1] = lo;
+	idx += 2;
+#ifdef MATRIX_4GB_CROSSING	
+	carry = spu_extract(spu_genc(spu_promote(lo, 0), spu_promote(lda, 0)), 0);
+	/* If we cross a 4GB boundary, flush the list and start a new one.
+	 */
+	if (carry) {
+	  spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD);
+	  buf += (M_SUB/4)*idx;;
+	  list += idx;
+	  idx = 0;
+	  hi += carry;
+	}
+#endif
+	lo += lda;
+      }
+      spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD);
+      spu_mfcdma32(0, 0, 0, tag, MFC_BARRIER_CMD);
+
+      /* Advance pointers to next row or buffer
+       */
+      list_offset = (list_offset + 16) % (128*16);	/* accomodate up to 128 enqueued DMAs */
+      tag ^= 1;
+    }
+
+    /* Handle the final odd column values 
+     */
+    if (extra) {
+      buf = &bufA[tag][0];
+      if (size & (M_SUB*sizeof(double)-1)) {
+	addend = esize - lda;
+	MATRIX_EA_ADD32(hi, lo, addend);
+      }
+      spu_mfcdma64(buf, panel_hi, p_lo, extra, tag, MFC_GET_CMD);
+      spu_mfcdma64(buf, hi, lo, extra, tag, MFC_PUTB_CMD);
+      tag ^= 1;
+    }
+
+    /* Advance pointers to the next row */
+    addend = M_SUB*sizeof(double)*HPL_ACCEL_SPES;
+    MATRIX_EA_UADD32(a_hi, a_lo, addend);
+    panel_lo += ldp * HPL_ACCEL_SPES;
+
+  }
+  /* Wait for next to last DMA to complete before posting completion.
+   */
+  DMA_WAIT(1<<tag);
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, cmd_parms->incomplete, tag^1);
+}
+
Index: accel/lib/spu/accel_reform_rows_B_to_R.c
===================================================================
RCS file: accel/lib/spu/accel_reform_rows_B_to_R.c
diff -N accel/lib/spu/accel_reform_rows_B_to_R.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform_rows_B_to_R.c	22 Oct 2008 03:28:08 -0000	1.3
@@ -0,0 +1,166 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+static inline void row_B_to_R(unsigned int src_hi, unsigned int src_lo, int ld_src,
+			      unsigned int dst_hi, unsigned int dst_lo,
+			      unsigned int skip,   unsigned int left, void *buf)
+{
+  unsigned int src_size, dst_size, size;
+  void *ptr;
+
+  dst_size = 16*1024 - skip;
+  src_size = (M_SUB*sizeof(double)) - skip;
+  if (dst_size > left) dst_size = left;
+  if (src_size > left) src_size = left;
+
+  while (left) {
+    /* Fetch (up to) 16KB buffer of M_SUB spans */
+    spu_mfcdma64(buf, src_hi, src_lo+skip, src_size, 0, MFC_GETB_CMD);
+    ptr = buf;
+
+    skip = 0;
+    size = dst_size;
+    left -= dst_size;
+
+    while ((size -= src_size)) {
+      ptr += src_size; 
+      MATRIX_EA_UADD32(src_hi, src_lo, ld_src);
+      src_size = (size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : size;
+
+      spu_mfcdma64(ptr, src_hi, src_lo, src_size, 0, MFC_GET_CMD);
+    }
+
+    /* Store the 16KB span into the row buffer */
+    spu_mfcdma64(buf, dst_hi, dst_lo, dst_size, 0, MFC_PUTB_CMD);
+
+    MATRIX_EA_UADD32(src_hi, src_lo, ld_src);
+    dst_lo += dst_size;
+    dst_size = (left > 16*1024) ? 16*1024 : left;
+    src_size = (dst_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : dst_size;
+  }
+}
+
+
+
+void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *parms, 
+			      volatile hpl_accel_reform_rows_parms_t *cmd_parms)
+{
+  int i;
+  int m, n, ldr, lda;
+  int row;
+  unsigned int id;
+  unsigned int a_hi, a_lo, r_hi, r_lo;
+  unsigned int blk_col, skip, mask;
+  unsigned int spans, spans_per_spe, extra_spans, start_span, end_span;
+  unsigned int start_col, end_col, max_end_col;
+  unsigned int row_size;
+  vector signed int m_n_ldr_lda;
+  vector unsigned long long rows_a, incomplete_blk_col;
+  void *buf;
+  
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  m_n_ldr_lda = cmd_parms->m_n_ldr_lda;
+  rows_a = cmd_parms->rows_a;
+  incomplete_blk_col = cmd_parms->incomplete_blk_col;
+
+  m = spu_extract(m_n_ldr_lda, 0);
+  n = spu_extract(m_n_ldr_lda, 1);
+  ldr = spu_extract(m_n_ldr_lda, 2);
+  lda = spu_extract(m_n_ldr_lda, 3);
+
+  blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2);
+
+  r_hi = spu_extract((vector unsigned int)rows_a, 0);
+  r_lo = spu_extract((vector unsigned int)rows_a, 1);
+
+  a_hi = spu_extract((vector unsigned int)rows_a, 2);
+  a_lo = spu_extract((vector unsigned int)rows_a, 3);
+
+  buf = bufA;
+
+  skip = (blk_col % M_SUB) * sizeof(double);
+
+  blk_col /= M_SUB;
+  MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda);
+
+  /* Equally assign complete rows to each of the SPEs.
+   */
+  row_size = n*sizeof(double);
+
+  /* Process remaining rows by assigning each row to groups of HPL_ACCEL_SPES SPEs.
+   * Compute the spanning parameters assigned to this SPE.
+   */
+  spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double));
+  spans_per_spe = spans / HPL_ACCEL_SPES;
+  extra_spans = spans % HPL_ACCEL_SPES;
+	   
+  start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id);
+  end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); 
+
+  if (end_span > start_span) {
+    start_col = start_span * (M_SUB * sizeof(double));
+    end_col = end_span * (M_SUB * sizeof(double));    
+
+    max_end_col = skip + row_size;
+
+    mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0);
+    r_lo += start_col - (skip & ~mask);
+    MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda);
+
+    skip &= mask;
+
+    start_col += skip;
+    end_col = (end_col > max_end_col) ? max_end_col : end_col;
+
+    row_size = end_col - start_col;
+
+    /* Before starting, make sure all previous DMA transfers are completed so
+     * that all the LS buffers are known to be available.
+     */
+    DMA_WAIT_RECEIVE();
+    for (i=0; i<m; i++) {
+#ifdef MATRIX_4GB_CROSSING
+      unsigned int hi, lo;
+
+      row = cmd_parms->blk_rows[i];
+
+      hi = a_hi;
+      lo = a_lo;
+      EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9);
+      row_B_to_R(hi, lo, lda, r_hi, r_lo + (i*ldr), skip, row_size, buf);
+#else
+      row = cmd_parms->blk_rows[i];
+      row_B_to_R(a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, r_hi, r_lo + (i*ldr), skip, row_size, buf);
+#endif
+    }
+  } else {
+    DMA_WAIT_RECEIVE();
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, spu_extract(incomplete_blk_col, 0),  0);
+}
+
+
+
Index: accel/lib/spu/accel_reform_rows_R_to_B.c
===================================================================
RCS file: accel/lib/spu/accel_reform_rows_R_to_B.c
diff -N accel/lib/spu/accel_reform_rows_R_to_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_reform_rows_R_to_B.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,164 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+static inline void row_R_to_B(unsigned int src_hi, unsigned int src_lo,
+			      unsigned int dst_hi, unsigned int dst_lo, int ld_dst,
+			      unsigned int skip,   unsigned int left, void *buf)
+{
+  unsigned int src_size, dst_size;
+  void *ptr;
+
+
+  src_size = 16*1024 - skip;
+  dst_size = (M_SUB*sizeof(double)) - skip;
+  if (src_size > left) src_size = left;
+  if (dst_size > left) dst_size = left;
+
+  while (left) {
+    /* Fetch a big (16KB) span from the row buffer */
+
+    spu_mfcdma64(buf, src_hi, src_lo, src_size, 0, MFC_GETB_CMD);
+
+    left -= src_size;
+    src_lo += src_size;
+
+    /* Store the big span into the matrix in M_SUB element spans */
+    spu_mfcdma64(buf, dst_hi, dst_lo+skip, dst_size, 0, MFC_PUTB_CMD);
+    ptr = buf;
+    skip = 0;
+
+    while ((src_size -= dst_size)) {
+      ptr += dst_size; 
+      MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst);
+      dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size;
+
+      spu_mfcdma64(ptr, dst_hi, dst_lo, dst_size, 0, MFC_PUT_CMD);
+    }
+    MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst);
+    src_size = (left > 16*1024) ? 16*1024 : left;
+    dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size;
+  }
+}
+
+
+
+void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *parms, 
+			      volatile hpl_accel_reform_rows_parms_t *cmd_parms)
+{
+  int i;
+  int m, n, ldr, lda;
+  int row;
+  unsigned int id;
+  unsigned int a_hi, a_lo, r_hi, r_lo;
+  unsigned int blk_col, skip, mask;
+  unsigned int spans, spans_per_spe, extra_spans, start_span, end_span;
+  unsigned int start_col, end_col, max_end_col;
+  unsigned int row_size;
+  vector signed int m_n_ldr_lda;
+  vector unsigned long long rows_a, incomplete_blk_col;
+  void *buf;
+  
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  m_n_ldr_lda = cmd_parms->m_n_ldr_lda;
+  rows_a = cmd_parms->rows_a;
+  incomplete_blk_col = cmd_parms->incomplete_blk_col;
+
+  m = spu_extract(m_n_ldr_lda, 0);
+  n = spu_extract(m_n_ldr_lda, 1);
+  ldr = spu_extract(m_n_ldr_lda, 2);
+  lda = spu_extract(m_n_ldr_lda, 3);
+
+  blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2);
+
+  r_hi = spu_extract((vector unsigned int)rows_a, 0);
+  r_lo = spu_extract((vector unsigned int)rows_a, 1);
+
+  a_hi = spu_extract((vector unsigned int)rows_a, 2);
+  a_lo = spu_extract((vector unsigned int)rows_a, 3);
+
+  buf = bufA;
+
+  skip = (blk_col % M_SUB) * sizeof(double);
+
+  blk_col /= M_SUB;
+  MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda);
+
+  /* Equally assign complete rows to each of the SPEs.
+   */
+  row_size = n*sizeof(double);
+
+  /* Process remaining rows by assigning each row to groups of 4 SPEs.
+   * Compute the spanning parameters assigned to this SPE.
+   */
+  spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double));
+  spans_per_spe = spans / 8;
+  extra_spans = spans % 8;
+	   
+  start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id);
+  end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); 
+
+  if (end_span > start_span) {
+    start_col = start_span * (M_SUB * sizeof(double));
+    end_col = end_span * (M_SUB * sizeof(double));    
+
+    max_end_col = skip + row_size;
+
+    mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0);
+    r_lo += start_col - (skip & ~mask);
+    MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda);
+
+    skip &= mask;
+
+    start_col += skip;
+    end_col = (end_col > max_end_col) ? max_end_col : end_col;
+
+    row_size = end_col - start_col;
+
+    /* Before starting, make sure all previous DMA transfers are completed so
+     * that all the LS buffers are known to be available.
+     */
+    DMA_WAIT_RECEIVE();
+
+    for (i=0; i<m; i++) {
+#ifdef MATRIX_4GB_CROSSING
+      unsigned int hi, lo;
+
+      row = cmd_parms->blk_rows[i];
+
+      hi = a_hi;
+      lo = a_lo;
+      EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9);
+      row_R_to_B(r_hi, r_lo + (i*ldr), hi, lo, lda, skip, row_size, buf);
+#else
+      row = cmd_parms->blk_rows[i];
+      row_R_to_B(r_hi, r_lo + (i*ldr), a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, skip, row_size, buf);
+#endif
+    }
+  } else {
+    DMA_WAIT_RECEIVE();
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, spu_extract(incomplete_blk_col, 0),  0);
+}
Index: accel/lib/spu/accel_spu.h
===================================================================
RCS file: accel/lib/spu/accel_spu.h
diff -N accel/lib/spu/accel_spu.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_spu.h	20 Aug 2008 03:57:53 -0000	1.7
@@ -0,0 +1,49 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_SPU_H_
+#define _ACCEL_SPU_H_	1
+
+typedef void (*accel_specialist_t)(hpl_accel_init_parms_t *, volatile void *);
+
+/* Accellerator specialists and dispatch table
+ */
+extern void accel_dgemm(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_dgemm_C_C_C(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_dgemm_panel(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_dtrsm(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_dtrsm_panel(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_fini(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_dtrsm_CL_B(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *, volatile void *);
+extern void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *, volatile void *);
+
+
+accel_specialist_t dispatch[] = {
+  &accel_dgemm,
+  &accel_dtrsm,
+  &accel_reform_matrix_CL_to_B,
+  &accel_reform_panel_B_to_CL,
+  &accel_reform_panel_R_to_B,
+  &accel_dgemm_panel,
+  &accel_reform_rows_R_to_B,
+  &accel_reform_rows_B_to_R,
+  &accel_fini,
+  &accel_dtrsm_CL_B,
+  &accel_dtrsm_panel,
+  &accel_dgemm_C_C_C,
+  &accel_swap_rows_B_to_B,
+  &accel_copy_rows_R_to_R
+};
+
+#endif /* _ACCEL_SPU_H_ */
+
+
+
Index: accel/lib/spu/accel_swap_rows_B_to_B.c
===================================================================
RCS file: accel/lib/spu/accel_swap_rows_B_to_B.c
diff -N accel/lib/spu/accel_swap_rows_B_to_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_swap_rows_B_to_B.c	20 Aug 2008 03:57:53 -0000	1.5
@@ -0,0 +1,186 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_buffers.h"
+#include "accel_utils.h"
+#include "accel_reform.h"
+
+
+static inline void row_B_to_B(unsigned int src_hi, unsigned int src_lo,
+                              unsigned int dst_hi, unsigned int dst_lo,
+                              int ld, unsigned int skip, unsigned int left)
+{
+  unsigned int size, blk_size;
+
+  if (skip>0) {
+    size = (M_SUB*sizeof(double)) - skip;
+    if (size > left) size = left;
+
+    spu_mfcdma64(bufA+skip, src_hi, src_lo+skip, size, 0, MFC_GET_CMD);
+    spu_mfcdma64(bufB+skip, dst_hi, dst_lo+skip, size, 0, MFC_GET_CMD);
+    spu_mfcdma64(bufA+skip, dst_hi, dst_lo+skip, size, 0, MFC_PUTB_CMD);
+    spu_mfcdma64(bufB+skip, src_hi, src_lo+skip, size, 0, MFC_PUT_CMD);
+
+    MATRIX_EA_UADD32(src_hi, src_lo, ld);
+    MATRIX_EA_UADD32(dst_hi, dst_lo, ld);
+    left -= size;
+  }
+
+  while (left) {
+    void *ptrA, *ptrB;
+    unsigned int get_size, put_size;
+    unsigned int save_src_hi = src_hi, save_src_lo = src_lo;
+    unsigned int save_dst_hi = dst_hi, save_dst_lo = dst_lo;
+
+    /* size is the number of bytes swapped in this iteration of the loop */
+    size = 16*1024;
+    if (size > left) size = left;
+
+    /* Barrier to ensure all prior transfers are complete */
+    spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD);
+
+    /* Fetch (up to) 16KB of src and dst rows into separate buffers */
+    ptrA = bufA;
+    ptrB = bufB;
+    get_size = size;
+    while (get_size) {
+      blk_size = (M_SUB*sizeof(double));
+      if (blk_size > get_size) blk_size = get_size;
+      spu_mfcdma64(ptrA, src_hi, src_lo, blk_size, 0, MFC_GET_CMD);
+      spu_mfcdma64(ptrB, dst_hi, dst_lo, blk_size, 0, MFC_GET_CMD);
+      ptrA += blk_size;
+      ptrB += blk_size;
+      MATRIX_EA_UADD32(src_hi, src_lo, ld);
+      MATRIX_EA_UADD32(dst_hi, dst_lo, ld);
+      get_size -= blk_size;
+    }
+
+    /* Barrier to ensure all gets are complete */
+    spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD);
+
+    src_hi = save_src_hi; src_lo = save_src_lo;
+    dst_hi = save_dst_hi; dst_lo = save_dst_lo;
+
+    /* Store the fetched bytes back into the src and dst rows */
+    ptrA = bufA;
+    ptrB = bufB;
+    put_size = size;
+    while (put_size) {
+      blk_size = (M_SUB*sizeof(double));
+      if (blk_size > put_size) blk_size = put_size;
+      spu_mfcdma64(ptrB, src_hi, src_lo, blk_size, 0, MFC_PUT_CMD);
+      spu_mfcdma64(ptrA, dst_hi, dst_lo, blk_size, 0, MFC_PUT_CMD);
+      ptrA += blk_size;
+      ptrB += blk_size;
+      MATRIX_EA_UADD32(src_hi, src_lo, ld);
+      MATRIX_EA_UADD32(dst_hi, dst_lo, ld);
+      put_size -= blk_size;
+    }
+
+    left -= size;
+  }
+
+  DMA_WAIT(-1);
+}
+
+
+void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *parms, 
+			    volatile hpl_accel_swap_rows_parms_t *cmd_parms)
+{
+  int m, n, lda;
+  int src, dst;
+  unsigned int id;
+  unsigned int a_hi, a_lo;
+  unsigned int blk_col, row_size, skip, mask;
+  unsigned int spans, spans_per_spe, extra_spans, start_span, end_span;
+  unsigned int start_col, end_col, max_end_col;
+  vector signed int m_n_lda_blk_col;
+  vector unsigned long long a_incomplete;
+
+  id = parms->id;
+
+  /* Wait for the transfer of the parameters to complete
+   */
+  DMA_WAIT_RECEIVE();
+  DMA_WAIT_REQUEST(-1);
+
+  /* Fetch the parameters 
+   */
+  m_n_lda_blk_col = cmd_parms->m_n_lda_blk_col;
+  a_incomplete = cmd_parms->a_incomplete;
+
+  m = spu_extract(m_n_lda_blk_col, 0);
+  n = spu_extract(m_n_lda_blk_col, 1);
+  lda = spu_extract(m_n_lda_blk_col, 2);
+  blk_col = spu_extract(m_n_lda_blk_col, 3);
+
+  a_hi = spu_extract((vector unsigned int)a_incomplete, 0);
+  a_lo = spu_extract((vector unsigned int)a_incomplete, 1);
+
+  skip = (blk_col % M_SUB) * sizeof(double);
+  blk_col /= M_SUB;
+
+  MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda);
+
+  /* Process rows by assigning each row to a group of 8 SPEs.
+   * Compute the spanning parameters assigned to this SPE.
+   */
+  row_size = n*sizeof(double);
+  spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double));
+  spans_per_spe = spans / 8;
+  extra_spans = spans % 8;
+	   
+  start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id);
+  end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); 
+
+  if (end_span > start_span) {
+    start_col = start_span * (M_SUB * sizeof(double));
+    end_col = end_span * (M_SUB * sizeof(double));
+
+    max_end_col = skip + row_size;
+
+    mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0);
+    MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda);
+
+    skip &= mask;
+
+    start_col += skip;
+    end_col = (end_col > max_end_col) ? max_end_col : end_col;
+
+    row_size = end_col - start_col;
+
+    /* Before starting, make sure all previous DMA transfers are completed so
+     * that all the LS buffers are known to be available.
+     */
+    DMA_WAIT_RECEIVE();
+
+    for (src=0; src<m; src++) {
+      dst = cmd_parms->blk_rows[src];
+      if ( src != dst ) {
+#ifdef MATRIX_4GB_CROSSING
+        unsigned int src_hi = a_hi, src_lo = a_lo, dst_hi = a_hi, dst_lo = a_lo;
+        EA_UADD64(src_hi, src_lo, (unsigned int)src >> (32-9), (unsigned int)src << 9);
+        EA_UADD64(dst_hi, dst_lo, (unsigned int)dst >> (32-9), (unsigned int)dst << 9);
+        row_B_to_B(src_hi, src_lo, dst_hi, dst_lo, lda, skip, row_size);
+#else
+        row_B_to_B(a_hi, a_lo + (src * (M_SUB * sizeof(double))), 
+                   a_hi, a_lo + (dst * (M_SUB * sizeof(double))), 
+                   lda, skip, row_size);
+#endif
+      }
+    }
+  } else {
+    DMA_WAIT_RECEIVE();
+  }
+
+  /* Report completion status if requested. 
+   */
+  report_completion(id, spu_extract(a_incomplete, 1),  0);
+}
Index: accel/lib/spu/accel_utils.h
===================================================================
RCS file: accel/lib/spu/accel_utils.h
diff -N accel/lib/spu/accel_utils.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/accel_utils.h	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,173 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _ACCEL_UTILS_H_
+#define _ACCEL_UTILS_H_
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+#define LIKELY(COND)   __builtin_expect(COND, 1)
+#define UNLIKELY(COND) __builtin_expect(COND, 0)
+
+/* The waiting for DMA has been broken into two parts. 
+ * 1) DMA_WAIT_REQUEST - Make a channel request for the wait by 
+ *                       setting the tag mask and writing to the 
+ *                       tag update channel.
+ * 2) DMA_WAIT_RECEIVE - Reading the tag status.
+ *
+ * The two parts needs to be seperated by 36 cycles to avoid
+ * stalling even when no DMAs are still in flight.
+ */
+#define DMA_WAIT_REQUEST(_mask)	spu_writech(MFC_WrTagMask, _mask);			\
+				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL);
+#define DMA_WAIT_RECEIVE()	(void)spu_readch(MFC_RdTagStat);
+
+
+/* The simplified DMA_WAIT is used by the reformating routines since 
+ * these are not computation bound and do not benefit in spliting 
+ * the wait into two parts.
+ */
+#define DMA_WAIT(_mask) 	DMA_WAIT_REQUEST(_mask);	\
+				DMA_WAIT_RECEIVE();
+
+
+
+/* Add the unsigned 32-bit _addend to the 64 bit effective address _eah,_eal.
+ */
+#define EA_UADD32(_eah, _eal, _addend) {		\
+  unsigned int _a;					\
+							\
+  _a = _addend;						\
+  _eah += spu_extract(spu_genc(spu_promote(_eal, 0),	\
+			       spu_promote(_a, 0)), 0);	\
+  _eal += _a;						\
+}
+
+/* Add the signed 32-bit _addend to the 64 bit effective address _eah,_eal.
+ */
+#define EA_ADD32(_eah, _eal, _addend) {					\
+  vec_uint4 _va;							\
+									\
+  _va = spu_promote((unsigned int)_addend, 0);				\
+  _eah = spu_extract(spu_addx(spu_promote(_eah, 0),			\
+			      spu_rlmaska(_va, -31),			\
+			      spu_genc(spu_promote(_eal, 0), _va)), 0);	\
+  _eal += spu_extract(_va, 0);						\
+}
+
+
+
+/* Add the unsigned 64-bit addend specified by _ah,_al to the 64 bit effective 
+ * address _eah,_eal.
+ */
+#define EA_UADD64(_eah, _eal, _ah, _al)	{				\
+  vec_uint4 _vah, _val;							\
+									\
+  _vah = spu_promote((unsigned int)_ah, 0);				\
+  _val = spu_promote((unsigned int)_al, 0);				\
+  _eah = spu_extract(spu_addx(spu_promote(_eah, 0),			\
+			      _vah,					\
+			      spu_genc(spu_promote(_eal, 0), _val)), 0);\
+  _eal += spu_extract(_val, 0);						\
+}
+
+
+
+/* Multiply two unsigned 32-bit values, _m1 and _m2, and return the 64-bit product
+ * in _ph,_pl.
+ */
+#define EA_UMUL32(_ph, _pl, _m1, _m2)					\
+{									\
+  vec_uint4 _vll, _vlh, _vhl, _vh, _vl, _v0, _v1;			\
+  vec_ushort8 _va, _vb, _vb2;						\
+									\
+  _va  = (vec_ushort8)spu_promote(_m1, 0);				\
+  _vb  = (vec_ushort8)spu_promote(_m2, 0);				\
+  _vb2 = (vec_ushort8)spu_rl((vec_uint4)_vb, 16);			\
+									\
+  _vll = spu_mulo(_va, _vb);						\
+  _vlh = spu_mulo(_va, _vb2);						\
+  _vhl = spu_mule(_va, _vb2);						\
+									\
+  _vh = spu_mhhadd(_va, _vb, spu_add(spu_rlmask(_vhl, -16), spu_rlmask(_vlh, -16))); \
+									\
+  _v0 = spu_sl(_vhl, 16);						\
+  _v1 = spu_sl(_vlh, 16);						\
+									\
+  _vh = spu_add(_vh, spu_genc(_v1, _v0));				\
+  _vl = spu_add(_v1, _v0);						\
+  _vh = spu_add(_vh, spu_genc(_vl, _vll));				\
+  _vl = spu_add(_vl, _vll);						\
+									\
+  _ph = spu_extract(_vh, 0);						\
+  _pl = spu_extract(_vl, 0);						\
+}
+
+/* Multiply two unsigned 32-bit values, _m1 and _m2, and add the 64-bit product to
+ * the 64-bit effective address. 
+ */
+#define EA_UMADD32(_eah, _eal, _m1, _m2) {	\
+  unsigned int _ph, _pl;			\
+  EA_UMUL32(_ph, _pl, _m1, _m2);		\
+  EA_UADD64(_eah, _eal, _ph, _pl);		\
+}
+
+
+#ifdef PANEL_4GB_CROSSING
+#define PANEL_EA_ADD32(_eah, _eal, _addend)	EA_ADD32(_eah, _eal, _addend)
+#define PANEL_EA_UADD32(_eah, _eal, _addend)	EA_UADD32(_eah, _eal, _addend)
+#else
+#define PANEL_EA_ADD32(_eah, _eal, _addend)	_eal += _addend;
+#define PANEL_EA_UADD32(_eah, _eal, _addend)	_eal += _addend;
+#endif
+
+#ifdef MATRIX_4GB_CROSSING
+#define MATRIX_EA_ADD32(_eah, _eal, _addend)	EA_ADD32(_eah, _eal, _addend)
+#define MATRIX_EA_UADD32(_eah, _eal, _addend)	EA_UADD32(_eah, _eal, _addend)
+#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2)	EA_UMADD32(_eah, _eal, _m1, _m2)
+#else
+#define MATRIX_EA_ADD32(_eah, _eal, _addend)	_eal += _addend;
+#define MATRIX_EA_UADD32(_eah, _eal, _addend)	_eal += _addend;
+#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2)	_eal += _m1 * _m2;
+#endif
+
+
+/* report_completion
+ * -----------------
+ * Write a byte to system memory to report that the requested operation
+ * has been completed by the specified SPE. The DMA put is fenced using
+ * the specified tag ID so that the writeback is ordered with respect
+ * to the results posted to system memory. Caller's MUST ensure that the
+ * tag ID be the same as the DMA for the results.
+ */
+static vec_uchar16 completion_writeback = (vec_uchar16){0};
+
+static inline void report_completion(int id, 
+				     unsigned long long incomplete_ea,
+				     unsigned int tag)
+{
+  unsigned int incomplete_hi, incomplete_lo;
+  unsigned int size;
+  void *lsa;
+
+  incomplete_lo = mfc_ea2l(incomplete_ea);
+  incomplete_hi = mfc_ea2h(incomplete_ea);
+
+  size = 1 & ~(spu_extract(spu_cmpeq(spu_or(spu_promote(incomplete_hi, 0),
+					    spu_promote(incomplete_lo, 0)), 0), 0));
+
+  incomplete_lo += id;
+
+  lsa = ((void *)&completion_writeback) + (incomplete_lo & 0xF);;
+  
+  spu_mfcdma64(lsa, incomplete_hi, incomplete_lo, size, tag, MFC_PUTF_CMD);
+}
+
+
+#endif /* _ACCEL_UTILS_H_ */
+
+
+
Index: accel/lib/spu/hpl_accel_spu.c
===================================================================
RCS file: accel/lib/spu/hpl_accel_spu.c
diff -N accel/lib/spu/hpl_accel_spu.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/spu/hpl_accel_spu.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,60 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <spu_mfcio.h>
+#include <spu_intrinsics.h>
+#include "hpl_accel_spu.h"
+#include "accel_utils.h"
+#include "accel_spu.h"
+
+volatile hpl_accel_init_parms_t parms;
+
+volatile unsigned char cmd_parms[128] __attribute__ ((aligned (128)));
+
+
+int main(unsigned long long speid __attribute__ ((unused)), 
+	 unsigned long long parms_ea)
+{
+  unsigned int cmd;
+  uint64_t cmd_queue;
+
+  /* Fetch the global parameters
+   */
+
+  mfc_get(&parms, parms_ea, sizeof(parms), HPL_ACCEL_PARM_TAG, 0, 0);
+  DMA_WAIT(1 << HPL_ACCEL_PARM_TAG);
+
+  while (1) {
+    cmd = spu_readch(SPU_RdInMbox);
+
+    /* Fetch the command parameters 
+     */
+    cmd_queue = parms.cmd_base + (cmd & ~HPL_ACCEL_CMD_MASK);
+    
+    mfc_get((volatile void *)cmd_parms, cmd_queue, 128, HPL_ACCEL_PARM_TAG, 0, 0);
+
+    DMA_WAIT_REQUEST(1<<HPL_ACCEL_PARM_TAG);
+
+    /* Dispatch the command request
+     */
+    dispatch[cmd & HPL_ACCEL_CMD_MASK]((hpl_accel_init_parms_t *)&parms, (void *)cmd_parms);
+  }
+
+  return 0;
+}
+    
+
+
+void accel_fini(hpl_accel_init_parms_t *parms __attribute__ ((unused)), 
+		volatile void *cmd_parms __attribute__ ((unused)))
+{
+  /* Wait for all the outstanding DMAs to complete, before exiting.
+   */
+  DMA_WAIT(-1);
+
+  exit(0);
+}
Index: accel/lib/tests/Makefile
===================================================================
RCS file: accel/lib/tests/Makefile
diff -N accel/lib/tests/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/Makefile	20 Aug 2008 03:57:53 -0000	1.6
@@ -0,0 +1,58 @@
+# --------------------------------------------------------------- 
+# (C) Copyright IBM Corporation 2007,2008
+#                                                                 
+# --------------------------------------------------------------- 
+
+########################################################################
+#                       Subdirectories
+########################################################################
+
+DIRS            = 
+
+########################################################################
+#                       Target
+########################################################################
+
+PROGRAMS_ppu64 = copy_rows      \
+		 dgemm		\
+		 dgemm_CL_B_B	\
+		 dgemm_CL_R_B_CL\
+		 dgemm_CL_B_B_CL\
+		 dgemm_C_C_C	\
+		 dtrsm		\
+		 dtrsm_CL_B	\
+		 reform		\
+		 reform_matrix	\
+		 reform_lpanel	\
+		 reform_upanel	\
+		 reform_rows	\
+		 swap_rows	\
+	         perf_dgemm	\
+	         perf_dgemm_C	\
+	         perf_dtrsm	\
+		 perf_reform_lpanel	\
+		 perf_reform_upanel	\
+		 perf_reform_matrix	\
+		 perf_reform_rows
+
+########################################################################
+#                       Local Defines
+########################################################################
+
+INCLUDE		= -I..
+
+#CC_OPT_LEVEL	= -ggdb3
+
+#CPPFLAGS	= -DACCEL_LITTLE_ENDIAN
+#CPPFLAGS       += -DMATRIX_4GB_CROSSING
+#CPPFLAGS       += -DPANEL_4GB_CROSSING 
+
+LDFLAGS		= -L.. -L../spu
+IMPORTS         = -lhpl_accel_ppu -lspe2 -lhpl_accel_spu -lm
+
+
+########################################################################
+#                       make.footer
+########################################################################
+
+include $(CELL_TOP)/buildutils/make.footer
Index: accel/lib/tests/copy_rows.c
===================================================================
RCS file: accel/lib/tests/copy_rows.c
diff -N accel/lib/tests/copy_rows.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/copy_rows.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,118 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+int norepeat_rand_row(int max, int *rows, int cnt)
+{
+  int i;
+  int new_row;
+  int unique;
+
+  do {
+    new_row = (max * (rand() & 0xFFFF)) >> 16;
+    unique = 1;
+
+    for (i=0; i<cnt; i++) {
+      if (new_row == rows[i]) unique = 0;
+    }
+  } while (!unique);
+  return (new_row);
+}
+
+/* copy row
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int n=64;
+  int m=1;
+  int lda=0;
+  int ldb=0;
+  int *rows;
+  volatile unsigned long long incomplete;
+  double *A, *B1, *B2;
+
+  switch (argc) {
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 128;
+    n = rand() % 4096;
+    lda = (rand() & 1) ? n : ((n+M_SUB-1)/M_SUB)*M_SUB;
+    ldb = (rand() & 1) ? n : ((n+M_SUB-1)/M_SUB)*M_SUB;
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (m < 1) m = 1;
+  if (lda < n) ldb = n;
+  if (ldb < n) ldb = n;
+
+  printf("Performing copy_rows test with m=%d n=%d lda=%d ldb=%d\n", m, n, lda, ldb);
+
+  /* Allocate and initialize the arrays
+   */
+
+  A  = (double *)allocate_panel(1, lda*m*sizeof(double), 128);	/* Never cross a 4GB boundary */
+  B1 = (double *)allocate_panel(1, ldb*m*sizeof(double), 128);	/* Never cross a 4GB boundary */
+  B2 = (double *)allocate_panel(1, ldb*m*sizeof(double), 128);	/* Never cross a 4GB boundary */
+
+  rows = (int *)allocate_panel(1, m * sizeof(int), 4);		/* Never cross a 4GB boundary */
+
+  if ((A == NULL) || (B1 == NULL) || (B2 == NULL) || (rows == NULL)) {
+      printf("Failed to allocate buffers. Total allocation is %f MB.\n", ((double)(lda+2*ldb)*m*sizeof(double))/(1024*1024) );
+    return 0;
+  }
+
+  /* Test Row to Row copy */
+  for (i=0; i<m*lda; i++) A[i] = drand48();
+  for (i=0; i<m*ldb; i++) B1[i] = B2[i] = 0;
+  for (i=0; i<m; i++) rows[i] = norepeat_rand_row(m,rows,i);
+
+  hpl_ref_init();
+  hpl_ref_copy_rows_R_to_R(m, n, A, lda, B1, ldb, rows, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_copy_rows_R_to_R(m, n, A, lda, B2, ldb, rows, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=m*ldb-1, errors=0; i>=0; i--) {
+    if (B1[i] != B2[i]) {
+      errors++;
+      if (errors < 20) printf("B1<->B2 %d expected=%f got=%f\n", i, B1[i], B2[i]);
+    }
+  }
+
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm.c
===================================================================
RCS file: accel/lib/tests/dgemm.c
diff -N accel/lib/tests/dgemm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,113 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+
+/* dgemm
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int k=128;
+  int n=128;
+  int m=128;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+
+  switch (argc) {
+  case 6:
+    ldc = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1280;
+    n = rand() % 1280;
+    lda = rand() % 2048;
+    ldb = rand() % 2048;
+    ldc = rand() % (64*2048);
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  m = m & ~(64-1);		
+  if (m < 64) m = 64;
+
+  n = n & ~(64-1);
+  if (n < 64) n = 64;
+
+  lda = lda & ~(15);
+  if (lda < m) lda = m;
+
+  ldb = ldb & ~(15);
+  if (ldb < n) ldb = n;
+
+  ldc = ldc & ~(15);
+  if (ldc < 64*m) ldc = 64*m;
+ 
+  printf("Performing dgemm test with m=%d n=%d k=%d  lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_panel(k, ldb, 128);
+  C1 = (double *)allocate_matrix(n/64, ldc, 128);
+  C2 = (double *)allocate_matrix(n/64, ldc, 128);
+
+  for (i=0; i<lda*k; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<k*ldb; i++) B[i] = drand48();
+  for (i=0; i<m*n; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_CL_R_B_CL(m, n, k, A, lda, B, ldb, C1, ldc, 0, 0, NULL, 0, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_CL_R_B_CL(m, n, k, A, lda, B, ldb, C2, ldc, 0, 0, NULL, 0, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_fini();
+
+  /* Compare the results 
+   */
+  for (i=n*m-1, errors=0; i>=0; i--) {
+    if (fabs(C1[i] - C2[i]) > EPSILON) { 
+      errors++;
+      printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm_CL_B_B.c
===================================================================
RCS file: accel/lib/tests/dgemm_CL_B_B.c
diff -N accel/lib/tests/dgemm_CL_B_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm_CL_B_B.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,113 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+
+/* dgemm
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int k=128;
+  int n=128;
+  int m=128;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+
+  switch (argc) {
+  case 6:
+    ldc = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1280;
+    n = rand() % 1280;
+    lda = rand() % 2048;
+    ldb = rand() % 16384;
+    ldc = rand() % (64*2048);
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  m = m & ~(64-1);		
+  if (m < 64) m = 64;
+
+  n = n & ~(64-1);
+  if (n < 64) n = 64;
+
+  lda = lda & ~(15);
+  if (lda < m) lda = m;
+
+  ldb = ldb & ~(15);
+  if (ldb < 64*k) ldb = 64*k;
+
+  ldc = ldc & ~(15);
+  if (ldc < 64*m) ldc = 64*m;
+ 
+  printf("Performing dgemm test with m=%d n=%d k=%d  lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_matrix(n/64, ldb, 128);
+  C1 = (double *)allocate_matrix(n/64, ldc, 128);
+  C2 = (double *)allocate_matrix(n/64, ldc, 128);
+
+  for (i=0; i<lda*k; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<ldb*(n/64); i++) B[i] = drand48();
+  for (i=0; i<m*n; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_CL_B_B_CL(m, n, k, A, lda, B, ldb, C1, ldc, 0, 0, NULL, 0, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_CL_B_B_CL(m, n, k, A, lda, B, ldb, C2, ldc, 0, 0, NULL, 0, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  /* Compare the results 
+   */
+  for (i=n*m-1, errors=0; i>=0; i--) {
+    if (fabs(C1[i] - C2[i]) > EPSILON) { 
+      errors++;
+      printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm_CL_B_B_CL.c
===================================================================
RCS file: accel/lib/tests/dgemm_CL_B_B_CL.c
diff -N accel/lib/tests/dgemm_CL_B_B_CL.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm_CL_B_B_CL.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,179 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+unsigned long long dab(double d)
+{
+  union {
+    unsigned long long ull;
+    double d;
+  } x;
+  x.d = d;
+  return (x.ull);
+}
+
+
+/* dgemm 
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int ldp = 0;
+  int c_col = 0;
+  int c_row = 0;
+  int c_cols, c_rows;
+  int k=128;
+  int n=128;
+  int m=128;
+  int csize, psize, bsize;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+  double *P1 = NULL;
+  double *P2 = NULL;
+
+  switch (argc) {
+  case 9:
+    c_col = atoi(argv[8]);
+  case 8:
+    c_row = atoi(argv[7]);
+  case 7:
+    ldp = atoi(argv[6]);
+  case 6:
+    ldc = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1024;
+    n = rand() % 1024;
+    lda = rand() % 1200;
+    ldb = rand() % 1200;
+    ldc = rand() % 1200;
+    if (rand() & 1) ldp = rand() % 1200;
+    c_row = rand() % 150;
+    c_col = rand() % 150;
+
+    if (rand() & 1) {
+      lda &= ~1;
+      ldb &= ~1;
+      ldc &= ~1;
+      ldp &= ~1;
+      c_row = 0;
+      c_col = 0;
+    }
+
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (lda < m) lda = m;
+  lda = (lda + 15) & ~(15);
+
+  if (ldb < M_SUB*k) ldb = M_SUB*k;
+  ldb = (ldb + 15) & ~(15);
+
+  if (ldc < m) ldc = m;
+  ldc = (ldc + 15) & ~(15);
+ 
+  if (ldp) {
+    if (ldp < m) ldp = m;
+    ldp = (ldp + 15) & ~(15);
+  }
+
+  c_cols = c_col + n;
+  c_rows = c_row + m;
+  
+  if (ldc < c_rows*64) ldc = c_rows*64;
+  ldc = ((ldc + 63) & ~63);
+  c_cols = (c_cols + 63) & ~63;
+
+  csize = ldc*c_cols/64;
+  psize = ldp*n;
+
+  bsize = ldb*(n+M_SUB-1)/M_SUB;
+
+  printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_matrix(n+63, ldb, 128);
+  if (ldp) {
+    C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128);
+    P1 = (double *)allocate_panel(n, ldp, 128);
+    P2 = (double *)allocate_panel(n, ldp, 128);
+    for (i=0; i<ldp*n; i++) P1[i] = P2[i] = drand48();
+  } else {
+    C1 = (double *)allocate_matrix(c_cols, ldc, 128);
+    C2 = (double *)allocate_matrix(c_cols, ldc, 128);
+  }
+
+  for (i=0; i<lda*k; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<bsize; i++) B[i] = drand48();
+  for (i=0; i<(ldc/64)*n; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_CL_B_B_CL(m, n, k, A, lda, B, ldb, C1, ldc, c_row, c_col, P1, ldp, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_CL_B_B_CL(m, n, k, A, lda, B, ldb, C2, ldc, c_row, c_col, P2, ldp, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  /* Compare the results 
+   */
+  if (P1) {
+    for (i=psize-1, errors=0; i>=0; i--) {
+      double p1, p2;
+
+      p1 = byte_swap(P1[i]);
+      p2 = byte_swap(P2[i]);
+
+      if (fabs(p1 - p2) > EPSILON) { 
+	if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2);
+      }
+    }
+  } else {
+    for (i=csize, errors=0; i>=0; i--) {
+      if (fabs(C1[i] - C2[i]) > EPSILON) { 
+	if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+      }
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm_CL_C_C.c
===================================================================
RCS file: accel/lib/tests/dgemm_CL_C_C.c
diff -N accel/lib/tests/dgemm_CL_C_C.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm_CL_C_C.c	14 May 2008 21:35:01 -0000	1.3
@@ -0,0 +1,126 @@
+/* ------------------------------------------------------------------ */
+/* (C) Copyright 2007                                                 */
+/* International Business Machines Corporation,                       */
+/*                                                                    */
+/* All Rights Reserved.                                               */
+/* ------------------------------------------------------------------ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+
+/* dgemm_CL_C_C
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int k=64;
+  int n=64;
+  int m=128;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+
+  switch (argc) {
+  case 7:
+    ldc = atoi(argv[6]);
+  case 6:
+    ldb = atoi(argv[5]);
+  case 5:
+    lda = atoi(argv[4]);
+  case 4:
+    k = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 2000;
+    n = rand() % 70;
+    k = rand() % 70;
+    lda = rand() % 2000;
+    ldb = rand() % 200;
+    ldc = rand() % 2000;
+
+    /* Force all parameter within constraints */
+    if ((rand() & 1) == 0) {
+      k &= ~(4-1);
+      m &= ~(8-1);
+      n &= ~(4-1);
+
+      if (k < 4) k = 4;
+      if (k > 64) k = 64;
+      if (m < 8) m = 8;
+      if (n < 4) n = 4;
+      if (n > 64) n = 64;
+
+      lda &= ~1;
+      ldb &= ~1;
+      ldc &= ~1;
+    }
+    break;
+  default:
+    printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (m == 0) m = 1;
+  if (n == 0) n = 1;
+  if (k == 0) k = 1;
+
+  if (lda < m) lda = m;
+  if (ldb < k) ldb = k;
+  if (ldc < m) ldc = m;
+ 
+  printf("Performing dgemm_CL_C_C test with m=%d n=%d k=%d  lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_panel(n, ldb, 128);
+  C1 = (double *)allocate_panel(n, ldc, 128);
+  C2 = (double *)allocate_panel(n, ldc, 128);
+
+  for (i=0; i<k*lda; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<n*ldb; i++) B[i] = drand48();
+  for (i=0; i<n*ldc; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_CL_C_C(m, n, k, A, lda, B, ldb, C1, ldc, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_CL_C_C(m, n, k, A, lda, B, ldb, C2, ldc, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  /* Compare the results 
+   */
+  for (i=n*ldc-1, errors=0; i>=0; i--) {
+    if (fabs(C1[i] - C2[i]) > EPSILON) { 
+      errors++;
+      printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm_CL_R_B_CL.c
===================================================================
RCS file: accel/lib/tests/dgemm_CL_R_B_CL.c
diff -N accel/lib/tests/dgemm_CL_R_B_CL.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm_CL_R_B_CL.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,177 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+unsigned long long dab(double d)
+{
+  union {
+    unsigned long long ull;
+    double d;
+  } x;
+  x.d = d;
+  return (x.ull);
+}
+
+
+/* dgemm 
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int ldp = 0;
+  int c_col = 0;
+  int c_row = 0;
+  int c_cols, c_rows;
+  int k=128;
+  int n=128;
+  int m=128;
+  int csize, psize;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+  double *P1 = NULL;
+  double *P2 = NULL;
+
+  switch (argc) {
+  case 9:
+    c_col = atoi(argv[8]);
+  case 8:
+    c_row = atoi(argv[7]);
+  case 7:
+    ldp = atoi(argv[6]);
+  case 6:
+    ldc = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1024;
+    n = rand() % 1024;
+    lda = rand() % 1536;
+    ldb = rand() % 1536;
+    ldc = rand() % 1536;
+    if (rand() & 1) ldp = rand() % 1536;
+    c_row = rand() % 256;
+    c_col = rand() % 256;
+
+    if (rand() & 1) {
+      lda &= ~1;
+      ldb &= ~1;
+      ldc &= ~1;
+      ldp &= ~1;
+      c_row = 0;
+      c_col = 0;
+    }
+
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (lda < m) lda = m;
+  lda = (lda + 15) & ~(15);
+
+  if (ldb < n) ldb = n;
+  ldb = (ldb + 15) & ~(15);
+
+  if (ldc < m) ldc = m;
+  ldc = (ldc + 15) & ~(15);
+ 
+  if (ldp) {
+    if (ldp < m) ldp = m;
+    ldp = (ldp + 15) & ~(15);
+  }
+
+  c_cols = c_col + n;
+  c_rows = c_row + m;
+  
+  if (ldc < c_rows*64) ldc = c_rows*64;
+  ldc = ((ldc + 63) & ~63);
+  c_cols = (c_cols + 63) & ~63;
+
+  csize = ldc*c_cols/64;
+  psize = ldp*n;
+
+  printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_panel(k, ldb, 128);
+  if (ldp) {
+    C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128);
+    P1 = (double *)allocate_panel(n, ldp, 128);
+    P2 = (double *)allocate_panel(n, ldp, 128);
+    for (i=0; i<ldp*n; i++) P1[i] = P2[i] = drand48();
+  } else {
+    C1 = (double *)allocate_matrix(c_cols, ldc, 128);
+    C2 = (double *)allocate_matrix(c_cols, ldc, 128);
+  }
+
+  for (i=0; i<lda*k; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<k*ldb; i++) B[i] = drand48();
+  for (i=0; i<(ldc/64)*n; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_CL_R_B_CL(m, n, k, A, lda, B, ldb, C1, ldc, c_row, c_col, P1, ldp, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_CL_R_B_CL(m, n, k, A, lda, B, ldb, C2, ldc, c_row, c_col, P2, ldp, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  /* Compare the results 
+   */
+  if (P1) {
+    for (i=psize-1, errors=0; i>=0; i--) {
+      double p1, p2;
+
+      p1 = byte_swap(P1[i]);
+      p2 = byte_swap(P2[i]);
+
+      if (fabs(p1 - p2) > EPSILON) { 
+	if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2);
+      }
+    }
+  } else {
+    for (i=csize, errors=0; i>=0; i--) {
+      if (fabs(C1[i] - C2[i]) > EPSILON) { 
+	if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+      }
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dgemm_C_C_C.c
===================================================================
RCS file: accel/lib/tests/dgemm_C_C_C.c
diff -N accel/lib/tests/dgemm_C_C_C.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dgemm_C_C_C.c	20 Aug 2008 03:57:53 -0000	1.4
@@ -0,0 +1,126 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+#define EPSILON		0.0000001
+
+
+/* dgemm_C_C_C
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int k=64;
+  int n=64;
+  int m=128;
+  volatile unsigned long long incomplete;
+  double *A, *B, *C1, *C2;
+
+  switch (argc) {
+  case 7:
+    ldc = atoi(argv[6]);
+  case 6:
+    ldb = atoi(argv[5]);
+  case 5:
+    lda = atoi(argv[4]);
+  case 4:
+    k = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 2000;
+    n = rand() % 70;
+    k = rand() % 70;
+    lda = rand() % 2000;
+    ldb = rand() % 200;
+    ldc = rand() % 2000;
+
+    /* Force all parameter within constraints */
+    if ((rand() & 1) == 0) {
+      k &= ~(4-1);
+      m &= ~(8-1);
+      n &= ~(4-1);
+
+      if (k < 4) k = 4;
+      if (k > 64) k = 64;
+      if (m < 8) m = 8;
+      if (n < 4) n = 4;
+      if (n > 64) n = 64;
+
+      lda &= ~1;
+      ldb &= ~1;
+      ldc &= ~1;
+    }
+    break;
+  default:
+    printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (m == 0) m = 1;
+  if (n == 0) n = 1;
+  if (k == 0) k = 1;
+
+  if (lda < m) lda = m;
+  if (ldb < k) ldb = k;
+  if (ldc < m) ldc = m;
+ 
+  printf("Performing dgemm_C_C_C test with m=%d n=%d k=%d  lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc);
+
+  /* Allocate and initialize the arrays
+   */
+  A = (double *)allocate_panel(k, lda, 128);
+  B = (double *)allocate_panel(n, ldb, 128);
+  C1 = (double *)allocate_panel(n, ldc, 128);
+  C2 = (double *)allocate_panel(n, ldc, 128);
+
+  for (i=0; i<k*lda; i++) A[i] = drand48();
+  for (i=0; i<n*ldb; i++) B[i] = drand48();
+  for (i=0; i<n*ldc; i++) C1[i] = C2[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_dgemm_C_C_C(m, n, k, A, lda, B, ldb, C1, ldc, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  hpl_accel_init();
+  hpl_accel_dgemm_C_C_C(m, n, k, A, lda, B, ldb, C2, ldc, (unsigned long long *)&incomplete);
+		       
+  while(incomplete);
+
+  /* Compare the results 
+   */
+  for (i=n*ldc-1, errors=0; i>=0; i--) {
+    if (fabs(C1[i] - C2[i]) > EPSILON) { 
+      errors++;
+      printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dtrsm.c
===================================================================
RCS file: accel/lib/tests/dtrsm.c
diff -N accel/lib/tests/dtrsm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dtrsm.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,147 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+#define EPSILON		0.0000001
+
+
+/* dtrsm
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int ldc = 0;
+  int n=128;
+  int n_padded, nb, m_padded;
+  int m=128;
+  unsigned int blk_col = 0;
+  unsigned int blk_row = 0;
+  volatile unsigned long long incomplete;
+  double *A, *B1, *B2, *C1, *C2;
+
+  switch (argc) {
+  case 8:
+    blk_col = atoi(argv[7]);
+  case 7:
+    blk_row = atoi(argv[6]);
+  case 6:
+    ldc = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = ((rand() & 3) == 0) ? (rand() % 1024) : 128;
+    n = rand() % 1024;
+    lda = rand() % 1536;
+    ldb = rand() % 1536;
+    ldc = ((rand() & 3) == 0) ? (rand() % 1536) : 0;
+    if ((rand() & 7) == 0) {
+      blk_row = rand() & 127;
+      blk_col = rand() & 127;
+    }
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [ldc [blk_row [blk_col]]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if ((rand() & 7) != 0) lda &= ~1;
+  if (lda < m) lda = m;
+
+  if ((rand() & 7) != 0) ldb &= ~1;
+  if (ldb < n) ldb = n;
+
+
+  if (ldc) {
+    ldc = (ldc + 1) & ~(1);
+    m_padded = blk_row + m;
+    if (ldc < 64*m_padded) ldc = 64*m_padded;
+  } else {
+    blk_row = blk_col = 0;
+  }
+
+  n_padded = (n + blk_col + 63) & ~63;
+
+  printf("Performing dtrsm test with m=%d n=%d   lda=%d ldb=%d ldc=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, ldc, blk_row, blk_col);
+
+  /* Allocate and initialize the arrays
+   */
+
+  hpl_ref_init();
+  hpl_accel_init();
+
+  /* First test the DRTSM without copy into the C matrix.
+   */
+  A  = (double *)allocate_panel(m, lda, 128);
+  B1 = (double *)allocate_panel(m, ldb, 128);
+
+  for (i=0; i<lda*m; i++) A[i] = byte_swap(drand48());
+  for (i=0; i<ldb*m; i++) B1[i] = drand48();
+
+  nb = n_padded / 64;
+
+  if (ldc) {
+    C1 = (double *)allocate_matrix(nb, ldc, 128);
+    C2 = (double *)allocate_matrix(nb, ldc, 128);
+    for (i=0; i<ldc*nb; i++) C1[i] = C2[i] = drand48();
+    B2 = B1;
+  } else {
+    B2 = (double *)allocate_panel(m, ldb, 128);
+    for (i=0; i<ldb*m; i++) B2[i] = B1[i];
+    C1 = C2 = NULL;
+  }
+
+  hpl_ref_dtrsm_CL_R_B(m, n, A, lda, B1, ldb, C1, ldc, blk_row, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_dtrsm_CL_R_B(m, n, A, lda, B2, ldb, C2, ldc, blk_row, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  if (ldc) {
+    for (i=ldc*nb-1, errors=0; i>=0; i--) {
+      if (fabs(C1[i] - C2[i]) > EPSILON) { 
+	errors++;
+	if (errors < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]);
+      }
+    }
+    printf("Errors (with copy) = %d\n", errors);
+  } else {
+    for (i=ldb*m-1, errors=0; i>=0; i--) {
+      if (fabs(B1[i] - B2[i]) > EPSILON) { 
+	errors++;
+	if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]);
+      }
+    }
+    printf("Errors (without copy) = %d\n", errors);
+    if (errors) return 1;
+  }
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/dtrsm_CL_B.c
===================================================================
RCS file: accel/lib/tests/dtrsm_CL_B.c
diff -N accel/lib/tests/dtrsm_CL_B.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/dtrsm_CL_B.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,121 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+#define EPSILON		0.0000001
+
+
+/* dtrsm
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int ldb = 0;
+  int n=128;
+  int n_padded, nb, m_padded;
+  int m=128;
+  unsigned int blk_col = 0;
+  unsigned int blk_row = 0;
+  volatile unsigned long long incomplete;
+  double *A, *B1, *B2;
+
+  switch (argc) {
+  case 7:
+    blk_col = atoi(argv[6]);
+  case 6:
+    blk_row = atoi(argv[5]);
+  case 5:
+    ldb = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = ((rand() & 3) == 0) ? (rand() % 1024) : 128;
+    n = rand() % 1024;
+    lda = rand() % 1536;
+    ldb = rand() % 1536;
+    if ((rand() & 7) == 0) {
+      blk_row = rand() & 127;
+      blk_col = rand() & 127;
+    }
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldb [blk_row [blk_col]]]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if ((rand() & 7) != 0) lda &= ~1;
+  if (lda < m) lda = m;
+
+  if ((rand() & 7) != 0) ldb &= ~1;
+  if (ldb < n) ldb = n;
+
+
+  ldb = (ldb + 1) & ~(1);
+  m_padded = (m + blk_row + 63) & ~63;
+
+  if (ldb < 64*m_padded) ldb = 64*m_padded;
+
+  n_padded = (n + blk_col + 63) & ~63;
+
+  printf("Performing dtrsm_CL_B test with m=%d n=%d   lda=%d ldb=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, blk_row, blk_col);
+
+  /* Allocate and initialize the arrays
+   */
+
+  hpl_ref_init();
+  hpl_accel_init();
+
+  /* First test the DRTSM without copy into the C matrix.
+   */
+  A  = (double *)allocate_panel(m, lda, 128);
+
+  for (i=0; i<lda*m; i++) A[i] = byte_swap(drand48());
+
+  nb = n_padded / 64;
+
+  B1 = (double *)allocate_matrix(nb, ldb, 128);
+  B2 = (double *)allocate_matrix(nb, ldb, 128);
+
+  for (i=0; i<ldb*nb; i++) B1[i] = B2[i] = drand48();
+
+  hpl_ref_dtrsm_CL_B(m, n, A, lda, B1, ldb, blk_row, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_dtrsm_CL_B(m, n, A, lda, B2, ldb, blk_row, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=ldb*nb-1, errors=0; i>=0; i--) {
+    if (fabs(B1[i] - B2[i]) > EPSILON) { 
+      errors++;
+      if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/perf_dgemm.c
===================================================================
RCS file: accel/lib/tests/perf_dgemm.c
diff -N accel/lib/tests/perf_dgemm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_dgemm.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,182 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/* dgemm performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=64;
+  int k=128;
+  int m_padded, n_padded;
+  int iterations = 1;
+  int ldp = 0;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  char *env;
+  double *A, *B, *C, *P = NULL;
+  double tbfreq, gflops;
+  unsigned long long *ticks;
+  unsigned long long total;
+  double min, max, mean, std, delta;
+
+  switch (argc) {
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [n]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  m_padded = (m + 63) & ~63;
+  n_padded = (n + 63) & ~63;
+
+  if ((env = getenv("ITERATIONS"))) 
+    iterations = atoi(env);
+  ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long));
+
+  /* Allocate and initialize the arrays
+   */
+  if (getenv("HUGE_TLBFS")) {
+    size_t memsize = 4*128 + (128*(m_padded+n_padded) + ((size_t)m_padded*(size_t)n_padded)) * sizeof(double);
+    size_t hugepagesize = 16*1024*1024;
+    int fd;
+    void *mem = NULL;
+    char filename[100];
+    
+    if (getenv("PANEL")) {
+      memsize += 128 + ((size_t)m_padded * (size_t)n_padded)*sizeof(double);
+    }
+
+    sprintf(filename, "/huge/perf_dgemm_%d.dat", getpid());
+
+    if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) {
+      printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno));
+      exit(1);
+    } else {
+      /* Delete file so that huge pages will get freed on program termination. */
+      remove(filename);
+
+      memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1);
+
+      mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if (mem == MAP_FAILED) {
+	printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n",
+	       (unsigned long long)memsize, filename, errno, strerror(errno));
+	exit(1);
+      }
+
+      A = (double *)ALIGN128(mem);
+      B = (double *)ALIGN128(A + 128*m_padded);
+      C = (double *)ALIGN128(B + 128*n_padded);
+      if (getenv("PANEL")) {
+	ldp = m_padded;
+	P = (double *)ALIGN128(C + m_padded*n_padded);
+      }
+
+      /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */
+      close (fd);
+    }
+  } else {
+    if (posix_memalign(&ptr, 128, 128*m_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      A = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, 128*n_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      B = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      C = (double *)ptr;
+    }
+    if (getenv("PANEL")) {
+      ldp = m_padded;
+      if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) {
+	perror("posix_memalign failed");
+	exit(1);
+      } else {
+	P = (double *)ptr;
+      }
+    }
+  }
+
+  for (i=0; i<128*m_padded; i++) {
+    A[i] = 0.0f;
+    __dcbf(&A[i]);
+  }
+  for (i=0; i<128*n_padded; i++) {
+    B[i] = 0.0f;
+    __dcbf(&B[i]);
+  }
+  for (i=0; i<m_padded*n_padded; i++) {
+    C[i] = 0.0f;
+  }
+  if (P) {
+    for (i=0; i<m_padded*n_padded; i++) {
+      P[i] = 0.0f;
+    }
+  }
+
+  tbfreq = get_timebase_frequency();
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_dgemm_CL_R_B_CL(m, n, k, A, m_padded, B, n_padded, C, 64*m, 0, 0, P, ldp, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_dgemm_CL_R_B_CL(m, n, k, A, m_padded, B, n_padded, C, 64*m, 0, 0, P, ldp,(unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  gflops = ((double)iterations * (2.0*128.0*(double)m*(double)n * (double)tbfreq)) / ((double)total * 1.0e9);
+  mean = (double)total / (double)iterations;
+
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+
+  printf("DGEMM m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%   %f Gflops/sec\n", m, n, 
+	 min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops);
+
+
+  return 0;
+}
Index: accel/lib/tests/perf_dgemm_C.c
===================================================================
RCS file: accel/lib/tests/perf_dgemm_C.c
diff -N accel/lib/tests/perf_dgemm_C.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_dgemm_C.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,161 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/* dgemm_C_C_C performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=64;
+  int k=64;
+  int m_padded, k_padded;
+  int iterations = 1;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  char *env;
+  double *A, *B, *C;
+  double tbfreq, gflops;
+  unsigned long long *ticks;
+  unsigned long long total;
+  double min, max, mean, std, delta;
+
+  switch (argc) {
+  case 4:
+    k = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n [k]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  k_padded = (k + 15) & ~15;
+  m_padded = (m + 15) & ~15;
+
+  if ((env = getenv("ITERATIONS"))) 
+    iterations = atoi(env);
+  ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long));
+
+  /* Allocate and initialize the arrays
+   */
+  if (getenv("HUGE_TLBFS")) {
+    size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double);
+    size_t hugepagesize = 16*1024*1024;
+    int fd;
+    void *mem = NULL;
+    char filename[100];
+    
+    sprintf(filename, "/huge/perf_dgemm_C_%d.dat", getpid());
+
+    if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) {
+      printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno));
+      exit(1);
+    } else {
+      /* Delete file so that huge pages will get freed on program termination. */
+      remove(filename);
+
+      memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1);
+
+      mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if (mem == MAP_FAILED) {
+	printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n",
+	       (unsigned long long)memsize, filename, errno, strerror(errno));
+	exit(1);
+      }
+
+      A = (double *)ALIGN128(mem);
+      B = (double *)ALIGN128(A + m_padded*k);
+      C = (double *)ALIGN128(B + k_padded*n);
+
+      /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */
+      close (fd);
+    }
+  } else {
+    if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      A = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      B = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      C = (double *)ptr;
+    }
+  }
+
+  for (i=0; i<k*m_padded; i++) {
+    A[i] = 0.0f;
+    __dcbf(&A[i]);
+  }
+  for (i=0; i<n*k_padded; i++) {
+    B[i] = 0.0f;
+    __dcbf(&B[i]);
+  }
+  for (i=0; i<m_padded*n; i++) {
+    C[i] = 0.0f;
+  }
+
+  tbfreq = get_timebase_frequency();
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_dgemm_C_C_C(m, n, k, A, m_padded, B, k_padded, C, m_padded, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_dgemm_C_C_C(m, n, k, A, m_padded, B, k_padded, C, m_padded, (unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  gflops = ((double)iterations * (2.0*(double)k*(double)m*(double)n * (double)tbfreq)) / ((double)total * 1.0e9);
+  mean = (double)total / (double)iterations;
+
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+
+  printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%   %f Gflops/sec\n", m, n, k,
+	 min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops);
+
+
+  return 0;
+}
Index: accel/lib/tests/perf_dgemm_CL.c
===================================================================
RCS file: accel/lib/tests/perf_dgemm_CL.c
diff -N accel/lib/tests/perf_dgemm_CL.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_dgemm_CL.c	14 May 2008 21:35:01 -0000	1.2
@@ -0,0 +1,163 @@
+/* ------------------------------------------------------------------ */
+/* (C) Copyright 2007                                                 */
+/* International Business Machines Corporation,                       */
+/*                                                                    */
+/* All Rights Reserved.                                               */
+/* ------------------------------------------------------------------ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/* dgemm performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=64;
+  int k=64;
+  int m_padded, k_padded;
+  int iterations = 1;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  char *env;
+  double *A, *B, *C;
+  double tbfreq, gflops;
+  unsigned long long *ticks;
+  unsigned long long total;
+  double min, max, mean, std, delta;
+
+  switch (argc) {
+  case 4:
+    k = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n [k]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  k_padded = (k + 15) & ~15;
+  m_padded = (m + 15) & ~15;
+
+  if ((env = getenv("ITERATIONS"))) 
+    iterations = atoi(env);
+  ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long));
+
+  /* Allocate and initialize the arrays
+   */
+  if (getenv("HUGE_TLBFS")) {
+    size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double);
+    size_t hugepagesize = 16*1024*1024;
+    int fd;
+    void *mem = NULL;
+    char filename[100];
+    
+    sprintf(filename, "/huge/perf_dgemm_CL_%d.dat", getpid());
+
+    if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) {
+      printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno));
+      exit(1);
+    } else {
+      /* Delete file so that huge pages will get freed on program termination. */
+      remove(filename);
+
+      memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1);
+
+      mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if (mem == MAP_FAILED) {
+	printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n",
+	       (unsigned long long)memsize, filename, errno, strerror(errno));
+	exit(1);
+      }
+
+      A = (double *)ALIGN128(mem);
+      B = (double *)ALIGN128(A + m_padded*k);
+      C = (double *)ALIGN128(B + k_padded*n);
+
+      /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */
+      close (fd);
+    }
+  } else {
+    if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      A = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      B = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      C = (double *)ptr;
+    }
+  }
+
+  for (i=0; i<k*m_padded; i++) {
+    A[i] = 0.0f;
+    __dcbf(&A[i]);
+  }
+  for (i=0; i<n*k_padded; i++) {
+    B[i] = 0.0f;
+    __dcbf(&B[i]);
+  }
+  for (i=0; i<m_padded*n; i++) {
+    C[i] = 0.0f;
+  }
+
+  tbfreq = get_timebase_frequency();
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_dgemm_CL_C_C(m, n, k, A, m_padded, B, k_padded, C, m_padded, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_dgemm_CL_C_C(m, n, k, A, m_padded, B, k_padded, C, m_padded, (unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  gflops = ((double)iterations * (2.0*(double)k*(double)m*(double)n * (double)tbfreq)) / ((double)total * 1.0e9);
+  mean = (double)total / (double)iterations;
+
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+
+  printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%   %f Gflops/sec\n", m, n, k,
+	 min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops);
+
+
+  return 0;
+}
Index: accel/lib/tests/perf_dtrsm.c
===================================================================
RCS file: accel/lib/tests/perf_dtrsm.c
diff -N accel/lib/tests/perf_dtrsm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_dtrsm.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,159 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+
+
+/* dtrsm performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=128;
+  int n=128;
+  int n_padded;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  double *A, *B, *C;
+  double ops, bytes, tbfreq, gflops, grate;
+  int iterations = 1;
+  char *env;
+  unsigned long long *ticks;
+  unsigned long long total;
+  double min, max, mean, std, delta;
+
+  switch (argc) {
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  if ((env = getenv("ITERATIONS"))) 
+    iterations = atoi(env);
+  ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long));
+
+  /* Allocate and initialize the arrays
+   */
+  n_padded = (n | 128) & ~(128-1);
+  if (getenv("HUGE_TLBFS")) {
+    size_t memsize = (m*m + m*n_padded + m*n)*sizeof(double) + 3*128;
+    size_t hugepagesize = 16*1024*1024;
+    int fd;
+    void *mem = NULL;
+    char filename[100];
+
+    sprintf(filename, "/huge/perf_dtrsm_%d.dat", getpid());
+    
+    if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) {
+      printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno));
+      exit(1);
+    } else {
+      /* Delete file so that huge pages will get freed on program termination. */
+      remove(filename);
+
+      memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1);
+
+      mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if (mem == MAP_FAILED) {
+	printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n",
+	       (unsigned long long)memsize, filename, errno, strerror(errno));
+	exit(1);
+      }
+
+      A = (double *)ALIGN128(mem);
+      B = (double *)ALIGN128(A + m*m);
+      C = (double *)ALIGN128(B + m*n_padded);
+
+      /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */
+      close (fd);
+    }
+  } else {
+    if (posix_memalign(&ptr, 128, m*m*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      A = (double *)ptr;
+    }
+
+    /* Pad n to an off multiple of 128 for bank utilization performance reasons.
+     */
+    if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      B = (double *)ptr;
+    }
+    if (posix_memalign(&ptr, 128, m*n*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      C = (double *)ptr;
+    }
+  }
+
+  for (i=0; i<m*m; i++) A[i] = 0.0f;
+  for (i=0; i<m*n_padded; i++) B[i] = 0.0f;
+  for (i=0; i<m*n; i++) C[i] = 0.0f;
+
+  tbfreq = get_timebase_frequency();
+  ops = (double)n * (double)(2 * ((m/2-1) * m + m/2));
+  bytes = n * (double)(2*m*sizeof(double));				/* B matrix load and store */
+  for (i=1; i<m; i++) {
+    bytes += ((i + 15) & ~15) * sizeof(double) * 8.0;
+  }
+
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_dtrsm_CL_R_B(m, n, A, m, B, n_padded, NULL, 0, 0, 0, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_dtrsm_CL_R_B(m, n, A, m, B, n_padded, NULL, 0, 0, 0, (unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  gflops = (iterations * ops * tbfreq) / ((double)total * 1.0e9);
+  grate  = (iterations * bytes * tbfreq) / ((double)total * 1.0e9);
+  mean = (double)total / (double)iterations;
+  
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+  printf("m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%   %f Gflops/sec   %f Gbytes/sec\n", 
+	 m, n, min, max, mean, std, 
+	 100.0*((double)(max-min))/((double)mean),
+	 gflops, grate);
+
+  return 0;
+}
Index: accel/lib/tests/perf_reform_lpanel.c
===================================================================
RCS file: accel/lib/tests/perf_reform_lpanel.c
diff -N accel/lib/tests/perf_reform_lpanel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_reform_lpanel.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,86 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+/* reform l panel performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=128;
+  int n_padded;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  double *A, *panel;
+  double tbfreq;
+  unsigned long long ticks;
+
+
+  switch (argc) {
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  m = m & ~(64-1);
+  if (m < 64) m = 64;
+
+  if (n < 1) n = 1;
+  n_padded = (n + 63) & ~(63);
+
+  /* Allocate and initialize the arrays
+   */
+  if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    A = (double *)ptr;
+  }
+
+  if (posix_memalign(&ptr, 128, m*n*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    panel = (double *)ptr;
+  }
+
+  for (i=0; i<m*n_padded; i++) A[i] = 0.0f;
+  for (i=0; i<m*n; i++) panel[i] = 0.0f;
+
+  tbfreq = get_timebase_frequency();
+
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_reform_panel_B_to_CL(m, n, panel, m, A, 64*m, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  ticks = __mftb();
+  hpl_accel_reform_panel_B_to_CL(m, n, panel, m, A, 64*m, (unsigned long long *)&incomplete);
+  while (incomplete);
+  ticks = __mftb() - ticks;
+
+  printf("m=%d n=%d   rate=%f Gbytes/sec  xfer=%f Gbytes/sec\n", m, n, 
+	 (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)ticks * 1.0e9),
+	 (double)m * (double)(n+n_padded) * tbfreq * (double)sizeof(double) / ((double)ticks * 1.0e9));
+
+  return 0;
+}
Index: accel/lib/tests/perf_reform_matrix.c
===================================================================
RCS file: accel/lib/tests/perf_reform_matrix.c
diff -N accel/lib/tests/perf_reform_matrix.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_reform_matrix.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,103 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+/* reform matrix performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=128;
+  int mb, nb;
+  int spes=1;
+  int size;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  double *A, *scratch;
+  double tbfreq, blocks_xfer;
+  unsigned long long ticks;
+
+
+  switch (argc) {
+  case 4:
+    spes = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n [spes]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  m = m & ~(64-1);
+  if (m < 64) m = 64;
+
+  n = n & ~(64-1);
+  if (n < 64) n = 64;
+
+  size = spes*64*(m-4);
+
+  /* Allocate and initialize the arrays
+   */
+  if (posix_memalign(&ptr, 128, m*n*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    A = (double *)ptr;
+  }
+
+  if (posix_memalign(&ptr, 128, size*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    scratch = (double *)ptr;
+  }
+
+  for (i=0; i<m*n; i++) A[i] = 0.0f;
+  for (i=0; i<size; i++) scratch[i] = 0.0f;
+
+  tbfreq = get_timebase_frequency();
+
+  /* Compute the number of block transfered */
+
+  mb = m / 64;
+  nb = n / 64;
+  
+  if (mb < 4) {
+    blocks_xfer = (double)(2*mb*nb);
+  } else {
+    blocks_xfer = (double)(2*4*nb + 4*(mb-4)*nb);
+  }
+    
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_reform_matrix_CL_to_B(m, n, A, m, scratch, size, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  ticks = __mftb();
+  hpl_accel_reform_matrix_CL_to_B(m, n, A, m, scratch, size, (unsigned long long *)&incomplete);
+  while (incomplete);
+  ticks = __mftb() - ticks;
+
+  printf("m=%d n=%d spes=%d  rate=%f Gbytes/sec xfer=%f Gbytes/sec\n", m, n, spes,
+	 (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)ticks * 1.0e9),
+	 blocks_xfer * tbfreq * (double)(64*64*sizeof(double)) / ((double)ticks * 1.0e9));
+
+  return 0;
+}
Index: accel/lib/tests/perf_reform_rows.c
===================================================================
RCS file: accel/lib/tests/perf_reform_rows.c
diff -N accel/lib/tests/perf_reform_rows.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_reform_rows.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,195 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+
+
+/* reform rows performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=1;
+  int n=128;
+  int height=256;
+  int blk_col=0;
+  int n_padded;
+  int iterations=1;
+  int *rows;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  double *A, *R;
+  double tbfreq;
+  char *env;
+  unsigned long long *ticks;
+  unsigned long long total;
+  double min, max, mean, std, delta;
+
+  switch (argc) {
+  case 5:
+    blk_col = atoi(argv[4]);
+  case 4:
+    height = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [blk_col]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  blk_col &= (64-1);
+
+  if (n < 1) n = 1;
+  n_padded = (n + blk_col + 63) & ~(63);
+
+  if (height < m) height = m;
+  height = (height + 63) & ~63;
+
+  if ((env = getenv("ITERATIONS"))) 
+    iterations = atoi(env);
+  ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long));
+
+  if (getenv("HUGE_TLBFS")) {
+    size_t memsize = 2*128 + m*sizeof(int) + (size_t)(height+m)*n_padded*sizeof(double);
+    size_t hugepagesize = 16*1024*1024;
+    int fd;
+    void *mem = NULL;
+    char filename[100];
+    
+    sprintf(filename, "/huge/perf_reform_rows_%d.dat", getpid());
+
+    if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) {
+      printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno));
+      exit(1);
+    } else {
+      /* Delete file so that huge pages will get freed on program termination. */
+      remove(filename);
+
+      memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1);
+
+      mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if (mem == MAP_FAILED) {
+	printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n",
+	       (unsigned long long)memsize, filename, errno, strerror(errno));
+	exit(1);
+      }
+
+      A = (double *)ALIGN128(mem);
+      R = (double *)ALIGN128(A + height*n_padded);
+      rows = (int *)(R + m*n_padded);
+
+      /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */
+      close (fd);
+    }
+  } else {
+    /* Allocate and initialize the arrays
+     */
+    if (posix_memalign(&ptr, 128, height*n_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      A = (double *)ptr;
+    }
+
+    if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) {
+      perror("posix_memalign failed");
+      exit(1);
+    } else {
+      R = (double *)ptr;
+    }
+    rows = (int *)malloc(m*sizeof(int));
+  }
+
+  for (i=0; i<height*n_padded; i++) A[i] = 0.0f;
+  for (i=0; i<m*n_padded; i++) R[i] = 0.0f;
+  for (i=0; i<m; i++) rows[i] = rand() & (height-1);
+
+  tbfreq = get_timebase_frequency();
+
+  hpl_accel_init();
+
+  /* Test ROW to BLOCK copy */
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_reform_rows_R_to_B(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_reform_rows_R_to_B(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  mean = (double)total / (double)iterations;
+
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+  printf("REFORM ROW (R_to_B) m=%d n=%d height=%d blk_col=%d  MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%  rate=%f Gbytes/sec  xfer=%f Gbytes/sec\n", m, n, height, blk_col,
+	 min, max, mean, std, 100.0*((double)(max-min))/((double)mean),
+	 (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9),
+	 (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9));
+
+  /* Test BLOCK to ROW copy */
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_reform_rows_B_to_R(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  total = 0;
+  for (i=0; i<iterations; i++) {
+    ticks[i] = __mftb();
+    hpl_accel_reform_rows_B_to_R(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete);
+    while (incomplete);
+    ticks[i] = __mftb() - ticks[i];
+    total += ticks[i];
+  }
+
+  mean = (double)total / (double)iterations;
+
+  min = max = ticks[0];
+  for (i=0, std=0; i<iterations; i++) {
+    delta = (double)ticks[i] -  mean;
+    std += delta * delta;
+    if (ticks[i] < min) min = ticks[i];
+    if (ticks[i] > max) max = ticks[i];
+  }
+  std = sqrt(std/(double)(iterations));
+
+  printf("REFORM ROW (B_to_R) m=%d n=%d height=%d blk_col=%d  MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%%  rate=%f Gbytes/sec  xfer=%f Gbytes/sec\n", m, n, height, blk_col,
+	 min, max, mean, std, 100.0*((double)(max-min))/((double)mean),
+	 (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9),
+	 (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9));
+
+
+  return 0;
+}
Index: accel/lib/tests/perf_reform_upanel.c
===================================================================
RCS file: accel/lib/tests/perf_reform_upanel.c
diff -N accel/lib/tests/perf_reform_upanel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/perf_reform_upanel.c	20 Aug 2008 03:57:53 -0000	1.2
@@ -0,0 +1,80 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+/* reform u panel performance test
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int m=64;
+  int n=128;
+  int n_padded;
+  volatile unsigned long long incomplete;
+  void *ptr;
+  double *A, *panel;
+  double tbfreq;
+  unsigned long long ticks;
+
+
+  switch (argc) {
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    break;
+  default:
+    printf("Usage: %s [m [n]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  n_padded = (n + 15) & ~(15);
+
+  /* Allocate and initialize the arrays
+   */
+  if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    A = (double *)ptr;
+  }
+
+  if (posix_memalign(&ptr, 128, m*n*sizeof(double))) {
+    perror("posix_memalign failed");
+    exit(1);
+  } else {
+    panel = (double *)ptr;
+  }
+
+  for (i=0; i<m*n_padded; i++) A[i] = 0.0f;
+  for (i=0; i<m*n; i++) panel[i] = 0.0f;
+
+  tbfreq = get_timebase_frequency();
+
+  hpl_accel_init();
+
+  /* Perform 1 iteration first to pre-charge the PTEs
+   */
+  hpl_accel_reform_panel_R_to_B(m, n, A, 64*m, panel, n_padded, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  ticks = __mftb();
+  hpl_accel_reform_panel_R_to_B(m, n, A, 64*m, panel, n_padded, (unsigned long long *)&incomplete);
+  while (incomplete);
+  ticks = __mftb() - ticks;
+
+  printf("m=%d n=%d   rate=%f Gbytes/sec\n", m, n, (double)m * (double)(n) * tbfreq * (double)sizeof(double) / ((double)ticks * 1.0e9));
+
+  return 0;
+}
Index: accel/lib/tests/reform.c
===================================================================
RCS file: accel/lib/tests/reform.c
diff -N accel/lib/tests/reform.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/reform.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,117 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+#define EPSILON		0.0000001
+
+
+/* reform
+ */
+int main(int argc, char *argv[])
+{
+  int i, j;
+  int col;
+  int errors;
+  int lda = 0;
+  int ldp = 0;
+  int n=128;
+  int m=128;
+  volatile unsigned long long incomplete;
+  double *A1, *A2, *panel, *scratch;
+
+  switch (argc) {
+  case 5:
+    ldp = atoi(argv[3]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1280;
+    n = rand() % 1280;
+    lda = rand() % 2048;
+    ldp = rand() % 2048;
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  m = m & ~(128-1);		
+  if (m < 128) m = 128;
+
+  n = n & ~(128-1);
+  if (n < 128) n = 128;
+
+  lda = lda & ~(15);
+  if (lda < m) lda = m;
+
+  ldp = ldp & ~(15);
+  if (ldp < m) ldp = m;
+ 
+  printf("Performing reform test with m=%d n=%d  lda=%d  ldp=%d\n", m, n, lda, ldp);
+
+  /* Allocate and initialize the arrays
+   */
+  A1 = (double *)allocate_matrix(n/64, lda*M_SUB, 128);
+  A2 = (double *)allocate_matrix(n/64, lda*M_SUB, 128);
+  scratch = (double *)allocate_panel(1, 128*ldp, 128);	/* allocate 1 row so that no 4GB crossings occur */
+  panel = (double *)allocate_panel(128, ldp, 128);
+
+  for (i=0; i<n*lda; i++) A1[i] = A2[i] = drand48();
+
+  hpl_ref_init();
+
+  /* Reformat the matrix from column ordered little endian to blocked big endian.
+   */
+  hpl_ref_reform_matrix_CL_to_B(m, n, A1, lda, scratch, 128*ldp, (unsigned long long *)&incomplete);
+
+  while(incomplete);
+
+  /* Verify that the little endian translation occurred.
+   */
+  if (byte_swap(A1[0]) != A2[0]) {
+    printf("ERROR - First element not correctly byte swapped\n");
+    return 1;
+  }
+
+  /* For each of the panels. Translate them back and verify that the translation is accurate.
+   */
+  errors = 0;
+  for (col=0; col<n; col+=128) {
+    hpl_ref_reform_panel_B_to_CL(m, 128, panel, ldp, A1+col*lda, 64*lda, (unsigned long long *)&incomplete);
+
+    while(incomplete);
+
+    for (j=0; j<128; j++) {
+      for (i=0; i<m; i++) {
+	if (panel[j*ldp+i] != A2[(col+j)*lda+i]) {
+	  errors++;
+	  printf("  %d panel=%f  %f A2=%f\n", i, panel[j*ldp+i], byte_swap(panel[j*ldp+i]), A2[(col+j)*lda+i]);
+	}
+      }
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/reform_lpanel.c
===================================================================
RCS file: accel/lib/tests/reform_lpanel.c
diff -N accel/lib/tests/reform_lpanel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/reform_lpanel.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,102 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+/* reformat L panel
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int ldp = 0;
+  int lda = 0;
+  int n=64;
+  int m=64;
+  int n_padded;
+  volatile unsigned long long incomplete;
+  double *A, *P1, *P2;
+
+  switch (argc) {
+  case 5:
+    ldp = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 768;
+    n = rand() % 768;
+    lda = M_SUB * (rand() % 1536);
+    ldp = rand() % 1536;
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  m = m & ~(64-1);
+
+  ldp = (ldp + 15) & ~(15);
+  if (ldp < m) ldp = m;
+  if (ldp < 16) ldp = 16;
+  if (lda < m*M_SUB) lda = m*M_SUB;
+  
+  printf("Performing reform_lpanel test with m=%d n=%d   lda=%d ldp=%d\n", m, n, lda, ldp);
+
+  /* Allocate and initialize the arrays
+   */
+  n_padded = (n + M_SUB-1) & ~(M_SUB-1);
+
+  A = (double *)allocate_matrix(n_padded/M_SUB, lda, 128);
+  P1 = (double *)allocate_panel(n, ldp, 128);
+  P2 = (double *)allocate_panel(n, ldp, 128);
+
+  if ((A == NULL) || (P1 == NULL) || (P2 == NULL)) {
+    printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*ldp*n + (double)lda*n_padded)*sizeof(double)/(1024.0*1024.0), A, P1, P2);
+    return 0;
+  }
+
+  for (i=0; i<lda*n_padded/M_SUB; i++) A[i] = byte_swap(drand48());
+
+  memset(P1, 0x55, ldp*n*sizeof(double));
+  memset(P2, 0x55, ldp*n*sizeof(double));
+
+  hpl_ref_init();
+  hpl_ref_reform_panel_B_to_CL(m, n, P1, ldp, A, lda, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_reform_panel_B_to_CL(m, n, P2, ldp, A, lda, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=ldp*n-1, errors=0; i>=0; i--) {
+    if (P1[i] != P2[i]) {
+      errors++;
+      if (errors < 20) printf(" %d expected=%f got=%f\n", i, P1[i], P2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/reform_matrix.c
===================================================================
RCS file: accel/lib/tests/reform_matrix.c
diff -N accel/lib/tests/reform_matrix.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/reform_matrix.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,97 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+/* reformat matrix
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int n=64;
+  int m=64;
+  int m_pad;
+  int size;
+  volatile unsigned long long incomplete;
+  double *A1, *A2, *scratch;
+
+  switch (argc) {
+  case 5:
+    size = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 1280;
+    n = rand() % 1280;
+    lda = rand() % 2048;
+    size = rand() % (8*m*64);
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [size]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  n = n & ~(64-1);
+  if (n < 64) n = 64;
+
+  if (m < 1) m = 1;
+  m_pad = (m + 63) & ~63;
+
+  lda = lda & ~(64-1);
+  if (lda < m_pad) lda = m_pad;
+
+  if (size < m_pad*64) size = m_pad*64;
+  size = (size + (128-1)) & ~(128-1);	/* Pad the scratch buffer to a cacheline */
+
+  printf("Performing reform_matrix test with m=%d n=%d   lda=%d size=%d\n", m, n, lda, size);
+
+  /* Allocate and initialize the arrays
+   */
+  A1 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128);
+  A2 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128);
+  scratch = (double *)allocate_panel(1, size, 128);
+
+  for (i=0; i<lda*n; i++) A1[i] = A2[i] = byte_swap(drand48());
+
+  hpl_ref_init();
+  hpl_ref_reform_matrix_CL_to_B(m, n, A1, lda, scratch, size, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_reform_matrix_CL_to_B(m, n, A2, lda, scratch, size, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=lda*n-1, errors=0; i>=0; i--) {
+    if (A1[i] != A2[i]) {
+      errors++;
+      if (errors < 20) printf(" %d expected=%f got=%f\n", i, A1[i], A2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/reform_rows.c
===================================================================
RCS file: accel/lib/tests/reform_rows.c
diff -N accel/lib/tests/reform_rows.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/reform_rows.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,158 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+int rand_row(int max, int *rows, int cnt)
+{
+  int i;
+  int new_row;
+  int unique;
+
+  do {
+    new_row = (max * (rand() & 0xFFFF)) >> 16;
+    unique = 1;
+
+    for (i=0; i<cnt; i++) {
+      if (new_row == rows[i]) unique = 0;
+    }
+  } while (!unique);
+  return (new_row);
+}
+
+
+/* reformat row
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int ldr = 0;
+  int lda = 0;
+  int n=64;
+  int m=1;
+  int blk_col=0;
+  int n_padded, m_padded;
+  int *rows;
+  volatile unsigned long long incomplete;
+  double *A1, *A2, *R1, *R2;
+
+  switch (argc) {
+  case 6:
+    lda = atoi(argv[5]);
+  case 5:
+    ldr = atoi(argv[4]);
+  case 4:
+    blk_col = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 64;
+    n = rand() % 700;
+    lda = M_SUB * (rand() % 800);
+    ldr = rand() % 1280;
+    blk_col = rand() % 200;
+    break;
+  default:
+    printf("Usage: %s [m [n [blk_col [ldr [lda]]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (m < 1) m = 1;
+  if (blk_col < 0) blk_col = 0;
+  if (ldr < n) ldr = n;
+
+  /* Force most random invocations to use accelerated code path
+   */
+  if ((argc==1) && (rand() & 7)) {
+    lda = (lda + 15) & ~(15);
+    ldr = (ldr + 1) & ~1;
+    blk_col = blk_col & ~1;
+  }
+
+  printf("Performing reform_row test with m=%d n=%d  blk_col=%d ldr=%d lda=%d\n", m, n, blk_col, ldr, lda);
+
+  /* Allocate and initialize the arrays
+   */
+  m_padded = (m + M_SUB-1) & (~(M_SUB-1));
+  n_padded = (n+blk_col + M_SUB-1) & ~(M_SUB-1);
+  if (lda < m_padded*M_SUB) lda = m_padded * M_SUB;
+
+  R1 = (double *)allocate_panel(1, ldr*m*sizeof(double), 128);	/* Never cross a 4GB boundary */
+  R2 = (double *)allocate_panel(1, ldr*m*sizeof(double), 128);	/* Never cross a 4GB boundary */
+  A1 = (double *)allocate_matrix(m_padded, lda, 128);
+  A2 = (double *)allocate_matrix(m_padded, lda, 128);
+
+  rows = (int *)allocate_panel(1, m * sizeof(int), 4);		/* Never cross a 4GB boundary */
+
+  if ((A1 == NULL) || (A2 == NULL) || (R1 == NULL) || (R2 == NULL) || (rows == NULL)) {
+    printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p %p\n", (2.0*m*n_padded + (double)ldr*m)*sizeof(double)/(1024.0*1024.0), A1, A2, R1, R2);
+    return 0;
+  }
+
+  /* Test ROW to BLOCK copy */
+  for (i=0; i<m*n_padded; i++) A1[i] = A2[i] = drand48();
+  for (i=0; i<ldr*m; i++) R1[i] = drand48();
+  for (i=0; i<m; i++) rows[i] = rand_row(m, rows, i);
+
+  hpl_ref_init();
+  hpl_ref_reform_rows_R_to_B(m, n, R1, ldr, A1, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_reform_rows_R_to_B(m, n, R1, ldr, A2, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=m*n_padded-1, errors=0; i>=0; i--) {
+    if (A1[i] != A2[i]) {
+      errors++;
+      if (errors < 20) printf("R->B %d expected=%f got=%f\n", i, A1[i], A2[i]);
+    }
+  }
+
+  /* Test BLOCK to ROW copy */
+  for (i=0; i<m*n_padded; i++) A1[i] = drand48();
+  for (i=0; i<ldr*m; i++) R1[i] = R2[i] = drand48();
+  for (i=0; i<m; i++) rows[i] = rand_row(m, rows, i);
+
+  hpl_ref_init();
+  hpl_ref_reform_rows_B_to_R(m, n, R1, ldr, A1, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_reform_rows_B_to_R(m, n, R2, ldr, A1, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=ldr*m-1; i>=0; i--) {
+    if (R1[i] != R2[i]) {
+      errors++;
+      if (errors < 20) printf("B->R %d expected=%f got=%f\n", i, R1[i], R2[i]);
+    }
+  }
+
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/reform_upanel.c
===================================================================
RCS file: accel/lib/tests/reform_upanel.c
diff -N accel/lib/tests/reform_upanel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/reform_upanel.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,101 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+
+
+/* reformat L panel
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int ldp = 0;
+  int lda = 0;
+  int n=64;
+  int m=64;
+  int m_padded, n_padded;
+  volatile unsigned long long incomplete;
+  double *A1, *A2, *P;
+
+  switch (argc) {
+  case 5:
+    ldp = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 700;
+    n = rand() % 700;
+    lda = M_SUB * (rand() % 1280);
+    ldp = rand() % 1280;
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  m_padded = (m + M_SUB-1)  & ~(M_SUB-1);
+  if (ldp < n) ldp = n;
+  ldp = (ldp + 15) & ~(15);
+  if (lda < m_padded*M_SUB) lda = m_padded*M_SUB;
+  lda = (lda + 15) & ~(15);
+  
+  printf("Performing reform_upanel test with m=%d n=%d   lda=%d ldp=%d\n", m, n, lda, ldp);
+
+  /* Allocate and initialize the arrays
+   */
+  n_padded = (n + M_SUB-1) & ~(M_SUB-1);
+
+  A1 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128);
+  A2 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128);
+  P  = (double *)allocate_panel(m, ldp, 128);
+
+  if ((A1 == NULL) || (A2 == NULL) || (P == NULL)) {
+    printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*lda*n_padded + (double)ldp*m)*sizeof(double)/(1024.0*1024.0), A1, A2, P);
+    return 0;
+  }
+
+  for (i=0; i<lda*n_padded/M_SUB; i++) A1[i] = A2[i] = drand48();
+  for (i=0; i<ldp*m; i++) P[i] = drand48();
+
+  hpl_ref_init();
+  hpl_ref_reform_panel_R_to_B(m, n, A1, lda, P, ldp, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_reform_panel_R_to_B(m, n, A2, lda, P, ldp, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=lda*n_padded/M_SUB-1, errors=0; i>=0; i--) {
+    if (A1[i] != A2[i]) {
+      errors++;
+      if (errors < 20) printf(" %d expected=%f got=%f\n", i, A1[i], A2[i]);
+    }
+  }
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/regression
===================================================================
RCS file: accel/lib/tests/regression
diff -N accel/lib/tests/regression
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/regression	15 May 2008 01:36:22 -0000	1.4
@@ -0,0 +1,38 @@
+#!/bin/sh
+# hpl_accel regression test suite
+
+let ITERATIONS=20
+
+regress() {
+  let i=0
+
+  echo ">>> Regression testing" $1 "for" $ITERATIONS "iterations <<<"
+
+  while ((i<ITERATIONS))
+  do
+    ./$1
+    if (($? != 0))
+    then
+      echo "REGRESSION TEST FAILED"      
+      exit
+    fi
+    let i=i+1
+  done
+}
+
+
+regress dgemm
+regress dgemm_CL_B_B
+regress dgemm_CL_R_B_CL
+regress dgemm_CL_B_B_CL
+regress dgemm_C_C_C
+regress dtrsm
+regress dtrsm_CL_B
+regress reform
+regress reform_matrix
+regress reform_lpanel
+regress reform_upanel
+regress reform_rows
+regress swap_rows
+regress copy_rows
+echo "REGRESSION TEST PASSED"
Index: accel/lib/tests/swap_rows.c
===================================================================
RCS file: accel/lib/tests/swap_rows.c
diff -N accel/lib/tests/swap_rows.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/swap_rows.c	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,111 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <ppu_intrinsics.h>
+#include "hpl_accel.h"
+#include "test_utils.h"
+
+int rand_row(int max)
+{
+  int new_row;
+  new_row = (max * (rand() & 0xFFFF)) >> 16;
+  return (new_row);
+}
+
+
+/* swap row
+ */
+int main(int argc, char *argv[])
+{
+  int i;
+  int errors;
+  int lda = 0;
+  int n=64;
+  int m=1;
+  int blk_col=0;
+  int n_padded, m_padded;
+  int *rows;
+  volatile unsigned long long incomplete;
+  double *A1, *A2;
+
+  switch (argc) {
+  case 5:
+    blk_col = atoi(argv[4]);
+  case 4:
+    lda = atoi(argv[3]);
+  case 3:
+    n = atoi(argv[2]);
+  case 2:
+    m = atoi(argv[1]);
+    break;
+  case 1:
+    /* No parameters, randomly select a parameter set.
+     */
+    srand((unsigned int)__mftb());
+    m = rand() % 64;
+    n = rand() % 700;
+    lda = M_SUB * (rand() % 800);
+    blk_col = rand() % 200;
+    break;
+  default:
+    printf("Usage: %s [m [n [lda [blk_col]]]]\n", argv[0]);
+    return 1;
+    break;
+  }
+
+  /* Apply functional constraints to the parameter set.
+   */
+  if (m < 1) m = 1;
+  if (blk_col < 0) blk_col = 0;
+  lda = (lda + 15) & ~(15);
+
+  printf("Performing swap_rows test with m=%d n=%d lda=%d blk_col=%d\n", m, n, lda, blk_col);
+
+  /* Allocate and initialize the arrays
+   */
+  m_padded = (m + M_SUB-1) & (~(M_SUB-1));
+  n_padded = (n+blk_col + M_SUB-1) & ~(M_SUB-1);
+  if (lda < m_padded*M_SUB) lda = m_padded * M_SUB;
+
+  A1 = (double *)allocate_matrix(m_padded, lda, 128);
+  A2 = (double *)allocate_matrix(m_padded, lda, 128);
+
+  rows = (int *)allocate_panel(1, m * sizeof(int), 4);		/* Never cross a 4GB boundary */
+
+  if ((A1 == NULL) || (A2 == NULL) || (rows == NULL)) {
+    printf("Failed to allocate buffers. Total allocation is %f MB. %p %p\n", (2.0*m*n_padded)/(1024.0*1024.0), A1, A2);
+    return 0;
+  }
+
+  /* Test BLOCK to BLOCK copy */
+  for (i=0; i<m*n_padded; i++) A1[i] = A2[i] = drand48();
+  for (i=0; i<m; i++) rows[i] = i+rand_row(m-i);
+
+  hpl_ref_init();
+  hpl_ref_swap_rows_B_to_B(m, n, A1, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  hpl_accel_init();
+  hpl_accel_swap_rows_B_to_B(m, n, A2, lda, rows, blk_col, (unsigned long long *)&incomplete);
+  while (incomplete);
+
+  /* Compare the results 
+   */
+  for (i=m*n_padded-1, errors=0; i>=0; i--) {
+    if (A1[i] != A2[i]) {
+      errors++;
+      if (errors < 20) printf("B<->B %d expected=%f got=%f\n", i, A1[i], A2[i]);
+    }
+  }
+
+  printf("Errors = %d\n", errors);
+
+  hpl_accel_fini();
+
+  return ((errors) ? 1 : 0);
+}
Index: accel/lib/tests/test_utils.h
===================================================================
RCS file: accel/lib/tests/test_utils.h
diff -N accel/lib/tests/test_utils.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ accel/lib/tests/test_utils.h	20 Aug 2008 03:57:53 -0000	1.3
@@ -0,0 +1,147 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef _TEST_UTILS_H_
+#define _TEST_UTILS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "hpl_accel.h"
+
+extern ssize_t getline(char **lineptr, size_t *n, FILE *stream);
+
+#define ALIGN128(_x)		(((uintptr_t)(_x) + 127) & ~(127))
+
+/* get_timebase_frequency
+ * ----------------------
+ * Parse /proc/cpuinfo for the timebase frequency. 
+ * This information is returned in Hz. If the data 
+ * can not be obtained, then 0.0 is returned.
+ */
+#define CPU_INFO "/proc/cpuinfo"
+
+double get_timebase_frequency()
+{
+  FILE *fp;
+  double freq = 0.0;
+  char *line = NULL;
+  size_t len = 0;
+  ssize_t chrs_read;
+
+  if ((fp = fopen(CPU_INFO, "r"))) {
+    while ((chrs_read = getline(&line, &len, fp)) != -1) {
+      if (sscanf(line, "timebase : %lf", &freq) == 1) {
+        if (strstr(line, "KHz")) {
+          freq *= 1.0e3;
+        } else if (strstr(line, "MHz")) {
+          freq *= 1.0e6;
+        } else if (strstr(line, "GHz")) {
+          freq *= 1.0e9;
+        }
+        break;
+      }
+    }
+    if (line) free(line);
+    fclose(fp);
+  }
+  return (freq);
+}
+
+double byte_swap(double x)
+{
+#ifdef ACCEL_LITTLE_ENDIAN
+  int i;
+  union {
+    double d;
+    unsigned char c[8];
+  } in, out;
+
+  in.d = x;
+  for (i=0; i<8; i++) out.c[i] = in.c[7-i];
+
+  return (out.d);
+#else
+  return (x);
+#endif
+}
+
+
+unsigned long long segment = 0x100000000ULL;
+
+void *allocate_panel(int rows, 			/* # of rows (row ordered) or colums (column ordered) */
+		     size_t row_size,		/* # of doubles per row (row ordered) or column (column ordered) */
+		     int alignment)		/* alignment of allocation */
+{
+  void *ptr;
+  unsigned long long start, skip;
+  unsigned int row_crossing;
+  size_t size, padded_size;
+  int page_size;
+
+  row_size *= sizeof(double);
+  size = rows * row_size;
+  page_size = getpagesize();
+  padded_size = (size + (alignment-1) + (row_size - 1) + (page_size - 1)) & ~(page_size -1);
+
+#ifdef PANEL_4GB_CROSSING
+  row_crossing = ((rand() & 0xFFFF) * rows) >> 16;
+#else
+  row_crossing = rows;
+#endif
+
+  do {
+    segment += 0x100000000ULL;
+    start = segment - ((unsigned long long)row_crossing * (unsigned long long)row_size);
+    skip = start & ((unsigned long long)page_size - 1);
+    ptr = mmap((void *)(start-skip), padded_size+skip, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  } while (ptr == MAP_FAILED);
+
+  ptr += skip;
+
+  return (ptr);
+}
+
+
+void *allocate_matrix(int blk_columns, 		/* number of block columns */
+		      size_t blk_column_size,	/* # of doubles to stride a column of blocks */
+		      int alignment)		/* alignment of allocation */
+{
+  void *ptr;
+  unsigned long long start, skip;
+  unsigned int row_crossing, col_crossing, blks_per_col;
+  size_t size, padded_size;
+  int page_size;
+
+  blk_column_size *= sizeof(double);
+  size = (size_t)blk_columns * blk_column_size;
+  page_size = getpagesize();
+  padded_size = (size + (alignment-1) + (M_SUB * M_SUB * sizeof(double) - 1) + (page_size - 1)) & ~(page_size -1);
+
+  blks_per_col = blk_column_size / (M_SUB * M_SUB * sizeof(double));
+
+#ifdef MATRIX_4GB_CROSSING
+  row_crossing = ((rand() & 0xFFFF) * blks_per_col) >> 16;
+  col_crossing = ((rand() & 0xFFFF) * blk_columns) >> 16;
+#else
+  row_crossing = blks_per_col;
+  col_crossing = blk_columns;
+#endif
+
+  do {
+    segment += 0x100000000ULL;
+
+    start = segment - ((unsigned long long)(col_crossing) * (unsigned long long)(blk_column_size) + (unsigned long long)(row_crossing * M_SUB * M_SUB * sizeof(double)));
+    skip = start & ((unsigned long long)page_size - 1);
+    ptr = mmap((void *)(start-skip), padded_size+skip, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  } while (ptr == MAP_FAILED);
+
+  ptr += skip;
+  return (ptr);
+}
+
+#endif /* _TEST_UTILS_H_ */
Index: include/hpl.h
===================================================================
RCS file: /cvsroot/hpl_qs22/include/hpl.h,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- include/hpl.h	10 Feb 2008 21:45:50 -0000	1.1
+++ include/hpl.h	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,9 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #ifndef HPL_H
 #define HPL_H
 /*
@@ -82,6 +85,8 @@
 #include "hpl_panel.h"
 #include "hpl_pfact.h"
 #include "hpl_pgesv.h"
+
+#include "hpl_accel.h"
 
 #include "hpl_timer.h"
 #include "hpl_matgen.h"
Index: include/hpl_accel.h
===================================================================
RCS file: include/hpl_accel.h
diff -N include/hpl_accel.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ include/hpl_accel.h	20 Aug 2008 03:57:53 -0000	1.13
@@ -0,0 +1,61 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef HPL_ACCEL_H
+#define HPL_ACCEL_H
+
+#ifdef HPL_CALL_ACCEL
+
+/* ---------------------------------------------------------------- */
+/* Include files                                                    */
+/* ---------------------------------------------------------------- */
+
+#include "hpl_grid.h"        /* HPL_T_grid */
+#include "hpl_pgesv.h"       /* HPL_T_pmat */
+#include "hpl_panel.h"       /* HPL_T_panel */
+
+#include "../accel/lib/hpl_accel.h"
+
+/* ---------------------------------------------------------------- */
+/* Macros                                                           */
+/* ---------------------------------------------------------------- */
+
+#define CROSSES_4GB_BOUNDARY(p,len) \
+    ( (((size_t)(p))>>32) != (((size_t)(p)+(len)-1)>>32) )
+
+#define FIX_4GB_BOUNDARY_CROSSING(p,len) \
+    ( CROSSES_4GB_BOUNDARY(p,len) ? ((((size_t)(p)+(len)-1)>>32)<<32) : (size_t)(p) )
+
+/* ---------------------------------------------------------------- */
+/*  Function prototypes                                             */
+/* ---------------------------------------------------------------- */
+
+extern int HPL_accel_init(int my_rank);
+extern int HPL_accel_exit(int my_rank);
+extern int HPL_accel_pgesv_init( HPL_T_grid *GRID, HPL_T_palg *ALGO, HPL_T_pmat *A);
+extern int HPL_accel_pgesv_fini( HPL_T_grid *GRID, HPL_T_palg *ALGO, HPL_T_pmat *A);
+extern int HPL_accel_pangetL( HPL_T_panel *PANEL);
+extern int HPL_accel_panputU( HPL_T_panel *PANEL, double *data, int ld, int *rows, int nn);
+extern int HPL_accel_rowget( HPL_T_panel *PANEL, double *data, int ld, int numrows, int *rows, int jj, int nn);
+extern int HPL_accel_rowput( HPL_T_panel *PANEL, double *data, int ld, int numrows, int *rows, int jj, int nn);
+extern int HPL_accel_dtrsm( HPL_T_panel *PANEL, int j1, int nn);
+extern int HPL_accel_dgemm( HPL_T_panel *PANEL, int j1, int nn);
+extern int HPL_accel_dgemm_async( HPL_T_panel *PANEL, int j1, int nn);
+extern int HPL_accel_dgemm_wait( HPL_T_panel *PANEL);
+extern void HPL_accel_dgemmCL(int m, int n, int k, const double *a, int lda, const double *b, int ldb, double *c, int ldc);
+extern int HPL_accel_swap00N( HPL_T_panel *PANEL, const int *IPIV, int j1, int nn);
+extern int HPL_accel_swap01T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int nn);
+extern int HPL_accel_swap02N( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, double *W0, double *W, const int ldw, const int nn);
+extern int HPL_accel_swap04T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int numrows2, double *W0, double *W, const int ldw, const int nn);
+extern int HPL_accel_swap05T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int nn);
+extern int HPL_accel_swap06T( HPL_T_panel *PANEL, const int *LINDXA, const int numrows, const int i0, const int nn);
+
+#else
+
+#define FIX_4GB_BOUNDARY_CROSSING(p,len) ((size_t)(p))
+
+#endif /* HPL_CALL_ACCEL */
+
+#endif /* HPL_ACCEL_H */
Index: include/hpl_auxil.h
===================================================================
RCS file: /cvsroot/hpl_qs22/include/hpl_auxil.h,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- include/hpl_auxil.h	10 Feb 2008 21:45:50 -0000	1.1
+++ include/hpl_auxil.h	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,9 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #ifndef HPL_AUXIL_H
 #define HPL_AUXIL_H
 /*
@@ -139,6 +142,15 @@
 double                           HPL_dlamch
 STDC_ARGS( (
    const HPL_T_MACH
+) );
+
+void*                            HPL_hpalloc
+STDC_ARGS( (
+   size_t
+) );
+void                             HPL_hpfree
+STDC_ARGS( (
+   void *
 ) );
 
 #endif
Index: include/hpl_panel.h
===================================================================
RCS file: /cvsroot/hpl_qs22/include/hpl_panel.h,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- include/hpl_panel.h	10 Feb 2008 21:45:50 -0000	1.1
+++ include/hpl_panel.h	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,9 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #ifndef HPL_PANEL_H
 #define HPL_PANEL_H
 /*
@@ -90,6 +93,7 @@
    int                 pcol;  /* proc. col owning 1st col of trail. A */
    int                 msgid;           /* message id for panel bcast */
    int                 ldl2;         /* local leading dim of array L2 */
+   int                 ldu;           /* local leading dim of array U */
    int                 len;      /* length of the buffer to broadcast */
 #ifdef HPL_CALL_VSIPL
    vsip_block_d        * Ablock;                           /* A block */
Index: include/hpl_pgesv.h
===================================================================
RCS file: /cvsroot/hpl_qs22/include/hpl_pgesv.h,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- include/hpl_pgesv.h	10 Feb 2008 21:45:50 -0000	1.1
+++ include/hpl_pgesv.h	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,9 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #ifndef HPL_PGESV_H
 #define HPL_PGESV_H
 /*
@@ -71,6 +74,7 @@
    HPL_SWAP01        = 452,                      /* Use HPL_pdlaswp01 */
    HPL_SW_MIX        = 453, /* Use HPL_pdlaswp00_ for small number of */
                             /* columns, and HPL_pdlaswp01_ otherwise. */
+   HPL_SWAP03        = 454,                      /* Use HPL_pdlaswp03 */
    HPL_NO_SWP        = 499
 } HPL_T_SWAP;
 
@@ -338,6 +342,14 @@
 STDC_ARGS( (
    HPL_T_grid *,
    HPL_T_pmat *
+) );
+
+void                             HPL_pdlaswp03T
+STDC_ARGS( (
+   HPL_T_panel *PBCST,
+   int *IFLAG,
+   HPL_T_panel *PANEL,
+   const int NN
 ) );
 
 #endif
Index: include/hpl_ptest.h
===================================================================
RCS file: /cvsroot/hpl_qs22/include/hpl_ptest.h,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- include/hpl_ptest.h	10 Feb 2008 21:45:50 -0000	1.1
+++ include/hpl_ptest.h	26 Aug 2008 13:24:26 -0000	1.4
@@ -43,6 +43,9 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #ifndef HPL_PTEST_H
 #define HPL_PTEST_H
 /*
@@ -93,13 +96,17 @@
  */
 #ifdef HPL_DETAILED_TIMING
 #define    HPL_TIMING_BEG        11 /* timer 0 reserved, used by main */
-#define    HPL_TIMING_N           6 /* number of timers defined below */
+#define    HPL_TIMING_N          (HPL_TIMING_END-HPL_TIMING_BEG) /* number of timers defined below */
 #define    HPL_TIMING_RPFACT     11 /* starting from here, contiguous */
 #define    HPL_TIMING_PFACT      12
 #define    HPL_TIMING_MXSWP      13
 #define    HPL_TIMING_UPDATE     14
 #define    HPL_TIMING_LASWP      15
 #define    HPL_TIMING_PTRSV      16
+#define    HPL_TIMING_ACCEL_OVERHEAD 17
+#define    HPL_TIMING_ALLGATHER      18
+#define    HPL_TIMING_SCATTER        19
+#define    HPL_TIMING_END            20
 #endif
 /*
  * ---------------------------------------------------------------------
Index: makes/Make.accel
===================================================================
RCS file: makes/Make.accel
diff -N makes/Make.accel
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ makes/Make.accel	20 Aug 2008 03:57:53 -0000	1.6
@@ -0,0 +1,74 @@
+# --------------------------------------------------------------- 
+# (C) Copyright IBM Corporation 2007,2008                                       
+#                                                                 
+# --------------------------------------------------------------- 
+
+include Make.inc
+
+########################################################################
+#			Target
+########################################################################
+
+ifeq ($(arch),qs22)
+
+ACCEL_OBJS	= HPL_accel_init.o HPL_accel_exit.o \
+		  HPL_accel_pgesv.o HPL_accel_swap.o \
+                  HPL_accel_rowget.o HPL_accel_rowput.o \
+                  HPL_accel_panget.o HPL_accel_panput.o \
+                  HPL_accel_dgemm.o HPL_accel_dtrsm.o 
+
+all     : lib 
+
+lib	: lib.grd
+
+lib.grd : $(ACCEL_OBJS)
+	$(ARCHIVER) $(ARFLAGS) $(HPLlib) $(ACCEL_OBJS)
+	$(RANLIB) $(HPLlib)
+	$(TOUCH) lib.grd
+
+else
+
+all     :
+
+endif
+
+########################################################################
+#			Local Defines
+########################################################################
+
+CCFLAGS		+= -I$(TOPdir)/accel
+
+INCdep		= $(INCdir)/hpl_accel.h ../HPL_accel_private.h \
+                $(TOPdir)/accel/lib/hpl_accel.h 
+
+########################################################################
+#                       Build Rules
+########################################################################
+
+HPL_accel_init.o       : ../HPL_accel_init.c       $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_exit.o       : ../HPL_accel_exit.c       $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_pgesv.o      : ../HPL_accel_pgesv.c      $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_swap.o       : ../HPL_accel_swap.c       $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_rowget.o     : ../HPL_accel_rowget.c     $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_rowput.o     : ../HPL_accel_rowput.c     $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_panget.o     : ../HPL_accel_panget.c     $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_panput.o     : ../HPL_accel_panput.c     $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_dtrsm.o      : ../HPL_accel_dtrsm.c      $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+HPL_accel_dgemm.o      : ../HPL_accel_dgemm.c      $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
+
+########################################################################
+#                       Clean Rules
+########################################################################
+
+clean            :
+	$(RM) *.o 
Index: makes/Make.auxil
===================================================================
RCS file: /cvsroot/hpl_qs22/makes/Make.auxil,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- makes/Make.auxil	10 Feb 2008 21:45:50 -0000	1.1
+++ makes/Make.auxil	26 Aug 2008 13:24:26 -0000	1.4
@@ -43,6 +43,8 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 include Make.inc
 #
@@ -58,7 +60,7 @@
    HPL_warn.o             HPL_abort.o            HPL_dlaprnt.o          \
    HPL_dlange.o
 HPL_au1obj       = \
-   HPL_dlamch.o
+   HPL_dlamch.o           HPL_hpalloc.o
 HPL_auxobj       = \
    $(HPL_au0obj) $(HPL_au1obj)
 #
@@ -91,6 +93,8 @@
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_dlange.c
 HPL_dlamch.o           : ../HPL_dlamch.c           $(INCdep)
 	$(CC) -o $@ -c $(CCNOOPT)  ../HPL_dlamch.c
+HPL_hpalloc.o          : ../HPL_hpalloc.c          $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
 #
 # ######################################################################
 #
Index: makes/Make.comm
===================================================================
RCS file: /cvsroot/hpl_qs22/makes/Make.comm,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
Index: makes/Make.panel
===================================================================
RCS file: /cvsroot/hpl_qs22/makes/Make.panel,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- makes/Make.panel	10 Feb 2008 21:45:50 -0000	1.1
+++ makes/Make.panel	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,8 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 include Make.inc
 #
@@ -52,7 +54,7 @@
    $(INCdir)/hpl_misc.h   $(INCdir)/hpl_blas.h  $(INCdir)/hpl_auxil.h \
    $(INCdir)/hpl_pmisc.h  $(INCdir)/hpl_grid.h  $(INCdir)/hpl_comm.h  \
    $(INCdir)/hpl_pauxil.h $(INCdir)/hpl_panel.h $(INCdir)/hpl_pfact.h \
-   $(INCdir)/hpl_pgesv.h
+   $(INCdir)/hpl_pgesv.h  $(INCdir)/hpl_accel.h
 #
 ## Object files ########################################################
 #
Index: makes/Make.pgesv
===================================================================
RCS file: /cvsroot/hpl_qs22/makes/Make.pgesv,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- makes/Make.pgesv	10 Feb 2008 21:45:50 -0000	1.1
+++ makes/Make.pgesv	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,8 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 include Make.inc
 #
@@ -64,7 +66,8 @@
    HPL_equil.o            HPL_pdlaswp01N.o       HPL_pdlaswp01T.o       \
    HPL_pdupdateNN.o       HPL_pdupdateNT.o       HPL_pdupdateTN.o       \
    HPL_pdupdateTT.o       HPL_pdtrsv.o           HPL_pdgesv0.o          \
-   HPL_pdgesvK1.o         HPL_pdgesvK2.o         HPL_pdgesv.o
+   HPL_pdgesvK1.o         HPL_pdgesvK2.o         HPL_pdgesv.o           \
+   HPL_pdlaswp03T.o
 #
 ## Targets #############################################################
 #
@@ -127,6 +130,8 @@
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_pdgesvK2.c
 HPL_pdgesv.o           : ../HPL_pdgesv.c           $(INCdep)
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_pdgesv.c
+HPL_pdlaswp03T.o       : ../HPL_pdlaswp03T.c       $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  $<
 #
 # ######################################################################
 #
Index: makes/Make.ptest
===================================================================
RCS file: /cvsroot/hpl_qs22/makes/Make.ptest,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- makes/Make.ptest	10 Feb 2008 21:45:50 -0000	1.1
+++ makes/Make.ptest	26 Aug 2008 13:24:26 -0000	1.3
@@ -43,6 +43,8 @@
 #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 # ######################################################################
+#  Modifications (C) Copyright IBM Corporation 2008
+# ######################################################################
 #  
 include Make.inc
 #
@@ -72,7 +74,7 @@
 $(BINdir)/HPL.dat : ../HPL.dat
 	( $(CP) ../HPL.dat $(BINdir) )
 #
-dexe.grd: $(HPL_pteobj) $(HPLlib)
+dexe.grd: $(HPL_pteobj) $(HPLlib) $(ACLlib)
 	$(LINKER) $(LINKFLAGS) -o $(xhpl) $(HPL_pteobj) $(HPL_LIBS)
 	$(MAKE) $(BINdir)/HPL.dat
 	$(TOUCH) dexe.grd
Index: src/accel/HPL_accel_dgemm.c
===================================================================
RCS file: src/accel/HPL_accel_dgemm.c
diff -N src/accel/HPL_accel_dgemm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_dgemm.c	20 Aug 2008 03:57:54 -0000	1.15
@@ -0,0 +1,241 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+static unsigned long long completion_flags = 0;
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_dgemm_async
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for inputs to dgemm */
+    int            j1,        /* Relative index of first column of B and C inputs */
+    int            nn         /* number of columns of rhs input in the row_buf */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_dgemm initiates a dgemm on the accelerator.  The lower triangular
+ * matrix input to dgemm is in the blk_buf, and the right hand sides are in
+ * the row_buf.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int ii = PANEL->ii;      /* local row index (zero based) of first row of C matrix */
+    int jj = PANEL->jj;      /* local column index (zero based) of first column of C matrix */
+    int jb = PANEL->jb;      /* number of panel columns / row buffer rows */
+    int mp = PANEL->mp;      /* number of local rows of panel */
+    int lda = PANEL->lda;    /* local leading dimension of matrix */
+    int ldl2 = PANEL->ldl2;  /* local leading dimension of L2 panel */
+
+    if ( panel_prep >= (PANEL->jj+j1) ) {
+        if ( PANEL->grid->myrow == PANEL->prow ) {
+
+            HPL_dgemm(HplColumnMajor, HplNoTrans, HplNoTrans, mp-jb, nn, jb, -HPL_rone,
+                      PANEL->L2, ldl2, Mptr(PANEL->A, 0, j1, lda), lda, HPL_rone,
+                      Mptr(PANEL->A, jb, j1, lda), lda);
+
+        } else {
+            int ldu = PANEL->ldu;
+
+            HPL_dgemm(HplColumnMajor, HplNoTrans, HplTrans, mp, nn, jb, -HPL_rone,
+                      PANEL->L2, ldl2, PANEL->U+j1, ldu, HPL_rone,
+                      Mptr(PANEL->A, 0, j1, lda), lda);
+
+        }
+        return 0;
+    }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d j1=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, ii, jj, jb, j1, nn);
+
+    /* If this row of processors holds the U panel, then the number of rows to
+       update is JB rows less than the number of rows in the trailing submatrix. */
+
+    if ( PANEL->grid->myrow == PANEL->prow ) {
+        ii += jb;
+        mp -= jb;
+    }
+
+    if (mp <= 0 || nn <= 0) { return 0; }
+
+    int mp_pad = ((mp+M_SUB-1)/M_SUB)*M_SUB;
+
+    /* Pad out rows to get acceleration when appropriate */
+    if ( (mp>=M_SUB) && (nn>=M_SUB) )
+        mp = (mp_pad > ldl2) ? mp : mp_pad;
+
+    /* The L2 panel may reside in the matrix, and thus could cross a 4GB boundary
+       for large problem sizes.  When this happens, we just copy the L2 panel over
+       to one of our extra panel buffers. */
+
+    double *L2 = PANEL->L2;  /* L2 panel */
+    if ( CROSSES_4GB_BOUNDARY(L2, ldl2*jb*sizeof(double)) ) {
+        memcpy(pan_buf[1], L2, ldl2*jb*sizeof(double));
+        L2 = pan_buf[1];
+    }
+
+    if ( PANEL->grid->myrow == PANEL->prow ) {
+        int ldu = lda*M_SUB;
+        double *U = &PANEL->pmat->A[INDEX_BLK(PANEL->ii, 0, ldu)];
+
+        hpl_accel_dgemm_CL_B_B_CL(
+            /* IN  (int) Number of rows in a, c, and panel */    mp,
+            /* IN  (int) Number of cols in b, c, and panel */    nn,
+            /* IN  (int) Number of cols in a and rows in b */    jb,
+            /* IN  (double*) matrix a is the L2 panel */         L2,
+            /* IN  (int) Leading dimension of L2 */              ldl2,
+            /* IN  (double*) matrix b (U), block row format */   U,
+            /* IN  (int) Leading dimension of b (U) */           ldu,
+            /* INOUT (double*) c matrix is trailing matrix */    PANEL->pmat->A,
+            /* IN  (int) Leading dimension of trailing matrix 
+               (number of doubles to advance from block column 
+               n to block column n+1. (i.e., from column n to
+               column n+M_SUB) */                                lda*M_SUB,
+            /* IN  (uint) Starting block matrix row offset */    ii,
+            /* IN  (uint) Starting block matrix column offset */ jj+j1,
+            /* INOUT (double *) panel to hold result or NULL  */ NULL,
+            /* IN  (int) leading dimension of panel */           0,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+    } else {
+
+        hpl_accel_dgemm_CL_R_B_CL(
+            /* IN  (int) Number of rows in a, c, and panel */    mp,
+            /* IN  (int) Number of cols in b, c, and panel */    nn,
+            /* IN  (int) Number of cols in a and rows in b */    jb,
+            /* IN  (double*) matrix a is the L2 panel */         L2,
+            /* IN  (int) Leading dimension of L2 */              ldl2,
+            /* IN  (double*) b matrix is the U panel */          PANEL->U,
+            /* IN  (int) Leading dimension U panel */            PANEL->ldu,
+            /* INOUT (double*) c matrix is trailing matrix */    PANEL->pmat->A,
+            /* IN  (int) Leading dimension of trailing matrix 
+               (number of doubles to advance from block column 
+               n to block column n+1. (i.e., from column n to
+               column n+M_SUB) */                                lda*M_SUB,
+            /* IN  (uint) Starting block matrix row offset */    ii,
+            /* IN  (uint) Starting block matrix column offset */ jj+j1,
+            /* INOUT (double *) panel to hold result or NULL  */ NULL,
+            /* IN  (int) leading dimension of panel */           0,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+    }
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_dgemm_wait
+(
+    HPL_T_panel    *PANEL     /* Panel structure for inputs to dgemm */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_dgemm initiates a dgemm on the accelerator.  The lower triangular
+ * matrix input to dgemm is in the blk_buf, and the right hand sides are in
+ * the row_buf.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    wait_for(&completion_flags);
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_dgemm
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for inputs to dgemm */
+    int            j1,        /* Relative index of first column of B and C inputs */
+    int            nn         /* number of columns of rhs input in the row_buf */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_dgemm initiates a dgemm on the accelerator.  The lower triangular
+ * matrix input to dgemm is in the blk_buf, and the right hand sides are in
+ * the row_buf.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    HPL_accel_dgemm_async(PANEL, j1, nn);
+
+    HPL_accel_dgemm_wait(PANEL);
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+void HPL_accel_dgemmCL
+(
+    int            m,         /* Number of rows in a and c */
+    int            n,         /* Number of cols in b and c */
+    int            k,         /* Number of cols in a and rows in b */
+    const double  *a,         /* Column ordered a matrix */
+    int            lda,       /* Leading dimension of a matrix */
+    const double  *b,         /* Column ordered b matrix */
+    int            ldb,       /* Leading dimension of b matrix */
+    double        *c,         /* Column ordered c matrix */
+    int            ldc        /* Leading dimension of c matrix */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_dgemmCL performs a dgemm on the accelerator.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    if ( k<=0 || n<=0 ) { return; }
+
+    /* The input panels may reside in the matrix, and thus could cross a 4GB 
+       boundary for large problem sizes.  When this happens, we copy the
+       a panels over to one of our extra panel buffers. */
+
+    int i;
+    double *a_panel = (double *)a;
+    double *c_panel = (double *)c;
+
+    if ( CROSSES_4GB_BOUNDARY(a, (k*lda-1)*sizeof(double)) ) {
+        a_panel = pan_buf[0];
+        for (i=0; i<k; i++) {
+            memcpy(a_panel+i*lda, a+i*lda, m*sizeof(double));
+        }
+    }
+
+    if ( CROSSES_4GB_BOUNDARY(c, (n*ldc-1)*sizeof(double)) ) {
+
+        /* Try splitting C so that one half or the other can avoid the copy */
+        if (n >= 8) {
+            int n1 = n / 2, n2 = n - n1;
+            HPL_accel_dgemmCL(m, n1, k, a, lda, b, ldb, c, ldc);
+            HPL_accel_dgemmCL(m, n2, k, a, lda, b+n1*ldb, ldb, c+n1*ldc, ldc);
+            return;
+        }
+
+        c_panel = pan_buf[1];
+        for (i=0; i<n; i++) {
+            memcpy(c_panel+i*ldc, c+i*ldc, m*sizeof(double));
+        }
+    }
+
+    hpl_accel_dgemm_C_C_C(m, n, k, a_panel, lda, b, ldb, c_panel, ldc, &completion_flags);
+    wait_for(&completion_flags);
+
+    if ( c_panel != c ) {
+        for (i=0; i<n; i++) {
+            memcpy(c+i*ldc, c_panel+i*ldc, m*sizeof(double));
+        }
+    }
+}
Index: src/accel/HPL_accel_dtrsm.c
===================================================================
RCS file: src/accel/HPL_accel_dtrsm.c
diff -N src/accel/HPL_accel_dtrsm.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_dtrsm.c	20 Aug 2008 03:57:54 -0000	1.6
@@ -0,0 +1,107 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_dtrsm
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for inputs to dtrsm */
+    int            j1,        /* Relative index of first column of the panel */
+    int            nn         /* number of columns of rhs input in the row_buf */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_dtrsm initiates a dtrsm on the accelerator.  The lower triangular
+ * matrix input to dtrsm is in the blk_buf, and the right hand sides are in
+ * the row_buf.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int jb = PANEL->jb;
+    int lda = PANEL->lda;
+    int ldl1 = PANEL->jb;
+    int ldu = PANEL->ldu;
+
+    if ( panel_prep >= PANEL->jj ) {
+        if ( PANEL->U == NULL ) {
+            HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans,
+                       HplUnit, jb, nn, HPL_rone, PANEL->L1, jb, Mptr(PANEL->A, 0, j1, PANEL->lda), lda );
+        } else {
+            HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans,
+                       HplUnit, nn, jb, HPL_rone, PANEL->L1, jb, PANEL->U+j1, ldu );
+
+            if ( PANEL->grid->myrow == PANEL->prow ) {
+                HPL_dlatcpy( jb, nn, PANEL->U+j1, ldu, Mptr(PANEL->A, 0, j1, PANEL->lda), lda );
+            }
+        }
+        return 0;
+    }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, PANEL->jb, nn);
+
+    unsigned long long completion_flags = 0;
+
+    /* If this processor is in the row of processors that holds the
+       top row of the trailing matrix, set the C matrix pointer and ldc
+       so that the dtrsm result goes straight into the matrix.  Note that
+       DGEMM must be sure to use the data in the matrix rather than out of
+       the U buffer in this case. */
+
+    if ( PANEL->U == NULL ) {
+
+        /* When there is no U buffer, that means that the rows are stored in the matrix. */
+
+        hpl_accel_dtrsm_CL_B(
+            /* IN  (int) Number of rows in L1 matrix and rhs matrix b */ jb,
+            /* IN  (int) Number of columns in rhs matrix b */    nn,
+            /* IN  (double*) L1 matrix */                        PANEL->L1,
+            /* IN  (int) Leading dimension of L1 */              ldl1,
+            /* INOUT (double*) c matrix - alternate result area */ PANEL->pmat->A,
+            /* IN  (int) Leading dimension of c matrix */        lda*M_SUB,
+            /* IN  (int) Block row */                            PANEL->ii,
+            /* IN  (int) Block col */                            PANEL->jj+j1,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+    } else if ( PANEL->grid->myrow == PANEL->prow ) {
+
+        hpl_accel_dtrsm_CL_R_B(
+            /* IN  (int) Number of rows in L1 matrix and rhs matrix b */ jb,
+            /* IN  (int) Number of columns in rhs matrix b */    nn,
+            /* IN  (double*) L1 matrix */                        PANEL->L1,
+            /* IN  (int) Leading dimension of L1 */              ldl1,
+            /* INOUT (double*) rhs matrix */                     PANEL->U+j1,
+            /* IN  (int) Leading dimension of rhs matrix */      ldu,
+            /* INOUT (double*) c matrix - alternate result area */ PANEL->pmat->A,
+            /* IN  (int) Leading dimension of c matrix */        lda*M_SUB,
+            /* IN  (int) Block row */                            PANEL->ii,
+            /* IN  (int) Block col */                            PANEL->jj+j1,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+    } else {
+        hpl_accel_dtrsm_CL_R_B(
+            /* IN  (int) Number of rows in L1 matrix and rhs matrix b */ jb,
+            /* IN  (int) Number of columns in rhs matrix b */    nn,
+            /* IN  (double*) L1 matrix */                        PANEL->L1,
+            /* IN  (int) Leading dimension of L1 */              ldl1,
+            /* INOUT (double*) rhs matrix */                     PANEL->U+j1,
+            /* IN  (int) Leading dimension of rhs matrix */      ldu,
+            /* INOUT (double*) c matrix - alternate result area */ NULL,
+            /* IN  (int) Leading dimension of c matrix */        0,
+            /* IN  (int) Block row */                            0,
+            /* IN  (int) Block col */                            0,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+    }
+
+    wait_for(&completion_flags);
+
+    return 0;
+}
Index: src/accel/HPL_accel_exit.c
===================================================================
RCS file: src/accel/HPL_accel_exit.c
diff -N src/accel/HPL_accel_exit.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_exit.c	20 Aug 2008 03:57:54 -0000	1.2
@@ -0,0 +1,17 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+/* HPL_accel_exit                                                   */
+/* ---------------------------------------------------------------- */
+int HPL_accel_exit(int my_rank)
+{
+    TRACE_PRINT("\nHPL_accel_exit: Done!\n");
+
+    return(0);
+}
Index: src/accel/HPL_accel_init.c
===================================================================
RCS file: src/accel/HPL_accel_init.c
diff -N src/accel/HPL_accel_init.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_init.c	20 Aug 2008 03:57:54 -0000	1.4
@@ -0,0 +1,31 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+/* HPL_accel_init                                                   */
+/* ---------------------------------------------------------------- */
+int HPL_accel_init(int my_rank)
+{
+    int lib_rc;
+
+    /* Start the accelerator process */
+    TRACE_PRINT("\nHPL_accel_init[%d]: Starting accelerator process.\n", my_rank);
+
+    /* Initialize the accelerator library */
+    lib_rc = hpl_accel_init();
+    if (lib_rc != HPL_ACCEL_INIT_SUCCESS) {
+        fprintf(stdout,"\nHPL_accel_init[%d]: hpl_accel_init failed.\n", my_rank);
+        fflush(stdout);
+        return -1;
+    }
+
+    TRACE_PRINT("\nHPL_accel_init[%d]: The accelerator process started.\n", my_rank);
+
+    return 0;
+}
Index: src/accel/HPL_accel_panget.c
===================================================================
RCS file: src/accel/HPL_accel_panget.c
diff -N src/accel/HPL_accel_panget.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_panget.c	20 Aug 2008 03:57:54 -0000	1.4
@@ -0,0 +1,76 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_pangetL
+(
+    HPL_T_panel    *PANEL  /* Panel structure specifying the panel to reformat */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_pangetL reformats the specified panel from blocked-row
+ * format back to column major format.  This operation is performed
+ * prior to panel factoization on the host.
+ *
+ * ----------------------------------------------------------------
+ */ 
+{
+    int jj = PANEL->jj;      /* local column index (zero based) of first column of panel */
+    int lda = PANEL->lda;    /* local leading dimension of matrix */
+
+    /* If this node does not contain any rows of the matrix, then just return */
+    if (PANEL->pmat->mp <= 0) { return 0; }
+
+    /* If this panel has already been prepped, then just return */
+    if ( panel_prep >= jj ) { return 0; }
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d\n", 
+                __FUNCTION__, my_rank, PANEL->ii, PANEL->jj, PANEL->jb);
+
+    /* Prepare the next block column to be factored, which means convert it
+       to column major. We do this using our special extra panel buffer. */
+
+    int reform_cols = Mmin(PANEL->nq, PANEL->nb);
+
+    /* Block column in matrix to receive prepped panel */
+    double *panel = PANEL->pmat->A+INDEX_BLK(0, jj, lda*M_SUB); 
+    double *pbuf = pan_buf[0];
+
+    unsigned long long completion_flags = 0;
+
+    hpl_accel_reform_panel_B_to_CL(
+        /* IN  (int) Number of rows of matrix a to copy to panel */ PANEL->pmat->mp,
+        /* IN  (int) Number of columns of matrix a to copy to panel */ reform_cols,
+        /* OUT (double*) Panel buffer to receive the reformatted panel */ pbuf,
+        /* IN  (int) Leading dimension of panel */ lda,
+        /* IN  (double*) Block formatted matrix */ panel,
+        /* IN  (int) Leading dimension of matrix a. The number of doubles to
+           advance from block column n to block column n+1. (i.e., from 
+           column n to column n+M_SUB) */          M_SUB*lda,
+        /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+    wait_for(&completion_flags);
+
+    memcpy(panel, pbuf, reform_cols*lda*sizeof(double));
+
+    /* Columns up to panel_prep have been converted to column major */
+    panel_prep += reform_cols;
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    return 0;
+}
Index: src/accel/HPL_accel_panput.c
===================================================================
RCS file: src/accel/HPL_accel_panput.c
diff -N src/accel/HPL_accel_panput.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_panput.c	20 Aug 2008 03:57:54 -0000	1.4
@@ -0,0 +1,53 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_panputU
+(
+    HPL_T_panel    *PANEL,    /* Panel structure containing data to be
+                                 sent to the accelerator */
+    double         *data,     /* area containing the data(in row maj format) to 
+                                 be copied to the row buffer */
+    int            ld,        /* leading dimension of data (amount to advance 
+                                 from row i to row i+1) */
+    int            *rows,     /* array of local row indices to be copied */
+    int            nn         /* number of columns of each row to be copied to accel */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_panputU copies data from host storage in row-major format to 
+ * the row buffer on the accelerator, also in row-major format.  No
+ * endianness conversions are done (the data is already big-endian).
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    if (PANEL->mp<=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, PANEL->jb, nn);
+
+    unsigned long long completion_flags = 0;
+
+    hpl_accel_copy_rows_R_to_R(
+        /* (int) number of rows to copy */                PANEL->jb,
+        /* (int) number of columns to copy in each row */ nn,
+        /* (double *) row-ordered source matrix */        data,
+        /* (int) leading dimension of source matrix */    ld,
+        /* (double *) row-ordered destination matrix */   PANEL->U,
+        /* (int) leading dimension of target matrix */    PANEL->ldu,
+        /* (int*) array of destination row indices */     rows,
+        /* (unsigned long long *) completion variable */  &completion_flags);
+
+    wait_for(&completion_flags);
+
+    return 0;
+}
Index: src/accel/HPL_accel_pgesv.c
===================================================================
RCS file: src/accel/HPL_accel_pgesv.c
diff -N src/accel/HPL_accel_pgesv.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_pgesv.c	20 Aug 2008 03:57:54 -0000	1.10
@@ -0,0 +1,196 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+/* Global variables                                                 */
+/* ---------------------------------------------------------------- */
+
+void *pgesv_mem;
+void *pan_buf[2];
+int panel_prep;
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_pgesv_init
+(
+    HPL_T_grid *GRID,
+    HPL_T_palg *ALGO,
+    HPL_T_pmat *A
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_pgesv_init prepares the accelerator to participate in a pgesv
+ * computation.
+ *
+ * Arguments
+ * =========
+ *
+ * GRID    (local input)                 HPL_T_grid *
+ *         On entry,  GRID  points  to the data structure containing the
+ *         process grid information.
+ *
+ * ALGO    (local input)                 HPL_T_palg *
+ *         On entry, ALGO points to a data structure containing the 
+ *         algorithm parameters.
+ *
+ * A       (local input/output)          HPL_T_pmat *
+ *         On entry, A points to the data structure containing the local
+ *         array information.
+ *
+ */
+{
+    /* Initialize panel_prep to indicate that no panels have been prepped */
+    panel_prep = -1;
+
+    /* If this node does not contain any rows of the matrix, then just return */
+    if (A->mp <= 0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry myprow=%d mypcol=%d mp=%d nq=%d\n", 
+                __FUNCTION__, GRID->iam, GRID->myrow, GRID->mycol, A->mp, A->nq);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    /* Create a scratch area for matrix reformatting */
+
+    int pan_size = A->ld * A->nb * sizeof(double);
+    int alloc_size = 3 * (pan_size + PAGESIZE);
+
+#ifdef HPL_USE_HUGE_PAGES
+    pgesv_mem = HPL_hpalloc(alloc_size);
+#else
+    pgesv_mem = malloc(alloc_size);
+#endif
+    assert ( pgesv_mem != NULL );
+
+    /* Reformat the matrix [a] from column-order to blocked format. */
+
+    int nq = A->nq;
+    double *mat_data = A->A;
+
+    /* Pad out the number of columns to be a multiple of M_SUB */
+    nq = ((nq+M_SUB-1)/M_SUB)*M_SUB;
+
+    /* If this processor is in the first column of processors, don't reformat
+       the first block column since it needs to stay in column-major order for
+       panel factorization. */
+
+    if (GRID->mycol == 0) {
+        nq -= A->nb;
+        mat_data = &mat_data[INDEX_BLK(0,A->nb,M_SUB*A->ld)];
+
+        /* Columns up to panel_prep have been converted to column major */
+        panel_prep += A->nb;
+    }
+
+    if (nq > 0) {
+        double *scratch_buf = (double *) FIX_4GB_BOUNDARY_CROSSING(
+                                             ALIGN_PTR(pgesv_mem, PAGESIZE),
+                                             pan_size);
+        unsigned long long completion_flags;
+
+        hpl_accel_reform_matrix_CL_to_B(
+            /* IN  (int) Number of rows in matrix a */ A->mp,
+            /* IN  (int) Number of cols in matrix a */ nq,
+            /* IN  (double*) Matrix data in column-ordered, big-endian format
+               OUT           Matrix data in blocked, big-endian format */ mat_data,
+            /* IN  (int) Leading dimension of matrix a.  The number of doubles to
+               advance from column n column n+1. */    A->ld,
+            /* IN  (double*) Scratch area of at least 64*mp elems */ scratch_buf,
+            /* IN  (int) size of scratch_buf in doubles */ pan_size/sizeof(double),
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+        wait_for(&completion_flags);
+    }
+
+    /* Carve up allocated storage into buffers for pgesv computation */
+
+    void *free_area = pgesv_mem;
+
+    int i;
+    for (i = 0; i<2; i++) {
+        pan_buf[i] = (double *) FIX_4GB_BOUNDARY_CROSSING(
+                                    ALIGN_PTR(free_area, PAGESIZE),
+                                    pan_size);
+        free_area = (void*)pan_buf[i] + pan_size;
+    }
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    return 0;
+}
+
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_pgesv_fini
+(
+    HPL_T_grid *GRID,
+    HPL_T_palg *ALGO,
+    HPL_T_pmat *A
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_pgesv_fini cleans up the accelerator at the completion of a
+ * pgesv computation.
+ *
+ * Arguments
+ * =========
+ *
+ * GRID    (local input)                 HPL_T_grid *
+ *         On entry,  GRID  points  to the data structure containing the
+ *         process grid information.
+ *
+ * ALGO    (local input)                 HPL_T_palg *
+ *         On entry, ALGO points to a data structure containing the 
+ *         algorithm parameters.
+ *
+ * A       (local input/output)          HPL_T_pmat *
+ *         On entry, A points to the data structure containing the local
+ *         array information.
+ */
+{
+    /* If this node does not contain any rows of the matrix, then just return */
+    if (A->mp <= 0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry\n", __FUNCTION__, GRID->iam);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    /* A special case: when the last block column contains only the 
+       right-hand-side vector, we must convert this from row-blocked 
+       format to column major here. */
+
+    if ( (A->nq % A->nb) == 1) {
+        int x;
+        for (x=0; x<A->mp; x++) {
+            A->A[INDEX_COL(x, A->nq-1, A->ld)] = A->A[INDEX_BLK(x, A->nq-1, M_SUB*A->ld)];
+        }
+    }
+
+#ifdef HPL_USE_HUGE_PAGES
+    HPL_hpfree(pgesv_mem);
+#else
+    free(pgesv_mem);
+#endif
+    pgesv_mem = NULL;
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD );
+#endif
+
+    return 0;
+}
Index: src/accel/HPL_accel_private.h
===================================================================
RCS file: src/accel/HPL_accel_private.h
diff -N src/accel/HPL_accel_private.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_private.h	20 Aug 2008 03:57:54 -0000	1.6
@@ -0,0 +1,49 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#ifndef HPL_ACCEL_PRIVATE_H
+#define HPL_ACCEL_PRIVATE_H
+
+/* ---------------------------------------------------------------- */
+/* Include files                                                    */
+/* ---------------------------------------------------------------- */
+
+#include <assert.h>
+#include "lib/hpl_accel.h"    /* M_SUB */
+#include <ppu_intrinsics.h>   /* __mftb */
+
+/* ---------------------------------------------------------------- */
+/* Global variables                                                 */
+/* ---------------------------------------------------------------- */
+
+extern void *pan_buf[2];
+extern int panel_prep;
+
+/* ---------------------------------------------------------------- */
+/* Macros                                                           */
+/* ---------------------------------------------------------------- */
+
+//#include <stdio.h>            /* printf */
+
+#define TRACE_PRINT(s, ...)
+//#define TRACE_PRINT  printf
+
+#define PAGESIZE (4096)
+
+#define ALIGN_PTR(p,b)  ( ( ((size_t)(p)+(b)-1) / (b) ) * (b) )
+
+/* ---------------------------------------------------------------- */
+/* Inline functions                                                 */
+/* ---------------------------------------------------------------- */
+
+static inline void wait_for(unsigned long long *completion_flags)
+{
+    volatile unsigned long long *flagptr = completion_flags;
+
+    while ( *flagptr != 0 ) {}
+}
+
+
+#endif /* HPL_ACCEL_PRIVATE_H */
Index: src/accel/HPL_accel_rowget.c
===================================================================
RCS file: src/accel/HPL_accel_rowget.c
diff -N src/accel/HPL_accel_rowget.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_rowget.c	20 Aug 2008 03:57:54 -0000	1.3
@@ -0,0 +1,85 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_rowget
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    double         *data,     /* area to receive the data from the accelerator matrix (in row maj format) */
+    int            ld,        /* leading dimension of data (amount to advance from row i to row i+1) */
+    int            numrows,   /* number of rows to be copied from the accelerator matrix */
+    int            *rows,     /* array of local row indices to be copied */
+    int            jj,        /* local column index of first column to be copied into data */
+    int            nn         /* number of columns of each row to be copied into data */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_rowget copies data from a block-formatted matrix into a buffer in 
+ * row-major format.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i = 0;
+
+    if (numrows<=0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, jj, numrows, nn);
+
+    if ( panel_prep >= jj ) {
+        unsigned int x;
+
+        for (i=0; i<numrows; i++) {
+            if (rows[i]>=0) {
+                double *dest = data + i*ld;
+                for (x=0; x<(unsigned int)nn; x++) {
+                    dest[x] = PANEL->pmat->A[INDEX_COL(rows[i], jj+x, PANEL->lda)];
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    while (i<numrows)
+    {
+        if (rows[i] < 0) {
+            i++;
+            continue;
+        }
+
+        int j = 0;
+        while ( (j<16) && ((i+j)<numrows) && (rows[i+j]>=0)) {
+            j++;
+        }
+
+        double *dest = data + i*ld;
+
+        unsigned long long completion_flags;
+
+        hpl_accel_reform_rows_B_to_R(
+            /* IN  (int) Number of rows to copy */                     j,
+            /* IN  (int) Number of values (doubles) per row to copy */ nn,
+            /* OUT (double*) Buffer to receive row-formatted data */   dest,
+            /* IN  (int) Leading dimension of the row buffer */        ld,
+            /* IN  (double*) block formetted matrix */                 PANEL->pmat->A,
+            /* IN  (int) Leading dimension of matrix [a] */            M_SUB*PANEL->lda,
+            /* IN  (int*) Array of row indices */                      &rows[i],
+            /* IN  (int) Starting [a] block matrix column offset */    jj,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+        wait_for(&completion_flags);
+
+        i+=j;
+    }
+
+    return 0;
+}
Index: src/accel/HPL_accel_rowput.c
===================================================================
RCS file: src/accel/HPL_accel_rowput.c
diff -N src/accel/HPL_accel_rowput.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_rowput.c	20 Aug 2008 03:57:54 -0000	1.3
@@ -0,0 +1,84 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_rowput
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    double         *data,     /* area containing the data(in row maj format) to be copied to the accel */
+    int            ld,        /* leading dimension of data (amount to advance from row i to row i+1) */
+    int            numrows,   /* number of rows to be copied to the accelerator matrix */
+    int            *rows,     /* array of local row indices to be copied */
+    int            jj,        /* local column index of first column to be copied to accel */
+    int            nn         /* number of columns of each row to be copied to accel */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_rowput copies data from a buffer in row-major format to a block-formatted matrix.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i = 0;
+
+    if (numrows<=0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, jj, numrows, nn);
+
+    if ( panel_prep >= jj ) {
+        unsigned int x;
+
+        for (i=0; i<numrows; i++) {
+            if (rows[i]>=0) {
+                double *src = data + i*ld;
+                for (x=0; x<(unsigned int)nn; x++) {
+                    PANEL->pmat->A[INDEX_COL(rows[i], jj+x, PANEL->lda)] = src[x];
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    while (i<numrows)
+    {
+        if (rows[i] < 0) {
+            i++;
+            continue;
+        }
+
+        int j = 0;
+        while ( (j<16) && ((i+j)<numrows) && (rows[i+j]>=0)) {
+            j++;
+        }
+
+        double *src = data + i*ld;
+
+        unsigned long long completion_flags;
+
+        hpl_accel_reform_rows_R_to_B(
+            /* IN  (int) Number of rows to copy */                     j,
+            /* IN  (int) Number of values (doubles) per row to copy */ nn, 
+            /* IN  (double*) Buffer containing row-formatted data */   src,
+            /* IN  (int) Leading dimension of the row buffer */        ld, 
+            /* OUT  (double*) block formatted matrix */                PANEL->pmat->A, 
+            /* IN  (int) Leading dimension of matrix [a] */            M_SUB*PANEL->lda, 
+            /* IN  (int*) Array of row indices */                      &rows[i],
+            /* IN  (int) Starting [a] block matrix column offset */    jj,
+            /* IN  (unsigned long long *) Completion variable */ &completion_flags);
+
+        wait_for(&completion_flags);
+
+        i += j;
+    }
+
+    return 0;
+}
Index: src/accel/HPL_accel_swap.c
===================================================================
RCS file: src/accel/HPL_accel_swap.c
diff -N src/accel/HPL_accel_swap.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/accel/HPL_accel_swap.c	20 Aug 2008 03:57:54 -0000	1.8
@@ -0,0 +1,301 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2008                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+#include "HPL_accel_private.h"
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap00N
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *IPIV,     /* Pivot vector */
+    const int       j1,       /* Relative index of first column of matrix */
+    const int       nn        /* local number of columns of each row to be swapped */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap00N swaps rows in a block-row formatted matrix.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    if (nn <=0) { return 0; }
+
+    if ( panel_prep >= PANEL->jj ) {
+        HPL_dlaswp00N( PANEL->jb, nn, Mptr(PANEL->A, 0, j1, PANEL->lda), PANEL->lda, IPIV );
+        return 0;
+    }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d j1=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, j1, nn);
+
+    int row_indx = INDEX_BLK(PANEL->ii, 0, M_SUB*PANEL->lda);
+
+    unsigned long long completion_flags = 0;
+
+    hpl_accel_swap_rows_B_to_B(
+        /* IN    (int) number of rows to swap */                 PANEL->jb,
+        /* IN    (int) number of columns to swap in each row */  nn,
+        /* INOUT (double*) block-formatted matrix */             PANEL->pmat->A+row_indx,
+        /* IN    (int) leading dimension for matrix */           M_SUB*PANEL->lda,
+        /* IN    (int) array of row indices */                   (int *)IPIV,
+        /* IN    (int) starting column */                        PANEL->jj+j1,
+        /* IN    (unsigned long long *) Completion variable */   &completion_flags);
+
+    wait_for(&completion_flags);
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap01T
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *LINDXA,   /* row indices of source rows in A */
+    const int      *LINDXAU,  /* row indices of dest rows in A or U */
+    const int       numrows,  /* number of rows to copy */
+    const int       nn        /* local number of columns of each row to be copied */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap01T copies rows from A into A and into U.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i, y;
+
+    if (numrows <= 0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn);
+
+    double *A = PANEL->pmat->A;
+    int lda = M_SUB*PANEL->lda;
+    double *U = PANEL->U;
+    int ldu = PANEL->ldu;
+
+    int blk_row = PANEL->ii, blk_col = PANEL->jj;
+
+    for (i=0; i<numrows; i++) {
+        int src = LINDXA[i];  /* source row in A for copy */
+        int dst = LINDXAU[i];  /* dest row in A or U for copy */
+        if (dst >= 0) {
+            /* Copy source row into U */
+            for (y=0; y<nn; y++) {
+                U[INDEX_ROW(dst, y, ldu)] = A[INDEX_BLK(src+blk_row, y+blk_col, lda)];
+            }
+        } else {
+            dst = -dst;
+            /* Copy source row into A */
+            for (y=0; y<nn; y++) {
+                A[INDEX_BLK(dst+blk_row, y+blk_col, lda)] = A[INDEX_BLK(src+blk_row, y+blk_col, lda)];
+            }
+        }
+    }
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap02N
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *LINDXA,   /* row indices of source rows in A */
+    const int      *LINDXAU,  /* row indices of dest rows in A or U */
+    const int       numrows,  /* number of rows to copy */
+    double         *W0,       /* array to receive dst row info from LINDXAU */
+    double         *W,        /* workspace to receive source row data */
+    const int       ldw,       /* leading dimension of W0 and W */
+    const int       nn         /* local number of columns of each row to be copied */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap02N packs the dest row index and src row data into workspace W.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i, y;
+
+    if (numrows <= 0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn);
+
+    double *A = PANEL->pmat->A;
+    int lda = M_SUB*PANEL->lda;
+
+    int blk_row = PANEL->ii, blk_col = PANEL->jj;
+
+    for (i=0; i<numrows; i++) {
+        W0[INDEX_ROW(i,0,ldw)] = LINDXAU[i];
+    }
+
+    for (i=0; i<numrows; i++) {
+        /* Copy source row into W */
+        int src = LINDXA[i];  /* source row in A for copy */
+        for (y=0; y<nn; y++) {
+            W[INDEX_ROW(i, y, ldw)] = A[INDEX_BLK(src+blk_row, y+blk_col, lda)];
+        }
+    }
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap04T
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *LINDXA,   /* row indices of source rows in A */
+    const int      *LINDXAU,  /* row indices of dest rows in A or U */
+    const int       numrows,  /* number of rows to copy from U to A */
+    const int       numrows2, /* number of rows to copy from W to U */
+    double         *W0,       /* array to receive dst row info from LINDXAU */
+    double         *W,        /* workspace to receive source row data */
+    const int       ldw,       /* leading dimension of W0 and W */
+    const int       nn         /* local number of columns of each row to be copied */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap04T copies rows from U to A and replaces them with rows of W,
+ * and may copy some additional rows from W to U.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i, y;
+
+    if (numrows <= 0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn);
+
+    double *A = PANEL->pmat->A;
+    int lda = M_SUB*PANEL->lda;
+    double *U = PANEL->U;
+    int ldu = PANEL->ldu;
+
+    int blk_row = PANEL->ii, blk_col = PANEL->jj;
+
+    for (i=0; i<numrows; i++) {
+        /* Copy row from U to A, and copy row from W to U */
+        int urow = LINDXAU[i]; /* src row in U for copy */
+        int arow = LINDXA[i];  /* dst row in A for copy */
+        for (y=0; y<nn; y++) {
+            A[INDEX_BLK(arow+blk_row, y+blk_col, lda)] = U[INDEX_ROW(urow, y, ldu)];
+            U[INDEX_ROW(urow, y, ldu)] = W[INDEX_ROW(i, y, ldw)];
+        }
+    }
+
+    for (i=numrows; i<numrows2; i++) {
+        /* Copy row from W to U */
+        int urow = W0[INDEX_ROW(i,0,ldw)]; /* dst row in U for copy */
+        for (y=0; y<nn; y++) {
+            U[INDEX_ROW(urow, y, ldu)] = W[INDEX_ROW(i, y, ldw)];
+        }
+    }
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap05T
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *LINDXA,   /* row indices of source rows in A */
+    const int      *LINDXAU,  /* row indices of dest rows in A or U */
+    const int       numrows,  /* number of rows to copy */
+    const int       nn        /* local number of columns of each row to be copied */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap05T copies rows from U into A.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i, y;
+
+    if (numrows <= 0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn);
+
+    double *A = PANEL->pmat->A;
+    int lda = M_SUB*PANEL->lda;
+    double *U = PANEL->U;
+    int ldu = PANEL->ldu;
+
+    int blk_row = PANEL->ii, blk_col = PANEL->jj;
+
+    for (i=0; i<numrows; i++) {
+        int src = LINDXAU[i];  /* src row in U for copy */
+        int dst = LINDXA[i];  /* dst row in A for copy */
+        /* Copy source row into A */
+        for (y=0; y<nn; y++) {
+            A[INDEX_BLK(dst+blk_row, y+blk_col, lda)] = U[INDEX_ROW(src, y, ldu)];
+        }
+    }
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+int HPL_accel_swap06T
+(
+    HPL_T_panel    *PANEL,    /* Panel structure for current panel */
+    const int      *LINDXA,   /* row indices of source rows in A */
+    const int       numrows,  /* number of rows to copy */
+    const int       i0,       /* Starting row index in U for copy/swap */
+    const int       nn        /* local number of columns of each row to be copied */
+)
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_accel_swap06T swaps rows of U with rows of A.
+ *
+ * -----------------------------------------------------------------
+ */ 
+{
+    int i, y;
+
+    if (numrows <= 0 || nn <=0) { return 0; }
+
+    TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", 
+                __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn);
+
+    double *A = PANEL->pmat->A;
+    int lda = M_SUB*PANEL->lda;
+    double *U = PANEL->U;
+    int ldu = PANEL->ldu;
+
+    int blk_row = PANEL->ii, blk_col = PANEL->jj;
+
+    for (i=0; i<numrows; i++) {
+        int src = i0 + i;     /* src row in U for copy */
+        int dst = LINDXA[i];  /* dst row in A for copy */
+        /* Copy source row into A */
+        for (y=0; y<nn; y++) {
+            double tmp = A[INDEX_BLK(dst+blk_row, y+blk_col, lda)];
+            A[INDEX_BLK(dst+blk_row, y+blk_col, lda)] = U[INDEX_ROW(src, y, ldu)];
+            U[INDEX_ROW(src, y, ldu)] = tmp;
+        }
+    }
+
+    return 0;
+}
+
Index: src/auxil/HPL_hpalloc.c
===================================================================
RCS file: src/auxil/HPL_hpalloc.c
diff -N src/auxil/HPL_hpalloc.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/auxil/HPL_hpalloc.c	20 Aug 2008 03:57:54 -0000	1.2
@@ -0,0 +1,225 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007                               */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+/* This file contains source code for a simple huge page memory allocator. The supported 
+ * funcitons include: hpalloc and hpfree. This allocator is intentionally simple and 
+ * memory can quickly be fragmented if used as a general purpose memory allocator.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define MINIMUM_BUFFER_SIZE	(8 + sizeof(hpalloc_hdr_t))
+
+
+typedef struct _hpalloc_hdr {
+  struct _hpalloc_hdr *next;
+  size_t size;
+} hpalloc_hdr_t;
+
+static size_t hpsize=0;			/* size of huge pages in bytes */
+static int hpseq=0;			/* sequence number for huge page allocations */
+static hpalloc_hdr_t *heap=NULL;	/* list of free buffers in the memory heap */
+
+/* get_huge_pagesize
+ * -----------------
+ * Parse /proc/meminfo for the size of huge pages.
+ */
+static size_t get_huge_pagesize()
+{
+  FILE *fp;
+  size_t size;
+  char *line = NULL;
+  size_t len = 0;
+  ssize_t chrs_read;
+
+  if ((fp = fopen("/proc/meminfo", "r"))) {
+    while ((chrs_read = getline(&line, &len, fp)) != -1) {
+      if (sscanf(line, "Hugepagesize:%ld", &size) == 1) {
+	if (strstr(line, "kB")) {
+	  size *= 1024;
+	} else if (strstr(line, "MB")) {
+	  size *= 1024*1204;
+	}
+	break;
+      }
+    }
+    if (line) free(line);
+    fclose(fp);
+  }
+  return (size);
+}
+
+/* allocate_from_heap
+ * ------------------
+ * Allocate a buffer of the specified size from the huge page memory allocator.
+ * If there is insufficient memory, NULL is returned.
+ */
+static void *allocate_from_heap(size_t size) 
+{
+  size_t *size_ptr;
+  hpalloc_hdr_t *ptr = heap;
+  hpalloc_hdr_t *prev = NULL;
+  hpalloc_hdr_t *ptr2;
+  
+  /* Scan the heap looking for a free buffer large enough
+   */
+  while (ptr) {
+    if (ptr->size >= size + sizeof(size_t)) {
+      if (ptr->size >= (size + MINIMUM_BUFFER_SIZE)) {
+	/* Split the buffer in two, allocating off the front */
+	ptr2 = (hpalloc_hdr_t *)((char *)ptr + size + sizeof(size_t));
+	if (prev) prev->next = ptr2;
+	else      heap = ptr2;
+	ptr2->next = ptr->next;
+	ptr2->size = ptr->size - (size + sizeof(size_t));
+      } else {
+        /* Allocate the entire buffer block */
+	if (prev) prev->next = ptr->next;
+	else 	  heap = ptr->next;
+	size = ptr->size - sizeof(size_t);
+      }
+      size_ptr = (size_t *)ptr;
+      *size_ptr = size;
+      return ((void *)(size_ptr+1));
+    }
+    prev = ptr;
+    ptr = ptr->next;
+  }
+  /* Failled allocating buffer */
+  return NULL;
+}
+
+
+/* add_to_heap
+ * -----------
+ * Add the memory buffer of <size> bytes and specified to begin at <ptr>
+ * to the heap allocator.
+ */
+static void add_to_heap(void *buffer, size_t size) 
+{
+  hpalloc_hdr_t *ptr = heap;
+  hpalloc_hdr_t *prev = NULL;
+  hpalloc_hdr_t *buf;
+
+  buf = (hpalloc_hdr_t *)buffer;
+  buf->size = size;
+
+  /* Scan the heap looking for the appropriate insertion point.
+   */
+  while ((buffer > (void *)ptr) && (ptr)) {
+    prev = ptr;
+    ptr = ptr->next;
+  }
+
+  /* Insert the buffer into heap's free list. Coalesce the
+   * adjacent blocks, before and after.
+   */
+  buf->next = ptr;
+  if (ptr) {
+    if (((char *)buf + buf->size) == (char *)(ptr)) {
+      /* Combine buf and ptr */
+      buf->next = ptr->next;
+      buf->size += ptr->size;
+    }
+  }
+  if (prev) {
+    if (((char *)prev + prev->size) == (char *)(buf)) {
+      /* Combine prev and buf */
+      prev->next = buf->next;
+       prev->size += buf->size;
+    } else {
+      prev->next = buf;
+    }
+  } else {
+    heap = buf;
+  }
+}
+
+#if 1
+/* Diagnostic routines 
+ */
+void dump_heap()
+{
+  hpalloc_hdr_t *ptr = heap;
+  
+  while (ptr) {
+    printf("HEAP %p %lld\n", ptr, (long long int)ptr->size);
+    ptr = ptr->next;
+  }
+}
+#endif
+
+/* HPL_hpalloc
+ * -------
+ * Allocate a buffer of <size> bytes from the huge page memory heap.
+ */
+void* HPL_hpalloc(size_t size)
+{
+  void *ptr;
+  int fmem;
+  char memfile[100];
+  char msg[100];
+  size_t bufsize;
+
+  if (size == 0) return NULL;
+
+  /* Pad the size to the next double word
+   */
+  size = (size + 7) & ~7;
+
+  /* See if allocation is available on the free list
+   */
+  if ((ptr = allocate_from_heap(size))) return ptr;
+
+  /* Not available, expand the heap and try again 
+   */
+  if (hpsize == 0) {
+    hpsize = get_huge_pagesize();
+    if (hpsize == 0) {
+      perror("Failed locating huge page size");
+      return NULL;
+    }
+  } 
+
+  bufsize = hpsize * (((sizeof(size_t) + size) + hpsize-1) / hpsize);
+  
+  sprintf(memfile, "/huge/linpack_%d_%d", getpid(), hpseq++);
+  if ((fmem = open(memfile, O_CREAT | O_RDWR, 0755)) == -1) {
+    sprintf(msg, "Failed opening file %s", memfile);
+    perror(msg);
+    return NULL;
+  }
+  remove(memfile);
+  if ((ptr = (void *)mmap(0, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fmem, 0)) == MAP_FAILED) {
+    perror("Failed mmapping hugetlbs file");
+    return NULL;
+  }
+  add_to_heap(ptr, bufsize);
+  return (allocate_from_heap(size));
+}
+
+
+
+/* HPL_hpfree
+ * --------
+ * Free a buffer previously allocated using hgmalloc.
+ */
+void HPL_hpfree(void *ptr)
+{
+  if (ptr) {
+    ptr = (void *)(((size_t *)ptr)-1);
+    add_to_heap(ptr, *((size_t *)ptr) + sizeof(size_t));
+  }
+}
+
Index: src/blas/HPL_dgemm.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/blas/HPL_dgemm.c,v
retrieving revision 1.1
retrieving revision 1.7
diff -u -r1.1 -r1.7
--- src/blas/HPL_dgemm.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/blas/HPL_dgemm.c	26 Aug 2008 13:24:26 -0000	1.7
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -406,6 +409,18 @@
  *
  * ---------------------------------------------------------------------
  */ 
+#ifdef HPL_CALL_ACCEL
+    /* If this call can be performed on the accelerator, invoke 
+       the accelerator DGEMM function. */
+   if ( ((N & (4-1)) == 0) && ((K & (4-1)) == 0)
+       && ORDER == HplColumnMajor 
+       && TRANSA == HplNoTrans && TRANSB == HplNoTrans
+       && ALPHA == -HPL_rone && BETA == HPL_rone )
+   {
+       HPL_accel_dgemmCL(M, N, K, A, LDA, B, LDB, C, LDC);
+       return;
+   }
+#endif
 #ifdef HPL_CALL_CBLAS
    cblas_dgemm( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
                 BETA, C, LDC );
Index: src/comm/HPL_allgatherv.c
===================================================================
RCS file: src/comm/HPL_allgatherv.c
diff -N src/comm/HPL_allgatherv.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/comm/HPL_allgatherv.c	7 Aug 2008 13:07:08 -0000	1.2
@@ -0,0 +1,206 @@
+/* ------------------------------------------------------------------ */
+/* (C) Copyright 2007                                                 */
+/* International Business Machines Corporation,                       */
+/*                                                                    */
+/* All Rights Reserved.                                               */
+/* ------------------------------------------------------------------ */
+
+#include "hpl.h"
+
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_allgatherv is an API-compatible wrapper for MPI_allgatherv, to
+ * allow experimentation with various implementations of allgatherv.
+ *
+ * Arguments
+ * =========
+ *
+ * sendbuf (input)           double *
+ *         Address of send buffer for this rank.
+ *
+ * sendcount (input)         int
+ *         Number of elements in send buffer
+ *
+ * datatype (input)          MPI_Datatype
+ *         Datatype of send buffer elements (assumed to be MPI_DOUBLE)
+ *
+ * recvbuf (input/output)    double *
+ *         Address of receive buffer
+ *
+ * recvcounts (input)        int *
+ *         Array specifying the number of elements to receive from each
+ *         participant in the Allgatherv
+ *
+ * displs (input)            int *
+ *         Array specifying the displacement into recvbuf at which to 
+ *         the data from each task should be placed.
+ *
+ * recvtype (input)          MPI_Datatype
+ *         Datatype of recv buffer elements (assumed to be MPI_DOUBLE)
+ *
+ * comm (input)              MPI_Comm
+ *         MPI Communicator on which the communication should flow.
+ *
+ * ---------------------------------------------------------------------
+ */ 
+
+#define FANCY_ALLGATHER 1
+
+int HPL_Allgatherv ( double *sendbuf, int sendcount, MPI_Datatype datatype, 
+                     double *recvbuf, int *recvcounts, int *displs, 
+                     MPI_Datatype recvtype, MPI_Comm comm )
+{
+    int retval;
+#ifdef FANCY_ALLGATHER
+    int how_many, i, j, leftover, my_rank, receiver, pointer, current_resource;
+    int total_size, average_size, hole, target_displ, recvidx, processor;
+    int *recvcounts2, *sendcounts2, *modified_displs;
+        int *modified_sendcount, *modified_recvcounts, *order_displs, *back_displs;
+        double *modified_sendbuf;
+    MPI_Status status_not_used;
+    int send_offset_table[100][100];
+    int send_amount_table[100][100];
+
+    MPI_Comm_size(comm, &how_many);
+    MPI_Comm_rank(comm, &my_rank);
+    recvcounts2 = (int*) malloc(sizeof(int)*how_many);
+    sendcounts2 = (int*) malloc(sizeof(int)*how_many);
+    modified_sendcount = (int*) malloc(sizeof(int)*how_many);
+    modified_recvcounts = (int*) malloc(sizeof(int)*how_many);
+    modified_displs = (int*) malloc(sizeof(int)*how_many);
+    order_displs = (int*) malloc(sizeof(int)*how_many);
+    back_displs = (int*) malloc(sizeof(int)*how_many);
+    for(i = 0; i < how_many; i++)
+    {
+        back_displs[i] = displs[i];
+    }
+    target_displ = 0;
+    for(i = 0; i < how_many; i++)
+    {
+        for(j = 0; j < how_many; j++)
+        {
+            if(displs[j] == target_displ)
+            {
+                order_displs[i] = j;
+                displs[j] = -1;
+                target_displ = target_displ+recvcounts[j];
+                j = how_many;
+            }
+        }
+    }
+        for(i = 0; i < how_many; i++)
+    {
+        for(j = 0; j < how_many; j++)
+        {
+            send_amount_table[i][j] = 0;
+            send_offset_table[i][j] = 0;
+        }
+    }
+    total_size = 0;
+        for(i = 0; i < how_many; i++)
+        total_size += recvcounts[i];
+    average_size = total_size/how_many;
+    leftover = total_size%how_many; // if the rank is < leftover, you get one extra thing to move
+    if(leftover > 0)
+        hole = average_size+1;
+    else hole = average_size;
+    processor = order_displs[0];
+    if(leftover > processor)
+        hole=average_size+1;
+    else hole=average_size;
+    recvidx = 0;
+    receiver = order_displs[recvidx];
+    pointer = 0; 
+    // Missing ... if sender == receiver, don't send
+    for(i = 0; i < how_many; i++)
+    {
+        current_resource = recvcounts[order_displs[i]];
+        while(current_resource > 0)
+        {
+            if(current_resource >= hole)
+            {
+                current_resource = current_resource - hole;
+                send_amount_table[order_displs[i]][receiver] = hole;
+                send_offset_table[order_displs[i]][receiver] = pointer;
+                    pointer = pointer+hole;    
+                recvidx++;
+                receiver = order_displs[recvidx];
+                if(receiver < leftover)
+                    hole = average_size+1;
+                else if(receiver < how_many)
+                    hole = average_size;
+                                else {hole = 0; break;}
+            }
+            else // if(current_resource > 0)??& if(current_resource < hole)
+            {
+                hole = hole - current_resource;
+                send_amount_table[order_displs[i]][receiver] = current_resource;
+                send_offset_table[order_displs[i]][receiver] = pointer;
+                    pointer = pointer+current_resource;    
+                current_resource = 0;
+            }
+        }
+    }
+
+    for(i = 0; i < how_many; i++) // source
+    { 
+        for(j = 0; j < how_many; j++) // destination
+        { 
+            if((i != j) && (send_amount_table[i][j] != 0))
+            {    
+                if((my_rank == i) ) 
+                { 
+                    MPI_Send((void*)&recvbuf[send_offset_table[i][j]], send_amount_table[i][j], datatype, j, i, comm );
+                }    
+                if((my_rank == j) ) 
+                { 
+                    MPI_Recv((void*)&recvbuf[send_offset_table[i][j]], send_amount_table[i][j], datatype, i, i, comm, &status_not_used ); 
+                } 
+            }
+        }
+    }
+
+    pointer = 0;
+    for(i = 0; i < how_many; i++)
+    {
+        processor = order_displs[i];
+        if(processor < leftover)
+        {
+            modified_recvcounts[processor] = average_size+1;
+            modified_displs[processor] = pointer;
+            pointer = pointer+average_size+1;
+            modified_sendcount[processor] = average_size+1;
+        }
+        else // if(processor >= leftover)
+        {
+            modified_recvcounts[processor] = average_size;
+            modified_displs[processor] = pointer;
+            pointer = pointer+average_size;
+            modified_sendcount[processor] = average_size;
+        }
+    }
+
+    modified_sendbuf = &recvbuf[modified_displs[my_rank]]; // It's "in place" ... so my send_buf is where I would receive if I were to receive my own data
+    retval = MPI_Allgatherv(modified_sendbuf, modified_sendcount[my_rank], datatype, 
+                            recvbuf, modified_recvcounts, modified_displs, recvtype, comm);
+    for(i = 0; i < how_many; i++)
+    {
+        displs[i] = back_displs[i];
+    }
+    free(recvcounts2);
+    free(sendcounts2);
+    free(modified_sendcount);
+    free(modified_recvcounts);
+    free(modified_displs);
+    free(order_displs);
+    free(back_displs);
+#else
+
+    retval = MPI_Allgatherv(sendbuf, sendcount, datatype, 
+                            recvbuf, recvcounts, displs, recvtype, comm);
+
+#endif
+    return retval;
+}
Index: src/panel/HPL_pdpanel_free.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/panel/HPL_pdpanel_free.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/panel/HPL_pdpanel_free.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/panel/HPL_pdpanel_free.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 #include "hpl.h"
 
 #ifdef STDC_HEADERS
@@ -94,7 +97,11 @@
       vsip_blockdestroy_d( PANEL->Ublock );
 #endif
 
+#ifdef HPL_USE_HUGE_PAGES
+   if( PANEL->WORK  ) HPL_hpfree( PANEL->WORK  );
+#else
    if( PANEL->WORK  ) free( PANEL->WORK  );
+#endif
    if( PANEL->IWORK ) free( PANEL->IWORK );
 
    return( MPI_SUCCESS );
Index: src/panel/HPL_pdpanel_init.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/panel/HPL_pdpanel_init.c,v
retrieving revision 1.1
retrieving revision 1.9
diff -u -r1.1 -r1.9
--- src/panel/HPL_pdpanel_init.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/panel/HPL_pdpanel_init.c	26 Aug 2008 13:24:26 -0000	1.9
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -141,7 +144,7 @@
    size_t                     dalign;
    int                        icurcol, icurrow, ii, itmp1, jj, lwork,
                               ml2, mp, mycol, myrow, nb, npcol, nprow,
-                              nq, nu;
+                              nq, align, nu, uwork;
 /* ..
  * .. Executable Statements ..
  */
@@ -182,10 +185,11 @@
    PANEL->pcol    = icurcol; /* proc col owning 1st col of trailing A */
    PANEL->msgid   = TAG;     /* message id to be used for panel bcast */
 /*
- * Initialize  ldl2 and len to temporary dummy values and Update tag for
- * next panel
+ * Initialize  ldl2, ldu, and len to temporary dummy values and Update
+ * tag for next panel
  */
    PANEL->ldl2    = 0;               /* local leading dim of array L2 */
+   PANEL->ldu     = 0;                /* local leading dim of array U */
    PANEL->len     = 0;           /* length of the buffer to broadcast */
 /*
  * Figure out the exact amount of workspace  needed by the factorization
@@ -201,15 +205,27 @@
  * right  after  L2 (when it exist) so that one can receive a contiguous
  * buffer.
  */
+   align  = ALGO->align;
    dalign = ALGO->align * sizeof( double );
 
    if( npcol == 1 )                             /* P x 1 process grid */
    {                                     /* space for L1, DPIV, DINFO */
       lwork = ALGO->align + ( PANEL->len = JB * JB + JB + 1 );
+      uwork = 0;
       if( nprow > 1 )                                 /* space for U */
-      { nu = nq - JB; lwork += JB * Mmax( 0, nu ); }
+      {
+         nu = Mmax( 0, nq - JB );
+         /* To allow for alignment of either transposed or non-transposed
+            U panels, compute the size by padding both dimensions. */
+         uwork = (((JB+align-1)/align)*align) * (((nu+align-1)/align)*align);
+      }
+      lwork += uwork;
 
-      if( !( PANEL->WORK = (void *)malloc( lwork * sizeof( double ) ) ) )
+#ifdef HPL_USE_HUGE_PAGES
+      if( !( PANEL->WORK = (void *)HPL_hpalloc( 2 * lwork * sizeof( double ) ) ) )
+#else
+      if( !( PANEL->WORK = (void *)malloc( 2 * lwork * sizeof( double ) ) ) )
+#endif
       {
          HPL_pabort( __LINE__, "HPL_pdpanel_init",
                      "Memory allocation failed" );
@@ -220,27 +236,43 @@
  */
       PANEL->L2    = PANEL->A + ( myrow == icurrow ? JB : 0 );
       PANEL->ldl2  = A->ld;
-      PANEL->L1    = (double *)HPL_PTR( PANEL->WORK, dalign );
+      PANEL->L1    = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                         HPL_PTR( PANEL->WORK, dalign ),
+                         JB * JB * sizeof(double) );
       PANEL->DPIV  = PANEL->L1    + JB * JB;
       PANEL->DINFO = PANEL->DPIV + JB;       *(PANEL->DINFO) = 0.0;
-      PANEL->U     = ( nprow > 1 ? PANEL->DINFO + 1: NULL );
+      PANEL->U     = ( nprow > 1 ? (double *)FIX_4GB_BOUNDARY_CROSSING(
+                                       HPL_PTR( PANEL->DINFO + 1, dalign ),
+                                       uwork * sizeof(double) )
+                                 : NULL );
    }
    else
    {                                        /* space for L2, L1, DPIV */
       ml2 = ( myrow == icurrow ? mp - JB : mp ); ml2 = Mmax( 0, ml2 );
       PANEL->len = ml2*JB + ( itmp1 = JB*JB + JB + 1 );
+      /* enforce alignment requirement on L2 panel */
+      ml2 = ((ml2+align-1)/align)*align;
 #ifdef HPL_COPY_L
-      lwork = ALGO->align + PANEL->len;
+      PANEL->ldl2 = Mmax( 1, ml2 );
+      lwork = ALGO->align + PANEL->ldl2*JB + itmp1;
 #else
-      lwork = ALGO->align + ( mycol == icurcol ? itmp1 : PANEL->len );
+      PANEL->ldl2 = ( mycol == icurcol ) ? A->ld : Mmax( 1, ml2 );
+      lwork = ALGO->align + ( mycol == icurcol ? 0 : PANEL->ldl2*JB ) + itmp1;
 #endif
+      uwork = 0;
       if( nprow > 1 )                                 /* space for U */
       { 
-         nu = ( mycol == icurcol ? nq - JB : nq );
-         lwork += JB * Mmax( 0, nu );
+         nu = Mmax( 0, ( mycol == icurcol ? nq - JB : nq) );
+         /* To allow for alignment of either transposed or non-transposed
+            U panels, compute the size by padding both dimensions. */
+         uwork = (((JB+align-1)/align)*align) * (((nu+align-1)/align)*align);
       }
-
-      if( !( PANEL->WORK = (void *)malloc( lwork * sizeof( double ) ) ) )
+      lwork += uwork;
+#ifdef HPL_USE_HUGE_PAGES
+      if( !( PANEL->WORK = (void *)HPL_hpalloc( 2 * lwork * sizeof( double ) ) ) )
+#else
+      if( !( PANEL->WORK = (void *)malloc( 2 * lwork * sizeof( double ) ) ) )
+#endif
       {
          HPL_pabort( __LINE__, "HPL_pdpanel_init",
                      "Memory allocation failed" );
@@ -250,26 +282,36 @@
  * rent process column when HPL_COPY_L is not defined.
  */
 #ifdef HPL_COPY_L
-      PANEL->L2    = (double *)HPL_PTR( PANEL->WORK, dalign );
-      PANEL->ldl2  = Mmax( 1, ml2 );
-      PANEL->L1    = PANEL->L2 + ml2 * JB;
+      PANEL->L2    = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                         HPL_PTR( PANEL->WORK, dalign ),
+                         PANEL->ldl2 * JB * sizeof(double) );
+      PANEL->L1    = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                         PANEL->L2 + PANEL->ldl2 * JB,
+                         JB * JB * sizeof(double) );
 #else
       if( mycol == icurcol )
       {
          PANEL->L2   = PANEL->A + ( myrow == icurrow ? JB : 0 );
-         PANEL->ldl2 = A->ld;
-         PANEL->L1   = (double *)HPL_PTR( PANEL->WORK, dalign );
+         PANEL->L1   = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                           HPL_PTR( PANEL->WORK, dalign ),
+                           JB * JB * sizeof(double) );
       }
       else
       {
-         PANEL->L2   = (double *)HPL_PTR( PANEL->WORK, dalign );
-         PANEL->ldl2 = Mmax( 1, ml2 );
-         PANEL->L1   = PANEL->L2 + ml2 * JB;
+         PANEL->L2   = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                           HPL_PTR( PANEL->WORK, dalign ),
+                           PANEL->ldl2 * JB * sizeof(double) );
+         PANEL->L1   = (double *)FIX_4GB_BOUNDARY_CROSSING(
+                           PANEL->L2 + PANEL->ldl2 * JB,
+                           JB * JB * sizeof(double) );
       } 
 #endif
       PANEL->DPIV  = PANEL->L1   + JB * JB;
       PANEL->DINFO = PANEL->DPIV + JB;     *(PANEL->DINFO) = 0.0;
-      PANEL->U     = ( nprow > 1 ? PANEL->DINFO + 1 : NULL );
+      PANEL->U     = ( nprow > 1 ? (double *)FIX_4GB_BOUNDARY_CROSSING(
+                                       HPL_PTR( PANEL->DINFO + 1, dalign ),
+                                       uwork * sizeof(double) )
+                                 : NULL );
    }
 #ifdef HPL_CALL_VSIPL
    PANEL->Ablock  = A->block;
Index: src/pfact/HPL_pdfact.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pfact/HPL_pdfact.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pfact/HPL_pdfact.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pfact/HPL_pdfact.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -114,6 +117,10 @@
    jb = PANEL->jb; PANEL->n -= jb; PANEL->ja += jb;
 
    if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return;
+#ifdef HPL_CALL_ACCEL
+   /* Copy panel data from accel to host */
+   HPL_accel_pangetL(PANEL);
+#endif
 #ifdef HPL_DETAILED_TIMING
    HPL_ptimer( HPL_TIMING_RPFACT );
 #endif
Index: src/pgesv/HPL_pdgesv.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdgesv.c,v
retrieving revision 1.1
retrieving revision 1.5
diff -u -r1.1 -r1.5
--- src/pgesv/HPL_pdgesv.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdgesv.c	26 Aug 2008 13:24:26 -0000	1.5
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -98,6 +101,10 @@
 
    A->info = 0;
 
+#ifdef HPL_CALL_ACCEL
+   (void) HPL_accel_pgesv_init(GRID, ALGO, A);
+#endif
+
    if( ( ALGO->depth == 0 ) || ( GRID->npcol == 1 ) )
    {
       HPL_pdgesv0(  GRID, ALGO, A );
@@ -106,6 +113,11 @@
    {
       HPL_pdgesvK2( GRID, ALGO, A );
    }
+
+#ifdef HPL_CALL_ACCEL
+   (void) HPL_accel_pgesv_fini(GRID, ALGO, A);
+#endif
+
 /*
  * Solve upper triangular system
  */
Index: src/pgesv/HPL_pdgesvK2.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdgesvK2.c,v
retrieving revision 1.1
retrieving revision 1.5
diff -u -r1.1 -r1.5
--- src/pgesv/HPL_pdgesvK2.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdgesvK2.c	26 Aug 2008 13:24:26 -0000	1.5
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -176,10 +179,10 @@
          nn = HPL_numrocI( jb, j, nb, nb, mycol, 0, npcol );
          for( k = 0; k < depth; k++ )   /* partial updates 0..depth-1 */
             (void) HPL_pdupdate( NULL, NULL, panel[k], nn );
-         HPL_pdfact(       panel[depth] );    /* factor current panel */
       }
       else { nn = 0; }
-          /* Finish the latest update and broadcast the current panel */
+      HPL_pdfact(       panel[depth] );    /* factor current panel */
+      /* Finish the latest update and broadcast the current panel */
       (void) HPL_binit( panel[depth] );
       HPL_pdupdate( panel[depth], &test, panel[0], nq-nn );
       (void) HPL_bwait( panel[depth] );
Index: src/pgesv/HPL_pdlaswp00N.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp00N.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pgesv/HPL_pdlaswp00N.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdlaswp00N.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -122,9 +125,8 @@
                              mydist, mydis_;
    int                       Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
                              hdim, i, icurrow, *iflag, ipA, ipW, *ipl,
-                             iprow, jb, k, lda, ldW, myrow, n, nprow,
-                             partner, root, size_, usize;
-#define LDU                  jb
+                             iprow, jb, k, lda, ldu, ldW, myrow, n,
+                             nprow, partner, root, size_, usize;
 /* ..
  * .. Executable Statements ..
  */
@@ -144,8 +146,14 @@
    comm  = grid->col_comm; ip2     = (unsigned int)grid->row_ip2;
    hdim  = grid->row_hdim; align   = PANEL->algo->align;
    A     = PANEL->A;       U       = PANEL->U;    iflag = PANEL->IWORK;
-   lda   = PANEL->lda;     icurrow = PANEL->prow; usize = jb * n;
-   ldW   = n + 1;
+   lda   = PANEL->lda;     icurrow = PANEL->prow; ldW   = n + 1;
+
+/*
+ * pad leading dimension of U panel to get proper alignment
+ */
+   ldu = ((jb+align-1)/align)*align;
+   PANEL->ldu = ldu;
+   usize = ldu * n;
 /*
  * Allocate space for temporary W (ldW * jb)
  */
@@ -189,7 +197,7 @@
  */
    if( myrow == icurrow ) 
    {
-      HPL_dlaswp01N( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
+      HPL_dlaswp01N( ipA, n, A, lda, U, ldu, lindxA, lindxAU );
    }
    else
    {
@@ -251,7 +259,7 @@
          (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
                           Cmsgid, partner, comm );
          if( llen[partner] > 0 )
-            HPL_dlaswp03N( llen[partner], n, U, LDU, W, W+1, ldW );
+            HPL_dlaswp03N( llen[partner], n, U, ldu, W, W+1, ldW );
       }
       else if( mydist == ip2 )
       {                      /* I recv U for later Bcast, I send my W */
@@ -316,7 +324,7 @@
                (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW,
                                 ldW ), llen[partner]*ldW, Cmsgid,
                                 partner, comm );
-               HPL_dlaswp03N( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
+               HPL_dlaswp03N( llen[partner], n, U, ldu, Mptr( W, 0, ipW,
                               ldW ), Mptr( W, 1, ipW, ldW ), ldW );
                ipW += llen[partner];
             }
@@ -324,7 +332,7 @@
             {
                (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
                                 Cmsgid, partner, comm );
-               HPL_dlaswp04N( ipA, llen[myrow], n, U, LDU, A, lda, W,
+               HPL_dlaswp04N( ipA, llen[myrow], n, U, ldu, A, lda, W,
                               W+1, ldW, lindxA, lindxAU );
             }
          }
@@ -401,7 +409,7 @@
  * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
  * of A.
  */
-      HPL_dlaswp05N( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
+      HPL_dlaswp05N( ipA, n, A, lda, U, ldu, lindxA, lindxAU );
    }
 /*
  * If  nprow  is not a power of 2,  proc[i-ip2]  sends  global result to
Index: src/pgesv/HPL_pdlaswp00T.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp00T.c,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- src/pgesv/HPL_pdlaswp00T.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdlaswp00T.c	26 Aug 2008 13:24:26 -0000	1.4
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -122,9 +125,8 @@
                              mydist, mydis_;
    int                       Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
                              hdim, i, icurrow, *iflag, ipA, ipW, *ipl,
-                             iprow, jb, k, lda, ldW, myrow, n, nprow,
-                             partner, root, size_, usize;
-#define LDU                  n
+                             iprow, jb, k, lda, ldu, ldW, myrow, n,
+                             nprow, partner, root, size_, usize;
 /* ..
  * .. Executable Statements ..
  */
@@ -144,8 +146,13 @@
    comm  = grid->col_comm; ip2     = (unsigned int)grid->row_ip2;
    hdim  = grid->row_hdim; align   = PANEL->algo->align;
    A     = PANEL->A;       U       = PANEL->U;    iflag = PANEL->IWORK;
-   lda   = PANEL->lda;     icurrow = PANEL->prow; usize = jb * n;
-   ldW   = n + 1;
+   lda   = PANEL->lda;     icurrow = PANEL->prow; ldW   = n + 1;
+/*
+ * pad leading dimension of U panel to get proper alignment
+ */
+   ldu   = ((n+align-1)/align)*align;
+   PANEL->ldu = ldu;
+   usize = ldu * jb;
 /*
  * Allocate space for temporary W (ldW * jb)
  */
@@ -189,10 +196,20 @@
  */
    if( myrow == icurrow ) 
    {
-      HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
+#ifdef HPL_CALL_ACCEL
+      if ( (PANEL->ja % (M_SUB*2)) == 0 )
+          HPL_accel_swap01T( PANEL, lindxA, lindxAU, ipA, n );
+      else
+#endif
+      HPL_dlaswp01T( ipA, n, A, lda, U, ldu, lindxA, lindxAU );
    }
    else
    {
+#ifdef HPL_CALL_ACCEL
+      if ( (PANEL->ja % (M_SUB*2)) == 0 )
+          HPL_accel_swap02N( PANEL, lindxA, lindxAU, ipA, W, W+1, ldW, n );
+      else
+#endif
       HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU );
    }
 /*
@@ -251,7 +268,7 @@
          (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
                           Cmsgid, partner, comm );
          if( llen[partner] > 0 )
-            HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW );
+            HPL_dlaswp03T( llen[partner], n, U, ldu, W, W+1, ldW );
       }
       else if( mydist == ip2 )
       {                      /* I recv U for later Bcast, I send my W */
@@ -316,7 +333,7 @@
                (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW,
                                 ldW ), llen[partner]*ldW, Cmsgid,
                                 partner, comm );
-               HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
+               HPL_dlaswp03T( llen[partner], n, U, ldu, Mptr( W, 0, ipW,
                               ldW ), Mptr( W, 1, ipW, ldW ), ldW );
                ipW += llen[partner];
             }
@@ -324,7 +341,12 @@
             {
                (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
                                 Cmsgid, partner, comm );
-               HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W,
+#ifdef HPL_CALL_ACCEL
+               if ( (PANEL->ja % (M_SUB*2)) == 0 )
+                   HPL_accel_swap04T( PANEL, lindxA, lindxAU, ipA, llen[myrow], W, W+1, ldW, n );
+               else
+#endif
+               HPL_dlaswp04T( ipA, llen[myrow], n, U, ldu, A, lda, W,
                               W+1, ldW, lindxA, lindxAU );
             }
          }
@@ -401,7 +423,12 @@
  * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
  * of A.
  */
-      HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
+#ifdef HPL_CALL_ACCEL
+      if ( (PANEL->ja % (M_SUB*2)) == 0 )
+          HPL_accel_swap05T( PANEL, lindxA, lindxAU, ipA, n );
+      else
+#endif
+      HPL_dlaswp05T( ipA, n, A, lda, U, ldu, lindxA, lindxAU );
    }
 /*
  * If  nprow  is not a power of 2,  proc[i-ip2]  sends  global result to
Index: src/pgesv/HPL_pdlaswp01N.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp01N.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pgesv/HPL_pdlaswp01N.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdlaswp01N.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -120,8 +123,7 @@
                              * permU;
    static int                equil=-1;
    int                       icurrow, * iflag, * ipA, * ipl, jb, k,
-                             lda, myrow, n, nprow;
-#define LDU                  jb
+                             align, lda, ldu, myrow, n, nprow;
 /* ..
  * .. Executable Statements ..
  */
@@ -142,7 +144,12 @@
  */
    nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow;
    A     = PANEL->A;   U       = PANEL->U;     iflag  = PANEL->IWORK;
-   lda   = PANEL->lda; icurrow = PANEL->prow;
+   lda   = PANEL->lda; icurrow = PANEL->prow;  align  = PANEL->algo->align;
+/*
+ * pad leading dimension of U panel to get proper alignment
+ */
+   ldu = ((jb+align-1)/align)*align;
+   PANEL->ldu = ldu;
 /*
  * Compute ipID (if not already done for this panel). lindxA and lindxAU
  * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
@@ -178,11 +185,11 @@
  * Copy into U the rows to be spread (local to icurrow)
  */
    if( myrow == icurrow )
-   { HPL_dlaswp01N( *ipA, n, A, lda, U, LDU, lindxA, lindxAU ); }
+   { HPL_dlaswp01N( *ipA, n, A, lda, U, ldu, lindxA, lindxAU ); }
 /*
  * Spread U - optionally probe for column panel
  */
-   HPL_spreadN( PBCST, IFLAG, PANEL, HplRight, n, U, LDU, 0, iplen,
+   HPL_spreadN( PBCST, IFLAG, PANEL, HplRight, n, U, ldu, 0, iplen,
                 ipmap, ipmapm1 );
 /*
  * Local exchange (everywhere but in process row icurrow)
@@ -191,22 +198,22 @@
    {
       k = ipmapm1[myrow];
       HPL_dlaswp06N( iplen[k+1]-iplen[k], n, A, lda, Mptr( U, iplen[k],
-                     0, LDU ), LDU, lindxA );
+                     0, ldu ), ldu, lindxA );
    }
 /*
  * Equilibration
  */
    if( equil != 0 )
-      HPL_equil( PBCST, IFLAG, PANEL, HplNoTrans, n, U, LDU, iplen,
+      HPL_equil( PBCST, IFLAG, PANEL, HplNoTrans, n, U, ldu, iplen,
                  ipmap, ipmapm1, iwork );
 /*
  * Rolling phase
  */
-   HPL_rollN( PBCST, IFLAG, PANEL, n, U, LDU, iplen, ipmap, ipmapm1 );
+   HPL_rollN( PBCST, IFLAG, PANEL, n, U, ldu, iplen, ipmap, ipmapm1 );
 /*
  * Permute U in every process row
  */
-   HPL_dlaswp00N( jb, n, U, LDU, permU );
+   HPL_dlaswp00N( jb, n, U, ldu, permU );
 
 #ifdef HPL_DETAILED_TIMING
    HPL_ptimer( HPL_TIMING_LASWP );
Index: src/pgesv/HPL_pdlaswp01T.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp01T.c,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- src/pgesv/HPL_pdlaswp01T.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdlaswp01T.c	26 Aug 2008 13:24:26 -0000	1.4
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -120,8 +123,7 @@
                              * permU;
    static int                equil=-1;
    int                       icurrow, * iflag, * ipA, * ipl, jb, k,
-                             lda, myrow, n, nprow;
-#define LDU                  n
+                             align, lda, ldu, myrow, n, nprow;
 /* ..
  * .. Executable Statements ..
  */
@@ -142,7 +144,12 @@
  */
    nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow;
    A     = PANEL->A;   U       = PANEL->U;     iflag  = PANEL->IWORK;
-   lda   = PANEL->lda; icurrow = PANEL->prow;
+   lda   = PANEL->lda; icurrow = PANEL->prow;  align  = PANEL->algo->align;
+/*
+ * pad leading dimension of U panel to get proper alignment
+ */
+   ldu   = ((n+align-1)/align)*align;
+   PANEL->ldu = ldu;
 /*
  * Compute ipID (if not already done for this panel). lindxA and lindxAU
  * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
@@ -178,11 +185,18 @@
  * Copy into U the rows to be spread (local to icurrow)
  */
    if( myrow == icurrow )
-   { HPL_dlaswp01T( *ipA, n, A, lda, U, LDU, lindxA, lindxAU ); }
+   {
+#ifdef HPL_CALL_ACCEL
+      if ( (PANEL->ja % (M_SUB*2)) == 0 )
+          HPL_accel_swap01T( PANEL, lindxA, lindxAU, *ipA, n );
+      else
+#endif
+      HPL_dlaswp01T( *ipA, n, A, lda, U, ldu, lindxA, lindxAU );
+   }
 /*
  * Spread U - optionally probe for column panel
  */
-   HPL_spreadT( PBCST, IFLAG, PANEL, HplRight, n, U, LDU, 0, iplen,
+   HPL_spreadT( PBCST, IFLAG, PANEL, HplRight, n, U, ldu, 0, iplen,
                 ipmap, ipmapm1 );
 /*
  * Local exchange (everywhere but in process row icurrow)
@@ -190,23 +204,28 @@
    if( myrow != icurrow )
    {
       k = ipmapm1[myrow];
+#ifdef HPL_CALL_ACCEL
+      if ( (PANEL->ja % (M_SUB*2)) == 0 )
+          HPL_accel_swap06T( PANEL, lindxA, iplen[k+1]-iplen[k], iplen[k], n );
+      else
+#endif
       HPL_dlaswp06T( iplen[k+1]-iplen[k], n, A, lda, Mptr( U, 0,
-                     iplen[k], LDU ), LDU, lindxA );
+                     iplen[k], ldu ), ldu, lindxA );
    }
 /*
  * Equilibration
  */
    if( equil != 0 )
-      HPL_equil( PBCST, IFLAG, PANEL, HplTrans, n, U, LDU, iplen, ipmap,
+      HPL_equil( PBCST, IFLAG, PANEL, HplTrans, n, U, ldu, iplen, ipmap,
                  ipmapm1, iwork );
 /*
  * Rolling phase
  */
-   HPL_rollT( PBCST, IFLAG, PANEL, n, U, LDU, iplen, ipmap, ipmapm1 );
+   HPL_rollT( PBCST, IFLAG, PANEL, n, U, ldu, iplen, ipmap, ipmapm1 );
 /*
  * Permute U in every process row
  */
-   HPL_dlaswp10N( n, jb, U, LDU, permU );
+   HPL_dlaswp10N( n, jb, U, ldu, permU );
 
 #ifdef HPL_DETAILED_TIMING
    HPL_ptimer( HPL_TIMING_LASWP );
Index: src/pgesv/HPL_pdlaswp03T.c
===================================================================
RCS file: src/pgesv/HPL_pdlaswp03T.c
diff -N src/pgesv/HPL_pdlaswp03T.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/pgesv/HPL_pdlaswp03T.c	20 Aug 2008 18:23:35 -0000	1.8
@@ -0,0 +1,477 @@
+/* ---------------------------------------------------------------- */
+/* (C) Copyright IBM Corporation 2007,2008                          */
+/*                                                                  */
+/* ---------------------------------------------------------------- */
+
+#include "hpl.h"
+
+/* 
+ * Purpose
+ * =======
+ *
+ * HPL_pdlaswp03T is an API-compatible replacement for the 
+ * HPL_pdlaswp0xN functions which perform the NB row interchanges to
+ * NN columns of the trailing submatrix.  The swap is performed using
+ * the MPI_Allgatherv and MPI_Scatterv collective communications APIs.
+ *
+ * Arguments
+ * =========
+ *
+ * PBCST   (local input/output)          HPL_T_panel *
+ *         On entry,  PBCST  points to the data structure containing the
+ *         panel (to be broadcast) information, or NULL.
+ *
+ * IFLAG   (local output)                int *
+ *         On exit,  IFLAG  indicates  whether or not  the broadcast has
+ *         been completed when PBCST is not NULL on entry. In that case,
+ *         IFLAG is left unchanged.
+ *
+ * PANEL   (local input/output)          HPL_T_panel *
+ *         On entry,  PANEL  points to the data structure containing the
+ *         panel (to be updated) information.
+ *
+ * NN      (local input)                 const int
+ *         On entry, NN specifies  the  local  number  of columns of the
+ *         trailing  submatrix  to be updated  starting  at the  current
+ *         position. NN must be at least zero.
+ *
+ * Note:  PBCST is the panel that has just been factored and must be
+ * broadcast. PANEL is generally some block column in the matrix to the
+ * left of PBCST. The operations of broadcasting PBCST and updating PANEL
+ * are combined to allow the implementation to attempt to overlap them.
+ *
+ * -----------------------------------------------------------------
+ */ 
+
+void HPL_pdlaswp03T
+(
+    HPL_T_panel *                    PBCST,
+    int *                            IFLAG,
+    HPL_T_panel *                    PANEL,
+    const int                        NN
+)
+{
+    int my_prow = PANEL->grid->myrow;
+    int nprow   = PANEL->grid->nprow;
+    int align   = PANEL->algo->align;
+    int jb      = PANEL->jb;
+    void  *vptr = NULL;
+    double *B            /* Buffer for MPI Collectives */;
+    int ldb;             /* Leading dimension for B */
+    int j;
+
+    /* There is nothing to update, just return */
+    if ( (NN <= 0) || (PANEL->jb <= 0) ) { return; }
+
+    /* For simplicity, let's just do the bcast up front and get it out of
+       the way. */
+
+    /* TODO:  MDK - Revisit whether this is what we want to do */
+    if ( PBCST != NULL && *IFLAG == HPL_KEEP_TESTING ) {
+        do { (void) HPL_bcast( PBCST, IFLAG ); }
+        while( *IFLAG != HPL_SUCCESS );
+    }
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+
+    /* The accelerator uses a leading dimension of N for the B/U buffers.
+       For the non-accelerated case, we must use NN+1 because hpl's local
+       swap routines -- specifically HPL_dlaswp03T -- are designed to work
+       with this size. The extra double is used to hold the 'W0' array
+       passed in to HPL_dlaswp03T. */
+
+    /* Do the pivot on NN columns using MPI Collective Communications routines.
+       We need to do an Allgatherv to collect the winners on all processors
+       in the column and a Scatterv to move the losers down into the matrix,
+       and we have to pick which to do first.  We'll do the AllGatherv
+       first, since the winners are needed for the DTRSM, which could
+       theoretically be done in parallel with the Scatterv of the losers,
+       which aren't needed until we start the DGEMM */
+
+    /* The pivot row information is contained in PANEL->DPIV, an array of jb
+       ints, where DPIV[j] specifies the global row index of the row to be
+       swapped with row j of the panel.*/
+
+    double *my_row_buffer;
+
+    /* Note: The Allgatherv processing is essentially the same on all
+       processor rows. */
+
+    /* PANEL->IWORK has size (4 + 9*JB + 3*NPROW + MAX( 2*JB, NPROW+1 )) * sizeof(int)
+       and is intended to hold information used by pivot processing */
+
+    int  *iflag;    /* iflag indicates if a pivot info has already been
+                       computed by a previous call to this function.
+                       0 => No , 1 => Yes */
+    int *ipl;       /* Length of ipID */
+    int *ipID;      /* Pivot pairs array computed by HPL_pipid.  This array
+                       is at most 4*N (N = PANEL->jb) elements in size. */
+    int *winner_prow; /* Array of winner processor row numbers */
+    int *my_winners;  /* Array containing the local index of winner rows 
+                       on this row of processors */
+    int *my_losers; /* Array containing the local index of rows that will
+                       receive loser rows */
+    int *WtoB;      /* WtoB[j] is the row index of winner j in the 
+                       Allgathterv buffer.
+                       This is computed in two parts -- first, the
+                       offset *within* the block of rows contributed by
+                       the winning row is computed.  Then to this is added
+                       the offset of the block of rows for the prow */
+    int *BtoW;      /* BtoW[j] is the row index of Allgathterv buffer that 
+                       will hold row j of the block row of winners.
+                       This is computed as the inverse of WtoB. */
+    int *LtoB;      /* LtoB[j] is the row index of loser j in the 
+                       Scatter buffer */
+    int *BtoL;      /* BtoL[j] is the row index of Scatterv buffer that 
+                       will hold row j of the block row of losers.
+                       This is computed as the inverse of LtoB. */
+    int *prow_cnts; /* Array containing the count of winner rows for each
+                       processor row --  will contribute to U */
+    int *loser_cnts; /* Array containing the count of loser rows for each
+                        processor row */
+    int *my_loser_cnt; /* Count of rows on this processor row that will receive
+                          loser rows */
+    int *prowindx;  /* Array containing the index [0..jb) of the first 
+                       row that each processor row contributes to U.  This
+                       is just the prefix sum of prow_cnts. */
+    int *recvcounts; /* Array of receive counts for MPI_AllgatherV */
+    int *displs;     /* Array of displacements for MPI AllgatherV */
+
+    int *iwork_free_area = PANEL->IWORK;
+
+    /* Allocate pivot info structures in PANEL->IWORK */
+    iflag        = iwork_free_area;  iwork_free_area++;
+    ipl          = iwork_free_area;  iwork_free_area++;
+    ipID         = iwork_free_area;  iwork_free_area += 4*jb;
+    winner_prow  = iwork_free_area;  iwork_free_area += jb;
+    my_winners   = iwork_free_area;  iwork_free_area += jb;
+    my_losers    = iwork_free_area;  iwork_free_area += jb;
+    WtoB         = iwork_free_area;  iwork_free_area += jb;
+    LtoB         = iwork_free_area;  iwork_free_area += jb;
+    my_loser_cnt = iwork_free_area;  iwork_free_area++;
+    prow_cnts    = iwork_free_area;  iwork_free_area += nprow;
+    prowindx     = iwork_free_area;  iwork_free_area += nprow;
+    recvcounts   = iwork_free_area;  iwork_free_area += nprow;
+    displs       = iwork_free_area;  iwork_free_area += nprow;
+
+    /* Due to space constraints, we use some areas of the IWORK buffer for multiple
+       purposes. */
+
+    BtoW         = ipID;      /* BtoW shares the first jb entries of ipID */
+    BtoL         = ipID+jb;   /* BtoW shares the second jb entries of ipID */
+
+    loser_cnts   = recvcounts; /* loser_cnts shares storage with recvcounts */
+
+    /* Pad leading dimension of U panel to get proper alignment */
+    PANEL->ldu = ((NN+align-1)/align)*align;
+
+#ifdef HPL_CALL_ACCEL
+    ldb = NN;        /* Leading dimension for B */
+#else
+    ldb = NN+1;      /* Leading dimension for B */
+#endif
+
+    /* Allocate another row buffer basically the same size as U -- jb x NN. */
+
+#ifdef HPL_USE_HUGE_PAGES
+    vptr = HPL_hpalloc( (align + jb*ldb) * sizeof(double) );
+#else
+    vptr = malloc( (align + jb*ldb) * sizeof(double) );
+#endif
+
+    if (vptr == NULL) {
+        HPL_pabort( __LINE__, "HPL_pdlaswp03T", "Memory allocation failed." );
+    }
+
+    B = (double *)HPL_PTR(vptr, ((size_t)(align) * sizeof(double)));
+
+    if (*iflag != 2) {  /* pivot data not already computed */
+        /* Initialize pivot_info in PANEL->IWORK */
+        *iflag = 2;
+
+        HPL_pipid(PANEL, ipl, ipID);
+
+        for (j=0; j<nprow; j++) {
+            prow_cnts[j] = 0;
+        }
+
+        /* The first jb entries of ipID are (src, dst) for dst =
+           the jb rows of the current panel */
+        for (j=0; j<jb; j++) {
+            int local_index;
+            /* ipID[2*j] is the global index of winner j.  Determine the processor
+               row that holds this winner (winner_prow).  Also find local_index of this
+               row on that processor row. */
+            Mindxg2lp( local_index, winner_prow[j], ipID[2*j], PANEL->nb, PANEL->nb, 0, nprow );
+            if ( winner_prow[j] == my_prow ) {
+                my_winners[prow_cnts[my_prow]] = local_index;
+            }
+            WtoB[j] = prow_cnts[winner_prow[j]];
+            prow_cnts[winner_prow[j]] ++;
+        }
+
+        prowindx[0] = 0;
+        for (j=1; j<nprow; j++) {
+            prowindx[j] = prowindx[j-1] + prow_cnts[j-1];
+        }
+
+        for (j=0; j<jb; j++) {
+            WtoB[j] += prowindx[winner_prow[j]];
+        }
+
+        /* Initialize LtoB array.  The -1 indicates that row j is not a loser. */
+        for (j=0; j<jb; j++) {
+            LtoB[j] = -1;
+        }
+
+        /* Initialize loser_cnts array.  This array is used to keep some intermediate
+           state for the loop below.  After that, all we need to retain is the count
+           of losers on the current row of processors.  We save that off into its own
+           spot, leaving us free to reuse the storage for loser_cnts. */
+        for (j=0; j<nprow; j++) {
+            loser_cnts[j] = 0;
+        }
+
+        /* Remaining entries of ipID are for dst = elsewhere.  These entries will
+           all have src = this block row. */
+        for (j=jb; j<*ipl/2; j++) {
+            int loser_index =  ipID[2*j] - PANEL->ia;
+            int local_index, loser_prow;
+            /* ipID[2*j+1] is the destination of a loser row.  Determine the processor
+               row that holds this loser (loser_prow). Also find local_index of this
+               row on that processor row. */
+            Mindxg2lp( local_index, loser_prow, ipID[2*j+1], PANEL->nb, PANEL->nb, 0, nprow );
+            if ( loser_prow == my_prow ) {
+                my_losers[loser_cnts[my_prow]] = local_index;
+            }
+            LtoB[loser_index] = prowindx[loser_prow] + loser_cnts[loser_prow];
+            loser_cnts[loser_prow]++;
+        }
+
+        *my_loser_cnt = loser_cnts[my_prow];
+
+        /* At this point, we're done with ipID, so we can overwrite it */
+
+        for (j=0;  j<jb; j++) {
+            BtoW[WtoB[j]] = j;
+        }
+
+        /* Initialize LtoB array.  The -1 indicates the target position does not 
+           receive a loser. */
+        for (j=0; j<jb; j++) {
+            BtoL[j] = -1;
+        }
+
+        for (j=0; j<jb; j++) {
+            if (LtoB[j]>=0) 
+                BtoL[LtoB[j]] = j + PANEL->ii;
+        }
+    }
+
+    /* Collect the winners in B.  Once we have all the winners, we'll move
+       into the right positions of U. */
+
+    /* Step 1. Copy the winner rows from the matrix storage into 
+       the appropriate position in B for the AllGatherV. */
+
+    my_row_buffer = &(B[prowindx[my_prow]*ldb]);
+
+#ifdef HPL_CALL_ACCEL
+    (void) HPL_accel_rowget (PANEL, my_row_buffer, ldb,
+                             prow_cnts[my_prow], my_winners, PANEL->jj, NN);
+#else
+    for (j=0; j<prow_cnts[my_prow]; j+=8) {
+        int src[8], dest[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+        int k, row_cnt = prow_cnts[my_prow]-j;
+        if (row_cnt>8) { row_cnt = 8; }
+        for (k=0; k<row_cnt; k++) {
+            /* Make the local index relative to the trailing part of A */
+            src[k] = my_winners[j+k] - PANEL->ii;
+        }
+        HPL_dlaswp01T(
+            /* Number of rows of A to copy */ row_cnt,
+            /* Number of cols of A to copy */ NN,
+            /* Source of data to copy */ PANEL->A,
+            /* leading dimension of A */ PANEL->lda,
+            /* Taget of data copy */ my_row_buffer,
+            /* Leading dimension of U (B) (row major) */ ldb,
+            /* Local row indexes of A to be copied */ src,
+            /* Local row indexes of U (B) to receive the data */ dest );
+        my_row_buffer += row_cnt*ldb;
+    }
+#endif
+
+    /* Step 2. Participate in the Allgatherv to collect the winners
+       into every processor in the column. */
+
+    my_row_buffer = &(B[prowindx[my_prow]*ldb]);
+
+    int displ = 0;
+    for (j=0; j<nprow; j++) {
+        displs[j] = displ;
+        recvcounts[j] = prow_cnts[j]*ldb;
+        displ += recvcounts[j];
+    }
+#ifdef HPL_DETAILED_TIMING
+    MPI_Barrier ( PANEL->grid->col_comm );
+    HPL_ptimer( HPL_TIMING_ALLGATHER );
+#endif
+    MPI_Allgatherv(
+        /* IN  (void*) starting address of send buffer */   my_row_buffer,
+        /* IN  (int) number of elements in send buffer */   recvcounts[my_prow],
+        /* IN  (MPI_Datatype) data type of send buffer elems */ MPI_DOUBLE,
+        /* OUT (void*) address of receive buffer */         B,
+        /* IN  (int*) elems to receive from process[j] */   recvcounts,
+        /* IN  (int*) loc in recv buf to store elems from process[j] */ displs,
+        /* IN  (MPI_DATATYPE) data type of recv buffer elems */ MPI_DOUBLE,
+        /* IN  (MPI_Comm) communicator */                   PANEL->grid->col_comm );
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_ALLGATHER );
+#endif
+
+    /* Step 3. Copy the winners from the AllGather buffer (B) into their
+       correct positions in U.  The data in row j of B must be copied to
+       row BtoW[j] of U. */
+
+#ifdef HPL_CALL_ACCEL
+
+    (void) HPL_accel_panputU(PANEL, B, ldb, BtoW, NN);
+
+#else
+    /* HPL_dlaswp03T requires a very odd format for the array of target offsets.
+       Firstly, it is an array of doubles rather than an array of ints.  And
+       secondly it has the same leading dimension as the source array. So we
+       really have no choice but to use an extra column in the source array to
+       store these offsets. */
+
+    for (j=0;  j<jb; j++) {
+        B[NN+j*ldb] = (double)BtoW[j];
+    }
+
+    HPL_dlaswp03T(
+        /* IN  (int) Number of cols (rows) to copy =*/   jb,
+        /* IN  (int) Length of cols (rows) to copy =*/   NN,
+        /* INOUT (double *) Taget of data copy */        PANEL->U,
+        /* IN  (int) Leading dimension of tgt array */   PANEL->ldu,
+        /* IN  (double *) Array of target offsets */     &B[NN],
+        /* IN  (double *) Source of data to copy */      B,
+        /* IN  (int) Leading dimension of src array */   ldb );
+
+#endif
+
+    /* Now the AllGatherV is done ... so we move on to the ScatterV. */
+
+    /* If we are in the top row of the trailing A */
+    if ( my_prow == PANEL->prow ) {
+
+        /* Step 1. Copy the loser rows from the matrix storage into the
+           appropriate position in B for the Scatterv. */
+
+        /* The data in the local buffer will be in row-major, big
+           endian format. */
+
+#ifdef HPL_CALL_ACCEL
+        (void) HPL_accel_rowget (PANEL, B, ldb,
+                                 jb, BtoL, PANEL->jj, NN );
+#else
+        /* This is a little tricky, since we must skip copying any 
+           rows that are not actually losers.  These are indicated
+           by BtoL[j] == -1. */
+
+        int num_losers = *ipl/2 - jb;
+        j = 0;
+        while (num_losers>0)
+        {
+            int src[8], dest[8];
+            int row_cnt = 0;
+
+            while ( (num_losers>0) && (row_cnt < 8) )
+            {
+                if (BtoL[j] != -1) {
+                    src[row_cnt] = BtoL[j] - PANEL->ii;
+                    dest[row_cnt] = j;
+                    num_losers--;
+                    row_cnt++;
+                }
+                j++;
+            }
+
+            HPL_dlaswp01T(
+                /* Number of rows of A to copy =*/ row_cnt,
+                /* Number of cols of A to copy =*/ NN,
+                /* Source of data to copy */ PANEL->A,
+                /* leading dimension of A */ PANEL->lda,
+                /* Taget of data copy */ B,
+                /* Leading dimension of U (B) (row major) */ ldb,
+                /* Local row indexes of A to be copied */ src,
+                /* Local row indexes of U (B) to receive the data */ dest );
+        }
+#endif
+    }
+
+    /* Step 2. Scatter the loser rows out to their new home processors */
+
+    my_row_buffer = &(B[prowindx[my_prow]*ldb]);
+
+    /* I'm pretty sure that the displs and recvcounts[j] used for the Allgatherv
+       will be exactly the same for the Scatterv, so just reuse them. */
+
+#ifdef HPL_DETAILED_TIMING
+    MPI_Barrier ( PANEL->grid->col_comm );
+    HPL_ptimer( HPL_TIMING_SCATTER );
+#endif
+    MPI_Scatterv(
+        /* IN  (void*) address of send buffer */               B,
+        /* IN  (int*) elemss to send to process [j] */         recvcounts,
+        /* IN  (int*) loc in sendbuf holding elems for process[j] */ displs,
+        /* IN  (MPI_DATATYPE) data type of send buffer elems */ MPI_DOUBLE,
+        /* OUT (void*) address of recv buffer */               my_row_buffer,
+        /* IN  (int) number of elements in recv buffer */      recvcounts[my_prow],
+        /* IN  (MPI_DATATYPE) data type of recv buffer elems */ MPI_DOUBLE,
+        /* IN  (int) rank of sending process */                PANEL->prow,
+        /* IN  (MPI_Comm) communicator */                      PANEL->grid->col_comm );
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_SCATTER );
+#endif
+
+    /* Step 3. Copy the losers from the Scatter buffer into the matrix. */
+
+#ifdef HPL_CALL_ACCEL
+    (void) HPL_accel_rowput (PANEL, my_row_buffer, ldb,
+                             *my_loser_cnt, my_losers, PANEL->jj, NN);
+#else
+    for (j=0; j<jb; j+=8) {
+        int dst[8], src[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+        int k, row_cnt = *my_loser_cnt-j;
+        if (row_cnt>8) { row_cnt = 8; }
+        for (k=0; k<row_cnt; k++) {
+            /* Make the local index relative to the trailing part of A */
+            dst[k] = my_losers[j+k] - PANEL->ii;
+        }
+        HPL_dlaswp05T(
+             /* IN  (int) Number of cols (rows) of U (B) to copy =*/ row_cnt,
+             /* IN  (int) Number of rows (cols) of U (B) to copy =*/ NN,
+             /* OUT (double*) Target of data copy */ PANEL->A,
+             /* IN  (int) leading dimension of A */ PANEL->lda,
+             /* IN  (double*) Source of data to copy */ my_row_buffer,
+             /* IN  (int) Leading dimension of U (B) (row major) */ ldb,
+             /* IN  (int*) Local row indexes of A to receive the data */ dst,
+             /* IN  (int*) Local col (row) indexes of U (B) to be copied */ src);
+        my_row_buffer += row_cnt*ldb;
+    }
+#endif
+
+#ifdef HPL_USE_HUGE_PAGES
+    if ( vptr ) HPL_hpfree( vptr );
+#else
+    if ( vptr ) free( vptr );
+#endif
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+
+    return;
+}
Index: src/pgesv/HPL_pdupdateNN.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateNN.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pgesv/HPL_pdupdateNN.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdupdateNN.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -105,11 +108,10 @@
 #ifdef HPL_CALL_VSIPL
    vsip_mview_d              * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1;
 #endif
-   int                       curr, i, iroff, jb, lda, ldl2, mp, n, nb,
-                             nq0, nn, test;
+   int                       curr, i, iroff, jb, lda, ldl2, ldu, mp, n,
+                             nb, nq0, nn, test;
    static int                tswap = 0;
    static HPL_T_SWAP         fswap = HPL_NO_SWP;
-#define LDU                  jb
 /* ..
  * .. Executable Statements ..
  */
@@ -274,7 +276,7 @@
  */
       nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 );
       Aptr = PANEL->A; L2ptr = PANEL->L2;  L1ptr = PANEL->L1;
-      Uptr = PANEL->U; ldl2 = PANEL->ldl2;
+      Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu;
       mp   = PANEL->mp - ( curr != 0 ? jb : 0 );
 #ifdef HPL_CALL_VSIPL
 /*
@@ -288,7 +290,7 @@
  */
       Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,  lda,  PANEL->pmat->nq );
       Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2,              jb );
-      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, LDU,  LDU,                n );
+      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, ldu,  ldu,                n );
 /*
  * Create the matrix subviews
  */
@@ -302,7 +304,7 @@
          nn = n - nq0; nn = Mmin( nb, nn );
 
          HPL_dtrsm( HplColumnMajor, HplLeft,  HplLower, HplNoTrans,
-                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu );
          if( curr != 0 )
          {
 #ifdef HPL_CALL_VSIPL
@@ -321,10 +323,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -344,11 +346,11 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
-         Uptr = Mptr( Uptr, 0, nn, LDU );
+         Uptr = Mptr( Uptr, 0, nn, ldu );
          Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn;
 
          (void) HPL_bcast( PBCST, &test ); 
@@ -359,7 +361,7 @@
       if( ( nn = n - nq0 ) > 0 )
       {
          HPL_dtrsm( HplColumnMajor, HplLeft,  HplLower, HplNoTrans,
-                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -379,10 +381,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -402,7 +404,7 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
Index: src/pgesv/HPL_pdupdateNT.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateNT.c,v
retrieving revision 1.1
retrieving revision 1.16
diff -u -r1.1 -r1.16
--- src/pgesv/HPL_pdupdateNT.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdupdateNT.c	26 Aug 2008 13:24:26 -0000	1.16
@@ -44,10 +44,16 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
 #include "hpl.h"
+extern int panel_prep;
+
+#define OVERLAP_DGEMM_AND_BCAST 1
 
 #ifdef STDC_HEADERS
 void HPL_pdupdateNT
@@ -105,11 +111,10 @@
 #ifdef HPL_CALL_VSIPL
    vsip_mview_d              * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1;
 #endif
-   int                       curr, i, iroff, jb, lda, ldl2, mp, n, nb,
-                             nq0, nn, test;
+   int                       curr, i, iroff, jb, lda, ldl2, ldu, mp, n,
+                             nb, nq0, nn, test;
    static int                tswap = 0;
    static HPL_T_SWAP         fswap = HPL_NO_SWP;
-#define LDU                  n
 /* ..
  * .. Executable Statements ..
  */
@@ -133,18 +138,22 @@
 #endif
       return;
    }
+#ifdef OVERLAP_DGEMM_AND_BCAST 
+   test = HPL_KEEP_TESTING;
+#else
 /*
  * Enable/disable the column panel probing mechanism
  */
    (void) HPL_bcast( PBCST, &test );
+#endif
 /*
  * 1 x Q case
  */
    if( PANEL->grid->nprow == 1 )
    {
-      Aptr = PANEL->A;       L2ptr = PANEL->L2;   L1ptr = PANEL->L1;
-      ldl2 = PANEL->ldl2;    dpiv  = PANEL->DPIV; ipiv  = PANEL->IWORK;
-      mp   = PANEL->mp - jb; iroff = PANEL->ii;   nq0   = 0; 
+      Aptr = PANEL->A;       L2ptr = PANEL->L2;     L1ptr = PANEL->L1;
+      ldl2 = PANEL->ldl2;    dpiv  = PANEL->DPIV;   ipiv  = PANEL->IWORK;
+      mp   = PANEL->mp - jb; iroff = PANEL->ii;     nq0   = 0; 
 #ifdef HPL_CALL_VSIPL
 /*
  * Admit the blocks
@@ -162,6 +171,8 @@
       Lv1 = vsip_msubview_d( Lv0, 0, 0, mp, jb );
 #endif
       for( i = 0; i < jb; i++ ) { ipiv[i] = (int)(dpiv[i]) - iroff; }
+
+#ifndef OVERLAP_DGEMM_AND_BCAST 
 /*
  * So far we have not updated anything -  test availability of the panel
  * to be forwarded - If detected forward it and finish the update in one
@@ -175,11 +186,21 @@
  */
 #ifdef HPL_DETAILED_TIMING
          HPL_ptimer( HPL_TIMING_LASWP );
-         HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv );
-         HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+#ifdef HPL_CALL_ACCEL
+         HPL_accel_swap00N( PANEL, ipiv, nq0, nn );
 #else
          HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv );
 #endif
+#ifdef HPL_DETAILED_TIMING
+         HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+
+#ifdef HPL_CALL_ACCEL
+         HPL_accel_dtrsm(PANEL, nq0, nn);
+
+         HPL_accel_dgemm(PANEL, nq0, nn);
+#else
          HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans,
                     HplUnit, jb, nn, HPL_rone, L1ptr, jb, Aptr, lda );
 #ifdef HPL_CALL_VSIPL
@@ -197,14 +218,17 @@
          (void) vsip_mdestroy_d( Av1 );
          (void) vsip_mdestroy_d( Uv1 );
 #else
+
          HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
                     jb, -HPL_rone, L2ptr, ldl2, Aptr, lda, HPL_rone,
                     Mptr( Aptr, jb, 0, lda ), lda );
 #endif
+#endif
          Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; 
 
          (void) HPL_bcast( PBCST, &test ); 
       }
+#endif  /* #ifndef OVERLAP_DGEMM_AND_BCAST  */
 /*
  * The panel has been forwarded at that point, finish the update
  */
@@ -212,11 +236,31 @@
       {
 #ifdef HPL_DETAILED_TIMING
          HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+#ifdef HPL_CALL_ACCEL
+         HPL_accel_swap00N( PANEL, ipiv, nq0, nn );
+#else
          HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv );
+#endif
+#ifdef HPL_DETAILED_TIMING
          HPL_ptimer( HPL_TIMING_LASWP );
+#endif
+#ifdef HPL_CALL_ACCEL
+         HPL_accel_dtrsm(PANEL, nq0, nn);
+
+#ifdef OVERLAP_DGEMM_AND_BCAST 
+         HPL_accel_dgemm_async(PANEL, nq0, nn);
+
+         if ( PBCST != NULL ) {
+             while( test != HPL_SUCCESS )
+             { (void) HPL_bcast( PBCST, &test ); }
+         }
+
+         HPL_accel_dgemm_wait(PANEL);
 #else
-         HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv );
+         HPL_accel_dgemm(PANEL, nq0, nn);
 #endif
+#else
          HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans,
                     HplUnit, jb, nn, HPL_rone, L1ptr, jb, Aptr, lda );
 #ifdef HPL_CALL_VSIPL
@@ -238,6 +282,7 @@
                     jb, -HPL_rone, L2ptr, ldl2, Aptr, lda, HPL_rone,
                     Mptr( Aptr, jb, 0, lda ), lda );
 #endif
+#endif
       }
 #ifdef HPL_CALL_VSIPL
 /*
@@ -267,6 +312,8 @@
       if( (   fswap == HPL_SWAP01 ) ||
           ( ( fswap == HPL_SW_MIX ) && ( n > tswap ) ) )
       { HPL_pdlaswp01T( PBCST, &test, PANEL, n ); }
+      else if ( fswap == HPL_SWAP03 )
+      { HPL_pdlaswp03T( PBCST, &test, PANEL, n ); }
       else
       { HPL_pdlaswp00T( PBCST, &test, PANEL, n ); }
 /*
@@ -274,7 +321,7 @@
  */
       nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 );
       Aptr = PANEL->A; L2ptr = PANEL->L2;  L1ptr = PANEL->L1;
-      Uptr = PANEL->U; ldl2 = PANEL->ldl2;
+      Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu;
       mp   = PANEL->mp - ( curr != 0 ? jb : 0 );
 #ifdef HPL_CALL_VSIPL
 /*
@@ -288,12 +335,14 @@
  */ 
       Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,  lda,  PANEL->pmat->nq );
       Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2,              jb );
-      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, LDU,  LDU,               jb );
+      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, ldu,  ldu,               jb );
 /*
  * Create the matrix subviews
  */
       Lv1 = vsip_msubview_d( Lv0, 0, 0, mp, jb );
 #endif
+
+#ifndef OVERLAP_DGEMM_AND_BCAST 
 /*
  * Broadcast has not occured yet, spliting the computational part
  */
@@ -302,7 +351,7 @@
          nn = n - nq0; nn = Mmin( nb, nn );
 
          HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans,
-                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -322,10 +371,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -345,22 +394,39 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
-         Uptr = Mptr( Uptr, nn, 0, LDU );
+         Uptr = Mptr( Uptr, nn, 0, ldu );
          Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn;
 
          (void) HPL_bcast( PBCST, &test ); 
       }
+#endif  /* #ifndef OVERLAP_DGEMM_AND_BCAST  */
 /*
  * The panel has been forwarded at that point, finish the update
  */
       if( ( nn = n - nq0 ) > 0 )
       {
+#ifdef HPL_CALL_ACCEL
+         HPL_accel_dtrsm(PANEL, nq0, nn);
+
+#ifdef OVERLAP_DGEMM_AND_BCAST 
+         HPL_accel_dgemm_async(PANEL, nq0, nn);
+
+         if ( PBCST != NULL ) {
+             while( test != HPL_SUCCESS )
+             { (void) HPL_bcast( PBCST, &test ); }
+         }
+
+         HPL_accel_dgemm_wait(PANEL);
+#else
+         HPL_accel_dgemm(PANEL, nq0, nn);
+#endif
+#else
          HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans,
-                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -380,10 +446,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -403,10 +469,11 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
+#endif /* !OVERLAP_DGEMM_AND_BCAST */
       }
 #ifdef HPL_CALL_VSIPL
 /*
@@ -428,7 +495,15 @@
 #endif
    }
 
+#ifdef OVERLAP_DGEMM_AND_BCAST 
+    if ( PBCST != NULL ) {
+        while( test != HPL_SUCCESS )
+        { (void) HPL_bcast( PBCST, &test ); }
+    }
+#endif
+
    PANEL->A = Mptr( PANEL->A, 0, n, lda ); PANEL->nq -= n; PANEL->jj += n;
+   PANEL->ja += n;
 /*
  * return the outcome of the probe  (should always be  HPL_SUCCESS,  the
  * panel broadcast is enforced in that routine).
Index: src/pgesv/HPL_pdupdateTN.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateTN.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pgesv/HPL_pdupdateTN.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdupdateTN.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -105,11 +108,10 @@
 #ifdef HPL_CALL_VSIPL
    vsip_mview_d              * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1;
 #endif
-   int                       curr, i, iroff, jb, lda, ldl2, mp, n, nb,
-                             nq0, nn, test;
+   int                       curr, i, iroff, jb, lda, ldl2, ldu, mp, n,
+                             nb, nq0, nn, test;
    static int                tswap = 0;
    static HPL_T_SWAP         fswap = HPL_NO_SWP;
-#define LDU                  jb
 /* ..
  * .. Executable Statements ..
  */
@@ -274,7 +276,7 @@
  */
       nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 );
       Aptr = PANEL->A; L2ptr = PANEL->L2;  L1ptr = PANEL->L1;
-      Uptr = PANEL->U; ldl2 = PANEL->ldl2;
+      Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu;
       mp   = PANEL->mp - ( curr != 0 ? jb : 0 );
 #ifdef HPL_CALL_VSIPL
 /*
@@ -288,7 +290,7 @@
  */
       Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,  lda,  PANEL->pmat->nq );
       Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2,              jb );
-      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, LDU,  LDU,                n );
+      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, ldu,  ldu,                n );
 /*
  * Create the matrix subviews
  */
@@ -302,7 +304,7 @@
          nn = n - nq0; nn = Mmin( nb, nn );
 
          HPL_dtrsm( HplColumnMajor, HplLeft,  HplUpper, HplTrans,
-                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -322,10 +324,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -345,11 +347,11 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
-         Uptr = Mptr( Uptr, 0, nn, LDU );
+         Uptr = Mptr( Uptr, 0, nn, ldu );
          Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn;
 
          (void) HPL_bcast( PBCST, &test ); 
@@ -360,7 +362,7 @@
       if( ( nn = n - nq0 ) > 0 )
       {
          HPL_dtrsm( HplColumnMajor, HplLeft,  HplUpper, HplTrans,
-                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -380,10 +382,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -403,7 +405,7 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
Index: src/pgesv/HPL_pdupdateTT.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateTT.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- src/pgesv/HPL_pdupdateTT.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_pdupdateTT.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -105,11 +108,10 @@
 #ifdef HPL_CALL_VSIPL
    vsip_mview_d              * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1;
 #endif
-   int                       curr, i, iroff, jb, lda, ldl2, mp, n, nb,
-                             nq0, nn, test;
+   int                       curr, i, iroff, jb, lda, ldl2, ldu, mp, n,
+                             nb, nq0, nn, test;
    static int                tswap = 0;
    static HPL_T_SWAP         fswap = HPL_NO_SWP;
-#define LDU                  n
 /* ..
  * .. Executable Statements ..
  */
@@ -274,7 +276,7 @@
  */
       nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 );
       Aptr = PANEL->A; L2ptr = PANEL->L2;  L1ptr = PANEL->L1;
-      Uptr = PANEL->U; ldl2 = PANEL->ldl2;
+      Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu;
       mp   = PANEL->mp - ( curr != 0 ? jb : 0 );
 #ifdef HPL_CALL_VSIPL
 /*
@@ -288,7 +290,7 @@
  */
       Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,  lda,  PANEL->pmat->nq );
       Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2,              jb );
-      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, LDU,  LDU,               jb );
+      Uv0 = vsip_mbind_d( PANEL->Ublock,  0, 1, ldu,  ldu,               jb );
 /*
  * Create the matrix subviews
  */
@@ -302,7 +304,7 @@
          nn = n - nq0; nn = Mmin( nb, nn );
 
          HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans,
-                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -322,10 +324,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -345,11 +347,11 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
-         Uptr = Mptr( Uptr, nn, 0, LDU );
+         Uptr = Mptr( Uptr, nn, 0, ldu );
          Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn;
 
          (void) HPL_bcast( PBCST, &test ); 
@@ -360,7 +362,7 @@
       if( ( nn = n - nq0 ) > 0 )
       {
          HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans,
-                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU );
+                    HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu );
 
          if( curr != 0 )
          {
@@ -380,10 +382,10 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Mptr( Aptr, jb, 0, lda ), lda );
 #endif
-            HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda );
+            HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda );
          }
          else
          {
@@ -403,7 +405,7 @@
             (void) vsip_mdestroy_d( Uv1 );
 #else
             HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn,
-                       jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone,
+                       jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone,
                        Aptr, lda );
 #endif
          }
Index: src/pgesv/HPL_rollT.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_rollT.c,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- src/pgesv/HPL_rollT.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_rollT.c	26 Aug 2008 13:24:26 -0000	1.4
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -137,7 +140,7 @@
 /*
  * .. Local Variables ..
  */
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
    MPI_Datatype               type[2];
 #endif
    MPI_Status                 status;
@@ -182,7 +185,7 @@
  
       if( lengthR > 0 )
       {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
          if( ierr == MPI_SUCCESS )
          {
             if( LDU == N )
@@ -209,7 +212,7 @@
  
       if( lengthS > 0 )
       {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
          if( ierr == MPI_SUCCESS )
          {
             if( LDU == N )
@@ -240,7 +243,7 @@
       {
          if( ierr == MPI_SUCCESS )
             ierr =   MPI_Wait( &request, &status );
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
          if( ierr == MPI_SUCCESS )
             ierr =   MPI_Type_free( &type[I_RECV] );
 #endif
Index: src/pgesv/HPL_spreadT.c
===================================================================
RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_spreadT.c,v
retrieving revision 1.1
retrieving revision 1.4
diff -u -r1.1 -r1.4
--- src/pgesv/HPL_spreadT.c	10 Feb 2008 21:45:51 -0000	1.1
+++ src/pgesv/HPL_spreadT.c	26 Aug 2008 13:24:26 -0000	1.4
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -153,7 +156,7 @@
 /*
  * .. Local Variables ..
  */
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
    MPI_Datatype              type;
 #endif
    MPI_Status                status;
@@ -194,7 +197,7 @@
 
                if( mydist & ip2 )
                {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
                   if( ierr == MPI_SUCCESS )
                   {
                      if( LDU == N )
@@ -224,7 +227,7 @@
                }
                else if( partner < nprow )
                {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
                   if( ierr == MPI_SUCCESS )
                   {
                      if( LDU == N )
@@ -293,7 +296,7 @@
 
                if( mydist & ip2 )
                {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
                   if( ierr == MPI_SUCCESS )
                   {
                      if( LDU == N )
@@ -323,7 +326,7 @@
                }
                else if( partner < nprow )
                {
-#if 0
+#ifdef HPL_USE_MPI_DATATYPE
                   if( ierr == MPI_SUCCESS )
                   {
                      if( LDU == N )
Index: testing/ptest/HPL.dat
===================================================================
RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL.dat,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- testing/ptest/HPL.dat	10 Feb 2008 21:45:52 -0000	1.1
+++ testing/ptest/HPL.dat	27 Apr 2008 23:55:48 -0000	1.2
@@ -23,7 +23,7 @@
 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
 1            # of lookahead depth
 0            DEPTHs (>=0)
-2            SWAP (0=bin-exch,1=long,2=mix)
+2            SWAP (0=bin-exch,1=long,2=mix,3=MPI-coll)
 64           swapping threshold
 0            L1 in (0=transposed,1=no-transposed) form
 0            U  in (0=transposed,1=no-transposed) form
Index: testing/ptest/HPL_pddriver.c
===================================================================
RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pddriver.c,v
retrieving revision 1.1
retrieving revision 1.3
diff -u -r1.1 -r1.3
--- testing/ptest/HPL_pddriver.c	10 Feb 2008 21:45:52 -0000	1.1
+++ testing/ptest/HPL_pddriver.c	26 Aug 2008 13:24:26 -0000	1.3
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -112,6 +115,9 @@
 #endif
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );
+#ifdef HPL_CALL_ACCEL
+   HPL_accel_init(rank);
+#endif
 /*
  * Read and check validity of test parameters from input file
  *
@@ -280,6 +286,9 @@
       if( ( test.outfp != stdout ) && ( test.outfp != stderr ) )
          (void) fclose( test.outfp );
    }
+#ifdef HPL_CALL_ACCEL
+   HPL_accel_exit(rank);
+#endif
 #ifdef HPL_CALL_VSIPL
    vsip_finalize((void*)0);
 #endif
Index: testing/ptest/HPL_pdinfo.c
===================================================================
RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pdinfo.c,v
retrieving revision 1.1
retrieving revision 1.7
diff -u -r1.1 -r1.7
--- testing/ptest/HPL_pdinfo.c	10 Feb 2008 21:45:52 -0000	1.1
+++ testing/ptest/HPL_pdinfo.c	26 Aug 2008 13:24:26 -0000	1.7
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -368,6 +371,15 @@
                        "Value of NB less than 1" );
             error = 1; goto label_error;
          }
+#ifdef HPL_CALL_ACCEL
+         /* Accelerator is hard-coded for NB=128 */
+         if( NB[ i ] != 128 )
+         {
+            HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", 
+                       "Value of NB must be 128 for hybrid architecture" );
+            error = 1; goto label_error;
+         }
+#endif
       }
 /*
  * Process grids, mapping, (>=1) (P, Q)
@@ -565,13 +577,14 @@
          }
       }
 /*
- * Swapping algorithm (0,1 or 2) (FSWAP)
+ * Swapping algorithm (0,1,2 or 3) (FSWAP)
  */
       (void) fgets( line, HPL_LINE_MAX - 2, infp );
       (void) sscanf( line, "%s", num ); j = atoi( num );
       if(      j == 0 ) *FSWAP = HPL_SWAP00;
       else if( j == 1 ) *FSWAP = HPL_SWAP01;
       else if( j == 2 ) *FSWAP = HPL_SW_MIX;
+      else if( j == 3 ) *FSWAP = HPL_SWAP03;
       else              *FSWAP = HPL_SWAP01;
 /*
  * Swapping threshold (>=0) (TSWAP)
@@ -585,12 +598,30 @@
       (void) fgets( line, HPL_LINE_MAX - 2, infp );
       (void) sscanf( line, "%s", num ); *L1NOTRAN = atoi( num );
       if( ( *L1NOTRAN != 0 ) && ( *L1NOTRAN != 1 ) ) *L1NOTRAN = 0; 
+#ifdef HPL_CALL_ACCEL
+      /* Accelerator code paths currently only implemented for L1 no-transposed */
+      if ( *L1NOTRAN != 1 )
+      {
+         HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", 
+                    "L transposed is not currently supported for hybrid architecture" );
+         error = 1; goto label_error;
+      }
+#endif
 /*
  * U  in (no-)transposed form (0 or 1)
  */
       (void) fgets( line, HPL_LINE_MAX - 2, infp );
       (void) sscanf( line, "%s", num ); *UNOTRAN = atoi( num );
       if( ( *UNOTRAN != 0 ) && ( *UNOTRAN != 1 ) ) *UNOTRAN = 0;
+#ifdef HPL_CALL_ACCEL
+      /* Accelerator code paths currently only implemented for U transposed */
+      if( *UNOTRAN != 0 )
+      {
+         HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", 
+                    "U no-transposed is not currently supported for hybrid architecture" );
+         error = 1; goto label_error;
+      }
+#endif
 /*
  * Equilibration (0=no, 1=yes)
  */
@@ -603,6 +634,11 @@
       (void) fgets( line, HPL_LINE_MAX - 2, infp );
       (void) sscanf( line, "%s", num ); *ALIGN = atoi( num );
       if( *ALIGN <= 0 ) *ALIGN = 4;
+#ifdef HPL_CALL_ACCEL
+      /* Accelerator is hard-coded for ALIGN=64 */
+      *ALIGN = 64; 
+#endif
+
 /*
  * Close input file
  */
@@ -703,6 +739,7 @@
       if(      *FSWAP == HPL_SWAP00 ) iwork[j] = 0;
       else if( *FSWAP == HPL_SWAP01 ) iwork[j] = 1;
       else if( *FSWAP == HPL_SW_MIX ) iwork[j] = 2;
+      else if( *FSWAP == HPL_SWAP03 ) iwork[j] = 3;
       j++;
    }
    (void) HPL_broadcast( (void*)iwork, lwork, HPL_INT, 0,
@@ -746,6 +783,7 @@
       if(      iwork[j] == 0 ) *FSWAP = HPL_SWAP00;
       else if( iwork[j] == 1 ) *FSWAP = HPL_SWAP01;
       else if( iwork[j] == 2 ) *FSWAP = HPL_SW_MIX;
+      else if( iwork[j] == 3 ) *FSWAP = HPL_SWAP03;
       j++;
    }
    if( iwork ) free( iwork );
@@ -766,6 +804,20 @@
       HPL_fprintf( TEST->outfp, "%s%s\n",
                    "======================================",
                    "======================================" );
+#ifdef HPL_CALL_ACCEL
+      HPL_fprintf( TEST->outfp, "%s%s\n",
+                   "======================================",
+                   "======================================" );
+      HPL_fprintf( TEST->outfp, "%s%s\n",
+          "Modified for hybrid architectures -- ",
+          " April 30, 2008" );
+      HPL_fprintf( TEST->outfp, "%s%s\n",
+          "by M. Kistler, J. Gunnels, D. Brokenshire, and B. Benton,  ",
+          "IBM Corporation" );
+      HPL_fprintf( TEST->outfp, "%s%s\n",
+                   "======================================",
+                   "======================================" );
+#endif
 
       HPL_fprintf( TEST->outfp, "\n%s\n",
           "An explanation of the input/output parameters follows:" );
@@ -1061,6 +1113,8 @@
          HPL_fprintf( TEST->outfp, " Spread-roll (long)" );
       else if( *FSWAP == HPL_SW_MIX )
          HPL_fprintf( TEST->outfp, " Mix (threshold = %d)", *TSWAP );
+      else if( *FSWAP == HPL_SWAP03 )
+         HPL_fprintf( TEST->outfp, " MPI Collectives" );
 /*
  * L1 storage form
  */
Index: testing/ptest/HPL_pdtest.c
===================================================================
RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pdtest.c,v
retrieving revision 1.1
retrieving revision 1.9
diff -u -r1.1 -r1.9
--- testing/ptest/HPL_pdtest.c	10 Feb 2008 21:45:52 -0000	1.1
+++ testing/ptest/HPL_pdtest.c	26 Aug 2008 13:24:26 -0000	1.9
@@ -44,6 +44,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  * ---------------------------------------------------------------------
  */ 
+/* ------------------------------------------------------------------ */
+/* Modifications (C) Copyright IBM Corporation 2008                   */
+/* ------------------------------------------------------------------ */
 /*
  * Include files
  */
@@ -160,7 +163,14 @@
 /*
  * Allocate dynamic memory
  */
-   vptr = (void*)malloc( (ALGO->align + (mat.ld+1)*(mat.nq))*sizeof(double) );
+   size_t mem_align = mat.nb*mat.nb;
+   if ( (mem_align % ALGO->align) != 0 ) mem_align *= ALGO->align;
+   size_t mem_size =  (mem_align + (mat.ld+1)*(((mat.nq+mat.nb-1)/mat.nb)*mat.nb))*sizeof(double);
+#ifdef HPL_USE_HUGE_PAGES
+   vptr = (void*)HPL_hpalloc( mem_size );
+#else
+   vptr = (void*)malloc( mem_size );
+#endif
    info[0] = (vptr == NULL); info[1] = myrow; info[2] = mycol;
    (void) HPL_all_reduce( (void *)(info), 3, HPL_INT, HPL_max,
                           GRID->all_comm );
@@ -176,7 +186,8 @@
 /*
  * generate matrix and right-hand-side, [ A | b ] which is N by N+1.
  */
-   mat.A  = (double *)HPL_PTR( vptr,
+   double *xptr  = (double *)HPL_PTR( vptr, mem_align * sizeof(double) );
+   mat.A  = (double *)HPL_PTR( xptr,
                                ((size_t)(ALGO->align) * sizeof(double) ) );
    mat.X  = mat.A + (mat.ld * mat.nq);
    HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED );
@@ -288,6 +299,27 @@
                       "+ Max aggregated wall time laswp . . : %18.2f\n",
                       HPL_w[HPL_TIMING_LASWP-HPL_TIMING_BEG] );
 /*
+ * Swap allgather time
+ */
+      if( HPL_w[HPL_TIMING_ALLGATHER-HPL_TIMING_BEG] > HPL_rzero )
+         HPL_fprintf( TEST->outfp,
+                      "+ + Max aggregated wall time allgather:%18.2f\n",
+                      HPL_w[HPL_TIMING_ALLGATHER-HPL_TIMING_BEG] );
+/*
+ * Swap scatter time
+ */
+      if( HPL_w[HPL_TIMING_SCATTER-HPL_TIMING_BEG] > HPL_rzero )
+         HPL_fprintf( TEST->outfp,
+                      "+ + Max aggregated wall time scatter : %18.2f\n",
+                      HPL_w[HPL_TIMING_SCATTER-HPL_TIMING_BEG] );
+/*
+ * Accelerator overhead (setup & cleanup)
+ */
+      if( HPL_w[HPL_TIMING_ACCEL_OVERHEAD-HPL_TIMING_BEG] > HPL_rzero )
+         HPL_fprintf( TEST->outfp,
+                      "Max aggregated wall time accel ovhd  : %18.2f\n",
+                      HPL_w[HPL_TIMING_ACCEL_OVERHEAD-HPL_TIMING_BEG] );
+/*
  * Upper triangular system solve
  */
       if( HPL_w[HPL_TIMING_PTRSV-HPL_TIMING_BEG] > HPL_rzero )
@@ -305,7 +337,11 @@
  * Quick return, if I am not interested in checking the computations
  */
    if( TEST->thrsh <= HPL_rzero )
+#ifdef HPL_USE_HUGE_PAGES
+   { (TEST->kpass)++; if( vptr ) HPL_hpfree( vptr ); return; }
+#else
    { (TEST->kpass)++; if( vptr ) free( vptr ); return; }
+#endif
 /*
  * Check info returned by solve
  */
@@ -315,7 +351,11 @@
          HPL_pwarn( TEST->outfp, __LINE__, "HPL_pdtest", "%s %d, %s", 
                     "Error code returned by solve is", mat.info, "skip" );
       (TEST->kskip)++;
+#ifdef HPL_USE_HUGE_PAGES
+      if( vptr ) HPL_hpfree( vptr ); return;
+#else
       if( vptr ) free( vptr ); return;
+#endif
    }
 /*
  * Check computation, re-generate [ A | b ], compute norm 1 and inf of A and x,
@@ -404,7 +444,11 @@
          "||x||_1  . . . . . . . . . . . . . . . . . . . = ", Xnorm1 );
       }
    }
+#ifdef HPL_USE_HUGE_PAGES
+   if( vptr ) HPL_hpfree( vptr );
+#else
    if( vptr ) free( vptr );
+#endif
 /*
  * End of HPL_pdtest
  */
