RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster:

raid6: vx128x8  gen() 19705 MB/s
raid6: vx128x8  xor() 11886 MB/s
raid6: using algorithm vx128x8 gen() 19705 MB/s
raid6: .... xor() 11886 MB/s, rmw enabled

vs the software algorithms:

raid6: int64x1  gen()  3018 MB/s
raid6: int64x1  xor()  1429 MB/s
raid6: int64x2  gen()  4661 MB/s
raid6: int64x2  xor()  3143 MB/s
raid6: int64x4  gen()  5392 MB/s
raid6: int64x4  xor()  3509 MB/s
raid6: int64x8  gen()  4441 MB/s
raid6: int64x8  xor()  3207 MB/s
raid6: using algorithm int64x4 gen() 5392 MB/s
raid6: .... xor() 3509 MB/s, rmw enabled

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48..667b960 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -7,6 +7,7 @@
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+raid6_pq-$(CONFIG_S390) += s390vx8.o
 
 hostprogs-y	+= mktables
 
@@ -116,6 +117,11 @@
 $(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
+targets += s390vx8.c
+$(obj)/s390vx8.c:   UNROLL := 8
+$(obj)/s390vx8.c:   $(src)/s390vx.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 quiet_cmd_mktable = TABLE   $@
       cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )