Skip to content

Commit

Permalink
8319872: AArch64: [vectorapi] Implementation of unsigned (zero extend…
Browse files Browse the repository at this point in the history
…ed) casts

Vector API defines zero-extend operations [1], which are going to be
intrinsified and generated to `VectorUCastNode` by C2. This patch adds
backend implementation for VectorUCastNode on AArch64.

The micro benchmark shows significant performance improvement. In my
test machine (SVE, 256-bit), the result is shown as below:

  Benchmark                     Before     After       Units   Gain
  VectorZeroExtend.byte2Int     3168.251   243012.399  ops/ms  75.70
  VectorZeroExtend.byte2Long    3212.201   216291.588  ops/ms  66.33
  VectorZeroExtend.byte2Short   3391.968   182655.365  ops/ms  52.85
  VectorZeroExtend.int2Long     1012.197    80448.553  ops/ms  78.48
  VectorZeroExtend.short2Int    1812.471   153416.828  ops/ms  83.65
  VectorZeroExtend.short2Long   1788.382   129794.814  ops/ms  71.58

On other Neon systems, we can get similar performance boost as a result
of intrinsification success.

Since `VectorUCastNode` only used in Vector API's zero extension
currently, this patch also adds assertion on nodes' definitions to
clarify their usages.

[TEST]
compiler/vectorapi and jdk/incubator/vector passed on NEON and SVE
machines.

[1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorOperators.java#L726

Change-Id: I10770759f158975ead1eecd3fb63280e563ed5e2
  • Loading branch information
Eric Liu committed Nov 15, 2023
1 parent 4e8c036 commit 2855229
Show file tree
Hide file tree
Showing 7 changed files with 376 additions and 39 deletions.
63 changes: 63 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -3748,6 +3748,69 @@ instruct reinterpret_resize_gt128b(vReg dst, vReg src, pReg ptmp, rFlagsReg cr)
ins_pipe(pipe_slow);
%}

// ---------------------------- Vector zero extend --------------------------------

instruct vzeroExtBtoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastB2X src));
format %{ "vzeroExtBtoX $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4B to 4S/4I, 8B to 8S
__ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
$src$$FloatRegister, T_BYTE, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_vector_extend($dst$$FloatRegister, size,
$src$$FloatRegister, __ B, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

instruct vzeroExtStoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastS2X src));
format %{ "vzeroExtStoX $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
assert(bt == T_INT || bt == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S to 4I
__ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
$src$$FloatRegister, T_SHORT, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_vector_extend($dst$$FloatRegister, size,
$src$$FloatRegister, __ H, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

instruct vzeroExtItoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastI2X src));
format %{ "vzeroExtItoX $dst, $src" %}
ins_encode %{
assert(Matcher::vector_element_basic_type(this) == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 2I to 2L
__ neon_vector_extend($dst$$FloatRegister, T_LONG, length_in_bytes,
$src$$FloatRegister, T_INT, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ D,
$src$$FloatRegister, __ S, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector cast ----------------------------------

// VectorCastB2X
Expand Down
63 changes: 63 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -2320,6 +2320,69 @@ instruct reinterpret_resize_gt128b(vReg dst, vReg src, pReg ptmp, rFlagsReg cr)
ins_pipe(pipe_slow);
%}

// ---------------------------- Vector zero extend --------------------------------

instruct vzeroExtBtoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastB2X src));
format %{ "vzeroExtBtoX $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4B to 4S/4I, 8B to 8S
__ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
$src$$FloatRegister, T_BYTE, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_vector_extend($dst$$FloatRegister, size,
$src$$FloatRegister, __ B, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

instruct vzeroExtStoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastS2X src));
format %{ "vzeroExtStoX $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
assert(bt == T_INT || bt == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S to 4I
__ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
$src$$FloatRegister, T_SHORT, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_vector_extend($dst$$FloatRegister, size,
$src$$FloatRegister, __ H, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

instruct vzeroExtItoX(vReg dst, vReg src) %{
match(Set dst (VectorUCastI2X src));
format %{ "vzeroExtItoX $dst, $src" %}
ins_encode %{
assert(Matcher::vector_element_basic_type(this) == T_LONG, "must be");
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 2I to 2L
__ neon_vector_extend($dst$$FloatRegister, T_LONG, length_in_bytes,
$src$$FloatRegister, T_INT, /* is_unsigned */ true);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ D,
$src$$FloatRegister, __ S, /* is_unsigned */ true);
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector cast ----------------------------------

// VectorCastB2X
Expand Down
45 changes: 28 additions & 17 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1337,29 +1337,33 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
subw(dst, rscratch1, dst);
}

typedef void (C2_MacroAssembler::* xtl_insn)(FloatRegister Vd, Assembler::SIMD_Arrangement Ta,
FloatRegister Vn, Assembler::SIMD_Arrangement Tb);

// Extend integer vector src to dst with the same lane count
// but larger element size, e.g. 4B -> 4I
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
FloatRegister src, BasicType src_bt) {
FloatRegister src, BasicType src_bt, bool is_unsigned) {
xtl_insn ext = is_unsigned ? &C2_MacroAssembler::uxtl : &C2_MacroAssembler::sxtl;
if (src_bt == T_BYTE) {
if (dst_bt == T_SHORT) {
// 4B/8B to 4S/8S
assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
sxtl(dst, T8H, src, T8B);
(this->*ext)(dst, T8H, src, T8B);
} else {
// 4B to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T8H, src, T8B);
sxtl(dst, T4S, dst, T4H);
(this->*ext)(dst, T8H, src, T8B);
(this->*ext)(dst, T4S, dst, T4H);
}
} else if (src_bt == T_SHORT) {
// 4S to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T4S, src, T4H);
(this->*ext)(dst, T4S, src, T4H);
} else if (src_bt == T_INT) {
// 2I to 2L
assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
sxtl(dst, T2D, src, T2S);
(this->*ext)(dst, T2D, src, T2S);
} else {
ShouldNotReachHere();
}
Expand Down Expand Up @@ -1392,35 +1396,42 @@ void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
}
}

typedef void (C2_MacroAssembler::* unpklo_insn)(FloatRegister Zd, Assembler::SIMD_RegVariant T,
FloatRegister Zn);

void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size) {
FloatRegister src, SIMD_RegVariant src_size,
bool is_unsigned) {
assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");

unpklo_insn unpklo = is_unsigned ? &C2_MacroAssembler::sve_uunpklo : &C2_MacroAssembler::sve_sunpklo;

if (src_size == B) {
switch (dst_size) {
case H:
sve_sunpklo(dst, H, src);
(this->*unpklo)(dst, H, src);
break;
case S:
sve_sunpklo(dst, H, src);
sve_sunpklo(dst, S, dst);
(this->*unpklo)(dst, H, src);
(this->*unpklo)(dst, S, dst);
break;
case D:
sve_sunpklo(dst, H, src);
sve_sunpklo(dst, S, dst);
sve_sunpklo(dst, D, dst);
(this->*unpklo)(dst, H, src);
(this->*unpklo)(dst, S, dst);
(this->*unpklo)(dst, D, dst);
break;
default:
ShouldNotReachHere();
}
} else if (src_size == H) {
if (dst_size == S) {
sve_sunpklo(dst, S, src);
(this->*unpklo)(dst, S, src);
} else { // D
sve_sunpklo(dst, S, src);
sve_sunpklo(dst, D, dst);
(this->*unpklo)(dst, S, src);
(this->*unpklo)(dst, D, dst);
}
} else if (src_size == S) {
sve_sunpklo(dst, D, src);
(this->*unpklo)(dst, D, src);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@

// Vector cast
void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
FloatRegister src, BasicType src_bt);
FloatRegister src, BasicType src_bt, bool is_unsigned = false);

void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);

void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size);
FloatRegister src, SIMD_RegVariant src_size, bool is_unsigned = false);

void sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size, FloatRegister tmp);
Expand Down
42 changes: 25 additions & 17 deletions src/hotspot/share/opto/vectornode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1614,26 +1614,31 @@ class VectorCastD2XNode : public VectorCastNode {
virtual int Opcode() const;
};

class RoundVFNode : public VectorNode {
class VectorCastHF2FNode : public VectorCastNode {
public:
RoundVFNode(Node* in, const TypeVect* vt) :VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float");
VectorCastHF2FNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_SHORT, "must be short");
}
virtual int Opcode() const;
};

class VectorUCastB2XNode : public VectorCastNode {
class VectorCastF2HFNode : public VectorCastNode {
public:
VectorUCastB2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_BYTE, "must be byte");
VectorCastF2HFNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float");
}
virtual int Opcode() const;
};

class RoundVDNode : public VectorNode {
// So far, VectorUCastNode can only be used in Vector API unsigned extensions
// between integral types. E.g., extending byte to float is not supported now.
class VectorUCastB2XNode : public VectorCastNode {
public:
RoundVDNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE, "must be double");
VectorUCastB2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_BYTE, "must be byte");
assert(vt->element_basic_type() == T_SHORT ||
vt->element_basic_type() == T_INT ||
vt->element_basic_type() == T_LONG, "must be");
}
virtual int Opcode() const;
};
Expand All @@ -1642,30 +1647,33 @@ class VectorUCastS2XNode : public VectorCastNode {
public:
VectorUCastS2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_SHORT, "must be short");
assert(vt->element_basic_type() == T_INT ||
vt->element_basic_type() == T_LONG, "must be");
}
virtual int Opcode() const;
};

class VectorCastHF2FNode : public VectorCastNode {
class VectorUCastI2XNode : public VectorCastNode {
public:
VectorCastHF2FNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_SHORT, "must be short");
VectorUCastI2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_INT, "must be int");
assert(vt->element_basic_type() == T_LONG, "must be");
}
virtual int Opcode() const;
};

class VectorCastF2HFNode : public VectorCastNode {
class RoundVFNode : public VectorNode {
public:
VectorCastF2HFNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
RoundVFNode(Node* in, const TypeVect* vt) :VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float");
}
virtual int Opcode() const;
};

class VectorUCastI2XNode : public VectorCastNode {
class RoundVDNode : public VectorNode {
public:
VectorUCastI2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_INT, "must be int");
RoundVDNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE, "must be double");
}
virtual int Opcode() const;
};
Expand Down
Loading

0 comments on commit 2855229

Please sign in to comment.