8-bit floating-point quarter-tile sum of outer products to half-precision, accumulating
This instruction generates four independent quarter-tile 8-bit floating-point sums of outer products from the sub-matrices in the half-vectors of the one or two first and second source vectors and accumulates the results to the corresponding elements of a 16-bit element ZA tile.
Each of the quarter-tile sums of outer products is generated by multiplying the SVLH÷2 × 2 sub-matrix of 8-bit floating-point values held in the half-vectors of the first source vectors by the 2 × SVLH÷2 sub-matrix of 8-bit floating-point values held in the half-vectors of the second source vectors. Each 16-bit container of the first source vectors holds 2 elements of each row of a SVLH÷2 × 2 sub-matrix. Similarly, each 16-bit container of the second source vectors holds 2 elements of each column of a 2 × SVLH÷2 sub-matrix.
This instruction widens the sub-matrices of 8-bit floating-point values held in the first source vectors to half-precision values and multiplies them by the corresponding widened sub-matrices of 8-bit floating-point values in the second source vectors to half-precision values. The resulting quarter-tile SVLH÷2 × SVLH÷2 half-precision sums of outer products are scaled by 2-UInt(FPMR.LSCALE[3:0]), before being destructively added to the half-precision destination tile. This is equivalent to performing a downscaled 2-way dot product and accumulate to each of the destination tile elements.
The 8-bit floating-point encoding format for the elements of the first source vector and the second source vector is selected by FPMR.F8S1 and FPMR.F8S2 respectively.
This instruction is unpredicated.
It has encodings from 4 classes: Single and multiple vectors , Single vectors , Multiple and single vectors and Multiple vectors
| 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | Zm | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Zn | 0 | 0 | 1 | 0 | 0 | ZAda | ||||
| M | N | ||||||||||||||||||||||||||||||
if !IsFeatureImplemented(FEAT_SME_MOP4) || !IsFeatureImplemented(FEAT_SME_F8F16) then EndOfDecode(Decode_UNDEF); end; let n : integer = UInt('0'::Zn::'0'); let m : integer = UInt('1'::Zm::'0'); let nreg : integer{} = 1; let mreg : integer = 2; let da : integer = UInt(ZAda);
| 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Zm | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Zn | 0 | 0 | 1 | 0 | 0 | ZAda | ||||
| M | N | ||||||||||||||||||||||||||||||
if !IsFeatureImplemented(FEAT_SME_MOP4) || !IsFeatureImplemented(FEAT_SME_F8F16) then EndOfDecode(Decode_UNDEF); end; let n : integer = UInt('0'::Zn::'0'); let m : integer = UInt('1'::Zm::'0'); let nreg : integer{} = 1; let mreg : integer = 1; let da : integer = UInt(ZAda);
| 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Zm | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Zn | 0 | 0 | 1 | 0 | 0 | ZAda | ||||
| M | N | ||||||||||||||||||||||||||||||
if !IsFeatureImplemented(FEAT_SME_MOP4) || !IsFeatureImplemented(FEAT_SME_F8F16) then EndOfDecode(Decode_UNDEF); end; let n : integer = UInt('0'::Zn::'0'); let m : integer = UInt('1'::Zm::'0'); let nreg : integer{} = 2; let mreg : integer = 1; let da : integer = UInt(ZAda);
| 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | Zm | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Zn | 0 | 0 | 1 | 0 | 0 | ZAda | ||||
| M | N | ||||||||||||||||||||||||||||||
if !IsFeatureImplemented(FEAT_SME_MOP4) || !IsFeatureImplemented(FEAT_SME_F8F16) then EndOfDecode(Decode_UNDEF); end; let n : integer = UInt('0'::Zn::'0'); let m : integer = UInt('1'::Zm::'0'); let nreg : integer{} = 2; let mreg : integer = 2; let da : integer = UInt(ZAda);
| <ZAda> |
Is the name of the ZA tile ZA0-ZA1, encoded in the "ZAda" field. |
| <Zn> |
Is the name of the first source scalable vector register, registers in the range Z0-Z15, encoded as "Zn" times 2. |
| <Zm1> |
Is the name of the first scalable vector register of the second source multi-vector group, in the range Z16-Z31, encoded as "Zm" times 2 plus 16. |
| <Zm2> |
Is the name of the second scalable vector register of the second source multi-vector group, in the range Z16-Z31, encoded as "Zm" times 2 plus 17. |
| <Zm> |
Is the name of the second source scalable vector register, registers in the range Z16-Z31, encoded as "Zm" times 2 plus 16. |
| <Zn1> |
Is the name of the first scalable vector register of the first source multi-vector group, in the range Z0-Z15, encoded as "Zn" times 2. |
| <Zn2> |
Is the name of the second scalable vector register of the first source multi-vector group, in the range Z0-Z15, encoded as "Zn" times 2 plus 1. |
CheckFPMREnabled(); CheckStreamingSVEAndZAEnabled(); let VL : integer{} = CurrentVL(); let hvsize : integer{} = VL DIV 2; let dim : integer{} = hvsize DIV 16; let tilesize : integer{} = 4*dim*dim*16; let op3 : bits(tilesize) = ZAtile{}(da, 16); var result : bits(tilesize); for outprod = 0 to 3 do let row_hv : integer = outprod DIVRM 2; let col_hv : integer = outprod MOD 2; let row_base : integer = row_hv * dim; let col_base : integer = col_hv * dim; let op1 : bits(VL) = Z{}(n + (nreg-1)*col_hv); let op2 : bits(VL) = Z{}(m + (mreg-1)*row_hv); for row = 0 to dim-1 do for col = 0 to dim-1 do let row_idx : integer = row_base + row; let col_idx : integer = col_base + col; let tile_idx : integer = row_idx * dim * 2 + col_idx; var sum : bits(16) = op3[tile_idx*:(16)]; var rowop : bits(16); var colop : bits(16); for i = 0 to 1 do rowop[i*:8] = op1[(2*row_idx + i)*:8]; colop[i*:8] = op2[(2*col_idx + i)*:8]; end; sum = FP8DotAddFP{16, 16}(sum, rowop, colop, FPCR(), FPMR()); result[tile_idx*:(16)] = sum; end; end; end; ZAtile{tilesize}(da, 16) = result;
2026-03_rel 2026-03-26 20:48:11
Copyright © 2010-2026 Arm Limited or its affiliates. All rights reserved. This document is Non-Confidential.