| //===- Utils.cpp - Utilities to support the Linalg dialect ----------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements utilities for the Linalg dialect. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/Linalg/Utils/Utils.h" |
| |
| #include "mlir/Analysis/SliceAnalysis.h" |
| #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
| #include "mlir/Dialect/Affine/LoopUtils.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Arith/Utils/Utils.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| #include "mlir/Dialect/Tensor/Utils/Utils.h" |
| #include "mlir/Dialect/Utils/IndexingUtils.h" |
| #include "mlir/Dialect/Utils/StaticValueUtils.h" |
| #include "mlir/IR/AffineExpr.h" |
| #include "mlir/IR/AffineExprVisitor.h" |
| #include "mlir/IR/AffineMap.h" |
| #include "mlir/IR/Matchers.h" |
| #include "mlir/IR/OpImplementation.h" |
| #include "mlir/Pass/Pass.h" |
| #include "llvm/ADT/TypeSwitch.h" |
| #include "llvm/Support/Debug.h" |
| #include <optional> |
| |
| #define DEBUG_TYPE "linalg-utils" |
| |
| using namespace mlir; |
| using namespace presburger; |
| using namespace mlir::affine; |
| using namespace mlir::linalg; |
| using namespace mlir::scf; |
| |
| namespace { |
| |
| // Helper visitor to determine whether an AffineExpr is tiled. |
| // This is achieved by traversing every AffineDimExpr with position `pos` and |
| // checking whether the corresponding `tileSizes[pos]` is non-zero. |
| // This also enforces only positive coefficients occur in multiplications. |
| // |
| // Example: |
| // `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0] |
| // |
| struct TileCheck : public AffineExprVisitor<TileCheck> { |
| TileCheck(ArrayRef<OpFoldResult> tileSizes) : tileSizes(tileSizes) {} |
| |
| void visitDimExpr(AffineDimExpr expr) { |
| isTiled |= !isZeroIndex(tileSizes[expr.getPosition()]); |
| } |
| void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) { |
| visit(expr.getLHS()); |
| visit(expr.getRHS()); |
| if (expr.getKind() == mlir::AffineExprKind::Mul) |
| assert(cast<AffineConstantExpr>(expr.getRHS()).getValue() > 0 && |
| "nonpositive multiplying coefficient"); |
| } |
| bool isTiled = false; |
| ArrayRef<OpFoldResult> tileSizes; |
| }; |
| |
| } // namespace |
| |
| static bool isTiled(AffineExpr expr, ArrayRef<OpFoldResult> tileSizes) { |
| if (!expr) |
| return false; |
| TileCheck t(tileSizes); |
| t.visit(expr); |
| return t.isTiled; |
| } |
| |
| // Checks whether the `map varies with respect to a non-zero `tileSize`. |
| static bool isTiled(AffineMap map, ArrayRef<OpFoldResult> tileSizes) { |
| if (!map) |
| return false; |
| for (unsigned r = 0; r < map.getNumResults(); ++r) |
| if (isTiled(map.getResult(r), tileSizes)) |
| return true; |
| return false; |
| } |
| |
| std::optional<RegionMatcher::BinaryOpKind> |
| RegionMatcher::matchAsScalarBinaryOp(GenericOp op) { |
| auto ®ion = op.getRegion(); |
| if (!llvm::hasSingleElement(region)) |
| return std::nullopt; |
| |
| Block &block = region.front(); |
| if (block.getNumArguments() != 2 || |
| !block.getArgument(0).getType().isSignlessIntOrFloat() || |
| !block.getArgument(1).getType().isSignlessIntOrFloat()) |
| return std::nullopt; |
| |
| auto &ops = block.getOperations(); |
| if (!llvm::hasSingleElement(block.without_terminator())) |
| return std::nullopt; |
| |
| using mlir::matchers::m_Val; |
| auto a = m_Val(block.getArgument(0)); |
| auto b = m_Val(block.getArgument(1)); |
| |
| auto addPattern = m_Op<linalg::YieldOp>(m_Op<arith::AddIOp>(a, b)); |
| if (addPattern.match(&ops.back())) |
| return BinaryOpKind::IAdd; |
| |
| return std::nullopt; |
| } |
| |
| /// Explicit instantiation of loop nest generator for different loop types. |
| template struct mlir::linalg::GenerateLoopNest<scf::ForOp>; |
| template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>; |
| template struct mlir::linalg::GenerateLoopNest<AffineForOp>; |
| |
| /// Given a list of subview ranges, extract individual values for lower, upper |
| /// bounds and steps and put them into the corresponding vectors. |
| static void unpackRanges(OpBuilder &builder, Location loc, |
| ArrayRef<Range> ranges, SmallVectorImpl<Value> &lbs, |
| SmallVectorImpl<Value> &ubs, |
| SmallVectorImpl<Value> &steps) { |
| for (Range range : ranges) { |
| lbs.emplace_back( |
| getValueOrCreateConstantIndexOp(builder, loc, range.offset)); |
| ubs.emplace_back(getValueOrCreateConstantIndexOp(builder, loc, range.size)); |
| steps.emplace_back( |
| getValueOrCreateConstantIndexOp(builder, loc, range.stride)); |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // General utilities |
| //===----------------------------------------------------------------------===// |
| |
| namespace mlir { |
| namespace linalg { |
| |
| bool allIndexingsAreProjectedPermutation(LinalgOp op) { |
| return llvm::all_of(op.getIndexingMapsArray(), [](AffineMap m) { |
| return m.isProjectedPermutation(/*allowZeroInResults=*/true); |
| }); |
| } |
| |
| bool hasOnlyScalarElementwiseOp(Region &r) { |
| if (!llvm::hasSingleElement(r)) |
| return false; |
| for (Operation &op : r.front()) { |
| if (!(isa<arith::ConstantOp, func::ConstantOp, tensor::ExtractOp, |
| linalg::YieldOp, linalg::IndexOp, AffineApplyOp>(op) || |
| OpTrait::hasElementwiseMappableTraits(&op)) || |
| llvm::any_of(op.getResultTypes(), |
| [](Type type) { return !type.isIntOrIndexOrFloat(); })) |
| return false; |
| } |
| return true; |
| } |
| |
| bool isElementwise(LinalgOp op) { |
| if (op.getNumLoops() != op.getNumParallelLoops()) |
| return false; |
| |
| if (!allIndexingsAreProjectedPermutation(op)) |
| return false; |
| |
| // TODO: relax the restrictions on indexing map. |
| for (OpOperand &opOperand : op.getDpsInitsMutable()) { |
| if (!op.getMatchingIndexingMap(&opOperand).isPermutation()) |
| return false; |
| } |
| return hasOnlyScalarElementwiseOp(op->getRegion(0)); |
| } |
| |
| bool isParallelIterator(utils::IteratorType iteratorType) { |
| return iteratorType == utils::IteratorType::parallel; |
| } |
| |
| bool isReductionIterator(utils::IteratorType iteratorType) { |
| return iteratorType == utils::IteratorType::reduction; |
| } |
| |
| Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type, |
| Value source, Value pad, bool nofold) { |
| // Exit if `source` is not defined by an ExtractSliceOp. |
| auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>(); |
| if (!sliceOp) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Search the `source` use-def chain for padded LinalgOps. |
| Value current = sliceOp.getSource(); |
| while (current) { |
| auto linalgOp = current.getDefiningOp<LinalgOp>(); |
| if (!linalgOp) |
| break; |
| OpResult opResult = cast<OpResult>(current); |
| current = linalgOp.getDpsInitOperand(opResult.getResultNumber())->get(); |
| } |
| auto padOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr; |
| |
| // Exit if the search fails to match a tensor::PadOp at the end of the matched |
| // LinalgOp sequence. |
| if (!padOp) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Exit if the padded result type does not match. |
| if (sliceOp.getSource().getType() != type) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Exit if the LinalgOps are not high padded. |
| if (llvm::any_of(padOp.getMixedLowPad(), [](OpFoldResult ofr) { |
| return getConstantIntValue(ofr) != static_cast<int64_t>(0); |
| })) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Exit if `padOpSliceOp`, which defines the slice used by |
| // `padOp`, is rank-reducing. |
| auto padOpSliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>(); |
| if (!padOpSliceOp || |
| sliceOp.getMixedSizes().size() != padOpSliceOp.getMixedSizes().size()) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size |
| // of the slice padded by `padOp`. |
| if (llvm::any_of( |
| llvm::zip(sliceOp.getMixedSizes(), padOpSliceOp.getMixedSizes()), |
| [](std::tuple<OpFoldResult, OpFoldResult> it) { |
| return !isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it)); |
| })) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Exit if the padding values do not match. |
| Attribute padOpPadAttr, padAttr; |
| Value padOpPad = padOp.getConstantPaddingValue(); |
| if (!padOpPad || !matchPattern(padOpPad, m_Constant(&padOpPadAttr)) || |
| !matchPattern(pad, m_Constant(&padAttr)) || padOpPadAttr != padAttr) |
| return tensor::createPadHighOp(type, source, pad, nofold, loc, b); |
| |
| // Return the padded result if the padding values and sizes match. |
| return sliceOp.getSource(); |
| } |
| |
| GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor, |
| Value outputTensor, |
| ArrayRef<int64_t> transposeVector) { |
| auto resultTensorType = cast<RankedTensorType>(outputTensor.getType()); |
| Type elementType = resultTensorType.getElementType(); |
| |
| assert(isPermutationVector(transposeVector) && |
| "expect transpose vector to be a permutation"); |
| assert(transposeVector.size() == |
| static_cast<size_t>(resultTensorType.getRank()) && |
| "expect transpose vector size to match result tensor rank"); |
| |
| // Compute the transpose and the indentity indexing maps. |
| SmallVector<AffineMap> indexingMaps = { |
| inversePermutation(AffineMap::getPermutationMap( |
| SmallVector<unsigned>(transposeVector.begin(), transposeVector.end()), |
| b.getContext())), |
| AffineMap::getMultiDimIdentityMap(transposeVector.size(), |
| b.getContext())}; |
| SmallVector<utils::IteratorType> iteratorTypes(transposeVector.size(), |
| utils::IteratorType::parallel); |
| |
| // Create a GenericOp to transpose `inputTensor` into `outputTensor`. |
| auto transposeOp = |
| b.create<GenericOp>(loc, resultTensorType, inputTensor, outputTensor, |
| indexingMaps, iteratorTypes); |
| Region &body = transposeOp.getRegion(); |
| body.push_back(new Block()); |
| body.front().addArguments({elementType, elementType}, {loc, loc}); |
| |
| // Create the body of the transpose operation. |
| OpBuilder::InsertionGuard g(b); |
| b.setInsertionPointToEnd(&body.front()); |
| b.create<YieldOp>(loc, transposeOp.getRegion().front().getArgument(0)); |
| return transposeOp; |
| } |
| |
| GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to) { |
| auto memrefTypeTo = cast<MemRefType>(to.getType()); |
| #ifndef NDEBUG |
| auto memrefTypeFrom = cast<MemRefType>(from.getType()); |
| assert(memrefTypeFrom.getRank() == memrefTypeTo.getRank() && |
| "`from` and `to` memref must have the same rank"); |
| #endif // NDEBUG |
| |
| AffineMap id = |
| AffineMap::getMultiDimIdentityMap(memrefTypeTo.getRank(), b.getContext()); |
| SmallVector<utils::IteratorType> iteratorTypes(memrefTypeTo.getRank(), |
| utils::IteratorType::parallel); |
| return b.create<linalg::GenericOp>( |
| loc, |
| /*inputs=*/from, |
| /*outputs=*/to, |
| /*indexingMaps=*/llvm::ArrayRef({id, id}), |
| /*iteratorTypes=*/iteratorTypes, |
| [](OpBuilder &b, Location loc, ValueRange args) { |
| b.create<linalg::YieldOp>(loc, args.front()); |
| }); |
| } |
| |
| /// Specialization to build an scf "for" nest. |
| template <> |
| void GenerateLoopNest<scf::ForOp>::doit( |
| OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, |
| ArrayRef<utils::IteratorType> iteratorTypes, |
| function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, |
| ValueRange)> |
| bodyBuilderFn, |
| ArrayRef<linalg::ProcInfo> procInfo) { |
| assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) && |
| "expected as many entries for proc info as number of loops, even if " |
| "they are null entries"); |
| SmallVector<Value> iterArgInitValues; |
| if (!linalgOp.hasPureBufferSemantics()) |
| llvm::append_range(iterArgInitValues, linalgOp.getDpsInits()); |
| SmallVector<Value, 4> lbs, ubs, steps; |
| unpackRanges(b, loc, loopRanges, lbs, ubs, steps); |
| LoopNest loopNest = mlir::scf::buildLoopNest( |
| b, loc, lbs, ubs, steps, iterArgInitValues, |
| [&](OpBuilder &b, Location loc, ValueRange ivs, ValueRange iterArgs) { |
| assert(iterArgs.size() == iterArgInitValues.size() && |
| "expect the number of output tensors and iter args to match"); |
| SmallVector<Value> operandValuesToUse = linalgOp->getOperands(); |
| if (!iterArgs.empty()) { |
| operandValuesToUse = linalgOp.getDpsInputs(); |
| operandValuesToUse.append(iterArgs.begin(), iterArgs.end()); |
| } |
| return bodyBuilderFn(b, loc, ivs, operandValuesToUse); |
| }); |
| |
| if (loopNest.loops.empty() || procInfo.empty()) |
| return; |
| |
| // Filter out scf.for loops that were created out of parallel dimensions. |
| for (const auto &loop : llvm::enumerate(loopNest.loops)) { |
| if (procInfo[loop.index()].distributionMethod == |
| DistributionMethod::Cyclic) { |
| mapLoopToProcessorIds(loop.value(), procInfo[loop.index()].procId, |
| procInfo[loop.index()].nprocs); |
| } |
| } |
| } |
| |
| /// Specialization to build affine "for" nest. |
| template <> |
| void GenerateLoopNest<AffineForOp>::doit( |
| OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, |
| ArrayRef<utils::IteratorType> iteratorTypes, |
| function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, |
| ValueRange)> |
| bodyBuilderFn, |
| ArrayRef<linalg::ProcInfo> /*procInfo*/) { |
| SmallVector<Value> iterArgInitValues; |
| if (!linalgOp.hasPureBufferSemantics()) |
| llvm::append_range(iterArgInitValues, linalgOp.getDpsInits()); |
| assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); |
| SmallVector<Value, 4> lbs, ubs, steps; |
| unpackRanges(b, loc, loopRanges, lbs, ubs, steps); |
| |
| // Affine loops require constant steps. |
| SmallVector<int64_t, 4> constantSteps; |
| constantSteps.reserve(steps.size()); |
| for (Value v : steps) { |
| auto constVal = getConstantIntValue(v); |
| assert(constVal.has_value() && "Affine loops require constant steps"); |
| constantSteps.push_back(constVal.value()); |
| } |
| |
| affine::buildAffineLoopNest(b, loc, lbs, ubs, constantSteps, |
| [&](OpBuilder &b, Location loc, ValueRange ivs) { |
| bodyBuilderFn(b, loc, ivs, |
| linalgOp->getOperands()); |
| }); |
| } |
| |
| /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. |
| void updateBoundsForCyclicDistribution(OpBuilder &b, Location loc, Value procId, |
| Value nprocs, Value &lb, Value &ub, |
| Value &step) { |
| AffineExpr d0, d1; |
| bindDims(b.getContext(), d0, d1); |
| AffineExpr s0 = getAffineSymbolExpr(0, b.getContext()); |
| lb = |
| affine::makeComposedAffineApply(b, loc, d0 + d1 * s0, {lb, procId, step}); |
| step = affine::makeComposedAffineApply(b, loc, d0 * s0, {nprocs, step}); |
| } |
| |
| /// Generates a loop nest consisting of scf.parallel and scf.for, depending |
| /// on the `iteratorTypes.` Consecutive parallel loops create a single |
| /// scf.parallel operation; each sequential loop creates a new scf.for |
| /// operation. The body of the innermost loop is populated by |
| /// `bodyBuilderFn` that accepts a range of induction variables for all |
| /// loops. `ivStorage` is used to store the partial list of induction |
| /// variables. |
| // TODO: this function can be made iterative instead. However, it |
| // will have at most as many recursive calls as nested loops, which rarely |
| // exceeds 10. |
| static void generateParallelLoopNest( |
| OpBuilder &b, Location loc, ValueRange lbs, ValueRange ubs, |
| ValueRange steps, ArrayRef<utils::IteratorType> iteratorTypes, |
| ArrayRef<linalg::ProcInfo> procInfo, |
| function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilderFn, |
| SmallVectorImpl<Value> &ivStorage) { |
| assert(lbs.size() == ubs.size()); |
| assert(lbs.size() == steps.size()); |
| assert(lbs.size() == iteratorTypes.size()); |
| assert(procInfo.empty() || (lbs.size() == procInfo.size())); |
| |
| // If there are no (more) loops to be generated, generate the body and be |
| // done with it. |
| if (iteratorTypes.empty()) { |
| bodyBuilderFn(b, loc, ivStorage); |
| return; |
| } |
| |
| // If there are no outer parallel loops, generate one sequential loop and |
| // recurse. |
| if (!isParallelIterator(iteratorTypes.front())) { |
| LoopNest singleLoop = buildLoopNest( |
| b, loc, lbs.take_front(), ubs.take_front(), steps.take_front(), |
| [&](OpBuilder &b, Location loc, ValueRange ivs) { |
| ivStorage.append(ivs.begin(), ivs.end()); |
| generateParallelLoopNest( |
| b, loc, lbs.drop_front(), ubs.drop_front(), steps.drop_front(), |
| iteratorTypes.drop_front(), |
| procInfo.empty() ? procInfo : procInfo.drop_front(), |
| bodyBuilderFn, ivStorage); |
| }); |
| return; |
| } |
| |
| unsigned nLoops = iteratorTypes.size(); |
| unsigned numProcessed = 0; |
| DistributionMethod distributionMethod = DistributionMethod::None; |
| if (procInfo.empty()) { |
| numProcessed = nLoops - iteratorTypes.drop_while(isParallelIterator).size(); |
| } else { |
| distributionMethod = procInfo.front().distributionMethod; |
| numProcessed = |
| nLoops - procInfo |
| .drop_while([&](linalg::ProcInfo p) { |
| return p.distributionMethod == distributionMethod; |
| }) |
| .size(); |
| } |
| |
| auto remainderProcInfo = |
| procInfo.empty() ? procInfo : procInfo.drop_front(numProcessed); |
| switch (distributionMethod) { |
| case DistributionMethod::None: { |
| // Generate a single parallel loop-nest operation for all outermost |
| // parallel loops and recurse. |
| b.create<scf::ParallelOp>( |
| loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed), |
| steps.take_front(numProcessed), |
| [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { |
| ivStorage.append(localIvs.begin(), localIvs.end()); |
| generateParallelLoopNest( |
| nestedBuilder, nestedLoc, lbs.drop_front(numProcessed), |
| ubs.drop_front(numProcessed), steps.drop_front(numProcessed), |
| iteratorTypes.drop_front(numProcessed), remainderProcInfo, |
| bodyBuilderFn, ivStorage); |
| }); |
| return; |
| } |
| case DistributionMethod::Cyclic: { |
| // Generate a single parallel loop-nest operation for all outermost |
| // parallel loops and recurse. |
| b.create<scf::ParallelOp>( |
| loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed), |
| steps.take_front(numProcessed), |
| [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { |
| ivStorage.append(localIvs.begin(), localIvs.end()); |
| generateParallelLoopNest( |
| nestedBuilder, nestedLoc, lbs.drop_front(numProcessed), |
| ubs.drop_front(numProcessed), steps.drop_front(numProcessed), |
| iteratorTypes.drop_front(numProcessed), remainderProcInfo, |
| bodyBuilderFn, ivStorage); |
| }); |
| return; |
| } |
| case DistributionMethod::CyclicNumProcsGeNumIters: { |
| // Check (for the processed loops) that the iteration is in-bounds. |
| ArithBuilder ab(b, loc); |
| Value cond = ab.slt(lbs[0], ubs[0]); |
| for (unsigned i = 1; i < numProcessed; ++i) |
| cond = ab._and(cond, ab.slt(lbs[i], ubs[i])); |
| ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); |
| b.create<scf::IfOp>(loc, cond, [&](OpBuilder &b, Location loc) { |
| generateParallelLoopNest(b, loc, lbs.drop_front(numProcessed), |
| ubs.drop_front(numProcessed), |
| steps.drop_front(numProcessed), |
| iteratorTypes.drop_front(numProcessed), |
| remainderProcInfo, bodyBuilderFn, ivStorage); |
| b.create<scf::YieldOp>(loc, ValueRange{}); |
| }); |
| return; |
| } |
| case DistributionMethod::CyclicNumProcsEqNumIters: |
| // No check/loops needed here. Set the `%iv` to be the `%lb` and proceed |
| // with inner loop generation. |
| ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); |
| generateParallelLoopNest( |
| b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), |
| steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), |
| remainderProcInfo, bodyBuilderFn, ivStorage); |
| return; |
| } |
| } |
| |
| /// Specialization for generating a mix of parallel and sequential scf loops. |
| template <> |
| void GenerateLoopNest<scf::ParallelOp>::doit( |
| OpBuilder &b, Location loc, ArrayRef<Range> loopRanges, LinalgOp linalgOp, |
| ArrayRef<utils::IteratorType> iteratorTypes, |
| function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange, |
| ValueRange)> |
| bodyBuilderFn, |
| ArrayRef<linalg::ProcInfo> procInfo) { |
| SmallVector<Value> iterArgInitValues; |
| if (!linalgOp.hasPureBufferSemantics()) |
| llvm::append_range(iterArgInitValues, linalgOp.getDpsInits()); |
| assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); |
| // This function may be passed more iterator types than ranges. |
| assert(iteratorTypes.size() >= loopRanges.size() && |
| "expected iterator type for all ranges"); |
| assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) && |
| "expected proc information for all loops when present"); |
| iteratorTypes = iteratorTypes.take_front(loopRanges.size()); |
| SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs; |
| unsigned numLoops = iteratorTypes.size(); |
| ivs.reserve(numLoops); |
| lbsStorage.reserve(numLoops); |
| ubsStorage.reserve(numLoops); |
| stepsStorage.reserve(numLoops); |
| |
| // Get the loop lb, ub, and step. |
| unpackRanges(b, loc, loopRanges, lbsStorage, ubsStorage, stepsStorage); |
| |
| // Modify the lb, ub, and step based on the distribution options. |
| for (const auto &it : llvm::enumerate(procInfo)) { |
| if (it.value().distributionMethod != linalg::DistributionMethod::None) { |
| updateBoundsForCyclicDistribution( |
| b, loc, it.value().procId, it.value().nprocs, lbsStorage[it.index()], |
| ubsStorage[it.index()], stepsStorage[it.index()]); |
| } |
| } |
| ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage); |
| generateParallelLoopNest( |
| b, loc, lbs, ubs, steps, iteratorTypes, procInfo, |
| [&](OpBuilder &b, Location loc, ValueRange ivs) { |
| bodyBuilderFn(b, loc, ivs, linalgOp->getOperands()); |
| }, |
| ivs); |
| |
| assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); |
| } |
| |
| static Value materializeTiledShape(OpBuilder &builder, Location loc, |
| Value valueToTile, |
| const SliceParameters &sliceParams) { |
| auto shapedType = dyn_cast<ShapedType>(valueToTile.getType()); |
| auto *sliceOp = TypeSwitch<ShapedType, Operation *>(shapedType) |
| .Case([&](MemRefType) { |
| return builder.create<memref::SubViewOp>( |
| loc, valueToTile, sliceParams.offsets, |
| sliceParams.sizes, sliceParams.strides); |
| }) |
| .Case([&](RankedTensorType) { |
| return builder.create<tensor::ExtractSliceOp>( |
| loc, valueToTile, sliceParams.offsets, |
| sliceParams.sizes, sliceParams.strides); |
| }) |
| .Default([](ShapedType) -> Operation * { |
| llvm_unreachable("Unexpected shaped type"); |
| }); |
| return sliceOp->getResult(0); |
| } |
| |
| Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile, |
| ArrayRef<OpFoldResult> tileSizes, AffineMap map, |
| ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs, |
| ArrayRef<OpFoldResult> subShapeSizes, |
| bool omitPartialTileCheck) { |
| SliceParameters sliceParams = |
| computeSliceParameters(builder, loc, valueToTile, tileSizes, map, lbs, |
| ubs, subShapeSizes, omitPartialTileCheck); |
| return materializeTiledShape(builder, loc, valueToTile, sliceParams); |
| } |
| |
| SliceParameters |
| computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile, |
| ArrayRef<OpFoldResult> tileSizes, AffineMap map, |
| ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs, |
| ArrayRef<OpFoldResult> subShapeSizes, |
| bool omitPartialTileCheck) { |
| auto shapedType = dyn_cast<ShapedType>(valueToTile.getType()); |
| assert(shapedType && "only shaped types can be tiled"); |
| ArrayRef<int64_t> shape = shapedType.getShape(); |
| int64_t rank = shapedType.getRank(); |
| |
| // Compute offsets/sizes/strides for the tile. |
| SliceParameters sliceParams; |
| sliceParams.offsets.reserve(rank); |
| sliceParams.sizes.reserve(rank); |
| sliceParams.strides.reserve(rank); |
| for (unsigned r = 0; r < rank; ++r) { |
| LLVM_DEBUG(llvm::dbgs() << "computeSliceParameters: for dim#" << r); |
| if (!isTiled(map.getSubMap({r}), tileSizes)) { |
| sliceParams.offsets.push_back(builder.getIndexAttr(0)); |
| OpFoldResult dim = createFoldedDimOp(builder, loc, valueToTile, r); |
| sliceParams.sizes.push_back(dim); |
| sliceParams.strides.push_back(builder.getIndexAttr(1)); |
| LLVM_DEBUG(llvm::dbgs() << ": not tiled: use size: " << dim << "\n"); |
| continue; |
| } |
| LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subsize...\n"); |
| |
| // Tiling creates a new slice at the proper index, the slice step is 1 |
| // (i.e. the op does not subsample, stepping occurs in the loop). |
| auto m = map.getSubMap({r}); |
| LLVM_DEBUG(llvm::dbgs() << "computeSliceParameters: submap: " << m << "\n"); |
| IRRewriter rewriter(builder); |
| OpFoldResult offset = makeComposedFoldedAffineApply(rewriter, loc, m, lbs); |
| sliceParams.offsets.push_back(offset); |
| OpFoldResult closedIntSize = |
| makeComposedFoldedAffineApply(rewriter, loc, m, subShapeSizes); |
| // Resulting size needs to be made half open interval again. |
| AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext()); |
| OpFoldResult size = |
| makeComposedFoldedAffineApply(rewriter, loc, s0 + 1, closedIntSize); |
| LLVM_DEBUG(llvm::dbgs() |
| << "computeSliceParameters: raw size: " << size << "\n"); |
| LLVM_DEBUG(llvm::dbgs() |
| << "computeSliceParameters: new offset: " << offset << "\n"); |
| sliceParams.strides.push_back(builder.getIndexAttr(1)); |
| |
| if (omitPartialTileCheck) { |
| // We statically know that the partial/boundary tile condition is |
| // unnecessary. |
| LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n"); |
| sliceParams.sizes.push_back(size); |
| continue; |
| } |
| |
| // The size of the subview / extract_slice should be trimmed to avoid |
| // out-of-bounds accesses, unless: |
| // a. We statically know the subshape size divides the shape size evenly. |
| // b. The subshape size is 1. According to the way the loops are set up, |
| // tensors with "0" dimensions would never be constructed. |
| int64_t shapeSize = shape[r]; |
| std::optional<int64_t> sizeCst = getConstantIntValue(size); |
| auto hasTileSizeOne = sizeCst && *sizeCst == 1; |
| auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) && |
| ((shapeSize % *sizeCst) == 0); |
| if (!hasTileSizeOne && !dividesEvenly) { |
| LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize |
| << ", size: " << size |
| << ": make sure in bound with affine.min\n"); |
| |
| AffineExpr dim0, dim1, dim2; |
| bindDims(builder.getContext(), dim0, dim1, dim2); |
| |
| // Get the dimension size for this dimension. We need to first calculate |
| // the max index and then plus one. This is important because for |
| // convolution ops, we have its input window dimension's affine map of the |
| // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window |
| // dimension and `s0` is stride. Directly use the dimension size of |
| // output/filer window dimensions will cause incorrect calculation. |
| AffineMap minusOneMap = |
| AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 - 1}}) |
| .front(); |
| AffineMap plusOneMap = |
| AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}}) |
| .front(); |
| SmallVector<OpFoldResult> maxIndices = |
| llvm::to_vector(llvm::map_range(ubs, [&](OpFoldResult ub) { |
| return makeComposedFoldedAffineApply(rewriter, loc, minusOneMap, |
| {ub}); |
| })); |
| OpFoldResult maxIndex = |
| makeComposedFoldedAffineApply(rewriter, loc, m, maxIndices); |
| OpFoldResult d = |
| makeComposedFoldedAffineApply(rewriter, loc, plusOneMap, {maxIndex}); |
| |
| // Compute min(dim - offset, size) to avoid out-of-bounds accesses. |
| AffineMap minMap = AffineMap::inferFromExprList( |
| {ArrayRef<AffineExpr>{dim1 - dim2, dim0}}) |
| .front(); |
| size = |
| makeComposedFoldedAffineMin(rewriter, loc, minMap, {size, d, offset}); |
| } |
| LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: new size: " << size << "\n"); |
| sliceParams.sizes.push_back(size); |
| } |
| return sliceParams; |
| } |
| |
| SmallVector<OpFoldResult> computeTileOffsets(OpBuilder &b, Location loc, |
| ArrayRef<OpFoldResult> ivs, |
| ArrayRef<OpFoldResult> tileSizes) { |
| SmallVector<OpFoldResult> offsets; |
| for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) { |
| LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for loop#" << idx << "\n"); |
| bool isTiled = !isZeroIndex(tileSizes[idx]); |
| offsets.push_back(isTiled ? ivs[idxIvs++] : b.getIndexAttr(0)); |
| LLVM_DEBUG(llvm::dbgs() |
| << "computeTileOffsets: " << offsets.back() << "\n"); |
| } |
| return offsets; |
| } |
| |
| SmallVector<OpFoldResult> computeTileSizes(OpBuilder &b, Location loc, |
| ArrayRef<OpFoldResult> tileSizes, |
| ArrayRef<OpFoldResult> sizeBounds) { |
| SmallVector<OpFoldResult> sizes; |
| for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) { |
| bool isTiled = !isZeroIndex(tileSizes[idx]); |
| // Before composing, we need to make range a closed interval. |
| OpFoldResult size = isTiled ? tileSizes[idx] : sizeBounds[idx]; |
| AffineExpr d0 = getAffineDimExpr(0, b.getContext()); |
| IRRewriter rewriter(b); |
| sizes.push_back(makeComposedFoldedAffineApply(rewriter, loc, d0 - 1, size)); |
| LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n"); |
| } |
| return sizes; |
| } |
| |
| SmallVector<Type> getTensorOutputTypes(LinalgOp op, ValueRange operands) { |
| if (op.hasPureBufferSemantics()) |
| return {}; |
| return llvm::to_vector( |
| llvm::map_range(op.getDpsInitsMutable(), [&](OpOperand &opOperand) { |
| return operands[opOperand.getOperandNumber()].getType(); |
| })); |
| } |
| |
| SmallVector<Value> insertSlicesBack(OpBuilder &builder, Location loc, |
| LinalgOp op, ValueRange operands, |
| ValueRange results) { |
| if (op.hasPureBufferSemantics()) |
| return {}; |
| SmallVector<Value> tensorResults; |
| tensorResults.reserve(results.size()); |
| // Insert a insert_slice for each output tensor. |
| unsigned resultIdx = 0; |
| for (OpOperand &opOperand : op.getDpsInitsMutable()) { |
| // TODO: use an interface/adaptor to avoid leaking position in |
| // `tiledOperands`. |
| Value outputTensor = operands[opOperand.getOperandNumber()]; |
| if (auto sliceOp = outputTensor.getDefiningOp<tensor::ExtractSliceOp>()) { |
| Value inserted = builder.create<tensor::InsertSliceOp>( |
| loc, sliceOp.getSource().getType(), results[resultIdx], |
| sliceOp.getSource(), sliceOp.getOffsets(), sliceOp.getSizes(), |
| sliceOp.getStrides(), sliceOp.getStaticOffsets(), |
| sliceOp.getStaticSizes(), sliceOp.getStaticStrides()); |
| tensorResults.push_back(inserted); |
| } else { |
| tensorResults.push_back(results[resultIdx]); |
| } |
| ++resultIdx; |
| } |
| return tensorResults; |
| } |
| |
| SmallVector<std::optional<SliceParameters>> |
| computeAllSliceParameters(OpBuilder &builder, Location loc, LinalgOp linalgOp, |
| ValueRange valuesToTile, ArrayRef<OpFoldResult> ivs, |
| ArrayRef<OpFoldResult> tileSizes, |
| ArrayRef<OpFoldResult> sizeBounds, |
| bool omitPartialTileCheck) { |
| assert(ivs.size() == static_cast<size_t>(llvm::count_if( |
| llvm::make_range(tileSizes.begin(), tileSizes.end()), |
| [](OpFoldResult v) { return !isZeroIndex(v); })) && |
| "expected as many ivs as non-zero sizes"); |
| |
| // Construct (potentially temporary) mins and maxes on which to apply maps |
| // that define tile subshapes. |
| SmallVector<OpFoldResult> lbs = |
| computeTileOffsets(builder, loc, ivs, tileSizes); |
| SmallVector<OpFoldResult> subShapeSizes = |
| computeTileSizes(builder, loc, tileSizes, sizeBounds); |
| |
| assert(static_cast<int64_t>(valuesToTile.size()) <= |
| linalgOp->getNumOperands() && |
| "more value to tile than operands."); |
| SmallVector<std::optional<SliceParameters>> allSliceParams; |
| allSliceParams.reserve(valuesToTile.size()); |
| for (auto [opOperand, val] : |
| llvm::zip(linalgOp->getOpOperands(), valuesToTile)) { |
| Value shapedOp = val; |
| LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for operand " << shapedOp); |
| AffineMap map = linalgOp.getMatchingIndexingMap(&opOperand); |
| // Use `opOperand` as is if it is not tiled and not an output tensor. Having |
| // an extract/insert slice pair for all output tensors simplifies follow up |
| // transformations such as padding and bufferization since the |
| // extract/insert slice pairs make the accessed iteration argument |
| // subdomains explicit. |
| |
| Type operandType = opOperand.get().getType(); |
| if (!isTiled(map, tileSizes) && !(isa<RankedTensorType>(operandType) && |
| linalgOp.isDpsInit(&opOperand))) { |
| allSliceParams.push_back(std::nullopt); |
| LLVM_DEBUG(llvm::dbgs() |
| << ": not tiled: use shape: " << operandType << "\n"); |
| continue; |
| } |
| LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subshape...\n"); |
| |
| allSliceParams.push_back(computeSliceParameters( |
| builder, loc, shapedOp, tileSizes, map, lbs, sizeBounds, subShapeSizes, |
| omitPartialTileCheck)); |
| } |
| |
| return allSliceParams; |
| } |
| |
| SmallVector<Value> makeTiledShapes(OpBuilder &builder, Location loc, |
| LinalgOp linalgOp, ValueRange valuesToTile, |
| ArrayRef<OpFoldResult> ivs, |
| ArrayRef<OpFoldResult> tileSizes, |
| ArrayRef<OpFoldResult> sizeBounds, |
| bool omitPartialTileCheck) { |
| SmallVector<std::optional<SliceParameters>> allSliceParameter = |
| computeAllSliceParameters(builder, loc, linalgOp, valuesToTile, ivs, |
| tileSizes, sizeBounds, omitPartialTileCheck); |
| SmallVector<Value> tiledShapes; |
| for (auto item : llvm::zip(valuesToTile, allSliceParameter)) { |
| Value valueToTile = std::get<0>(item); |
| std::optional<SliceParameters> sliceParams = std::get<1>(item); |
| tiledShapes.push_back( |
| sliceParams.has_value() |
| ? materializeTiledShape(builder, loc, valueToTile, *sliceParams) |
| : valueToTile); |
| } |
| return tiledShapes; |
| } |
| |
| void offsetIndices(OpBuilder &b, LinalgOp linalgOp, |
| ArrayRef<OpFoldResult> offsets) { |
| IRRewriter rewriter(b); |
| offsetIndices(rewriter, linalgOp, offsets); |
| } |
| |
| void offsetIndices(RewriterBase &b, LinalgOp linalgOp, |
| ArrayRef<OpFoldResult> offsets) { |
| if (!linalgOp.hasIndexSemantics()) |
| return; |
| |
| for (IndexOp indexOp : linalgOp.getBlock()->getOps<IndexOp>()) { |
| if (indexOp.getDim() >= offsets.size() || !offsets[indexOp.getDim()]) |
| continue; |
| OpBuilder::InsertionGuard guard(b); |
| b.setInsertionPointAfter(indexOp); |
| AffineExpr index, offset; |
| bindDims(b.getContext(), index, offset); |
| OpFoldResult applied = makeComposedFoldedAffineApply( |
| b, indexOp.getLoc(), index + offset, |
| {getAsOpFoldResult(indexOp.getResult()), offsets[indexOp.getDim()]}); |
| Value materialized = |
| getValueOrCreateConstantIndexOp(b, indexOp.getLoc(), applied); |
| b.replaceOpWithIf(indexOp, materialized, [&](OpOperand &use) { |
| return use.getOwner() != materialized.getDefiningOp(); |
| }); |
| } |
| } |
| |
| /// Get the reassociation maps to fold the result of a extract_slice (or source |
| /// of a insert_slice) operation with given offsets, and sizes to its |
| /// rank-reduced version. This is only done for the cases where the size is 1 |
| /// and offset is 0. Strictly speaking the offset 0 is not required in general, |
| /// but non-zero offsets are not handled by SPIR-V backend at this point (and |
| /// potentially cannot be handled). |
| std::optional<SmallVector<ReassociationIndices>> |
| getReassociationMapForFoldingUnitDims(ArrayRef<OpFoldResult> mixedSizes) { |
| SmallVector<ReassociationIndices> reassociation; |
| ReassociationIndices curr; |
| for (const auto &it : llvm::enumerate(mixedSizes)) { |
| auto dim = it.index(); |
| auto size = it.value(); |
| curr.push_back(dim); |
| auto attr = llvm::dyn_cast_if_present<Attribute>(size); |
| if (attr && cast<IntegerAttr>(attr).getInt() == 1) |
| continue; |
| reassociation.emplace_back(ReassociationIndices{}); |
| std::swap(reassociation.back(), curr); |
| } |
| // When the reassociations are not empty, then fold the remaining |
| // unit-dimensions into the last dimension. If the reassociations so far is |
| // empty, then leave it emtpy. This will fold everything to a rank-0 tensor. |
| if (!curr.empty() && !reassociation.empty()) |
| reassociation.back().append(curr.begin(), curr.end()); |
| return reassociation; |
| } |
| |
| } // namespace linalg |
| } // namespace mlir |