|  | //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file implements a pass to generate ROCDLIR operations for higher-level | 
|  | // GPU operations. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" | 
|  | #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" | 
|  | #include "mlir/Dialect/Arith/Transforms/Passes.h" | 
|  | #include "mlir/Pass/Pass.h" | 
|  | #include "mlir/Pass/PassManager.h" | 
|  | #include "mlir/Transforms/Passes.h" | 
|  |  | 
|  | #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" | 
|  | #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" | 
|  | #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" | 
|  | #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" | 
|  | #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" | 
|  | #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" | 
|  | #include "mlir/Conversion/LLVMCommon/Pattern.h" | 
|  | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" | 
|  | #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" | 
|  | #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" | 
|  | #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" | 
|  | #include "mlir/Dialect/Func/IR/FuncOps.h" | 
|  | #include "mlir/Dialect/GPU/IR/GPUDialect.h" | 
|  | #include "mlir/Dialect/GPU/Transforms/Passes.h" | 
|  | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" | 
|  | #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" | 
|  | #include "mlir/Dialect/Math/IR/Math.h" | 
|  | #include "mlir/Dialect/MemRef/IR/MemRef.h" | 
|  | #include "mlir/Dialect/Vector/IR/VectorOps.h" | 
|  | #include "mlir/IR/BuiltinAttributes.h" | 
|  | #include "mlir/Pass/Pass.h" | 
|  | #include "mlir/Transforms/DialectConversion.h" | 
|  | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" | 
|  | #include "llvm/Support/FormatVariadic.h" | 
|  |  | 
|  | #include "../GPUCommon/GPUOpsLowering.h" | 
|  | #include "../GPUCommon/IndexIntrinsicsOpLowering.h" | 
|  | #include "../GPUCommon/OpToFuncCallLowering.h" | 
|  |  | 
|  | namespace mlir { | 
|  | #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS | 
|  | #include "mlir/Conversion/Passes.h.inc" | 
|  | } // namespace mlir | 
|  |  | 
|  | using namespace mlir; | 
|  |  | 
|  | /// Returns true if the given `gpu.func` can be safely called using the bare | 
|  | /// pointer calling convention. | 
|  | static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { | 
|  | bool canBeBare = true; | 
|  | for (Type type : func.getArgumentTypes()) | 
|  | if (auto memrefTy = dyn_cast<BaseMemRefType>(type)) | 
|  | canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy); | 
|  | return canBeBare; | 
|  | } | 
|  |  | 
|  | Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, | 
|  | const unsigned indexBitwidth) { | 
|  | auto int32Type = IntegerType::get(rewriter.getContext(), 32); | 
|  | Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32); | 
|  | Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32); | 
|  | Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type, | 
|  | ValueRange{minus1, zero}); | 
|  | Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type, | 
|  | ValueRange{minus1, mbcntLo}); | 
|  | return laneId; | 
|  | } | 
|  |  | 
|  | namespace { | 
|  | struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> { | 
|  | using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern; | 
|  |  | 
|  | LogicalResult | 
|  | matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor, | 
|  | ConversionPatternRewriter &rewriter) const override { | 
|  | auto loc = op->getLoc(); | 
|  | MLIRContext *context = rewriter.getContext(); | 
|  | // convert to:  %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0) | 
|  | // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo) | 
|  |  | 
|  | Type intTy = IntegerType::get(context, 32); | 
|  | Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32); | 
|  | Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32); | 
|  | Value mbcntLo = | 
|  | rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero}); | 
|  | Value laneId = rewriter.create<ROCDL::MbcntHiOp>( | 
|  | loc, intTy, ValueRange{minus1, mbcntLo}); | 
|  | // Truncate or extend the result depending on the index bitwidth specified | 
|  | // by the LLVMTypeConverter options. | 
|  | const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); | 
|  | if (indexBitwidth > 32) { | 
|  | laneId = rewriter.create<LLVM::SExtOp>( | 
|  | loc, IntegerType::get(context, indexBitwidth), laneId); | 
|  | } else if (indexBitwidth < 32) { | 
|  | laneId = rewriter.create<LLVM::TruncOp>( | 
|  | loc, IntegerType::get(context, indexBitwidth), laneId); | 
|  | } | 
|  | rewriter.replaceOp(op, {laneId}); | 
|  | return success(); | 
|  | } | 
|  | }; | 
|  |  | 
|  | struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> { | 
|  | using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern; | 
|  |  | 
|  | /// Lowers a shuffle to the corresponding ROCDL ops. | 
|  | /// | 
|  | /// Use the `width` argument to see if src lane is participating. | 
|  | /// If not the dstLane would be itself. | 
|  | /// | 
|  | ///  Shuffle with DS Bpermute: | 
|  | ///   let shflMode = [xor, up, down, idx] | 
|  | ///   let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width]. | 
|  | ///   1. curLaneId = using mbcnt.lo + mbcnt.hi | 
|  | ///   2. widthOrZeroIfOutside = (curLaneId + width) & -width | 
|  | ///   3. dstLane = shflMode(curLaneId, step) | 
|  | ///   4. isActiveSrcLane = dstLane < isActiveSrcLane | 
|  | ///   5. dstLane = isActiveSrcLane ? dstLane : curLaneId | 
|  | ///   6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2. | 
|  | ///   7. bpermute(dwordAlignedDstLane, shfl_value). | 
|  | /// | 
|  | LogicalResult | 
|  | matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor, | 
|  | ConversionPatternRewriter &rewriter) const override { | 
|  | Location loc = op->getLoc(); | 
|  | // TODO: Add support for non 32-bit shuffle values. | 
|  | if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32) | 
|  | return failure(); | 
|  | const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); | 
|  | Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); | 
|  |  | 
|  | auto int32Type = IntegerType::get(rewriter.getContext(), 32); | 
|  | Value width = adaptor.getWidth(); | 
|  | Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0); | 
|  | Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width); | 
|  | Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width); | 
|  | Value widthOrZeroIfOutside = | 
|  | rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth); | 
|  | Value dstLane; | 
|  | // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN. | 
|  | // TODO: Use ds_swizzle for XOR when step/offsets are constants for better | 
|  | // perf. | 
|  | switch (op.getMode()) { | 
|  | case gpu::ShuffleMode::XOR: | 
|  | dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId, | 
|  | adaptor.getOffset()); | 
|  | break; | 
|  | case gpu::ShuffleMode::IDX: | 
|  | dstLane = adaptor.getOffset(); | 
|  | break; | 
|  | default: | 
|  | return failure(); | 
|  | } | 
|  | Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>( | 
|  | loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside); | 
|  | Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane, | 
|  | dstLane, srcLaneId); | 
|  | Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2); | 
|  | Value dwordAlignedDstLane = | 
|  | rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two); | 
|  | Value initShflValue = adaptor.getValue(); | 
|  | if (adaptor.getValue().getType().isF32()) { | 
|  | initShflValue = | 
|  | rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue); | 
|  | } | 
|  | Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>( | 
|  | loc, int32Type, dwordAlignedDstLane, initShflValue); | 
|  | if (adaptor.getValue().getType().isF32()) { | 
|  | shflValue = rewriter.create<LLVM::BitcastOp>( | 
|  | loc, adaptor.getValue().getType(), shflValue); | 
|  | } | 
|  | rewriter.replaceOp(op, {shflValue, isActiveSrcLane}); | 
|  | return success(); | 
|  | } | 
|  | }; | 
|  |  | 
|  | /// Import the GPU Ops to ROCDL Patterns. | 
|  | #include "GPUToROCDL.cpp.inc" | 
|  |  | 
|  | // A pass that replaces all occurrences of GPU device operations with their | 
|  | // corresponding ROCDL equivalent. | 
|  | // | 
|  | // This pass only handles device code and is not meant to be run on GPU host | 
|  | // code. | 
|  | struct LowerGpuOpsToROCDLOpsPass | 
|  | : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> { | 
|  | LowerGpuOpsToROCDLOpsPass() = default; | 
|  | LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, | 
|  | bool useBarePtrCallConv, | 
|  | gpu::amd::Runtime runtime) { | 
|  | if (this->chipset.getNumOccurrences() == 0) | 
|  | this->chipset = chipset; | 
|  | if (this->indexBitwidth.getNumOccurrences() == 0) | 
|  | this->indexBitwidth = indexBitwidth; | 
|  | if (this->useBarePtrCallConv.getNumOccurrences() == 0) | 
|  | this->useBarePtrCallConv = useBarePtrCallConv; | 
|  | if (this->runtime.getNumOccurrences() == 0) | 
|  | this->runtime = runtime; | 
|  | } | 
|  |  | 
|  | void runOnOperation() override { | 
|  | gpu::GPUModuleOp m = getOperation(); | 
|  | MLIRContext *ctx = m.getContext(); | 
|  |  | 
|  | // Request C wrapper emission. | 
|  | for (auto func : m.getOps<func::FuncOp>()) { | 
|  | func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), | 
|  | UnitAttr::get(ctx)); | 
|  | } | 
|  |  | 
|  | FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset); | 
|  | if (failed(maybeChipset)) { | 
|  | emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); | 
|  | return signalPassFailure(); | 
|  | } | 
|  |  | 
|  | /// Customize the bitwidth used for the device side index computations. | 
|  | LowerToLLVMOptions options( | 
|  | ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation()))); | 
|  | if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) | 
|  | options.overrideIndexBitwidth(indexBitwidth); | 
|  |  | 
|  | if (useBarePtrCallConv) { | 
|  | options.useBarePtrCallConv = true; | 
|  | WalkResult canUseBarePointers = | 
|  | m.walk([](gpu::GPUFuncOp func) -> WalkResult { | 
|  | if (canBeCalledWithBarePointers(func)) | 
|  | return WalkResult::advance(); | 
|  | return WalkResult::interrupt(); | 
|  | }); | 
|  | if (canUseBarePointers.wasInterrupted()) { | 
|  | emitError(UnknownLoc::get(ctx), | 
|  | "bare pointer calling convention requires all memrefs to " | 
|  | "have static shape and use the identity map"); | 
|  | return signalPassFailure(); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Apply in-dialect lowering. In-dialect lowering will replace | 
|  | // ops which need to be lowered further, which is not supported by a | 
|  | // single conversion pass. | 
|  | { | 
|  | RewritePatternSet patterns(ctx); | 
|  | populateGpuRewritePatterns(patterns); | 
|  | arith::populateExpandBFloat16Patterns(patterns); | 
|  | (void)applyPatternsAndFoldGreedily(m, std::move(patterns)); | 
|  | } | 
|  |  | 
|  | LLVMTypeConverter converter(ctx, options); | 
|  | populateGpuMemorySpaceAttributeConversions( | 
|  | converter, [](gpu::AddressSpace space) { | 
|  | switch (space) { | 
|  | case gpu::AddressSpace::Global: | 
|  | return 1; | 
|  | case gpu::AddressSpace::Workgroup: | 
|  | return 3; | 
|  | case gpu::AddressSpace::Private: | 
|  | return 5; | 
|  | } | 
|  | llvm_unreachable("unknown address space enum value"); | 
|  | return 0; | 
|  | }); | 
|  |  | 
|  | RewritePatternSet llvmPatterns(ctx); | 
|  |  | 
|  | mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns); | 
|  | populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, | 
|  | *maybeChipset); | 
|  | populateVectorToLLVMConversionPatterns(converter, llvmPatterns); | 
|  | cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns); | 
|  | populateFuncToLLVMConversionPatterns(converter, llvmPatterns); | 
|  | populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns); | 
|  | populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime); | 
|  | LLVMConversionTarget target(getContext()); | 
|  | configureGpuToROCDLConversionLegality(target); | 
|  | if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) | 
|  | signalPassFailure(); | 
|  |  | 
|  | // Manually rewrite known block size attributes so the LLVMIR translation | 
|  | // infrastructure can pick them up. | 
|  | m.walk([ctx](LLVM::LLVMFuncOp op) { | 
|  | if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>( | 
|  | op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) { | 
|  | op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(), | 
|  | blockSizes); | 
|  | // Also set up the rocdl.flat_work_group_size attribute to prevent | 
|  | // conflicting metadata. | 
|  | uint32_t flatSize = 1; | 
|  | for (uint32_t size : blockSizes.asArrayRef()) { | 
|  | flatSize *= size; | 
|  | } | 
|  | StringAttr flatSizeAttr = | 
|  | StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize)); | 
|  | op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(), | 
|  | flatSizeAttr); | 
|  | } | 
|  | }); | 
|  | } | 
|  | }; | 
|  |  | 
|  | } // namespace | 
|  |  | 
|  | void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { | 
|  | target.addIllegalOp<func::FuncOp>(); | 
|  | target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); | 
|  | target.addLegalDialect<ROCDL::ROCDLDialect>(); | 
|  | target.addIllegalDialect<gpu::GPUDialect>(); | 
|  | target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp, | 
|  | LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, | 
|  | LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, | 
|  | LLVM::SqrtOp>(); | 
|  |  | 
|  | // TODO: Remove once we support replacing non-root ops. | 
|  | target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>(); | 
|  | } | 
|  |  | 
|  | template <typename OpTy> | 
|  | static void populateOpPatterns(LLVMTypeConverter &converter, | 
|  | RewritePatternSet &patterns, StringRef f32Func, | 
|  | StringRef f64Func) { | 
|  | patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter); | 
|  | patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func); | 
|  | } | 
|  |  | 
|  | void mlir::populateGpuToROCDLConversionPatterns( | 
|  | LLVMTypeConverter &converter, RewritePatternSet &patterns, | 
|  | mlir::gpu::amd::Runtime runtime) { | 
|  | using mlir::gpu::amd::Runtime; | 
|  |  | 
|  | populateWithGenerated(patterns); | 
|  | patterns | 
|  | .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp, | 
|  | ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>( | 
|  | converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName()); | 
|  | patterns.add<GPUIndexIntrinsicOpLowering< | 
|  | gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>( | 
|  | converter, gpu::GPUFuncOp::getKnownGridSizeAttrName()); | 
|  | patterns | 
|  | .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp, | 
|  | ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>, | 
|  | GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp, | 
|  | ROCDL::GridDimYOp, ROCDL::GridDimZOp>, | 
|  | GPUReturnOpLowering>(converter); | 
|  | patterns.add<GPUFuncOpLowering>( | 
|  | converter, | 
|  | /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace, | 
|  | /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace, | 
|  | StringAttr::get(&converter.getContext(), | 
|  | ROCDL::ROCDLDialect::getKernelFuncAttrName())); | 
|  | if (Runtime::HIP == runtime) { | 
|  | patterns.add<GPUPrintfOpToHIPLowering>(converter); | 
|  | } else if (Runtime::OpenCL == runtime) { | 
|  | // Use address space = 4 to match the OpenCL definition of printf() | 
|  | patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4); | 
|  | } | 
|  | // TODO: Add alignment for workgroup memory | 
|  | patterns.add<GPUDynamicSharedMemoryOpLowering>(converter); | 
|  |  | 
|  | patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter); | 
|  |  | 
|  | populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32", | 
|  | "__ocml_fabs_f64"); | 
|  | populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32", | 
|  | "__ocml_atan_f64"); | 
|  | populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32", | 
|  | "__ocml_atan2_f64"); | 
|  | populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32", | 
|  | "__ocml_cbrt_f64"); | 
|  | populateOpPatterns<math::CeilOp>(converter, patterns, "__ocml_ceil_f32", | 
|  | "__ocml_ceil_f64"); | 
|  | populateOpPatterns<math::CosOp>(converter, patterns, "__ocml_cos_f32", | 
|  | "__ocml_cos_f64"); | 
|  | populateOpPatterns<math::ExpOp>(converter, patterns, "__ocml_exp_f32", | 
|  | "__ocml_exp_f64"); | 
|  | populateOpPatterns<math::Exp2Op>(converter, patterns, "__ocml_exp2_f32", | 
|  | "__ocml_exp2_f64"); | 
|  | populateOpPatterns<math::ExpM1Op>(converter, patterns, "__ocml_expm1_f32", | 
|  | "__ocml_expm1_f64"); | 
|  | populateOpPatterns<math::FloorOp>(converter, patterns, "__ocml_floor_f32", | 
|  | "__ocml_floor_f64"); | 
|  | populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32", | 
|  | "__ocml_fmod_f64"); | 
|  | populateOpPatterns<math::LogOp>(converter, patterns, "__ocml_log_f32", | 
|  | "__ocml_log_f64"); | 
|  | populateOpPatterns<math::Log10Op>(converter, patterns, "__ocml_log10_f32", | 
|  | "__ocml_log10_f64"); | 
|  | populateOpPatterns<math::Log1pOp>(converter, patterns, "__ocml_log1p_f32", | 
|  | "__ocml_log1p_f64"); | 
|  | populateOpPatterns<math::Log2Op>(converter, patterns, "__ocml_log2_f32", | 
|  | "__ocml_log2_f64"); | 
|  | populateOpPatterns<math::PowFOp>(converter, patterns, "__ocml_pow_f32", | 
|  | "__ocml_pow_f64"); | 
|  | populateOpPatterns<math::RsqrtOp>(converter, patterns, "__ocml_rsqrt_f32", | 
|  | "__ocml_rsqrt_f64"); | 
|  | populateOpPatterns<math::SinOp>(converter, patterns, "__ocml_sin_f32", | 
|  | "__ocml_sin_f64"); | 
|  | populateOpPatterns<math::SqrtOp>(converter, patterns, "__ocml_sqrt_f32", | 
|  | "__ocml_sqrt_f64"); | 
|  | populateOpPatterns<math::TanhOp>(converter, patterns, "__ocml_tanh_f32", | 
|  | "__ocml_tanh_f64"); | 
|  | populateOpPatterns<math::TanOp>(converter, patterns, "__ocml_tan_f32", | 
|  | "__ocml_tan_f64"); | 
|  | populateOpPatterns<math::ErfOp>(converter, patterns, "__ocml_erf_f32", | 
|  | "__ocml_erf_f64"); | 
|  | } | 
|  |  | 
|  | std::unique_ptr<OperationPass<gpu::GPUModuleOp>> | 
|  | mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, | 
|  | unsigned indexBitwidth, | 
|  | bool useBarePtrCallConv, | 
|  | gpu::amd::Runtime runtime) { | 
|  | return std::make_unique<LowerGpuOpsToROCDLOpsPass>( | 
|  | chipset, indexBitwidth, useBarePtrCallConv, runtime); | 
|  | } |