mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp - llvm-project - Git at Google

 //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
 //
 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements a pass to convert gpu kernel functions into a
 // corresponding binary blob that can be executed on a CUDA GPU. Currently
 // only translates the function itself but no dependencies.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"

 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Module.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/NVVMIR.h"

 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"

 using namespace mlir;

 namespace {
 // TODO(herhut): Move to shared location.
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";

 /// A pass converting tagged kernel modules to cubin blobs.
 ///
 /// If tagged as a kernel module, each contained function is translated to NVVM
 /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
 /// GPU binary code, which is then attached as an attribute to the function. The
 /// function body is erased.
 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 public:
   GpuKernelToCubinPass(
       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
       : cubinGenerator(cubinGenerator) {}

   void runOnModule() override {
     ModuleOp module = getModule();
     if (!module.getAttrOfType<UnitAttr>(
             gpu::GPUDialect::getKernelModuleAttrName()) ||
         !module.getName())
       return;

     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
     LLVMInitializeNVPTXTargetInfo();
     LLVMInitializeNVPTXTargetMC();
     LLVMInitializeNVPTXAsmPrinter();

     auto llvmModule = translateModuleToNVVMIR(module);
     if (!llvmModule)
       return signalPassFailure();

     // Translate the module to CUBIN and attach the result as attribute to the
     // module.
     if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
             *llvmModule, module.getLoc(), *module.getName()))
       module.setAttr(kCubinAnnotation, cubinAttr);
     else
       signalPassFailure();
   }

 private:
   static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
                                                 Location, StringRef);

   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);

   /// Converts llvmModule to cubin using the user-provided generator. Location
   /// is used for error reporting and name is forwarded to the CUBIN generator
   /// to use in its logging mechanisms.
   OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
                                   StringRef name);

   /// Translates llvmModule to cubin and returns the result as attribute.
   StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
                                                  Location loc, StringRef name);

   CubinGenerator cubinGenerator;
 };

 } // anonymous namespace

 std::string GpuKernelToCubinPass::translateModuleToPtx(
     llvm::Module &module, llvm::TargetMachine &target_machine) {
   std::string ptx;
   {
     llvm::raw_string_ostream stream(ptx);
     llvm::buffer_ostream pstream(stream);
     llvm::legacy::PassManager codegen_passes;
     target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                        llvm::CGFT_AssemblyFile);
     codegen_passes.run(module);
   }

   return ptx;
 }

 OwnedCubin
 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
                                                   Location, StringRef) {
   const char data[] = "CUBIN";
   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
 }

 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
                                                       Location loc,
                                                       StringRef name) {
   std::unique_ptr<llvm::TargetMachine> targetMachine;
   {
     std::string error;
     // TODO(herhut): Make triple configurable.
     constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
     llvm::Triple triple(cudaTriple);
     const llvm::Target *target =
         llvm::TargetRegistry::lookupTarget("", triple, error);
     if (target == nullptr) {
       emitError(loc, "cannot initialize target triple");
       return {};
     }
     targetMachine.reset(
         target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
   }

   // Set the data layout of the llvm module to match what the ptx target needs.
   llvmModule.setDataLayout(targetMachine->createDataLayout());

   auto ptx = translateModuleToPtx(llvmModule, *targetMachine);

   return cubinGenerator(ptx, loc, name);
 }

 StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
     llvm::Module &llvmModule, Location loc, StringRef name) {
   auto cubin = convertModuleToCubin(llvmModule, loc, name);
   if (!cubin)
     return {};
   return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
 }

 std::unique_ptr<OpPassBase<ModuleOp>>
 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
 }

 static PassRegistration<GpuKernelToCubinPass>
     pass("test-kernel-to-cubin",
          "Convert all kernel functions to CUDA cubin blobs");
	//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
	//
	// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a pass to convert gpu kernel functions into a
	// corresponding binary blob that can be executed on a CUDA GPU. Currently
	// only translates the function itself but no dependencies.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"

	#include "mlir/Dialect/GPU/GPUDialect.h"
	#include "mlir/IR/Attributes.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/Function.h"
	#include "mlir/IR/Module.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Pass/PassRegistry.h"
	#include "mlir/Support/LogicalResult.h"
	#include "mlir/Target/NVVMIR.h"

	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace mlir;

	namespace {
	// TODO(herhut): Move to shared location.
	static constexpr const char *kCubinAnnotation = "nvvm.cubin";

	/// A pass converting tagged kernel modules to cubin blobs.
	///
	/// If tagged as a kernel module, each contained function is translated to NVVM
	/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
	/// GPU binary code, which is then attached as an attribute to the function. The
	/// function body is erased.
	class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
	public:
	GpuKernelToCubinPass(
	CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
	: cubinGenerator(cubinGenerator) {}

	void runOnModule() override {
	ModuleOp module = getModule();
	if (!module.getAttrOfType<UnitAttr>(
	gpu::GPUDialect::getKernelModuleAttrName()) \|\|
	!module.getName())
	return;

	// Make sure the NVPTX target is initialized.
	LLVMInitializeNVPTXTarget();
	LLVMInitializeNVPTXTargetInfo();
	LLVMInitializeNVPTXTargetMC();
	LLVMInitializeNVPTXAsmPrinter();

	auto llvmModule = translateModuleToNVVMIR(module);
	if (!llvmModule)
	return signalPassFailure();

	// Translate the module to CUBIN and attach the result as attribute to the
	// module.
	if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
	llvmModule, module.getLoc(), module.getName()))
	module.setAttr(kCubinAnnotation, cubinAttr);
	else
	signalPassFailure();
	}

	private:
	static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
	Location, StringRef);

	std::string translateModuleToPtx(llvm::Module &module,
	llvm::TargetMachine &target_machine);

	/// Converts llvmModule to cubin using the user-provided generator. Location
	/// is used for error reporting and name is forwarded to the CUBIN generator
	/// to use in its logging mechanisms.
	OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
	StringRef name);

	/// Translates llvmModule to cubin and returns the result as attribute.
	StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
	Location loc, StringRef name);

	CubinGenerator cubinGenerator;
	};

	} // anonymous namespace

	std::string GpuKernelToCubinPass::translateModuleToPtx(
	llvm::Module &module, llvm::TargetMachine &target_machine) {
	std::string ptx;
	{
	llvm::raw_string_ostream stream(ptx);
	llvm::buffer_ostream pstream(stream);
	llvm::legacy::PassManager codegen_passes;
	target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
	llvm::CGFT_AssemblyFile);
	codegen_passes.run(module);
	}

	return ptx;
	}

	OwnedCubin
	GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
	Location, StringRef) {
	const char data[] = "CUBIN";
	return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
	}

	OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
	Location loc,
	StringRef name) {
	std::unique_ptr<llvm::TargetMachine> targetMachine;
	{
	std::string error;
	// TODO(herhut): Make triple configurable.
	constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
	llvm::Triple triple(cudaTriple);
	const llvm::Target *target =
	llvm::TargetRegistry::lookupTarget("", triple, error);
	if (target == nullptr) {
	emitError(loc, "cannot initialize target triple");
	return {};
	}
	targetMachine.reset(
	target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
	}

	// Set the data layout of the llvm module to match what the ptx target needs.
	llvmModule.setDataLayout(targetMachine->createDataLayout());

	auto ptx = translateModuleToPtx(llvmModule, *targetMachine);

	return cubinGenerator(ptx, loc, name);
	}

	StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
	llvm::Module &llvmModule, Location loc, StringRef name) {
	auto cubin = convertModuleToCubin(llvmModule, loc, name);
	if (!cubin)
	return {};
	return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
	}

	std::unique_ptr<OpPassBase<ModuleOp>>
	mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
	return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
	}

	static PassRegistration<GpuKernelToCubinPass>
	pass("test-kernel-to-cubin",
	"Convert all kernel functions to CUDA cubin blobs");