AMDGPUPropagateAttributes.cpp 9.92 KB
//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass propagates attributes from kernels to the non-entry
/// functions. Most of the library functions were not compiled for specific ABI,
/// yet will be correctly compiled if proper attrbutes are propagated from the
/// caller.
///
/// The pass analyzes call graph and propagates ABI target features through the
/// call graph.
///
/// It can run in two modes: as a function or module pass. A function pass
/// simply propagates attributes. A module pass clones functions if there are
/// callers with different ABI. If a function is clonned all call sites will
/// be updated to use a correct clone.
///
/// A function pass is limited in functionality but can run early in the
/// pipeline. A module pass is more powerful but has to run late, so misses
/// library folding opportunities.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <string>

#define DEBUG_TYPE "amdgpu-propagate-attributes"

using namespace llvm;

namespace llvm {
extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
}

namespace {

class AMDGPUPropagateAttributes {
  const FeatureBitset TargetFeatures = {
    AMDGPU::FeatureWavefrontSize16,
    AMDGPU::FeatureWavefrontSize32,
    AMDGPU::FeatureWavefrontSize64
  };

  class Clone{
  public:
    Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
      FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}

    FeatureBitset FeatureMask;
    Function *OrigF;
    Function *NewF;
  };

  const TargetMachine *TM;

  // Clone functions as needed or just set attributes.
  bool AllowClone;

  // Option propagation roots.
  SmallSet<Function *, 32> Roots;

  // Clones of functions with their attributes.
  SmallVector<Clone, 32> Clones;

  // Find a clone with required features.
  Function *findFunction(const FeatureBitset &FeaturesNeeded,
                         Function *OrigF);

  // Clone function F and set NewFeatures on the clone.
  // Cole takes the name of original function.
  Function *cloneWithFeatures(Function &F,
                              const FeatureBitset &NewFeatures);

  // Set new function's features in place.
  void setFeatures(Function &F, const FeatureBitset &NewFeatures);

  std::string getFeatureString(const FeatureBitset &Features) const;

  // Propagate attributes from Roots.
  bool process();

public:
  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
    TM(TM), AllowClone(AllowClone) {}

  // Use F as a root and propagate its attributes.
  bool process(Function &F);

  // Propagate attributes starting from kernel functions.
  bool process(Module &M);
};

// Allows to propagate attributes early, but no clonning is allowed as it must
// be a function pass to run before any optimizations.
// TODO: We shall only need a one instance of module pass, but that needs to be
// in the linker pipeline which is currently not possible.
class AMDGPUPropagateAttributesEarly : public FunctionPass {
  const TargetMachine *TM;

public:
  static char ID; // Pass identification

  AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
    FunctionPass(ID), TM(TM) {
    initializeAMDGPUPropagateAttributesEarlyPass(
      *PassRegistry::getPassRegistry());
  }

  bool runOnFunction(Function &F) override;
};

// Allows to propagate attributes with clonning but does that late in the
// pipeline.
class AMDGPUPropagateAttributesLate : public ModulePass {
  const TargetMachine *TM;

public:
  static char ID; // Pass identification

  AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
    ModulePass(ID), TM(TM) {
    initializeAMDGPUPropagateAttributesLatePass(
      *PassRegistry::getPassRegistry());
  }

  bool runOnModule(Module &M) override;
};

}  // end anonymous namespace.

char AMDGPUPropagateAttributesEarly::ID = 0;
char AMDGPUPropagateAttributesLate::ID = 0;

INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
                "amdgpu-propagate-attributes-early",
                "Early propagate attributes from kernels to functions",
                false, false)
INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
                "amdgpu-propagate-attributes-late",
                "Late propagate attributes from kernels to functions",
                false, false)

Function *
AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
                                        Function *OrigF) {
  // TODO: search for clone's clones.
  for (Clone &C : Clones)
    if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
      return C.NewF;

  return nullptr;
}

bool AMDGPUPropagateAttributes::process(Module &M) {
  for (auto &F : M.functions())
    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
      Roots.insert(&F);

  return process();
}

bool AMDGPUPropagateAttributes::process(Function &F) {
  Roots.insert(&F);
  return process();
}

bool AMDGPUPropagateAttributes::process() {
  bool Changed = false;
  SmallSet<Function *, 32> NewRoots;
  SmallSet<Function *, 32> Replaced;

  if (Roots.empty())
    return false;
  Module &M = *(*Roots.begin())->getParent();

  do {
    Roots.insert(NewRoots.begin(), NewRoots.end());
    NewRoots.clear();

    for (auto &F : M.functions()) {
      if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
        continue;

      const FeatureBitset &CalleeBits =
        TM->getSubtargetImpl(F)->getFeatureBits();
      SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;

      for (User *U : F.users()) {
        Instruction *I = dyn_cast<Instruction>(U);
        if (!I)
          continue;
        CallBase *CI = dyn_cast<CallBase>(I);
        if (!CI)
          continue;
        Function *Caller = CI->getCaller();
        if (!Caller)
          continue;
        if (!Roots.count(Caller))
          continue;

        const FeatureBitset &CallerBits =
          TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;

        if (CallerBits == (CalleeBits  & TargetFeatures)) {
          NewRoots.insert(&F);
          continue;
        }

        Function *NewF = findFunction(CallerBits, &F);
        if (!NewF) {
          FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
                                    CallerBits);
          if (!AllowClone) {
            // This may set different features on different iteartions if
            // there is a contradiction in callers' attributes. In this case
            // we rely on a second pass running on Module, which is allowed
            // to clone.
            setFeatures(F, NewFeatures);
            NewRoots.insert(&F);
            Changed = true;
            break;
          }

          NewF = cloneWithFeatures(F, NewFeatures);
          Clones.push_back(Clone(CallerBits, &F, NewF));
          NewRoots.insert(NewF);
        }

        ToReplace.push_back(std::make_pair(CI, NewF));
        Replaced.insert(&F);

        Changed = true;
      }

      while (!ToReplace.empty()) {
        auto R = ToReplace.pop_back_val();
        R.first->setCalledFunction(R.second);
      }
    }
  } while (!NewRoots.empty());

  for (Function *F : Replaced) {
    if (F->use_empty())
      F->eraseFromParent();
  }

  return Changed;
}

Function *
AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
                                             const FeatureBitset &NewFeatures) {
  LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');

  ValueToValueMapTy dummy;
  Function *NewF = CloneFunction(&F, dummy);
  setFeatures(*NewF, NewFeatures);

  // Swap names. If that is the only clone it will retain the name of now
  // dead value.
  if (F.hasName()) {
    std::string NewName = NewF->getName();
    NewF->takeName(&F);
    F.setName(NewName);

    // Name has changed, it does not need an external symbol.
    F.setVisibility(GlobalValue::DefaultVisibility);
    F.setLinkage(GlobalValue::InternalLinkage);
  }

  return NewF;
}

void AMDGPUPropagateAttributes::setFeatures(Function &F,
                                            const FeatureBitset &NewFeatures) {
  std::string NewFeatureStr = getFeatureString(NewFeatures);

  LLVM_DEBUG(dbgs() << "Set features "
                    << getFeatureString(NewFeatures & TargetFeatures)
                    << " on " << F.getName() << '\n');

  F.removeFnAttr("target-features");
  F.addFnAttr("target-features", NewFeatureStr);
}

std::string
AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
{
  std::string Ret;
  for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
    if (Features[KV.Value])
      Ret += (StringRef("+") + KV.Key + ",").str();
    else if (TargetFeatures[KV.Value])
      Ret += (StringRef("-") + KV.Key + ",").str();
  }
  Ret.pop_back(); // Remove last comma.
  return Ret;
}

bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
  if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
    return false;

  return AMDGPUPropagateAttributes(TM, false).process(F);
}

bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
  if (!TM)
    return false;

  return AMDGPUPropagateAttributes(TM, true).process(M);
}

FunctionPass
*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
  return new AMDGPUPropagateAttributesEarly(TM);
}

ModulePass
*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
  return new AMDGPUPropagateAttributesLate(TM);
}