Cluster pod kernel args: Inline the inner function by default

Use option -no-inline-pod to avoid inlining, i.e. get the old
behaviour.  But it's deprecated.
diff --git a/lib/ClusterPodKernelArgumentsPass.cpp b/lib/ClusterPodKernelArgumentsPass.cpp
index dd0a4c2..230096d 100644
--- a/lib/ClusterPodKernelArgumentsPass.cpp
+++ b/lib/ClusterPodKernelArgumentsPass.cpp
@@ -34,7 +34,9 @@
 #include <llvm/IR/Metadata.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Pass.h>
+#include <llvm/Support/CommandLine.h>
 #include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 
 #include "ArgKind.h"
 
@@ -42,6 +44,12 @@
 
 #define DEBUG_TYPE "clusterpodkernelargs"
 
+// TODO(dneto): Remove this after experimentation.
+static llvm::cl::opt<bool> no_inline_pod_fn(
+    "no-inline-pod-inner-function", llvm::cl::init(false),
+    llvm::cl::desc("DEPRECATED. Avoid inlining the inner function created by "
+                   "clustering pod kernel args"));
+
 namespace {
 struct ClusterPodKernelArgumentsPass : public ModulePass {
   static char ID;
@@ -81,6 +89,8 @@
     }
   }
 
+  SmallVector<CallInst*, 8> CallList;
+
   for (Function* F : WorkList) {
     Changed = true;
 
@@ -245,9 +255,17 @@
 
     auto Call = Builder.CreateCall(F, CalleeArgs);
     Call->setCallingConv(F->getCallingConv());
+    CallList.push_back(Call);
 
     Builder.CreateRetVoid();
   }
 
+  if (!no_inline_pod_fn) {
+    for (CallInst *C : CallList) {
+      InlineFunctionInfo info;
+      Changed |= InlineFunction(C, info);
+    }
+  }
+
   return Changed;
 }