Add an LLVM IR library for more builtin functions (#715)

* The IR library is generated from libclc (see README.md for
  instructions to rebuild it)
  * it is included as a header in the compiler and linked with the
    source
* Added the LICENSE text to LICENSE due to the addition of LLVM IR
  library
* Added a new compiler option `-cl-native-math` for the fasted math
  implementations
  * no precision guarantees
  * Causes a new pass to strip some of the library functions when a
    lower precision, faster alternative is available (e.g. for fma)
  * this option implies `-cl-fast-relaxed-math`
* Updated docs and readme
* Updated LLVM
* Updated tests
diff --git a/lib/Compiler.cpp b/lib/Compiler.cpp
index 3af1044..592b6f9 100644
--- a/lib/Compiler.cpp
+++ b/lib/Compiler.cpp
@@ -19,12 +19,15 @@
 #include "clang/Frontend/FrontendPluginRegistry.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/LinkAllPasses.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
@@ -38,8 +41,10 @@
 #include "clspv/Option.h"
 #include "clspv/Passes.h"
 #include "clspv/Sampler.h"
+#include "clspv/clspv_builtin_library.h"
 #include "clspv/opencl_builtins_header.h"
 
+#include "Builtins.h"
 #include "FrontendPlugin.h"
 #include "Passes.h"
 
@@ -164,6 +169,21 @@
         "Emit LLVM IR to the given file after parsing and stop compilation."),
     llvm::cl::value_desc("filename"));
 
+namespace {
+struct OpenCLBuiltinMemoryBuffer final : public llvm::MemoryBuffer {
+  OpenCLBuiltinMemoryBuffer(const void *data, uint64_t data_length) {
+    const char *dataCasted = reinterpret_cast<const char *>(data);
+    init(dataCasted, dataCasted + data_length, true);
+  }
+
+  virtual llvm::MemoryBuffer::BufferKind getBufferKind() const override {
+    return llvm::MemoryBuffer::MemoryBuffer_Malloc;
+  }
+
+  virtual ~OpenCLBuiltinMemoryBuffer() override {}
+};
+} // namespace
+
 // Populates |SamplerMapEntries| with data from the input sampler map. Returns 0
 // if successful.
 int ParseSamplerMap(const std::string &sampler_map,
@@ -428,17 +448,20 @@
   instance.getCodeGenOpts().LessPreciseFPMAD =
       cl_mad_enable || cl_unsafe_math_optimizations;
   // cl_no_signed_zeros ignored for now!
-  instance.getLangOpts().UnsafeFPMath =
-      cl_unsafe_math_optimizations || cl_fast_relaxed_math;
-  instance.getLangOpts().FiniteMathOnly =
-      cl_finite_math_only || cl_fast_relaxed_math;
-  instance.getLangOpts().FastRelaxedMath = cl_fast_relaxed_math;
+  instance.getLangOpts().UnsafeFPMath = cl_unsafe_math_optimizations ||
+                                        cl_fast_relaxed_math ||
+                                        clspv::Option::NativeMath();
+  instance.getLangOpts().FiniteMathOnly = cl_finite_math_only ||
+                                          cl_fast_relaxed_math ||
+                                          clspv::Option::NativeMath();
+  instance.getLangOpts().FastRelaxedMath =
+      cl_fast_relaxed_math || clspv::Option::NativeMath();
 
   // Preprocessor options
   if (!clspv::Option::ImageSupport()) {
     instance.getPreprocessorOpts().addMacroUndef("__IMAGE_SUPPORT__");
   }
-  if (cl_fast_relaxed_math) {
+  if (cl_fast_relaxed_math || clspv::Option::NativeMath()) {
     instance.getPreprocessorOpts().addMacroDef("__FAST_RELAXED_MATH__");
   }
 
@@ -494,19 +517,6 @@
   instance.getPreprocessorOpts().addRemappedFile(overiddenInputFilename,
                                                  memory_buffer.release());
 
-  struct OpenCLBuiltinMemoryBuffer final : public llvm::MemoryBuffer {
-    OpenCLBuiltinMemoryBuffer(const void *data, uint64_t data_length) {
-      const char *dataCasted = reinterpret_cast<const char *>(data);
-      init(dataCasted, dataCasted + data_length, true);
-    }
-
-    virtual llvm::MemoryBuffer::BufferKind getBufferKind() const override {
-      return llvm::MemoryBuffer::MemoryBuffer_Malloc;
-    }
-
-    virtual ~OpenCLBuiltinMemoryBuffer() override {}
-  };
-
   std::unique_ptr<llvm::MemoryBuffer> openCLBuiltinMemoryBuffer(
       new OpenCLBuiltinMemoryBuffer(opencl_builtins_header_data,
                                     opencl_builtins_header_size - 1));
@@ -599,6 +609,7 @@
     break;
   }
 
+  pm->add(clspv::createNativeMathPass());
   pm->add(clspv::createZeroInitializeAllocasPass());
   pm->add(clspv::createAddFunctionAttributesPass());
   pm->add(clspv::createAutoPodArgsPass());
@@ -844,6 +855,28 @@
   return 0;
 }
 
+bool LinkBuiltinLibrary(llvm::Module *module) {
+  std::unique_ptr<llvm::MemoryBuffer> buffer(new OpenCLBuiltinMemoryBuffer(
+      clspv_builtin_library_data, clspv_builtin_library_size - 1));
+
+  llvm::SMDiagnostic Err;
+  auto library = llvm::parseIR(*buffer, Err, module->getContext());
+  if (!library) {
+    llvm::errs() << "Failed to parse builtins library\n";
+    return false;
+  }
+
+  // TODO: when clang generates builtins using the generic address space,
+  // different builtins are used for pointer-based builtins. Need to do some
+  // work to ensure they are kept around.
+  // Affects: modf, remquo, lgamma_r, frexp
+
+  llvm::Linker L(*module);
+  L.linkInModule(std::move(library), 0);
+
+  return true;
+}
+
 } // namespace
 
 namespace clspv {
@@ -935,6 +968,10 @@
     return GenerateIRFile(&pm, *module, IROutputFile);
   }
 
+  if (!LinkBuiltinLibrary(module.get())) {
+    return -1;
+  }
+
   // Otherwise, populate the pass manager and run the regular passes.
   if (auto error = PopulatePassManager(&pm, &binaryStream, &SamplerMapEntries))
     return error;
@@ -1032,6 +1069,10 @@
 
   std::unique_ptr<llvm::Module> module(action.takeModule());
 
+  if (!LinkBuiltinLibrary(module.get())) {
+    return -1;
+  }
+
   // Optimize.
   // Create a memory buffer for temporarily writing the result.
   SmallVector<char, 10000> binary;