tensorflow
diff --git a/‎third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
Lines changed: 1 addition & 1 deletion b/‎third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/xla/xla/codegen/BUILD
Lines changed: 1 addition & 0 deletions b/‎third_party/xla/xla/codegen/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/xla/xla/codegen/emitters/transforms/BUILD
Lines changed: 1 addition & 0 deletions b/‎third_party/xla/xla/codegen/emitters/transforms/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/xla/xla/codegen/emitters/transforms/lower_xla_math_lib.cc
Lines changed: 29 additions & 1 deletion b/‎third_party/xla/xla/codegen/emitters/transforms/lower_xla_math_lib.cc
Lines changed: 29 additions & 1 deletion
diff --git a/‎third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_math_lib.mlir
Lines changed: 29 additions & 0 deletions b/‎third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_math_lib.mlir
Lines changed: 29 additions & 0 deletions
diff --git a/‎third_party/xla/xla/codegen/math/BUILD
Lines changed: 9 additions & 2 deletions b/‎third_party/xla/xla/codegen/math/BUILD
Lines changed: 9 additions & 2 deletions
diff --git a/‎third_party/xla/xla/codegen/math/intrinsic.h
Lines changed: 1 addition & 0 deletions b/‎third_party/xla/xla/codegen/math/intrinsic.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/xla/xla/codegen/math/rsqrt.cc
Lines changed: 76 additions & 41 deletions b/‎third_party/xla/xla/codegen/math/rsqrt.cc
Lines changed: 76 additions & 41 deletions
@@ -328,7 +328,7 @@ llvm::Error IrCompiler::RunIrPasses(llvm::Module& module,
       std::make_unique<llvm::TargetLibraryInfoImpl>(target_triple);
   target_library_info_impl->addVectorizableFunctions(
       PolynomialApproximationsVectorization());
-  codegen::MathFunctionLib math_lib;
+  codegen::MathFunctionLib math_lib(target_machine);
   target_library_info_impl->addVectorizableFunctions(math_lib.Vectorizations());
 
   fam.registerPass(
 
@@ -192,6 +192,7 @@ cc_library(
         "//xla/codegen/math:intrinsic",
         "//xla/codegen/math:ldexp",
         "//xla/codegen/math:log1p",
+        "//xla/codegen/math:rsqrt",
         "//xla/codegen/math:string_interner",
         "//xla/codegen/math:vec_name_mangler",
         "//xla/service/llvm_ir:llvm_util",
 
@@ -79,6 +79,7 @@ cc_library(
         "//xla/codegen/math:fptrunc",
         "//xla/codegen/math:intrinsic",
         "//xla/codegen/math:log1p",
+        "//xla/codegen/math:rsqrt",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
 
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/codegen/math/fptrunc.h"
 #include "xla/codegen/math/intrinsic.h"
 #include "xla/codegen/math/log1p.h"
+#include "xla/codegen/math/rsqrt.h"
 
 namespace xla {
 namespace emitters {
@@ -216,6 +217,32 @@ class LowerTruncF32BF16FPattern
   mlir::ModuleOp& module_op_;
 };
 
+class RsqrtPattern : public mlir::OpRewritePattern<mlir::math::RsqrtOp> {
+ public:
+  RsqrtPattern(mlir::MLIRContext* context, mlir::ModuleOp& module_op)
+      : OpRewritePattern(context), module_op_(module_op) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::math::RsqrtOp op, mlir::PatternRewriter& rewriter) const override {
+    // Don't change if not f32 or f64:
+    auto src_type = op.getOperand().getType();
+    if (!mlir::isa<mlir::Float32Type>(src_type) &&
+        !mlir::isa<mlir::Float64Type>(src_type)) {
+      return rewriter.notifyMatchFailure(op, "Not f32 or f64");
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto rsqrt_decl = codegen::intrinsics::Rsqrt::GetOrInsertDeclaration(
+        rewriter, module_op_, Type::TypeFromIrType(op.getOperand().getType()));
+    auto call_op = b.create<mlir::func::CallOp>(rsqrt_decl, op.getOperand());
+    rewriter.replaceOp(op, call_op->getResults());
+    return mlir::success();
+  }
+
+ private:
+  mlir::ModuleOp& module_op_;
+};
+
 class LowerXlaMathLibPass
     : public impl::LowerXlaMathLibPassBase<LowerXlaMathLibPass> {
  public:
@@ -226,7 +253,8 @@ class LowerXlaMathLibPass
     mlir::ModuleOp module_op = getOperation();
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<LowerExpOpPattern, LowerLog1pPattern, LowerErfPattern,
-                 LowerTruncF32BF16FPattern>(&getContext(), module_op);
+                 LowerTruncF32BF16FPattern, RsqrtPattern>(&getContext(),
+                                                          module_op);
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(module_op, std::move(patterns)))) {
 
@@ -78,3 +78,32 @@ module {
 // CHECK-NOT: math.erf
 // CHECK: %[[ERF_CALL:.*]] = call @erf
 // CHECK: return %[[ERF_CALL]]
+
+
+// -----
+
+module {
+  func.func @rsqrt(%arg0: f32) -> f32 {
+    %ret = math.rsqrt %arg0 : f32
+    return %ret : f32
+  }
+}
+
+// CHECK-LABEL: @local_xla.rsqrt.f32
+// CHECK-NOT: math.rsqrt
+// CHECK: %[[RSQRT_CALL:.*]] = call @local_xla.rsqrt.f32
+// CHECK: return %[[RSQRT_CALL]]
+
+// -----
+
+module {
+  func.func @rsqrt(%arg0: f64) -> f64 {
+    %ret = math.rsqrt %arg0 : f64
+    return %ret : f64
+  }
+}
+
+// CHECK-LABEL: @local_xla.rsqrt.f64
+// CHECK-NOT: math.rsqrt
+// CHECK: %[[RSQRT_CALL:.*]] = call @local_xla.rsqrt.f64
+// CHECK: return %[[RSQRT_CALL]]
@@ -35,6 +35,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -125,14 +126,18 @@ cc_library(
         "//xla/service/cpu:orc_jit_memory_mapper",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:MC",
         "@llvm-project//llvm:OrcJIT",  # buildcleaner: keep
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
     ] + if_llvm_aarch64_available([
         "@llvm-project//llvm:AArch64AsmParser",  # fixdeps: keep
@@ -306,14 +311,13 @@ cc_library(
     deps = [
         ":intrinsic",
         "//xla:xla_data_proto_cc",
-        "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Core",  # buildcleaner: keep
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
     ],
 )
@@ -358,7 +362,9 @@ xla_cc_test(
         ":test_matchers",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
         "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
@@ -379,6 +385,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
     ],
 )
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 
@@ -16,12 +16,11 @@ limitations under the License.
 #include "xla/codegen/math/rsqrt.h"
 
 #include <cstddef>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -38,8 +37,8 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/TypeSize.h"
+#include "llvm/Target/TargetMachine.h"
 #include "xla/codegen/math/intrinsic.h"
-#include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
@@ -68,19 +67,19 @@ static llvm::Value* NewtonRaphsonRsqrtIteration(llvm::IRBuilder<>& builder,
 
 struct RsqrtIntrinsic {
   llvm::Intrinsic::ID id;
-  int mask_bits;  // Some avx512 calls require masks.
-  bool needs_insert_element;
+  int mask_bits;                  // Some avx512 calls require masks.
+  int needs_insert_element_size;  // Some avx512 calls require padding.
 
   static RsqrtIntrinsic ForF32(size_t num_elements) {
     switch (num_elements) {
       case 1:
-        return {llvm::Intrinsic::x86_sse_rsqrt_ss, 0, true};
+        return {llvm::Intrinsic::x86_sse_rsqrt_ss, 0, 4};
       case 4:
-        return {llvm::Intrinsic::x86_sse_rsqrt_ps, 0, false};
+        return {llvm::Intrinsic::x86_sse_rsqrt_ps, 0, 0};
       case 8:
-        return {llvm::Intrinsic::x86_avx_rsqrt_ps_256, 0, false};
+        return {llvm::Intrinsic::x86_avx_rsqrt_ps_256, 0, 0};
       case 16:
-        return {llvm::Intrinsic::x86_avx512_rsqrt14_ps_512, 16, false};
+        return {llvm::Intrinsic::x86_avx512_rsqrt14_ps_512, 16, 0};
       default:
         LOG(FATAL) << "Unsupported vector width for rsqrt: " << num_elements;
     }
@@ -89,12 +88,18 @@ struct RsqrtIntrinsic {
   static RsqrtIntrinsic ForF64(size_t num_elements) {
     // We assume AVX512 is available for F64.
     switch (num_elements) {
+      case 1:
+        // Assuming AVX512 is available.
+        // We don't use x86_avx512_rsqrt14_sd because it also requires padding
+        // into <2 x double> vectors and it takes an additional source vector
+        // for the upper bits of the result.
+        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_128, 8, 2};
       case 2:
-        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_128, 8, false};
+        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_128, 8, 0};
       case 4:
-        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_256, 8, false};
+        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_256, 8, 0};
       case 8:
-        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_512, 8, false};
+        return {llvm::Intrinsic::x86_avx512_rsqrt14_pd_512, 8, 0};
       default:
         LOG(FATAL) << "Unsupported vector width for rsqrt: " << num_elements;
     }
@@ -106,36 +111,39 @@ struct RsqrtIntrinsic {
         llvm::Intrinsic::getOrInsertDeclaration(module, id);
 
     llvm::Value* y_approx;
-    if (needs_insert_element) {
+    std::vector<llvm::Value*> args = {x};
+    if (needs_insert_element_size > 0) {
+      // Pad into a vector of size `needs_insert_element_size`.
       llvm::Type* sse_vec_type = llvm::VectorType::get(
-          x->getType()->getScalarType(), llvm::ElementCount::getFixed(4));
+          x->getType()->getScalarType(),
+          llvm::ElementCount::getFixed(needs_insert_element_size));
       llvm::Value* vec_x = llvm::UndefValue::get(sse_vec_type);
       vec_x = builder.CreateInsertElement(vec_x, x, builder.getInt32(0));
-      llvm::Value* approx_vec =
-          builder.CreateCall(rsqrt_intrinsic, {vec_x}, "y_approx.vec");
-      y_approx = builder.CreateExtractElement(approx_vec, builder.getInt32(0),
-                                              "y_approx");
-    } else if (mask_bits > 0) {
-      llvm::Value* dest = llvm::ConstantFP::get(x->getType(), 0.0);
+      args[0] = vec_x;
+    }
+    if (mask_bits > 0) {
+      llvm::Value* src = llvm::ConstantFP::get(args[0]->getType(), 0.0);
       llvm::Value* mask = llvm::ConstantInt::get(
           builder.getContext(), llvm::APInt(mask_bits, -1, true));
-      y_approx =
-          builder.CreateCall(rsqrt_intrinsic, {x, dest, mask}, "y_approx");
-
-    } else {
-      y_approx = builder.CreateCall(rsqrt_intrinsic, {x}, "y_approx");
+      args.push_back(src);
+      args.push_back(mask);
+    }
+    y_approx = builder.CreateCall(rsqrt_intrinsic, args, "y_approx");
+    if (needs_insert_element_size > 0) {
+      // Extract the result from the padded vector.
+      y_approx = builder.CreateExtractElement(y_approx, builder.getInt32(0),
+                                              "y_approx");
     }
     return y_approx;
   }
 };
 
-absl::StatusOr<llvm::Function*> Rsqrt::CreateDefinition(llvm::Module* module,
-                                                        Type type) {
+absl::StatusOr<llvm::Function*> Rsqrt::CreateDefinition(
+    llvm::Module* module, llvm::TargetMachine* target_machine, Type type) {
+  CHECK(type.element_type() == F64 || type.element_type() == F32)
+      << type.name();
   llvm::Type* input_type = Type::TypeToIrType(type, module->getContext());
   CHECK(input_type != nullptr);
-  CHECK(input_type->isFloatingPointTy() || input_type->isVectorTy());
-  CHECK(input_type->getScalarType()->isFloatTy() ||
-        input_type->getScalarType()->isDoubleTy());
 
   llvm::LLVMContext& context = module->getContext();
   llvm::IRBuilder<> builder(context);
@@ -151,30 +159,44 @@ absl::StatusOr<llvm::Function*> Rsqrt::CreateDefinition(llvm::Module* module,
       module->getOrInsertFunction(Rsqrt::Name(type), function_type)
           .getCallee());
 
-  llvm::Argument* input_x_arg = func->getArg(0);
-  input_x_arg->setName("x");
+  llvm::Argument* x = func->getArg(0);
+  x->setName("x");
   llvm::BasicBlock* entry_bb = llvm::BasicBlock::Create(context, "entry", func);
-  llvm::Value* x = input_x_arg;
   builder.SetInsertPoint(entry_bb);
+
+  if ((type.element_type() == F64 &&
+       !target_machine->getTargetFeatureString().contains("+avx512f")) ||
+      !target_machine->getTargetFeatureString().contains("+avx")) {
+    LOG_EVERY_N(INFO, 1000)
+        << "avx not available, falling back to 1 / sqrt(x) for " << type.name();
+    // We can't use the same approximation algorithm for F64 without AVX512 or
+    // anything non-x86 and without avx.
+    llvm::Value* one = llvm::ConstantFP::get(input_type, 1.0);
+    llvm::Value* sqrt_x =
+        builder.CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, x);
+    llvm::Value* inv_sqrt_x = builder.CreateFDiv(one, sqrt_x, "inv_sqrt_x");
+    builder.CreateRet(inv_sqrt_x);
+    return func;
+  }
+
   RsqrtIntrinsic rsqrt_intrinsic = input_type->getScalarType()->isFloatTy()
                                        ? RsqrtIntrinsic::ForF32(num_elements)
                                        : RsqrtIntrinsic::ForF64(num_elements);
   llvm::Value* y_approx = rsqrt_intrinsic.CreateCall(builder, x);
 
   llvm::Value* refined_result =
-      NewtonRaphsonRsqrtIteration(builder, input_x_arg, y_approx, input_type);
+      NewtonRaphsonRsqrtIteration(builder, x, y_approx, input_type);
   if (input_type->getScalarType()->isDoubleTy()) {
     // Do an additional refinement step for F64.
-    refined_result = NewtonRaphsonRsqrtIteration(builder, input_x_arg,
-                                                 refined_result, input_type);
+    refined_result =
+        NewtonRaphsonRsqrtIteration(builder, x, refined_result, input_type);
   }
 
-  // Create a mask for special cases (denormals and infinities) to fall back
-  // to the intrinsic's result, matching Eigen's behavior.
   const llvm::fltSemantics& semantics =
       input_type->getScalarType()->getFltSemantics();
-  llvm::Constant* flt_min = llvm::ConstantFP::get(
-      input_type, llvm::APFloat::getSmallestNormalized(semantics));
+  llvm::APFloat flt_min_val = llvm::APFloat::getSmallestNormalized(semantics);
+  llvm::Constant* flt_min = llvm::ConstantFP::get(input_type, flt_min_val);
+
   llvm::Constant* inf =
       llvm::ConstantFP::get(input_type, llvm::APFloat::getInf(semantics));
 
@@ -183,11 +205,24 @@ absl::StatusOr<llvm::Function*> Rsqrt::CreateDefinition(llvm::Module* module,
   llvm::Value* use_hw_approx_mask =
       builder.CreateOr(lt_min_mask, inf_mask, "use_hw_approx_mask");
 
+  if (type.element_type() == F64) {
+    // If the input is very large, the result should be 0.
+    // This is effectively calculating 1.0 / FLT_MIN, which is the threshold
+    // where rsqrt(x*x) would be close to the smallest representable number.
+    llvm::APFloat rsqrt_is_zero_val = llvm::APFloat(semantics, 1);
+    rsqrt_is_zero_val.divide(flt_min_val, llvm::APFloat::rmTowardZero);
+    llvm::Constant* rsqrt_is_zero =
+        llvm::ConstantFP::get(input_type, rsqrt_is_zero_val);
+    llvm::Value* rsqrt_is_zero_mask =
+        builder.CreateFCmpOGE(x, rsqrt_is_zero, "rsqrt_is_zero_mask");
+    use_hw_approx_mask =
+        builder.CreateOr(use_hw_approx_mask, rsqrt_is_zero_mask);
+  }
+
   // If input is normal and finite, use the refined result. Otherwise, use the
   // raw hardware approximation.
   llvm::Value* result = builder.CreateSelect(use_hw_approx_mask, y_approx,
                                              refined_result, "result");
-
   builder.CreateRet(result);
   return func;
 }