mlc-ai · MasterJH5574 · Sep 15, 2025 · Jan 21, 2025 · Jan 25, 2025 · Feb 12, 2025
diff --git a/cmake/gen_cmake_config.py b/cmake/gen_cmake_config.py
@@ -52,46 +52,6 @@
     if "CUDA" in enabled_backends:
         cmake_config_str += f"set(USE_THRUST ON)\n"
 
-    # FlashInfer related
-    use_flashInfer = False  # pylint: disable=invalid-name
-    if "CUDA" in enabled_backends:
-        while True:
-            user_input = input(
-                "Use FlashInfer? (need CUDA w/ compute capability 80;86;89;90) (y/n): "
-            )
-            if user_input in ["yes", "Y", "y"]:
-                cmake_config_str += "set(USE_FLASHINFER ON)\n"
-                cmake_config_str += "set(FLASHINFER_ENABLE_FP8 OFF)\n"
-                cmake_config_str += "set(FLASHINFER_ENABLE_BF16 OFF)\n"
-                cmake_config_str += "set(FLASHINFER_GEN_GROUP_SIZES 1 4 6 8)\n"
-                cmake_config_str += "set(FLASHINFER_GEN_PAGE_SIZES 16)\n"
-                cmake_config_str += "set(FLASHINFER_GEN_HEAD_DIMS 128)\n"
-                cmake_config_str += "set(FLASHINFER_GEN_KV_LAYOUTS 0 1)\n"
-                cmake_config_str += "set(FLASHINFER_GEN_POS_ENCODING_MODES 0 1)\n"
-                cmake_config_str += 'set(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS "false")\n'
-                cmake_config_str += 'set(FLASHINFER_GEN_CASUALS "false" "true")\n'
-                use_flashInfer = True  # pylint: disable=invalid-name
-                break
-            elif user_input in ["no", "N", "n"]:
-                cmake_config_str += "set(USE_FLASHINFER OFF)\n"
-                break
-            else:
-                print(f"Invalid input: {use_flashInfer}. Please input again.")
-    else:
-        cmake_config_str += "set(USE_FLASHINFER OFF)\n"
-
-    if use_flashInfer:
-        while True:
-            user_input = input("Enter your CUDA compute capability: ")
-            if user_input in ["80", "86", "89", "90", "100", "120"]:
-                cmake_config_str += f"set(FLASHINFER_CUDA_ARCHITECTURES {user_input})\n"
-                cmake_config_str += f"set(CMAKE_CUDA_ARCHITECTURES {user_input})\n"
-                break
-            else:
-                print(
-                    f"Invalid input: {user_input}. FlashInfer requires 80, 86, 89, 90, 100 or 120"
-                )
-
     print("\nWriting the following configuration to config.cmake...")
     print(cmake_config_str)
 

diff --git a/docs/install/mlc_llm.rst b/docs/install/mlc_llm.rst
@@ -210,13 +210,6 @@ This step is useful when you want to make modification or obtain a specific vers
     # build mlc_llm libraries
     cmake .. && cmake --build . --parallel $(nproc) && cd ..
 
-.. note::
-    If you are using CUDA and your compute capability is above 80, then it is require to build with
-    ``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during
-    runtime.
-
-    To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``.
-
 **Step 3. Install via Python.** We recommend that you install ``mlc_llm`` as a Python package, giving you
 access to ``mlc_llm.compile``, ``mlc_llm.MLCEngine``, and the CLI.
 There are two ways to do so:

diff --git a/docs/install/tvm.rst b/docs/install/tvm.rst
@@ -203,9 +203,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r
         echo "set(USE_METAL  OFF)" >> config.cmake
         echo "set(USE_VULKAN OFF)" >> config.cmake
         echo "set(USE_OPENCL OFF)" >> config.cmake
-        # FlashInfer related, requires CUDA w/ compute capability 80;86;89;90
-        echo "set(USE_FLASHINFER OFF)" >> config.cmake
-        echo "set(FLASHINFER_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake
         echo "set(CMAKE_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake
 
     .. note::
@@ -217,13 +214,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r
         - ``RelWithDebInfo`` sets ``-O2 -g -DNDEBUG`` (recommended)
         - ``Release`` sets ``-O3 -DNDEBUG``
 
-    .. note::
-        If you are using CUDA and your compute capability is above 80, then it is require to build with
-        ``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during
-        runtime.
-
-        To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``.
-
     Once ``config.cmake`` is edited accordingly, kick off build with the commands below:
 
     .. code-block:: bash

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,7 @@ dependencies = [
     "apache-tvm-ffi",
     "datasets",
     "fastapi",
-    "flashinfer-python==0.2.14; sys_platform == 'linux'",
+    "flashinfer-python==0.3.1; sys_platform == 'linux'",
     "ml_dtypes>=0.5.1",
     "openai",
     "pandas",

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -1,7 +1,7 @@
 apache-tvm-ffi
 datasets
 fastapi
-flashinfer-python==0.2.14
+flashinfer-python==0.3.1
 ml_dtypes>=0.5.1
 openai
 pandas