diff --git a/cmake/gen_cmake_config.py b/cmake/gen_cmake_config.py index b44f47f247..1d8b6f0944 100644 --- a/cmake/gen_cmake_config.py +++ b/cmake/gen_cmake_config.py @@ -52,46 +52,6 @@ if "CUDA" in enabled_backends: cmake_config_str += f"set(USE_THRUST ON)\n" - # FlashInfer related - use_flashInfer = False # pylint: disable=invalid-name - if "CUDA" in enabled_backends: - while True: - user_input = input( - "Use FlashInfer? (need CUDA w/ compute capability 80;86;89;90) (y/n): " - ) - if user_input in ["yes", "Y", "y"]: - cmake_config_str += "set(USE_FLASHINFER ON)\n" - cmake_config_str += "set(FLASHINFER_ENABLE_FP8 OFF)\n" - cmake_config_str += "set(FLASHINFER_ENABLE_BF16 OFF)\n" - cmake_config_str += "set(FLASHINFER_GEN_GROUP_SIZES 1 4 6 8)\n" - cmake_config_str += "set(FLASHINFER_GEN_PAGE_SIZES 16)\n" - cmake_config_str += "set(FLASHINFER_GEN_HEAD_DIMS 128)\n" - cmake_config_str += "set(FLASHINFER_GEN_KV_LAYOUTS 0 1)\n" - cmake_config_str += "set(FLASHINFER_GEN_POS_ENCODING_MODES 0 1)\n" - cmake_config_str += 'set(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS "false")\n' - cmake_config_str += 'set(FLASHINFER_GEN_CASUALS "false" "true")\n' - use_flashInfer = True # pylint: disable=invalid-name - break - elif user_input in ["no", "N", "n"]: - cmake_config_str += "set(USE_FLASHINFER OFF)\n" - break - else: - print(f"Invalid input: {use_flashInfer}. Please input again.") - else: - cmake_config_str += "set(USE_FLASHINFER OFF)\n" - - if use_flashInfer: - while True: - user_input = input("Enter your CUDA compute capability: ") - if user_input in ["80", "86", "89", "90", "100", "120"]: - cmake_config_str += f"set(FLASHINFER_CUDA_ARCHITECTURES {user_input})\n" - cmake_config_str += f"set(CMAKE_CUDA_ARCHITECTURES {user_input})\n" - break - else: - print( - f"Invalid input: {user_input}. FlashInfer requires 80, 86, 89, 90, 100 or 120" - ) - print("\nWriting the following configuration to config.cmake...") print(cmake_config_str) diff --git a/docs/install/mlc_llm.rst b/docs/install/mlc_llm.rst index 2df981cc25..27cea0211e 100644 --- a/docs/install/mlc_llm.rst +++ b/docs/install/mlc_llm.rst @@ -210,13 +210,6 @@ This step is useful when you want to make modification or obtain a specific vers # build mlc_llm libraries cmake .. && cmake --build . --parallel $(nproc) && cd .. -.. note:: - If you are using CUDA and your compute capability is above 80, then it is require to build with - ``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during - runtime. - - To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``. - **Step 3. Install via Python.** We recommend that you install ``mlc_llm`` as a Python package, giving you access to ``mlc_llm.compile``, ``mlc_llm.MLCEngine``, and the CLI. There are two ways to do so: diff --git a/docs/install/tvm.rst b/docs/install/tvm.rst index d2729237fe..c6ffa9a369 100644 --- a/docs/install/tvm.rst +++ b/docs/install/tvm.rst @@ -203,9 +203,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r echo "set(USE_METAL OFF)" >> config.cmake echo "set(USE_VULKAN OFF)" >> config.cmake echo "set(USE_OPENCL OFF)" >> config.cmake - # FlashInfer related, requires CUDA w/ compute capability 80;86;89;90 - echo "set(USE_FLASHINFER OFF)" >> config.cmake - echo "set(FLASHINFER_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake echo "set(CMAKE_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake .. note:: @@ -217,13 +214,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r - ``RelWithDebInfo`` sets ``-O2 -g -DNDEBUG`` (recommended) - ``Release`` sets ``-O3 -DNDEBUG`` - .. note:: - If you are using CUDA and your compute capability is above 80, then it is require to build with - ``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during - runtime. - - To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``. - Once ``config.cmake`` is edited accordingly, kick off build with the commands below: .. code-block:: bash diff --git a/pyproject.toml b/pyproject.toml index ad1edcb1db..38cd74f6dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "apache-tvm-ffi", "datasets", "fastapi", - "flashinfer-python==0.2.14; sys_platform == 'linux'", + "flashinfer-python==0.3.1; sys_platform == 'linux'", "ml_dtypes>=0.5.1", "openai", "pandas", diff --git a/python/requirements.txt b/python/requirements.txt index 33b8a37e88..6d092b1bc5 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,7 +1,7 @@ apache-tvm-ffi datasets fastapi -flashinfer-python==0.2.14 +flashinfer-python==0.3.1 ml_dtypes>=0.5.1 openai pandas