Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions cmake/gen_cmake_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,46 +52,6 @@
if "CUDA" in enabled_backends:
cmake_config_str += f"set(USE_THRUST ON)\n"

# FlashInfer related
use_flashInfer = False # pylint: disable=invalid-name
if "CUDA" in enabled_backends:
while True:
user_input = input(
"Use FlashInfer? (need CUDA w/ compute capability 80;86;89;90) (y/n): "
)
if user_input in ["yes", "Y", "y"]:
cmake_config_str += "set(USE_FLASHINFER ON)\n"
cmake_config_str += "set(FLASHINFER_ENABLE_FP8 OFF)\n"
cmake_config_str += "set(FLASHINFER_ENABLE_BF16 OFF)\n"
cmake_config_str += "set(FLASHINFER_GEN_GROUP_SIZES 1 4 6 8)\n"
cmake_config_str += "set(FLASHINFER_GEN_PAGE_SIZES 16)\n"
cmake_config_str += "set(FLASHINFER_GEN_HEAD_DIMS 128)\n"
cmake_config_str += "set(FLASHINFER_GEN_KV_LAYOUTS 0 1)\n"
cmake_config_str += "set(FLASHINFER_GEN_POS_ENCODING_MODES 0 1)\n"
cmake_config_str += 'set(FLASHINFER_GEN_ALLOW_FP16_QK_REDUCTIONS "false")\n'
cmake_config_str += 'set(FLASHINFER_GEN_CASUALS "false" "true")\n'
use_flashInfer = True # pylint: disable=invalid-name
break
elif user_input in ["no", "N", "n"]:
cmake_config_str += "set(USE_FLASHINFER OFF)\n"
break
else:
print(f"Invalid input: {use_flashInfer}. Please input again.")
else:
cmake_config_str += "set(USE_FLASHINFER OFF)\n"

if use_flashInfer:
while True:
user_input = input("Enter your CUDA compute capability: ")
if user_input in ["80", "86", "89", "90", "100", "120"]:
cmake_config_str += f"set(FLASHINFER_CUDA_ARCHITECTURES {user_input})\n"
cmake_config_str += f"set(CMAKE_CUDA_ARCHITECTURES {user_input})\n"
break
else:
print(
f"Invalid input: {user_input}. FlashInfer requires 80, 86, 89, 90, 100 or 120"
)

print("\nWriting the following configuration to config.cmake...")
print(cmake_config_str)

Expand Down
7 changes: 0 additions & 7 deletions docs/install/mlc_llm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,6 @@ This step is useful when you want to make modification or obtain a specific vers
# build mlc_llm libraries
cmake .. && cmake --build . --parallel $(nproc) && cd ..

.. note::
If you are using CUDA and your compute capability is above 80, then it is require to build with
``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during
runtime.

To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``.

**Step 3. Install via Python.** We recommend that you install ``mlc_llm`` as a Python package, giving you
access to ``mlc_llm.compile``, ``mlc_llm.MLCEngine``, and the CLI.
There are two ways to do so:
Expand Down
10 changes: 0 additions & 10 deletions docs/install/tvm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r
echo "set(USE_METAL OFF)" >> config.cmake
echo "set(USE_VULKAN OFF)" >> config.cmake
echo "set(USE_OPENCL OFF)" >> config.cmake
# FlashInfer related, requires CUDA w/ compute capability 80;86;89;90
echo "set(USE_FLASHINFER OFF)" >> config.cmake
echo "set(FLASHINFER_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake
echo "set(CMAKE_CUDA_ARCHITECTURES YOUR_CUDA_COMPUTE_CAPABILITY_HERE)" >> config.cmake

.. note::
Expand All @@ -217,13 +214,6 @@ While it is generally recommended to always use the prebuilt TVM Unity, if you r
- ``RelWithDebInfo`` sets ``-O2 -g -DNDEBUG`` (recommended)
- ``Release`` sets ``-O3 -DNDEBUG``

.. note::
If you are using CUDA and your compute capability is above 80, then it is require to build with
``set(USE_FLASHINFER ON)``. Otherwise, you may run into ``Cannot find Function`` issue during
runtime.

To check your CUDA compute capability, you can use ``nvidia-smi --query-gpu=compute_cap --format=csv``.

Once ``config.cmake`` is edited accordingly, kick off build with the commands below:

.. code-block:: bash
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies = [
"apache-tvm-ffi",
"datasets",
"fastapi",
"flashinfer-python==0.2.14; sys_platform == 'linux'",
"flashinfer-python==0.3.1; sys_platform == 'linux'",
"ml_dtypes>=0.5.1",
"openai",
"pandas",
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apache-tvm-ffi
datasets
fastapi
flashinfer-python==0.2.14
flashinfer-python==0.3.1
ml_dtypes>=0.5.1
openai
pandas
Expand Down