%global pypi_name triton

# For testing
%bcond_with test

# For debugging
%bcond_with debug

# Reduce link memory pressure
%global _lto_cflags %nil

%global __cmake_in_source_build 1

# So pre releases can be tried
%bcond_with gitcommit
%if %{with gitcommit}
# The top of tree ~3/25/24
%global commit0 c3bcae7c6c75f29d1e627eaf92f0d5414abe793b

# from cmake/llvm-hash
%global commit1 4017f04e310454ccced4c404a23f7698eec735ca

%global pypi_version 3.0.0
%else

%global pypi_version 3.1.0

# The sdist does not contain enough to do the build
# Fetch top of release/3.1.x at 12/31/24
%global commit0 cf34004b8a67d290a962da166f5aa2fc66751326
%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})

# Do no use the prebuilt llvm
# This commit should come from trition/cmake/llvm-hash.txt
%global commit1 10dc3a8e916d73291269e5e2b82dd22681489aa1
%endif

%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
%global shortcommit1 %(c=%{commit1}; echo ${c:0:7})

# The llvm build has its LLVM_PARALLEL_COMPILE|LINK_JOBS switches
# Triton uses the envionment variable MAX_JOBS for both.
%global _smp_mflags %{nil}

# Build clang with clang is easier
%global toolchain clang

# Compression type and level for source/binary package payloads.
#  "w7T0.xzdio"         xz level 7 using %%{getncpus} threads
%define _source_payload         w7T0.xzdio
%define _binary_payload         w7T0.xzdio

Name:           python-%{pypi_name}
Version:        %{pypi_version}
Release:        1%{?dist}
Summary:        A language and compiler for custom Deep Learning operations

License:        MIT AND Apache-2.0 AND BSD-3-Clause AND BSD-2-Clause
# Main license is MIT
# llvm is Apache-2.0, BSD-3-Clause AND BSD-2-Clause

URL:            https://github.com/openai/triton/
Source0:        %{url}/archive/%{commit0}/triton-%{shortcommit0}.tar.gz
%if %{without local}
Source1:        https://github.com/llvm/llvm-project/archive/%{commit1}.tar.gz
Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
%endif

# Can not download things
# Can not use git on a tarball
Patch1:         0001-Prepare-triton-setup-for-fedora.patch

# GPUs really only work on x86_64
ExclusiveArch:  x86_64

BuildRequires:  ccache
BuildRequires:  clang
BuildRequires:  cmake
BuildRequires:  ninja-build
BuildRequires:  python3-devel
BuildRequires:  python3dist(filelock)
BuildRequires:  python3dist(numpy)
BuildRequires:  python3dist(pip)
BuildRequires:  python3dist(pytest)
BuildRequires:  python3dist(pybind11)
BuildRequires:  python3dist(setuptools)
BuildRequires:  python3dist(wheel)
BuildRequires:  rocm-compilersupport-macros
BuildRequires:  zlib-devel

# Triton uses a custom snapshot of the in development llvm
# Because of instablity of the llvm api, we must use the one
# triton uses.  llvm is statically built and none of the
# llvm headers or libraries are distributed directly.
# From llvm.spec's license
#   Apache-2.0 WITH LLVM-exception OR NCSA
Provides:       bundled(llvm-project) = 19.0.0~pre20240214.g%{shortcommit1}
# From pybind11.spec's license
#   BSD-3-Clause
Provides:       bundled(pybind11) = 2.11.1

%global _description %{expand:
Triton is a language and compiler for writing highly efficient custom
Deep-Learning primitives. The aim of Triton is to provide an open-source
environment to write fast code at higher productivity than CUDA, but
also with higher flexibility than other existing DSLs. }

%description %_description

%package -n     python3-%{pypi_name}
Summary:        %{summary}

Requires:  rocm-hip-devel
Requires:  rocm-lld
Requires:  rocm-runtime-devel
Requires:  roctracer-devel

%description -n python3-%{pypi_name} %_description

%prep
%autosetup -p1 -n triton-%{commit0}
%if %{without local}
tar xf %{SOURCE1}
tar xf %{SOURCE2}
%endif

# Remove bundled egg-info
rm -rf %{pypi_name}.egg-info

# Path to rocm compiler is not /opt/rocm/llvm
sed --i -e 's@/opt/rocm/llvm/bin@%{rocmllvm_bindir}@' third_party/amd/backend/compiler.py

# Not building proton, so the profiler package is never there
sed -i -e "/triton\/profiler/d" python/setup.py

# Logic for the backends is a little broken, give it some help
cp -r third_party/{amd,nvidia} python/triton/backends

# rm llvm-project bits we do not need
rm -rf llvm-project-%{commit1}/{bolt,clang,compiler-rt,flang,libc,libclc,libcxx,libcxxabi,libunwind,lld,lldb,llvm-libgcc,openmp,polly,pst,runtimes,utils}

# disable -Werror
sed -i -e 's@-Werror @ @' CMakeLists.txt

# For debugging
%if %{with debug}
sed -i -e 's@${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS @-O1 -g -D__STDC_FORMAT_MACROS @' CMakeLists.txt
%endif

%if %{without test}
# no knob to turn off downloading of googletest
sed -i -e 's@add_subdirectory(unittest)@#add_subdirectory(unittest)@' CMakeLists.txt
%else
# E   ValueError: option names {'--device'} already added
sed -i -e 's@--device@--ddevice@' python/test/unit/operators/conftest.py
# performance is only nvidia
rm python/test/regression/test_performance.py
# E   ModuleNotFoundError: No module named 'triton.common'
rm python/test/backend/test_device_backend.py
%endif

%build

# Real cores, No hyperthreading
COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'`
if [ ${COMPILE_JOBS}x = x ]; then
    COMPILE_JOBS=1
fi
# Try again..
if [ ${COMPILE_JOBS} = 1 ]; then
    COMPILE_JOBS=`lscpu | grep '^CPU(s)' | awk '{ print $2 }'`
    if [ ${COMPILE_JOBS}x = x ]; then
        COMPILE_JOBS=4
    fi
fi

# Take into account memmory usage per core, do not thrash real memory
BUILD_MEM=2
MEM_KB=0
MEM_KB=`cat /proc/meminfo | grep MemTotal | awk '{ print $2 }'`
MEM_MB=`eval "expr ${MEM_KB} / 1024"`
MEM_GB=`eval "expr ${MEM_MB} / 1024"`
COMPILE_JOBS_MEM=`eval "expr 1 + ${MEM_GB} / ${BUILD_MEM}"`
if [ "$COMPILE_JOBS_MEM" -lt "$COMPILE_JOBS" ]; then
    COMPILE_JOBS=$COMPILE_JOBS_MEM
fi
LINK_MEM=32
LINK_JOBS=`eval "expr 1 + ${MEM_GB} / ${LINK_MEM}"`

cd llvm-project-%{commit1}

%cmake -G Ninja \
       -DBUILD_SHARED_LIBS=OFF \
       -DCMAKE_BUILD_TYPE=RELEASE \
       -DCMAKE_INSTALL_PREFIX=$PWD/install \
       -DLLVM_PARALLEL_COMPILE_JOBS=$COMPILE_JOBS \
       -DLLVM_PARALLEL_LINK_JOBS=$LINK_JOBS \
       -DCMAKE_LINKER=lld \
       -DLLVM_BUILD_UTILS=ON \
       -DLLVM_BUILD_TOOLS=ON \
       -DLLVM_ENABLE_ASSERTIONS=OFF \
       -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
       -DLLVM_ENABLE_PROJECTS=mlir \
       -DLLVM_INSTALL_UTILS=ON \
       -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
       -DLLVM_ENABLE_TERMINFO=OFF \
       -DBUILD_SHARED_LIBS=OFF \
       llvm
%cmake_build
%cmake_build -t install

export LLVM_SYSPATH=$PWD/install
cd ..

cd pybind11-2.11.1
%cmake -G Ninja \
       -DBUILD_SHARED_LIBS=OFF \
       -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_INSTALL_PREFIX=$PWD/install \
       -DPYBIND11_TEST=OFF

%cmake_build
%cmake_build -t install

export PYBIND11_SYSPATH=$PWD/install
cd ..

export PATH=$LLVM_SYSPATH/bin:$PATH

%if %{with debug}
export DEBUG=1
%else
# Needs to be the same as llvm
export RELEASE=1
%endif

export CC=clang
export CXX=clang++
export MAX_JOBS=$COMPILE_JOBS
export TRITON_BUILD_WITH_CLANG_LD=ON
export TRITON_BUILD_PROTON=OFF
export TRITON_CODEGEN_AMD=ON
export TRITON_CODEGEN_NVIDIA=ON

cd python
%py3_build

%install

cd python
%py3_install

# empty files
rm %{buildroot}%{python3_sitearch}/triton/compiler/make_launcher.py

# Remove all the amd headers
rm -rf %{buildroot}%{python3_sitearch}/triton/backends/amd/include/*

%check
%py3_check_import %{pypi_name}
%if %{with test}
# Unit tests download so are not suitable for mock
cd python
%pytest
%endif

%files -n python3-%{pypi_name}
%{python3_sitearch}/%{pypi_name}
%{python3_sitearch}/%{pypi_name}*.egg-info

%changelog
* Sat Jan 4 2025 Tom Rix <trix@redhat.com> 3.1.0-1
- Inital release