1_Square_matrix_multiplication_
1_Square_matrix_multiplication_
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 16 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHE...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
1_Square_matrix_multiplication_
block_size_optimized_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHE...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.7100000000000002, 'variance': 4.930380657631324e-32, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.67, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 42.712, 'variance': 9.599999999996181e-05, 'n...
{'aten::to': {'cpu_time_total': 548763.7420000026, 'device_time_total': 3551.694999999949, 'self_cpu_time_total': 64.78500000503846, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::_to_copy': {'cpu_time_total': 548698.956...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b1_s0_block_size_optimized/base/base.cu:9:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 9 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor...
1_Square_matrix_multiplication_
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define THREAD_STRIDE 4 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #d...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 0.72, 'variance': 0.0, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 0.7, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 18.023999999999997, 'variance': 2.4000000000007502e-05, 'n': 5}, 'Issued Ipc A...
{'aten::to': {'cpu_time_total': 307489.68499999825, 'device_time_total': 3612.935000000056, 'self_cpu_time_total': 60.040999999211635, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::zeros': {'cpu_time_total': 131007.1739...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b1_s1_stride_loop_matmul/base/base.cu:10:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 10 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor"...
1_Square_matrix_multiplication_
tiled_coalesced_unroll_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define PADDING 1 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define ...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.64, 'variance': 0.0, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.6, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 40.91199999999999, 'variance': 0.00017600000000000387, 'n': 5}, 'Issued Ipc Ac...
{'aten::to': {'cpu_time_total': 482610.54600000015, 'device_time_total': 3558.23199999996, 'self_cpu_time_total': 49.90800000115996, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::zeros': {'cpu_time_total': 340885.230999...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b1_s2_tiled_coalesced_unroll/base/base.cu:10:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 10 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA ten...
1_Square_matrix_multiplication_
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define FULL_MASK 0xffffffff #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
1_Square_matrix_multiplication_
vectorized_coalesced_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define VECTOR_SIZE 4 // Using float4 for vectorized loads #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contig...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
1_Square_matrix_multiplication_
aligned_vectorized_matmul_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define VECTOR_SIZE 4 // float4 is 128-bits #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " mu...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.7880000000000003, 'variance': 1.600000000000003e-05, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.7399999999999998, 'variance': 4.930380657631324e-32, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 44.6379999999...
{'aten::to': {'cpu_time_total': 370214.6159999991, 'device_time_total': 3090.668000000005, 'self_cpu_time_total': 43.96999999956461, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::zeros': {'cpu_time_total': 255396.391000...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b2_s1_aligned_vectorized_matmul_base/base/base.cu:10:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 10 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a ...
1_Square_matrix_multiplication_
block_64_optimized_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 64 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHE...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
Using /var/tmp/torch_extensions_b5b11049 as PyTorch extensions root... Creating extension directory /var/tmp/torch_extensions_b5b11049/Square_matrix_multiplication_... Detected CUDA files, patching ldflags Emitting ninja build file /var/tmp/torch_extensions_b5b11049/Square_matrix_multiplication_/build.ninja... /home/ro...
1_Square_matrix_multiplication_
shared_memory_reuse_optimization_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHEC...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.7100000000000002, 'variance': 4.930380657631324e-32, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.67, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 42.726, 'variance': 0.00010399999999999841, '...
{'aten::zeros': {'cpu_time_total': 335019.7750000423, 'device_time_total': 32525.17199995462, 'self_cpu_time_total': 9757.13300013123, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::zero_': {'cpu_time_total': 9381759.222...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b2_s3_shared_memory_reuse_optimization_base/base/base.cu:8:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 8 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must...
1_Square_matrix_multiplication_
efficient_thread_block_mapping_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHE...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.7100000000000002, 'variance': 4.930380657631324e-32, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.67, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 42.70399999999999, 'variance': 0.000103999999...
{'aten::to': {'cpu_time_total': 548404.8910000011, 'device_time_total': 3166.0639999998966, 'self_cpu_time_total': 40.870999999693595, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::_to_copy': {'cpu_time_total': 548364.0...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b3_s0_efficient_thread_block_mapping_base/base/base.cu:9:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 9 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must b...
1_Square_matrix_multiplication_
workload_balancing_optimization_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHEC...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.684, 'variance': 2.400000000000004e-05, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.65, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 42.118, 'variance': 0.0003759999999999811, 'n': 5}, 'Issue...
{'aten::to': {'cpu_time_total': 352792.8509999998, 'device_time_total': 3158.569999999949, 'self_cpu_time_total': 45.03099999832921, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::zeros': {'cpu_time_total': 337755.530999...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b3_s1_workload_balancing_optimization/base/base.cu:8:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 8 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a ...
1_Square_matrix_multiplication_
double_buffered_tile_matmul_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHEC...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
1_Square_matrix_multiplication_
atomic_reduction_optimization_base_base
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHE...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.7100000000000002, 'variance': 4.930380657631324e-32, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.67, 'variance': 0.0, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 42.71, 'variance': 0.00027999999999997385, 'n...
{'aten::to': {'cpu_time_total': 566101.8920000023, 'device_time_total': 3327.2480000000214, 'self_cpu_time_total': 46.712000002968125, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::_to_copy': {'cpu_time_total': 566055.1...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b3_s3_atomic_reduction_optimization_base/base/base.cu:9:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 9 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be...
1_Square_matrix_multiplication_
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <c10/cuda/CUDAException.h> #define TILE_SIZE 32 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHEC...
import torch import torch.nn as nn class Model(nn.Module): """ Simple model that performs a single square matrix multiplication (C = A * B) """ def __init__(self): super(Model, self).__init__() def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Perfo...
import torch import torch.nn as nn import torch.nn.functional as F def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: """ Performs a single square matrix multiplication (C = A * B). Args: A (torch.Tensor): Input matrix A of shape (N, N). B (torch.Tensor): Input matrix B of s...
{'metrics': {'Executed Ipc Active': {'unit': 'inst/cycle', 'avg_value': 1.67, 'variance': 0.0, 'n': 5}, 'Executed Ipc Elapsed': {'unit': 'inst/cycle', 'avg_value': 1.6299999999999997, 'variance': 4.930380657631324e-32, 'n': 5}, 'Issue Slots Busy': {'unit': '%', 'avg_value': 41.715999999999994, 'variance': 0.00010399999...
{'aten::to': {'cpu_time_total': 589877.3080000015, 'device_time_total': 3269.1049999999814, 'self_cpu_time_total': 47.723000000580214, 'self_device_time_total': 0.0, 'cpu_memory_usage': 0, 'device_memory_usage': 0, 'self_cpu_memory_usage': 0, 'self_device_memory_usage': 0}, 'aten::_to_copy': {'cpu_time_total': 589829.5...
{'stdout': '/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_1/b4_s0_tiled_opt/base/base.cu:8:35: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]\n 8 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")\n |...