Skip to content

Commit

Permalink
Write special OpenCL kernel when left side matrix has only 1 row.
Browse files Browse the repository at this point in the history
This speeds up things quite a bit. I think I had ~570ms on Vicuna-13B
before this and now around ~440ms.
  • Loading branch information
Noeda committed Apr 5, 2023
1 parent 4a1e295 commit 746fc56
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 4 deletions.
13 changes: 13 additions & 0 deletions src/benches/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,22 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
let mut mul_left1 = Tensor::random(4096, 11000, TensorDType::Float16);
let mut mul_right1 = Tensor::random(1, 11000, TensorDType::Float16);
let mut mul_target1 = Tensor::zeros(4096, 1, TensorDType::Float16);
let mut mul_target2 = Tensor::zeros(1, 4096, TensorDType::Float16);
mul_left1.to_gpu_inplace(&cl).unwrap();
mul_right1.to_gpu_inplace(&cl).unwrap();
mul_target1.to_gpu_inplace(&cl).unwrap();
mul_target2.to_gpu_inplace(&cl).unwrap();

c.bench_function(
"1x11000 to 4096x11000 matrix multiplication transposed on OpenCL",
|b| {
b.iter(|| {
mul_target2
.matrix_mul_inplace_transposed(black_box(&mul_right1), black_box(&mul_left1));
mul_target2.finish();
})
},
);

c.bench_function(
"4096x11000 to 1x11000 matrix multiplication transposed on OpenCL",
Expand Down
51 changes: 49 additions & 2 deletions src/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use rand::Rng;
use rayon::prelude::*;
use std::alloc::Layout;
use std::io::{Read, Seek};
use std::path::{PathBuf};
use std::path::PathBuf;
#[cfg(feature = "opencl")]
use std::sync::{Arc, RwLock};
use thiserror::Error;
Expand Down Expand Up @@ -3116,10 +3116,57 @@ mod tests {

#[cfg(feature = "opencl")]
#[test]
fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed() {
fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed_1() {
let cl = OpenCL::new(false, 0).unwrap();
let mut rng = rand::thread_rng();

// src.rows == 1

for _trial in 0..300 {
let a = rng.gen_range(1..=300);
let b = rng.gen_range(1..=300);

let mat1 = Tensor::random(1, a, TensorDType::Float16);
let mat2 = Tensor::random(b, a, TensorDType::Float16);
let mat3 = Tensor::random(1, b, TensorDType::Float16);
let mut mat1_gpu = mat1.clone();
let mut mat2_gpu = mat2.clone();
let mut mat3_gpu = mat3.clone();
mat1_gpu.to_gpu_inplace(&cl).unwrap();
mat2_gpu.to_gpu_inplace(&cl).unwrap();
mat3_gpu.to_gpu_inplace(&cl).unwrap();

let mat1 = mat1.to_f32();
let mat2 = mat2.to_f32();
let mut mat3 = mat3.to_f32();

mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
mat3_gpu.to_cpu_inplace().unwrap();

assert_eq!(mat3.rows(), mat3_gpu.rows());
assert_eq!(mat3.cols(), mat3_gpu.cols());

for row in 0..mat3.rows {
for col in 0..mat3.cols {
assert_relative_eq!(
mat3.get_f32(row, col),
mat3_gpu.get_f32(row, col),
epsilon = 1e-2,
);
}
}
}
}

#[cfg(feature = "opencl")]
#[test]
fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed_2() {
let cl = OpenCL::new(false, 0).unwrap();
let mut rng = rand::thread_rng();

// other.rows == 1

for _trial in 0..300 {
let a = rng.gen_range(1..=300);
let b = rng.gen_range(1..=300);
Expand Down
102 changes: 100 additions & 2 deletions src/tensor_opencl_support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use thiserror::Error;
struct Programs {
matrix_mul_transposed_f16_program: Program,
matrix_mul_transposed_f16: Kernel,
matrix_mul_transposed_one_row_f16_program: Program,
matrix_mul_transposed_one_row_f16: Kernel,
matrix_mul_transposed_f16_cpu_optimized_program: Program,
matrix_mul_transposed_f16_cpu_optimized: Kernel,
silu_f16_program: Program,
Expand Down Expand Up @@ -324,12 +326,23 @@ impl OpenCLTensor {
// 2 = GPU optimized vector multiply (other.rows == 1)
const CPU: u8 = 0;
const GPU: u8 = 1;
let strategy: u8 = if self.cl.is_cpu_device { CPU } else { GPU };
const GPU2: u8 = 2;
let strategy: u8 = if self.cl.is_cpu_device {
CPU
} else {
if src.rows == 1 {
GPU2
} else {
GPU
}
};

let prg = if strategy == CPU {
&prg.matrix_mul_transposed_f16_cpu_optimized
} else {
} else if strategy == GPU {
&prg.matrix_mul_transposed_f16
} else {
&prg.matrix_mul_transposed_one_row_f16
};
prg.set_arg(0, self.buf.clone())?;
prg.set_arg(1, src.buf.clone())?;
Expand Down Expand Up @@ -369,6 +382,14 @@ impl OpenCLTensor {
.local_work_size([16, 16])
.enew(&mut event);
b.enq()?;
} else if strategy == GPU2 {
let b = prg
.cmd()
.queue(&self.queue)
.global_work_size([cols16 as usize, 1])
.local_work_size([16, 1])
.enew(&mut event);
b.enq()?;
} else {
let b = prg
.cmd()
Expand Down Expand Up @@ -428,6 +449,22 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
.arg(&0)
.queue(queue.clone())
.build()?;
let matrix_mul_transposed_one_row_f16_program =
make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_ONE_ROW_SRC)?;
let matrix_mul_transposed_one_row_f16 = Kernel::builder()
.program(&matrix_mul_transposed_one_row_f16_program)
.name("matrix_mul_transposed_one_row_f16")
.arg(None::<&Buffer<u16>>)
.arg(None::<&Buffer<u16>>)
.arg(None::<&Buffer<u16>>)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.queue(queue.clone())
.build()?;
let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?;
let silu_f16 = Kernel::builder()
.program(&silu_f16_program)
Expand Down Expand Up @@ -459,6 +496,8 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
Ok(Programs {
matrix_mul_transposed_f16_program,
matrix_mul_transposed_f16,
matrix_mul_transposed_one_row_f16_program,
matrix_mul_transposed_one_row_f16,
matrix_mul_transposed_f16_cpu_optimized_program,
matrix_mul_transposed_f16_cpu_optimized,
silu_f16_program,
Expand Down Expand Up @@ -517,6 +556,65 @@ __kernel void matrix_mul_transposed_f16(
}
"#;

const MATRIX_MUL_TRANSPOSED_F16_ONE_ROW_SRC: &str = r#"
__kernel void matrix_mul_transposed_one_row_f16(
__global half *tgt,
__global const half *left,
__global const half *right,
const int left_cols_capacity,
const int right_cols_capacity,
const int ncols_capacity,
const int nrows,
const int ncols, // size of target
const int shared_sz
) {
// assertions:
// nrows == 1
// left_rows == 1
__local float lefttile[16];
__local float righttile[16][16];
const int global_x = get_global_id(0);
const int local_x = get_local_id(0);
const int num_tiles = (shared_sz + 15) / 16;
const int x_tile = (global_x / 16) * 16;
float sum = 0.0f;
if (x_tile + 15 < ncols) {
for (int t = 0; t < num_tiles; ++t) {
lefttile[local_x] = vload_half(t * 16 + local_x, left);
for (int k = 0; k < 16; ++k) {
righttile[k][local_x] = vload_half(t * 16 + local_x + (x_tile + k) * right_cols_capacity, right);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < 16; ++k) {
sum += lefttile[k] * righttile[local_x][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
} else {
for (int t = 0; t < num_tiles; ++t) {
lefttile[local_x] = vload_half(t * 16 + local_x, left);
for (int k = 0; k < 16; ++k) {
if (x_tile + k >= ncols) {
righttile[k][local_x] = 0.0f;
} else {
righttile[k][local_x] = vload_half(t * 16 + local_x + (x_tile + k) * right_cols_capacity, right);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < 16; ++k) {
sum += lefttile[k] * righttile[local_x][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
if (global_x < ncols) {
vstore_half(sum, global_x, (__global half*) tgt);
}
}"#;

const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC: &str = r#"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
Expand Down

0 comments on commit 746fc56

Please sign in to comment.