diff --git a/Makefile b/Makefile index 0f50602a..d859b48f 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,10 @@ BLIS_PREFIX = /usr/local BLIS_INC = $(BLIS_PREFIX)/include/blis BLIS_LIB = $(BLIS_PREFIX)/lib/libblis.a +# MKL +MKL_PREFIX = /opt/intel +MKL_INC = $(MKL_PREFIX)/mkl/include +MKL_LIB = $(MKL_PREFIX)/mkl/lib/intel64 #OpenBLAS OPENBLAS_PREFIX = /usr/include @@ -13,10 +17,6 @@ OPENBLAS_INC = $(OPENBLAS_PREFIX)/openblas MOD_PATH = out/model.bin TOK_PATH = tokenizer.bin -# -L${MKLROOT}/lib/intel64 -lmkl_rt -Wl,--no-as-needed -lpthread -lm -ldl -# -m64 -I"${MKLROOT}/include" - - # choose your compiler, e.g. gcc/clang # example override to clang: make run CC=clang @@ -64,6 +64,10 @@ run_cc_gnu: ## - Optimized Generic linux distro build runq_cc_gnu: ## - Same for quantized build $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c -lm +.PHONY: run_cc_mmdebug +run_cc_mmdebug: ## - ***NEW*** Matmul Debug Log build (Warning: Huge Logs) + $(CC) -D MMDEBUG -Ofast -march=native -mtune=native run.c -lm -o run + ##@ Accelerated Builds # additionally compiles with OpenMP, allowing multithreaded runs # make sure to also enable multiple threads when running, e.g.: @@ -133,12 +137,12 @@ runq_cc_blis: ## - Same for quantized build ##@ ---> x86_64 # amd64 (x86_64) / Intel Mac (WIP) Do not use! .PHONY: run_cc_mkl -run_cc_mkl: ## - OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) (WIP) - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -lblis -o run +run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) run.c -lmkl_rt -lpthread -lm -o run .PHONY: runq_cc_mkl runq_cc_mkl: ## - Same for quantized build - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lblis -o run + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread -lm -o run ##@ ---> ARM64 / aarch64 .PHONY: run_cc_armpl diff --git a/run.c b/run.c index 8a17170b..571d1d69 100644 --- a/run.c +++ b/run.c @@ -442,6 +442,48 @@ void avx_matmul(float* xout, const float* x, const float* w, int n, int d) { } } #endif + +#ifdef MMDEBUG +void debug_matmul(float* xout, float* x, float* w, int n, int d) { + // W (d,n) @ x (n,) -> xout (d,) + // by far the most amount of time is spent inside this little function + + // Print input values to stderr + fprintf(stderr, "<<<<<<< Input x: >>>>>>> "); + for (int i = 0; i < n; i++) { + fprintf(stderr, "%f ", x[i]); + } + fprintf(stderr, "\n"); + + fprintf(stderr, "<<<<<<< Input w: >>>>>>> "); + for (int i = 0; i < d; i++) { + for (int j = 0; j < n; j++) { + fprintf(stderr, "%f ", w[i * n + j]); + } + // fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + + int i; + #ifdef ACCEL + ACCEL(i) + #endif + for (i = 0; i < d; i++) { + float val = 0.0f; + for (int j = 0; j < n; j++) { + val += w[i * n + j] * x[j]; + } + xout[i] = val; + } + + // Print output values to stderr + fprintf(stderr, "<<<<<<< Output xout: >>>>>>> "); + for (int i = 0; i < d; i++) { + fprintf(stderr, "%f ", xout[i]); + } + fprintf(stderr, "\n"); +} +#endif // END L2E Addition void matmul(float* xout, float* x, float* w, int n, int d) { @@ -453,6 +495,8 @@ void matmul(float* xout, float* x, float* w, int n, int d) { cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1); #elif defined(ACCELAVX) avx_matmul(xout, x, w, n, d); + #elif defined(MMDEBUG) + debug_matmul(xout, x, w, n, d); #else #ifdef ACCEL ACCEL(i) // OMP/OACC Macro @@ -1250,6 +1294,14 @@ int main(int argc, char *argv[]) { fflush(stdout); inprompt(prompt); // read prompt #else + #ifdef MMDEBUG + FILE* dLogFile = freopen("debug_matmul.log", "w", stderr); + if (dLogFile == NULL) { + // Handle error + perror("freopen"); + return 1; + } + #endif // END L2E Addition // poor man's C argparse so we can override the defaults above from the command line if (argc >= 2) { checkpoint_path = argv[1]; } else { error_usage(); } @@ -1316,6 +1368,9 @@ int main(int argc, char *argv[]) { } // end of loop #endif #endif + #ifdef MMDEBUG + fclose(dLogFile); + #endif // END L2E Addition return 0; }