Skip to content

Commit

Permalink
Added OPENBLAS, CLBLAST GPU and APE prompt support
Browse files Browse the repository at this point in the history
Added BLAS support:

+ Openblas
+ CLBlast (GPU)

CLBlast is considerable slower. Needs investigation.

Added APE binary prompt support

Usage:

Ape run:
$   run.com

Baremetal Boot:
$  qemu-system-x86_64 -serial stdio -hda run.com
(input is broken on baremetal)

Updated Makefile

Usage:
make runopenblas
make runclblast
  • Loading branch information
trholding committed Jul 31, 2023
1 parent 08fab7e commit 36413d9
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ rungnu:
runompgnu:
$(CC) -Ofast -fopenmp -std=gnu11 run.c -lm -o run

.PHONY: runclblast
runclblast: run.c
$(CC) -D CLBLAST -Ofast -fopenmp -march=native run.c -lm -lclblast -o run

.PHONY: runopenblas
runopenblas: run.c
$(CC) -D OPENBLAS -Ofast -fopenmp -march=native run.c -lm -lcblas -o run

.PHONY: cosmorun
cosmorun:
cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D COSMO_ZIP run.c -lm -o run.com
Expand Down
31 changes: 31 additions & 0 deletions run.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ __static_yoink("vga_console");
__static_yoink("zipos");
#endif

// ----------------------------------------------------------------------------
// BLAS Support

#ifdef CLBLAST
#include <clblast_netlib_c.h>
#define BLAS
#endif

#ifdef OPENBLAS
#include <cblas.h>
#define BLAS
#endif

// ----------------------------------------------------------------------------
// Standard Headers

Expand Down Expand Up @@ -181,17 +194,25 @@ void checkpoint_init_weights(TransformerWeights *w, Config* p, float* f, int sha
// neural net blocks

void accum(float *a, float *b, int size) {
#ifdef BLAS
cblas_saxpy(size, 1.0f, b, 1.0f, a, 1);
#else
for (int i = 0; i < size; i++) {
a[i] += b[i];
}
#endif
}

void rmsnorm(float* o, float* x, float* weight, int size) {
// calculate sum of squares
float ss = 0.0f;
#ifdef BLAS
ss = cblas_sdot(size, x, 1.0f, x, 1.0f);
#else
for (int j = 0; j < size; j++) {
ss += x[j] * x[j];
}
#endif
ss /= size;
ss += 1e-5f;
ss = 1.0f / sqrtf(ss);
Expand Down Expand Up @@ -224,6 +245,9 @@ void softmax(float* x, int size) {
void matmul(float* xout, float* x, float* w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
// by far the most amount of time is spent inside this little function
#ifdef BLAS
cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1, w, n, x, 1, 0, xout, 1);
#else
int i;
#pragma omp parallel for private(i)
for (i = 0; i < d; i++) {
Expand All @@ -233,6 +257,7 @@ void matmul(float* xout, float* x, float* w, int n, int d) {
}
xout[i] = val;
}
#endif
}

void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) {
Expand Down Expand Up @@ -491,6 +516,12 @@ int main(int argc, char *argv[]) {
// we read the embedded checkpoint from within the executable
// 'checkpoint' is necessary arg
checkpoint = "/zip/out/model.bin" ;
buffertokens=32;
char promptbuffer[1024]; // Buffer for prompt
printf("LLAMA2 Prompt: ");
fflush(stdout);
gets(promptbuffer); // Read prompt
prompt=promptbuffer; // Set prompt
#else
if (argc < 2) {
printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]\n", argv[0]);
Expand Down

1 comment on commit 36413d9

@trholding
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used cblas example from here: karpathy#7 (comment)

Please sign in to comment.