llama.prg

Harbour library, based on llama.cpp

Quick review
Compiling library
Compiling samples
Functions list
Tests
Model parameters
Links

Quick review

The main goal of llama.prg project is to provide possibility to create Harbour applications, which can interact with local LLaMA language models. The project provides a llama, ggml and whisper libraries, which may be linked to your application. Under Windows it demands 64-bit MSVC compiler, under Linux/Unix - the standard 64-bit GNU C.

Compiling library

A preferred method to build the llama library and samples is HwBuilder - my utility, which builds programs, written on Harbour. An appropriate project file, llamalib.hwprj, is provided. Llamalib.hwprj and other hwprj files supposes, that there is a section for 64-bit MSVC compiler in your copy of hwbuild.ini, you need to tune it:

[C_COMPILER_6]
id=msvc64
family=msvc
...

If you prefer to not use special utilities, you can build this library with following bat file:

Windows

$echo off
if not exist lib md lib
if not exist obj md obj
if not exist obj\msvc64 md obj\msvc64
if not exist obj\whisper md obj\whisper
if not exist obj\whisper\msvc64 md obj\whisper\msvc64

call "c:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64

set CFLAGS=/TP /W3 /nologo /c /TP /W3 /nologo /EHsc /c -Illama.cpp\common -Illama.cpp -DLOG_DISABLE_LOGS=1 /W3 /WX- /diagnostics:column /O2 /Ob2 /D _MBCS /D WIN32 /D _WINDOWS /D NDEBUG /D _CRT_SECURE_NO_WARNINGS /Gm- /MD /GS /arch:AVX2 /fp:precise /Zc:wchar_t /Zc:forScope /Zc:inline /std:c11 /external:W3 /Gd /TC /errorReport:queue
set CXXFLAGS=/TP /W3 /nologo /c /TP /W3 /nologo /EHsc /c -Illama.cpp\common -Illama.cpp -DLOG_DISABLE_LOGS=1

set HB_PATH=c:\harbour
set OBJ=obj\msvc64
set OBJLIST=%OBJ%\hllama.obj %OBJ%\hllava.obj %OBJ%\llama.obj %OBJ%\llama-sampling.obj %OBJ%\llama-grammar.obj %OBJ%\llama-vocab.obj %OBJ%\common.obj %OBJ%\sampling.obj %OBJ%\grammar-parser.obj %OBJ%\build-info.obj %OBJ%\json-schema-to-grammar.obj %OBJ%\unicode.obj %OBJ%\unicode-data.obj %OBJ%\clip.obj %OBJ%\llava.obj 

cl.exe  %CXXFLAGS% /I%HB_PATH%\include /Fo%OBJ%\hllama.obj source\hllama.cpp
cl.exe  %CXXFLAGS% /I%HB_PATH%\include /Illama.cpp/llava /Fo%OBJ%\hllava.obj source\hllava.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\llama.obj llama.cpp\llama.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\llama-sampling.obj llama.cpp\llama-sampling.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\llama-grammar.obj llama.cpp\llama-grammar.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\llama-vocab.obj llama.cpp\llama-vocab.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\unicode.obj llama.cpp\unicode.cpp
cl.exe  %CXXFLAGS% /Fo%OBJ%\unicode-data.obj llama.cpp\unicode-data.cpp

cl.exe %CFLAGS% /Fo%OBJ%\ggml.obj llama.cpp\ggml.c
cl.exe %CFLAGS% /Fo%OBJ%\ggml-alloc.obj llama.cpp\ggml-alloc.c
cl.exe %CFLAGS% /Fo%OBJ%\ggml-backend.obj llama.cpp\ggml-backend.c
cl.exe %CFLAGS% /Fo%OBJ%\ggml-quants.obj llama.cpp\ggml-quants.c
cl.exe %CFLAGS% /Fo%OBJ%\ggml-aarch64 llama.cpp\ggml-aarch64.c
cl.exe %CFLAGS% /I%HB_PATH%\include /Fo%OBJ%\hcommon.obj source\hcommon.c

cl.exe %CXXFLAGS% /Fo%OBJ%\common.obj llama.cpp\common\common.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\sampling.obj llama.cpp\common\sampling.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\grammar-parser.obj llama.cpp\common\grammar-parser.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\build-info.obj llama.cpp\common\build-info.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\json-schema-to-grammar.obj llama.cpp\common\json-schema-to-grammar.cpp

cl.exe %CXXFLAGS% /Fo%OBJ%\clip.obj llama.cpp\llava\clip.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\llava.obj llama.cpp\llava\llava.cpp

lib /out:lib\llama.lib %OBJLIST%

set OBJLIST=%OBJ%\ggml.obj %OBJ%\ggml-alloc.obj %OBJ%\ggml-backend.obj %OBJ%\ggml-quants.obj %OBJ%\ggml-aarch64.obj %OBJ%\hcommon.obj
lib /out:lib\ggml.lib %OBJLIST%

set CFLAGS=/TP /W3 /nologo /c /TP /W3 /nologo /EHsc /c -Illama.cpp\whisper -Illama.cpp -DLOG_DISABLE_LOGS=1 /W3 /WX- /diagnostics:column /O2 /Ob2 /D _MBCS /D WIN32 /D _WINDOWS /D NDEBUG /D _CRT_SECURE_NO_WARNINGS /Gm- /MD /GS /arch:AVX2 /fp:precise /Zc:wchar_t /Zc:forScope /Zc:inline /std:c11 /external:W3 /Gd /TC /errorReport:queue
set CXXFLAGS=/TP /W3 /nologo /c /TP /W3 /nologo /EHsc /c -Illama.cpp\whisper -Illama.cpp -DLOG_DISABLE_LOGS=1

set OBJ=obj\whisper\msvc64
set OBJLIST=%OBJ%\hwhisper.obj %OBJ%\whisper.obj %OBJ%\common.obj %OBJ%\grammar-parser.obj

cl.exe %CXXFLAGS% /Fo%OBJ%\hwhisper.obj llama.cpp\source\hwhisper.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\whisper.obj llama.cpp\whisper\whisper.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\common.obj llama.cpp\whisper\common.cpp
cl.exe %CXXFLAGS% /Fo%OBJ%\grammar-parser.obj llama.cpp\whisper\grammar-parser.cpp

lib /out:lib\whisper.lib %OBJLIST%

Of course, you need to use your paths.

Linux

#!/bin/bash
if ! [ -e lib ]; then
   mkdir lib
   chmod a+w+r+x lib
fi
if ! [ -e obj ]; then
   mkdir obj
   chmod a+w+r+x obj
fi
if ! [ -e obj/gcc ]; then
   mkdir obj/gcc
   chmod a+w+r+x obj/gcc
fi
if ! [ -e obj/whisper ]; then
   mkdir obj/whisper
   chmod a+w+r+x obj/whisper
fi
if ! [ -e obj/whisper/gcc ]; then
   mkdir obj/whisper/gcc
   chmod a+w+r+x obj/whisper/gcc
fi

# Set your Harbour path here
export HRB_DIR=/home/guest/apps/harbour
export CFLAGS="-c -Wall -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Illama.cpp -Illama.cpp/common -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DLOG_DISABLE_LOGS=1 -c -I$HRB_DIR/include"
export CXXFLAGS="-c -Wall -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Illama.cpp -Illama.cpp/common -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DLOG_DISABLE_LOGS=1 -c -xc++ -I$HRB_DIR/include"
export OBJ=obj/gcc
export OBJLIST="$OBJ/hllama.o $OBJ/hllava.o $OBJ/llama.o $OBJ/common.o $OBJ/sampling.o $OBJ/grammar-parser.o $OBJ/json-schema-to-grammar.o $OBJ/build-info.o $OBJ/unicode.o $OBJ/unicode-data.o $OBJ/clip.o $OBJ/llava.o"

gcc $CXXFLAGS -o$OBJ/hllama.o source/hllama.cpp
gcc $CXXFLAGS -Illama.cpp/llava -o$OBJ/hllava.o source/hllava.cpp
gcc $CFLAGS -o$OBJ/hcommon.o source/hcommon.c
gcc $CXXFLAGS -o$OBJ/llama.o llama.cpp/llama.cpp
gcc $CXXFLAGS -o$OBJ/unicode.o llama.cpp/unicode.cpp
gcc $CXXFLAGS -o$OBJ/unicode-data.o llama.cpp/unicode-data.cpp
gcc $CFLAGS -o$OBJ/ggml.o llama.cpp/ggml.c
gcc $CFLAGS -o$OBJ/ggml-alloc.o llama.cpp/ggml-alloc.c
gcc $CFLAGS -o$OBJ/ggml-backend.o llama.cpp/ggml-backend.c
gcc $CFLAGS -o$OBJ/ggml-quants.o llama.cpp/ggml-quants.c
gcc $CFLAGS -o$OBJ/ggml-aarch64.o llama.cpp/ggml-aarch64.c
gcc $CXXFLAGS -o$OBJ/common.o llama.cpp/common/common.cpp
gcc $CXXFLAGS -o$OBJ/sampling.o llama.cpp/common/sampling.cpp
gcc $CXXFLAGS -o$OBJ/grammar-parser.o llama.cpp/common/grammar-parser.cpp
gcc $CXXFLAGS -o$OBJ/json-schema-to-grammar.o llama.cpp/common/json-schema-to-grammar.cpp
gcc $CXXFLAGS -o$OBJ/build-info.o llama.cpp/common/build-info.cpp
gcc $CXXFLAGS -o$OBJ/clip.o llama.cpp/llava/clip.cpp
gcc $CXXFLAGS -o$OBJ/llava.o llama.cpp/llava/llava.cpp

ar rc lib/libllama.a $OBJLIST

ar rc lib/libggml.a $OBJ/ggml.o $OBJ/ggml-alloc.o $OBJ/ggml-backend.o $OBJ/ggml-quants.o $OBJ/ggml-aarch64.o $OBJ/hcommon.o

export CFLAGS="-c -Wall -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Illama.cpp -Illama.cpp/whisper -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DLOG_DISABLE_LOGS=1 -c -I$HRB_DIR/include"
export CXXFLAGS="-c -Wall -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Illama.cpp -Illama.cpp/whisper -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DLOG_DISABLE_LOGS=1 -c -xc++ -I$HRB_DIR/include"
export OBJ=obj/whisper/gcc

gcc $CXXFLAGS -o$OBJ/hwhisper.o source/hwhisper.cpp
gcc $CXXFLAGS -o$OBJ/whisper.o llama.cpp/whisper/whisper.cpp
gcc $CXXFLAGS -o$OBJ/common.o llama.cpp/whisper/common.cpp
gcc $CXXFLAGS -o$OBJ/grammar-parser.o llama.cpp/whisper/grammar-parser.cpp

ar rc lib/libwhisper.a $OBJ/whisper.o $OBJ/common.o $OBJ/grammar-parser.o $OBJ/hwhisper.o

Compiling samples

It is better to use HwBuilder to build a sample application - test1.hwprj is provided, but you may use the following bat file:

Windows

@echo off
call "c:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64

set HB_PATH=c:\harbour
set HB_LIBS=gtwvt.lib hbvm.lib hbrtl.lib gtgui.lib gtwin.lib hbcpage.lib hblang.lib hbrdd.lib hbmacro.lib hbpp.lib rddntx.lib rddcdx.lib rddfpt.lib hbsix.lib hbcommon.lib hbct.lib hbcplr.lib hbpcre.lib hbzlib.lib
set LLAMA_LIBS=llama.lib ggml.lib
set VC_LIBS=ucrt.lib user32.lib gdi32.lib comdlg32.lib shell32.lib comctl32.lib winspool.lib advapi32.lib winmm.lib ws2_32.lib iphlpapi.lib OleAut32.Lib Ole32.Lib

%HB_PATH%\bin\harbour -n -q -i%HB_PATH%\include test1.prg

cl.exe /TP /W3 /nologo /c /I%HB_PATH%\include /Fotest1.obj test1.c

link /NODEFAULTLIB:libucrt.lib /NODEFAULTLIB:msvcrt.lib /INCREMENTAL:NO /NOLOGO /SUBSYSTEM:CONSOLE /TLBID:1 /MANIFEST /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /manifest:embed /DYNAMICBASE /NXCOMPAT /MACHINE:X64  /machine:x64 /LIBPATH:d:\harbour_2023\lib\win\msvc64 /LIBPATH:d:\harbour_2023\hwgui\lib\vc /LIBPATH:lib  test1.obj %HB_LIBS% %LLAMA_LIBS% %VC_LIBS%

Linux

#!/bin/bash
# Set your Harbour path here
export HRB_DIR=/home/guest/apps/harbour

$HRB_DIR/bin/linux/gcc/harbour -n -q -i$HRB_DIR/include test1.prg
gcc -c -Wall -I$HRB_DIR/include -otest1.o test1.c
gcc  test1.o -otest1 -L$HRB_DIR/lib/linux/gcc -Llib -Wl,--start-group -lgttrm -lhbvm -lhbrtl -lgtcgi -lgttrm -lhbcpage -lhblang -lhbrdd -lhbmacro -lhbpp -lrddntx -lrddcdx -lrddfpt -lhbsix -lhbcommon -lhbct -lhbcplr -lllama -lggml -lpthread -lm -lz -lpcre -ldl -Wl,--end-group -fPIC -O3 -Wall -lstdc++ -shared-libgcc

Functions list

llm_set_params( cParams ) –> nSuccess

The function sets the model parameters. Parameters list cParams is a string with name=value pairs, divided by ~ character:

name1=value1~name2=value2~…

See the list of parameters in appropriate section of this manual.

Return value nSuccess is 0 if the function has completed successfully.

llm_open_model( cModelName ) –> nSuccess

The function opens AI model cModel.

Return value nSuccess is 0 if the function has completed successfully.

llm_create_context() –> nSuccess

The function creates the dialog context.

Return value nSuccess is 0 if the function has completed successfully.

llm_init_prompt( cPrompt )

llm_ask( cQuestion )

llm_getnexttoken() –> xResult

llm_close_model()

Closes previously opened model.

llm_print_timings()

Prints some time parameters of a dialog.

llm_rediron( n, cFile ) –> handle

Redirects output ( n = 1 - stdout, n = 2 - stderr ) to a file cFile, returns file handle.

llm_rediroff( n, handle )

Cancel output redirection ( n = 1 - stdout, n = 2 - stderr ), handle - a file handle, returned by llm_rediron().

llm_clip_open( cModel, cMmprojModel, cImage ) –> nSuccess

llm_clip_setimage( cPrompt, cSysPrompt )

llm_clip_getnexttoken() –> xResult

llm_clip_closeimage()

llm_clip_close()

llm_whisper_print_usage()

llm_whisper_set_params( cParams ) –> nSuccess

The function sets the model parameters. Parameters list cParams is a string with name=value pairs, divided by ~ character:

name1=value1~name2=value2~…

Return value nSuccess is 0 if the function has completed successfully.

llm_whisper_open_model( cModel ) –> nSuccess

The function opens AI model cModel.

Return value nSuccess is 0 if the function has completed successfully.

llm_whisper_close_model()

Closes previously opened model.

llm_whisper_recognize( cWavFile, [lStringBuffer], [@cStringOut] ) –> nSuccess

Starts the process of recognizing wav file. cWavFile - a file name or a buffer with file content (in this case lStringBuffer must be set to .T.

Return value nSuccess is 0 if the function has completed successfully.

llm_whisper_setcallback( cCallbackName )

Sets callback function with a name cCallbackName, which may output recognizing results

llm_whisper_abort()

Aborts the process of recognizing

llm_whisper_print_timings()

The functions llm_create_context_0(), llm_ask_0() and llm_getnexttoken_0() are deprecated.

Tests

test1.prg

This test is based on simple.cpp example of llama.cpp package. The context is created for every question.

This test uses a models.ini file, where a list of models with full paths should be. After each model there may be a list of parameters for this model. Available parameters listed in Model parameters chapter.

Parameters may be passed in a command line as one quoted string:

test1.exe “c=2048 temp=0.3 penalize-nl=1”

test2.prg

This test is based on main.cpp example of llama.cpp package. The context is created once for all questions, so it is possible to have a dialogue with a model as a series of related questions and answers.

This test uses a models.ini file, see appropriate comments to test1.prg. As for test1.prg, parameters can be passed on the command line as a single line enclosed in quotation marks.

test3.prg

This test is based on main.cpp example of llama.cpp package. It allows to analyse an image file and asnwers to a question about it. If the question is empty, it print info about the image content.

test4.prg

This test is based on main.cpp example of whisper package. It recognizes a wav audio file. At the moment, wav files with 16kHz only are supported. To convert to this armat, you may use ffmpeg utility:

ffmpeg -i my.mp3 -acodec pcm_s16le -ar 16000 my.wav

To compile test4.prg you need to replace llama library with a whisper library in the build script.

test5.prg

This test records the audio input from the microphone and recognize it - some kind of dictation.

demo/

The demo/ directory contains a sample, which uses the technology, described in my Ext project here on gitflic. extcli.prg, extsrv.prg, fconnect.prg are borrowed from that project, llama_exsrv.prg is a “server” program code, test_exllm.prg - a “client”. One of the reasons of using this technology is that a program, which uses llama.lib calls (llama_exsrv.prg), must be compiled with msvc 64-bit, while the “client” test_exllm.prg, which runs llama_exsrv to get possibility to use AI models, may be compiled, using any other C compiler, including 32-bit.

Model parameters

Below is a list of parameters, which may be used currently in Llama.prg. I use the same abbreviations, as in a main llama.cpp example. The description is borrowed from llama.cpp/examples/main/README.md.

c - (default: 512) This is a –ctx-size option, which allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
n - (default: 128) This is a –n-predict option, which controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.

A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after –n-keep) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
temp - (default: 0.8) Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model’s output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
repeat-penalty - (default: 1.1) Control the repetition of token sequences in the generated text. The repeat-penalty option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.
top-k - (default: 40) Limit the next token selection to the K most probable tokens. Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.
top-p - (default: 0.9) Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.
n-keep - The –keep option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. It is the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use -1 to retain all tokens from the initial prompt.
tb - –threads N: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
penalize-nl - –no-penalize-nl: the value 0 disables penalization for newline tokens when applying the repeat penalty.
min-p - (default: 0.05) Sets a minimum base probability threshold for token selection. The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.

Links

llama.prg

Quick review

Compiling library

Windows

Linux

Compiling samples

Windows

Linux

Functions list

llm_set_params( cParams ) –> nSuccess

llm_open_model( cModelName ) –> nSuccess

llm_create_context() –> nSuccess

llm_init_prompt( cPrompt )

llm_ask( cQuestion )

llm_getnexttoken() –> xResult

llm_close_model()

llm_print_timings()

llm_rediron( n, cFile ) –> handle

llm_rediroff( n, handle )

llm_clip_open( cModel, cMmprojModel, cImage ) –> nSuccess

llm_clip_setimage( cPrompt, cSysPrompt )

llm_clip_getnexttoken() –> xResult

llm_clip_closeimage()

llm_clip_close()

llm_whisper_print_usage()

llm_whisper_set_params( cParams ) –> nSuccess

llm_whisper_open_model( cModel ) –> nSuccess

llm_whisper_close_model()

llm_whisper_recognize( cWavFile, [lStringBuffer], [@cStringOut] ) –> nSuccess

llm_whisper_setcallback( cCallbackName )

llm_whisper_abort()

llm_whisper_print_timings()

Tests

test1.prg

test2.prg

test3.prg

test4.prg

test5.prg

demo/

Model parameters

Links

Описание

Конвейеры