The execution, how to run the LLM inference engine
Execution Manager
The execution manager manages, so called executors, which implement how to perform all the primitive vector operations on one set of hardware for one kind of datatype. For example, here is an abridged version of the Float Executor for the CPU implementation:
public class OzAIFloatCPUExec : OzAICPUExecutor
{
public override bool Add(OzAIVectorRange src1, OzAIVectorRange src2, OzAIVectorRange dst, out string error)
{
try
{
var src1Vec = src1.Vector as OzAIFloatVec_CSharp;
var src2Vec = src2.Vector as OzAIFloatVec_CSharp;
var dstVec = dst.Vector as OzAIFloatVec_CSharp;
var src1Offset = src1.Offset;
var src2Offset = src2.Offset;
var dstOffset = dst.Offset;
var count = src1.Length;
for (ulong i = 0; i < count; i++)
{
dstVec.Values[dstOffset++] = src1Vec.Values[src1Offset++] + src2Vec.Values[src2Offset++];
}
}
catch (Exception ex)
{
error = "Failed to perform addition: " + ex.Message;
return false;
}
error = null;
return true;
}
public override bool Div(OzAIVectorRange src, OzAIScalar scalar, OzAIVectorRange dst, out string error)
{
try
{
var scalarVec = scalar.Vector as OzAIFloatVec_CSharp;
var scalarVal = scalarVec.Values[scalar.Offset];
var srcVec = src.Vector as OzAIFloatVec_CSharp;
var dstVec = dst.Vector as OzAIFloatVec_CSharp;
var srcOffset = src.Offset;
var dstOffset = dst.Offset;
var count = src.Length;
for (ulong i = 0; i < count; i++)
{
dstVec.Values[dstOffset++] = srcVec.Values[srcOffset++] / scalarVal;
}
}
catch (Exception ex)
{
error = "Failed to perform division: " + ex.Message;
return false;
}
error = null;
return true;
}
…
public override bool Sum(OzAIVectorRange src, OzAIScalar dst, out string error)
{
try
{
var srcVec = src.Vector as OzAIFloatVec_CSharp;
var dstVec = dst.Vector as OzAIFloatVec_CSharp;
var srcOffset = src.Offset;
var count = src.Length;
for (ulong i = 0; i < count; i++)
{
dstVec.Values[dst.Offset] += srcVec.Values[srcOffset++];
}
}
catch (Exception ex)
{
error = "Failed to perform summation: " + ex.Message;
return false;
}
error = null;
return true;
}
public override bool Swish1(OzAIVectorRange src, OzAIVectorRange dst, out string error)
{
try
{
var srcVec = src.Vector as OzAIFloatVec_CSharp;
var dstVec = dst.Vector as OzAIFloatVec_CSharp;
var srcOffset = src.Offset;
var dstOffset = dst.Offset;
var count = dst.Length;
for (ulong i = 0; i < count; i++)
{
var val = srcVec.Values[srcOffset++];
var neg = -val;
var exp = MathF.Exp(neg);
dstVec.Values[dstOffset++] = val / (1 + exp);
}
}
catch (Exception ex)
{
error = "Failed to perform swish with beta = 1: " + ex.Message;
return false;
}
error = null;
return true;
}
}
One may notice that vector ranges are used instead of vectors, this is to increase the parallelisability of the operations to a sub-vector level. The threading is handled by the CPU executor super-class as follows:
public abstract partial class OzAICPUExecutor : OzAIExecutor
{
OzAIProcMode _mode;
Thread _execThread;
ManualResetEvent _process;
ManualResetEvent _done;
bool _run;
public override bool Start(OzAIProcMode mode, out string error)
{
_mode = mode;
_tasks = new List();
_execThread = new Thread(execute);
_process = new ManualResetEvent(false);
_done = new ManualResetEvent(true);
_run = true;
_execThread.Start();
error = null;
return true;
}
public override void Stop()
{
_done.WaitOne();
_run = false;
_process.Set();
}
List _tasks;
void execute()
{
while (_tasks.Count != 0 || _process.WaitOne())
{
while (_tasks.Count != 0)
{
var item = _tasks.Last();
if (!perform(item, out _currentError))
{
_success = false;
_done.Set();
_process.Reset();
return;
}
_tasks.Remove(item);
}
if (_run && _tasks.Count == 0)
{
_done.Set();
_process.Reset();
}
}
}
string _currentError;
bool _success = true;
public override bool AwaitAll(out string error)
{
_done.WaitOne();
error = _currentError;
_currentError = null;
var res = _success;
_success = true;
return res;
}
bool perform(OzAIOperation op, out string error)
{
switch (op.Type)
{
case OzAIOperationType.Addition:
if (!BulckAdd(op, out error))
return false;
break;
case OzAIOperationType.Div:
if (!BulckDiv(op, out error))
return false;
…
case OzAIOperationType.SoftMax:
if (!BulckSoftMax(op, out error))
return false;
break;
case OzAIOperationType.Sum:
if (!BulckSum(op, out error))
return false;
break;
case OzAIOperationType.Swish1:
if (!BulckSwish1(op, out error))
return false;
break;
default:
error = $"Operation unkown operation: {op}.";
return false;
}
return true;
}
public override void Add(OzAIOperation operation)
{
_done.Reset();
_tasks.Add(operation);
_process.Set();
}
}
OzAIOperation (representing a primitive AI operation) is currently effectively hidden due to the extensive wrapper functions the exec manager has to create these in place of the user. However, the way it would be without those is the user would have to instantiate the type of operation they want and use the .Add function of the exec manager. This would then break it down and send it to the different executors in the form of bulk operations which are completed individually by each thread. Afterwards, each thread awaits the others for the operation to complete. All executors boil down to the below abstract class:
public abstract class OzAIExecutor
{
public abstract void Add(OzAIOperation operation);
public abstract bool Start(OzAIProcMode mode, out string error);
public abstract bool AwaitAll(out string error);
public abstract void Stop();
}
For each datatype encountered, the exec manager creates a the respective executors and starts them. It then adds a task to each executor when it arrives from the arch components. The code for this is below:
public partial class OzAIExecManager
{
Dictionary _main;
Dictionary> _executors;
OzAIProcMode _mode;
OzAICPUSettings _cpu;
public bool GetProcMode(out OzAIProcMode mode, out string error)
{
if (_mode == null)
{
mode = null;
error = "Execution Manager not initialized.";
return false;
}
mode = _mode;
error = null;
return true;
}
public bool Init(OzAIProcMode mode, out string error)
{
_mode = mode;
_executors = new Dictionary>();
_main = new Dictionary();
if (!_mode.GetCPUSettings(out _cpu, out error))
return false;
var defaultType = _cpu.DefaultProcType;
if (!createExecs(defaultType, out error))
{
error = "Could not create executors for given default vector data type: " + error;
return false;
}
error = null;
return true;
}
bool getMain(List vecs, out OzAIExecutor res, out string error, bool checkDtypes = true)
{
res = null;
var dType = vecs[0].GetNumType();
if (checkDtypes)
{
foreach (var vec in vecs)
{
if (dType != vec.GetNumType())
{
error = "Data types of provided vectors do not agree!";
return false;
}
}
}
if (!_main.ContainsKey(dType))
{
if (!createExecs(dType, out error))
{
error = "Could not create executors for given vector data type: " + error;
return false;
}
}
res = _main[dType];
error = null;
return true;
}
bool getExecs(List vecs, out List res, out string error, bool checkDtypes = true)
{
res = null;
var dType = vecs[0].GetNumType();
if (checkDtypes)
{
foreach (var vec in vecs)
{
if (dType != vec.GetNumType())
{
error = "Data types of provided vectors do not agree!";
return false;
}
}
}
if (!_executors.ContainsKey(dType))
{
if (!createExecs(dType, out error))
{
error = "Could not create executors for given vector data type: " + error;
return false;
}
}
res = _executors[dType];
error = null;
return true;
}
bool createExecs(OzAINumType dType, out string error)
{
if (!dType.CreateCPUExec(out var cpuExec, out error))
return false;
_main.Add(dType, cpuExec);
if (!_main[dType].Start(_mode, out error))
return false;
var execs = new List();
_executors.Add(dType, execs);
execs.Add(cpuExec);
for (int i = 1; i < _cpu.ThreadCount; i++)
{
if (!dType.CreateCPUExec(out cpuExec, out error))
return false;
OzAIExecutor exec = cpuExec;
if (!exec.Start(_mode, out error))
return false;
execs.Add(exec);
}
return true;
}
~OzAIExecManager()
{
foreach (var item in _executors)
{
foreach (var exec in item.Value)
{
exec.Stop();
}
}
}
}
The rest of the code for converting between function calls and operations is
repetitive and convoluted therefore it is not presented here. However, based on
former description, I would believe the reader can imagine a function that
verifies the operation and uses the .Add function on the related executors after
converting the vectors into vector ranges.