Skip to content

Commit eba2cec

Browse files
committed
Add Ollama embedding hardware, memory, performance and execution options
- Add Embedding specific properties - These properties including all from the OllamaChat options except numCtx,f16KV, logitsAll - Update test Signed-off-by: Ilayaperumal Gopinathan <ilayaperumal.gopinathan@broadcom.com>
1 parent f908aa1 commit eba2cec

File tree

2 files changed

+213
-3
lines changed

2 files changed

+213
-3
lines changed

models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaEmbeddingOptions.java

Lines changed: 211 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,80 @@ public class OllamaEmbeddingOptions implements EmbeddingOptions {
7878
@JsonProperty("truncate")
7979
private Boolean truncate;
8080

81+
// @formatter:off
82+
83+
/**
84+
* Whether to use NUMA. (Default: false)
85+
*/
86+
@JsonProperty("numa")
87+
private Boolean useNUMA;
88+
89+
/**
90+
* Prompt processing maximum batch size. (Default: 512)
91+
*/
92+
@JsonProperty("num_batch")
93+
private Integer numBatch;
94+
95+
/**
96+
* The number of layers to send to the GPU(s). On macOS, it defaults to 1
97+
* to enable metal support, 0 to disable.
98+
* (Default: -1, which indicates that numGPU should be set dynamically)
99+
*/
100+
@JsonProperty("num_gpu")
101+
private Integer numGPU;
102+
103+
/**
104+
* When using multiple GPUs this option controls which GPU is used
105+
* for small tensors for which the overhead of splitting the computation
106+
* across all GPUs is not worthwhile. The GPU in question will use slightly
107+
* more VRAM to store a scratch buffer for temporary results.
108+
* By default, GPU 0 is used.
109+
*/
110+
@JsonProperty("main_gpu")
111+
private Integer mainGPU;
112+
113+
/**
114+
* (Default: false)
115+
*/
116+
@JsonProperty("low_vram")
117+
private Boolean lowVRAM;
118+
119+
/**
120+
* Load only the vocabulary, not the weights.
121+
*/
122+
@JsonProperty("vocab_only")
123+
private Boolean vocabOnly;
124+
125+
/**
126+
* By default, models are mapped into memory, which allows the system to load only the necessary parts
127+
* of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low
128+
* on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
129+
* Disabling mmap results in slower load times but may reduce pageouts if you're not using mlock.
130+
* Note that if the model is larger than the total amount of RAM, turning off mmap would prevent
131+
* the model from loading at all.
132+
* (Default: null)
133+
*/
134+
@JsonProperty("use_mmap")
135+
private Boolean useMMap;
136+
137+
/**
138+
* Lock the model in memory, preventing it from being swapped out when memory-mapped.
139+
* This can improve performance but trades away some of the advantages of memory-mapping
140+
* by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
141+
* (Default: false)
142+
*/
143+
@JsonProperty("use_mlock")
144+
private Boolean useMLock;
145+
146+
/**
147+
* Set the number of threads to use during generation. For optimal performance, it is recommended to set this value
148+
* to the number of physical CPU cores your system has (as opposed to the logical number of cores).
149+
* Using the correct number of threads can greatly improve performance.
150+
* By default, Ollama will detect this value for optimal performance.
151+
*/
152+
@JsonProperty("num_thread")
153+
private Integer numThread;
154+
81155
public static Builder builder() {
82156
return new Builder();
83157
}
@@ -93,19 +167,37 @@ public static Map<String, Object> filterNonSupportedFields(Map<String, Object> o
93167
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
94168
}
95169

96-
public static OllamaEmbeddingOptions fromOptions(OllamaOptions fromOptions) {
170+
public static OllamaEmbeddingOptions fromOptions(OllamaEmbeddingOptions fromOptions) {
97171
return builder()
98172
.model(fromOptions.getModel())
99173
.keepAlive(fromOptions.getKeepAlive())
100174
.truncate(fromOptions.getTruncate())
175+
.useNUMA(fromOptions.getUseNUMA())
176+
.numBatch(fromOptions.getNumBatch())
177+
.numGPU(fromOptions.getNumGPU())
178+
.mainGPU(fromOptions.getMainGPU())
179+
.lowVRAM(fromOptions.getLowVRAM())
180+
.vocabOnly(fromOptions.getVocabOnly())
181+
.useMMap(fromOptions.getUseMMap())
182+
.useMLock(fromOptions.getUseMLock())
183+
.numThread(fromOptions.getNumThread())
101184
.build();
102185
}
103186

104-
public static OllamaEmbeddingOptions fromOptions(OllamaEmbeddingOptions fromOptions) {
187+
public static OllamaEmbeddingOptions fromOptions(OllamaOptions fromOptions) {
105188
return builder()
106189
.model(fromOptions.getModel())
107190
.keepAlive(fromOptions.getKeepAlive())
108191
.truncate(fromOptions.getTruncate())
192+
.useNUMA(fromOptions.getUseNUMA())
193+
.numBatch(fromOptions.getNumBatch())
194+
.numGPU(fromOptions.getNumGPU())
195+
.mainGPU(fromOptions.getMainGPU())
196+
.lowVRAM(fromOptions.getLowVRAM())
197+
.vocabOnly(fromOptions.getVocabOnly())
198+
.useMMap(fromOptions.getUseMMap())
199+
.useMLock(fromOptions.getUseMLock())
200+
.numThread(fromOptions.getNumThread())
109201
.build();
110202
}
111203

@@ -137,6 +229,78 @@ public void setTruncate(Boolean truncate) {
137229
this.truncate = truncate;
138230
}
139231

232+
public Boolean getUseNUMA() {
233+
return this.useNUMA;
234+
}
235+
236+
public void setUseNUMA(Boolean useNUMA) {
237+
this.useNUMA = useNUMA;
238+
}
239+
240+
public Integer getNumBatch() {
241+
return this.numBatch;
242+
}
243+
244+
public void setNumBatch(Integer numBatch) {
245+
this.numBatch = numBatch;
246+
}
247+
248+
public Integer getNumGPU() {
249+
return this.numGPU;
250+
}
251+
252+
public void setNumGPU(Integer numGPU) {
253+
this.numGPU = numGPU;
254+
}
255+
256+
public Integer getMainGPU() {
257+
return this.mainGPU;
258+
}
259+
260+
public void setMainGPU(Integer mainGPU) {
261+
this.mainGPU = mainGPU;
262+
}
263+
264+
public Boolean getLowVRAM() {
265+
return this.lowVRAM;
266+
}
267+
268+
public void setLowVRAM(Boolean lowVRAM) {
269+
this.lowVRAM = lowVRAM;
270+
}
271+
272+
public Boolean getVocabOnly() {
273+
return this.vocabOnly;
274+
}
275+
276+
public void setVocabOnly(Boolean vocabOnly) {
277+
this.vocabOnly = vocabOnly;
278+
}
279+
280+
public Boolean getUseMMap() {
281+
return this.useMMap;
282+
}
283+
284+
public void setUseMMap(Boolean useMMap) {
285+
this.useMMap = useMMap;
286+
}
287+
288+
public Boolean getUseMLock() {
289+
return this.useMLock;
290+
}
291+
292+
public void setUseMLock(Boolean useMLock) {
293+
this.useMLock = useMLock;
294+
}
295+
296+
public Integer getNumThread() {
297+
return this.numThread;
298+
}
299+
300+
public void setNumThread(Integer numThread) {
301+
this.numThread = numThread;
302+
}
303+
140304
@Override
141305
@JsonIgnore
142306
public Integer getDimensions() {
@@ -198,6 +362,51 @@ public Builder truncate(Boolean truncate) {
198362
return this;
199363
}
200364

365+
public Builder useNUMA(Boolean useNUMA) {
366+
this.options.useNUMA = useNUMA;
367+
return this;
368+
}
369+
370+
public Builder numBatch(Integer numBatch) {
371+
this.options.numBatch = numBatch;
372+
return this;
373+
}
374+
375+
public Builder numGPU(Integer numGPU) {
376+
this.options.numGPU = numGPU;
377+
return this;
378+
}
379+
380+
public Builder mainGPU(Integer mainGPU) {
381+
this.options.mainGPU = mainGPU;
382+
return this;
383+
}
384+
385+
public Builder lowVRAM(Boolean lowVRAM) {
386+
this.options.lowVRAM = lowVRAM;
387+
return this;
388+
}
389+
390+
public Builder vocabOnly(Boolean vocabOnly) {
391+
this.options.vocabOnly = vocabOnly;
392+
return this;
393+
}
394+
395+
public Builder useMMap(Boolean useMMap) {
396+
this.options.useMMap = useMMap;
397+
return this;
398+
}
399+
400+
public Builder useMLock(Boolean useMLock) {
401+
this.options.useMLock = useMLock;
402+
return this;
403+
}
404+
405+
public Builder numThread(Integer numThread) {
406+
this.options.numThread = numThread;
407+
return this;
408+
}
409+
201410
public OllamaEmbeddingOptions build() {
202411
return this.options;
203412
}

models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaEmbeddingRequestTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ public class OllamaEmbeddingRequestTests {
4343
public void setUp() {
4444
this.embeddingModel = OllamaEmbeddingModel.builder()
4545
.ollamaApi(OllamaApi.builder().build())
46-
.defaultOptions(OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").build())
46+
.defaultOptions(
47+
OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").mainGPU(11).useMMap(true).numGPU(1).build())
4748
.build();
4849
}
4950

0 commit comments

Comments
 (0)