@@ -78,6 +78,80 @@ public class OllamaEmbeddingOptions implements EmbeddingOptions {
78
78
@ JsonProperty ("truncate" )
79
79
private Boolean truncate ;
80
80
81
+ // @formatter:off
82
+
83
+ /**
84
+ * Whether to use NUMA. (Default: false)
85
+ */
86
+ @ JsonProperty ("numa" )
87
+ private Boolean useNUMA ;
88
+
89
+ /**
90
+ * Prompt processing maximum batch size. (Default: 512)
91
+ */
92
+ @ JsonProperty ("num_batch" )
93
+ private Integer numBatch ;
94
+
95
+ /**
96
+ * The number of layers to send to the GPU(s). On macOS, it defaults to 1
97
+ * to enable metal support, 0 to disable.
98
+ * (Default: -1, which indicates that numGPU should be set dynamically)
99
+ */
100
+ @ JsonProperty ("num_gpu" )
101
+ private Integer numGPU ;
102
+
103
+ /**
104
+ * When using multiple GPUs this option controls which GPU is used
105
+ * for small tensors for which the overhead of splitting the computation
106
+ * across all GPUs is not worthwhile. The GPU in question will use slightly
107
+ * more VRAM to store a scratch buffer for temporary results.
108
+ * By default, GPU 0 is used.
109
+ */
110
+ @ JsonProperty ("main_gpu" )
111
+ private Integer mainGPU ;
112
+
113
+ /**
114
+ * (Default: false)
115
+ */
116
+ @ JsonProperty ("low_vram" )
117
+ private Boolean lowVRAM ;
118
+
119
+ /**
120
+ * Load only the vocabulary, not the weights.
121
+ */
122
+ @ JsonProperty ("vocab_only" )
123
+ private Boolean vocabOnly ;
124
+
125
+ /**
126
+ * By default, models are mapped into memory, which allows the system to load only the necessary parts
127
+ * of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low
128
+ * on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
129
+ * Disabling mmap results in slower load times but may reduce pageouts if you're not using mlock.
130
+ * Note that if the model is larger than the total amount of RAM, turning off mmap would prevent
131
+ * the model from loading at all.
132
+ * (Default: null)
133
+ */
134
+ @ JsonProperty ("use_mmap" )
135
+ private Boolean useMMap ;
136
+
137
+ /**
138
+ * Lock the model in memory, preventing it from being swapped out when memory-mapped.
139
+ * This can improve performance but trades away some of the advantages of memory-mapping
140
+ * by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
141
+ * (Default: false)
142
+ */
143
+ @ JsonProperty ("use_mlock" )
144
+ private Boolean useMLock ;
145
+
146
+ /**
147
+ * Set the number of threads to use during generation. For optimal performance, it is recommended to set this value
148
+ * to the number of physical CPU cores your system has (as opposed to the logical number of cores).
149
+ * Using the correct number of threads can greatly improve performance.
150
+ * By default, Ollama will detect this value for optimal performance.
151
+ */
152
+ @ JsonProperty ("num_thread" )
153
+ private Integer numThread ;
154
+
81
155
public static Builder builder () {
82
156
return new Builder ();
83
157
}
@@ -93,19 +167,37 @@ public static Map<String, Object> filterNonSupportedFields(Map<String, Object> o
93
167
.collect (Collectors .toMap (Map .Entry ::getKey , Map .Entry ::getValue ));
94
168
}
95
169
96
- public static OllamaEmbeddingOptions fromOptions (OllamaOptions fromOptions ) {
170
+ public static OllamaEmbeddingOptions fromOptions (OllamaEmbeddingOptions fromOptions ) {
97
171
return builder ()
98
172
.model (fromOptions .getModel ())
99
173
.keepAlive (fromOptions .getKeepAlive ())
100
174
.truncate (fromOptions .getTruncate ())
175
+ .useNUMA (fromOptions .getUseNUMA ())
176
+ .numBatch (fromOptions .getNumBatch ())
177
+ .numGPU (fromOptions .getNumGPU ())
178
+ .mainGPU (fromOptions .getMainGPU ())
179
+ .lowVRAM (fromOptions .getLowVRAM ())
180
+ .vocabOnly (fromOptions .getVocabOnly ())
181
+ .useMMap (fromOptions .getUseMMap ())
182
+ .useMLock (fromOptions .getUseMLock ())
183
+ .numThread (fromOptions .getNumThread ())
101
184
.build ();
102
185
}
103
186
104
- public static OllamaEmbeddingOptions fromOptions (OllamaEmbeddingOptions fromOptions ) {
187
+ public static OllamaEmbeddingOptions fromOptions (OllamaOptions fromOptions ) {
105
188
return builder ()
106
189
.model (fromOptions .getModel ())
107
190
.keepAlive (fromOptions .getKeepAlive ())
108
191
.truncate (fromOptions .getTruncate ())
192
+ .useNUMA (fromOptions .getUseNUMA ())
193
+ .numBatch (fromOptions .getNumBatch ())
194
+ .numGPU (fromOptions .getNumGPU ())
195
+ .mainGPU (fromOptions .getMainGPU ())
196
+ .lowVRAM (fromOptions .getLowVRAM ())
197
+ .vocabOnly (fromOptions .getVocabOnly ())
198
+ .useMMap (fromOptions .getUseMMap ())
199
+ .useMLock (fromOptions .getUseMLock ())
200
+ .numThread (fromOptions .getNumThread ())
109
201
.build ();
110
202
}
111
203
@@ -137,6 +229,78 @@ public void setTruncate(Boolean truncate) {
137
229
this .truncate = truncate ;
138
230
}
139
231
232
+ public Boolean getUseNUMA () {
233
+ return this .useNUMA ;
234
+ }
235
+
236
+ public void setUseNUMA (Boolean useNUMA ) {
237
+ this .useNUMA = useNUMA ;
238
+ }
239
+
240
+ public Integer getNumBatch () {
241
+ return this .numBatch ;
242
+ }
243
+
244
+ public void setNumBatch (Integer numBatch ) {
245
+ this .numBatch = numBatch ;
246
+ }
247
+
248
+ public Integer getNumGPU () {
249
+ return this .numGPU ;
250
+ }
251
+
252
+ public void setNumGPU (Integer numGPU ) {
253
+ this .numGPU = numGPU ;
254
+ }
255
+
256
+ public Integer getMainGPU () {
257
+ return this .mainGPU ;
258
+ }
259
+
260
+ public void setMainGPU (Integer mainGPU ) {
261
+ this .mainGPU = mainGPU ;
262
+ }
263
+
264
+ public Boolean getLowVRAM () {
265
+ return this .lowVRAM ;
266
+ }
267
+
268
+ public void setLowVRAM (Boolean lowVRAM ) {
269
+ this .lowVRAM = lowVRAM ;
270
+ }
271
+
272
+ public Boolean getVocabOnly () {
273
+ return this .vocabOnly ;
274
+ }
275
+
276
+ public void setVocabOnly (Boolean vocabOnly ) {
277
+ this .vocabOnly = vocabOnly ;
278
+ }
279
+
280
+ public Boolean getUseMMap () {
281
+ return this .useMMap ;
282
+ }
283
+
284
+ public void setUseMMap (Boolean useMMap ) {
285
+ this .useMMap = useMMap ;
286
+ }
287
+
288
+ public Boolean getUseMLock () {
289
+ return this .useMLock ;
290
+ }
291
+
292
+ public void setUseMLock (Boolean useMLock ) {
293
+ this .useMLock = useMLock ;
294
+ }
295
+
296
+ public Integer getNumThread () {
297
+ return this .numThread ;
298
+ }
299
+
300
+ public void setNumThread (Integer numThread ) {
301
+ this .numThread = numThread ;
302
+ }
303
+
140
304
@ Override
141
305
@ JsonIgnore
142
306
public Integer getDimensions () {
@@ -198,6 +362,51 @@ public Builder truncate(Boolean truncate) {
198
362
return this ;
199
363
}
200
364
365
+ public Builder useNUMA (Boolean useNUMA ) {
366
+ this .options .useNUMA = useNUMA ;
367
+ return this ;
368
+ }
369
+
370
+ public Builder numBatch (Integer numBatch ) {
371
+ this .options .numBatch = numBatch ;
372
+ return this ;
373
+ }
374
+
375
+ public Builder numGPU (Integer numGPU ) {
376
+ this .options .numGPU = numGPU ;
377
+ return this ;
378
+ }
379
+
380
+ public Builder mainGPU (Integer mainGPU ) {
381
+ this .options .mainGPU = mainGPU ;
382
+ return this ;
383
+ }
384
+
385
+ public Builder lowVRAM (Boolean lowVRAM ) {
386
+ this .options .lowVRAM = lowVRAM ;
387
+ return this ;
388
+ }
389
+
390
+ public Builder vocabOnly (Boolean vocabOnly ) {
391
+ this .options .vocabOnly = vocabOnly ;
392
+ return this ;
393
+ }
394
+
395
+ public Builder useMMap (Boolean useMMap ) {
396
+ this .options .useMMap = useMMap ;
397
+ return this ;
398
+ }
399
+
400
+ public Builder useMLock (Boolean useMLock ) {
401
+ this .options .useMLock = useMLock ;
402
+ return this ;
403
+ }
404
+
405
+ public Builder numThread (Integer numThread ) {
406
+ this .options .numThread = numThread ;
407
+ return this ;
408
+ }
409
+
201
410
public OllamaEmbeddingOptions build () {
202
411
return this .options ;
203
412
}
0 commit comments