spring-projects
diff --git a/‎models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaEmbeddingOptions.java‎
Lines changed: 211 additions & 2 deletions b/‎models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaEmbeddingOptions.java‎
Lines changed: 211 additions & 2 deletions
diff --git a/‎models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaEmbeddingRequestTests.java‎
Lines changed: 2 additions & 1 deletion b/‎models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaEmbeddingRequestTests.java‎
Lines changed: 2 additions & 1 deletion
@@ -78,6 +78,80 @@ public class OllamaEmbeddingOptions implements EmbeddingOptions {
 @JsonProperty("truncate")
 private Boolean truncate;
 
+// @formatter:off
+
+/**
+ * Whether to use NUMA. (Default: false)
+ */
+@JsonProperty("numa")
+private Boolean useNUMA;
+
+/**
+ * Prompt processing maximum batch size. (Default: 512)
+ */
+@JsonProperty("num_batch")
+private Integer numBatch;
+
+/**
+ * The number of layers to send to the GPU(s). On macOS, it defaults to 1
+ * to enable metal support, 0 to disable.
+ * (Default: -1, which indicates that numGPU should be set dynamically)
+ */
+@JsonProperty("num_gpu")
+private Integer numGPU;
+
+/**
+ * When using multiple GPUs this option controls which GPU is used
+ * for small tensors for which the overhead of splitting the computation
+ * across all GPUs is not worthwhile. The GPU in question will use slightly
+ * more VRAM to store a scratch buffer for temporary results.
+ * By default, GPU 0 is used.
+ */
+@JsonProperty("main_gpu")
+private Integer mainGPU;
+
+/**
+ * (Default: false)
+ */
+@JsonProperty("low_vram")
+private Boolean lowVRAM;
+
+/**
+ * Load only the vocabulary, not the weights.
+ */
+@JsonProperty("vocab_only")
+private Boolean vocabOnly;
+
+/**
+ * By default, models are mapped into memory, which allows the system to load only the necessary parts
+ * of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low
+ * on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
+ * Disabling mmap results in slower load times but may reduce pageouts if you're not using mlock.
+ * Note that if the model is larger than the total amount of RAM, turning off mmap would prevent
+ * the model from loading at all.
+ * (Default: null)
+ */
+@JsonProperty("use_mmap")
+private Boolean useMMap;
+
+/**
+ * Lock the model in memory, preventing it from being swapped out when memory-mapped.
+ * This can improve performance but trades away some of the advantages of memory-mapping
+ * by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
+ * (Default: false)
+ */
+@JsonProperty("use_mlock")
+private Boolean useMLock;
+
+/**
+ * Set the number of threads to use during generation. For optimal performance, it is recommended to set this value
+ * to the number of physical CPU cores your system has (as opposed to the logical number of cores).
+ * Using the correct number of threads can greatly improve performance.
+ * By default, Ollama will detect this value for optimal performance.
+ */
+@JsonProperty("num_thread")
+private Integer numThread;
+
 public static Builder builder() {
 return new Builder();
 }
@@ -93,19 +167,37 @@ public static Map<String, Object> filterNonSupportedFields(Map<String, Object> o
 .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
 }
 
-public static OllamaEmbeddingOptions fromOptions(OllamaOptions fromOptions) {
+public static OllamaEmbeddingOptions fromOptions(OllamaEmbeddingOptions fromOptions) {
 return builder()
 .model(fromOptions.getModel())
 .keepAlive(fromOptions.getKeepAlive())
 .truncate(fromOptions.getTruncate())
+.useNUMA(fromOptions.getUseNUMA())
+.numBatch(fromOptions.getNumBatch())
+.numGPU(fromOptions.getNumGPU())
+.mainGPU(fromOptions.getMainGPU())
+.lowVRAM(fromOptions.getLowVRAM())
+.vocabOnly(fromOptions.getVocabOnly())
+.useMMap(fromOptions.getUseMMap())
+.useMLock(fromOptions.getUseMLock())
+.numThread(fromOptions.getNumThread())
 .build();
 }
 
-public static OllamaEmbeddingOptions fromOptions(OllamaEmbeddingOptions fromOptions) {
+public static OllamaEmbeddingOptions fromOptions(OllamaOptions fromOptions) {
 return builder()
 .model(fromOptions.getModel())
 .keepAlive(fromOptions.getKeepAlive())
 .truncate(fromOptions.getTruncate())
+.useNUMA(fromOptions.getUseNUMA())
+.numBatch(fromOptions.getNumBatch())
+.numGPU(fromOptions.getNumGPU())
+.mainGPU(fromOptions.getMainGPU())
+.lowVRAM(fromOptions.getLowVRAM())
+.vocabOnly(fromOptions.getVocabOnly())
+.useMMap(fromOptions.getUseMMap())
+.useMLock(fromOptions.getUseMLock())
+.numThread(fromOptions.getNumThread())
 .build();
 }
 
@@ -137,6 +229,78 @@ public void setTruncate(Boolean truncate) {
 this.truncate = truncate;
 }
 
+public Boolean getUseNUMA() {
+return this.useNUMA;
+}
+
+public void setUseNUMA(Boolean useNUMA) {
+this.useNUMA = useNUMA;
+}
+
+public Integer getNumBatch() {
+return this.numBatch;
+}
+
+public void setNumBatch(Integer numBatch) {
+this.numBatch = numBatch;
+}
+
+public Integer getNumGPU() {
+return this.numGPU;
+}
+
+public void setNumGPU(Integer numGPU) {
+this.numGPU = numGPU;
+}
+
+public Integer getMainGPU() {
+return this.mainGPU;
+}
+
+public void setMainGPU(Integer mainGPU) {
+this.mainGPU = mainGPU;
+}
+
+public Boolean getLowVRAM() {
+return this.lowVRAM;
+}
+
+public void setLowVRAM(Boolean lowVRAM) {
+this.lowVRAM = lowVRAM;
+}
+
+public Boolean getVocabOnly() {
+return this.vocabOnly;
+}
+
+public void setVocabOnly(Boolean vocabOnly) {
+this.vocabOnly = vocabOnly;
+}
+
+public Boolean getUseMMap() {
+return this.useMMap;
+}
+
+public void setUseMMap(Boolean useMMap) {
+this.useMMap = useMMap;
+}
+
+public Boolean getUseMLock() {
+return this.useMLock;
+}
+
+public void setUseMLock(Boolean useMLock) {
+this.useMLock = useMLock;
+}
+
+public Integer getNumThread() {
+return this.numThread;
+}
+
+public void setNumThread(Integer numThread) {
+this.numThread = numThread;
+}
+
 @Override
 @JsonIgnore
 public Integer getDimensions() {
@@ -198,6 +362,51 @@ public Builder truncate(Boolean truncate) {
 return this;
 }
 
+public Builder useNUMA(Boolean useNUMA) {
+this.options.useNUMA = useNUMA;
+return this;
+}
+
+public Builder numBatch(Integer numBatch) {
+this.options.numBatch = numBatch;
+return this;
+}
+
+public Builder numGPU(Integer numGPU) {
+this.options.numGPU = numGPU;
+return this;
+}
+
+public Builder mainGPU(Integer mainGPU) {
+this.options.mainGPU = mainGPU;
+return this;
+}
+
+public Builder lowVRAM(Boolean lowVRAM) {
+this.options.lowVRAM = lowVRAM;
+return this;
+}
+
+public Builder vocabOnly(Boolean vocabOnly) {
+this.options.vocabOnly = vocabOnly;
+return this;
+}
+
+public Builder useMMap(Boolean useMMap) {
+this.options.useMMap = useMMap;
+return this;
+}
+
+public Builder useMLock(Boolean useMLock) {
+this.options.useMLock = useMLock;
+return this;
+}
+
+public Builder numThread(Integer numThread) {
+this.options.numThread = numThread;
+return this;
+}
+
 public OllamaEmbeddingOptions build() {
 return this.options;
 }
 
@@ -43,7 +43,8 @@ public class OllamaEmbeddingRequestTests {
 public void setUp() {
 this.embeddingModel = OllamaEmbeddingModel.builder()
 .ollamaApi(OllamaApi.builder().build())
-.defaultOptions(OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").build())
+.defaultOptions(
+OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").mainGPU(11).useMMap(true).numGPU(1).build())
 .build();
 }
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,8 @@ public class OllamaEmbeddingRequestTests {`
`43`	`43`	`public void setUp() {`
`44`	`44`	`this.embeddingModel = OllamaEmbeddingModel.builder()`
`45`	`45`	`.ollamaApi(OllamaApi.builder().build())`
`46`		`-.defaultOptions(OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").build())`
	`46`	`+.defaultOptions(`
	`47`	`+OllamaEmbeddingOptions.builder().model("DEFAULT_MODEL").mainGPU(11).useMMap(true).numGPU(1).build())`
`47`	`48`	`.build();`
`48`	`49`	`}`
`49`	`50`