JohnSnowLabs · DevinTDHa · Oct 20, 2025 · Oct 14, 2025
diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
@@ -12,12 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains classes for the AutoGGUFModel."""
-from typing import List, Dict
-
 from sparknlp.common import *
 
 
-class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties):
+class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties, CompletionPostProcessing):
  """
  Annotator that uses the llama.cpp library to generate text completions with large language
  models.
@@ -243,7 +241,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties):
  inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
  outputAnnotatorType = AnnotatorType.DOCUMENT
 
-
  @keyword_only
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel", java_model=None):
  super(AutoGGUFModel, self).__init__(

diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py
@@ -15,7 +15,7 @@
 from sparknlp.common import *
 
 
-class AutoGGUFVisionModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties):
+class AutoGGUFVisionModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties, CompletionPostProcessing):
  """Multimodal annotator that uses the llama.cpp library to generate text completions with large
  language models. It supports ingesting images for captioning.
 

diff --git a/python/sparknlp/common/__init__.py b/python/sparknlp/common/__init__.py
@@ -23,3 +23,4 @@
 from sparknlp.common.utils import *
 from sparknlp.common.annotator_type import *
 from sparknlp.common.match_strategy import *
+from sparknlp.common.completion_post_processing import *
diff --git a/python/sparknlp/common/completion_post_processing.py b/python/sparknlp/common/completion_post_processing.py
@@ -0,0 +1,37 @@
+# Copyright 2017-2025 John Snow Labs
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pyspark.ml.param import Param, Params, TypeConverters
+
+
+class CompletionPostProcessing:
+ removeThinkingTag = Param(
+ Params._dummy(),
+ "removeThinkingTag",
+ "Set a thinking tag (e.g. think) to be removed from output. Will match <TAG>...</TAG>",
+ typeConverter=TypeConverters.toString,
+ )
+
+ def setRemoveThinkingTag(self, value: str):
+ """Set a thinking tag (e.g. `think`) to be removed from output.
+ Will produce the regex: `(?s)<$TAG>.+?</$TAG>`
+ """
+ self._set(removeThinkingTag=value)
+ return self
+
+ def getRemoveThinkingTag(self):
+ """Get the thinking tag to be removed from output."""
+ value = None
+ if self.removeThinkingTag in self._paramMap:
+ value = self._paramMap[self.removeThinkingTag]
+ return value
diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py
@@ -257,6 +257,7 @@ def runTest(self):
  model_writer.save(model_path)
  AutoGGUFModel.load(model_path)
 
+
  model_path = "file:///tmp/autogguf_spark_nlp"
  AutoGGUFModel.load(model_path)
 
@@ -293,3 +294,40 @@ def runTest(self):
 
  print(f"Freed RAM after closing the model: {ramChange} MB")
  assert (ramChange < -100, "Freed RAM should be greater than 100 MB")
+
+
+@pytest.mark.slow
+class AutoGGUFModelThinkingTagTestSpec(unittest.TestCase):
+ def setUp(self):
+ self.spark = SparkContextForTest.spark
+
+ def runTest(self):
+ document_assembler = (
+ DocumentAssembler().setInputCol("text").setOutputCol("document")
+ )
+
+ think_tag = "think"
+
+ model = (
+ AutoGGUFModel.loadSavedModel("models/Qwen3-8B-Q4_K_M.gguf", self.spark)
+ .setInputCols(["document"])
+ .setOutputCol("completions")
+ .setRemoveThinkingTag(think_tag)
+ .setNPredict(500)
+ .setTemperature(0.1)
+ )
+
+ data = self.spark.createDataFrame(
+ [("What is the meaning of life? Think shortly step by step.",)],
+ ["text"]
+ )
+
+ pipeline = Pipeline(stages=[document_assembler, model])
+ result = pipeline.fit(data).transform(data)
+
+ completions = result.select("completions").collect()
+ completion = completions[0][0][0].result
+
+ print(completion)
+ assert f"<{think_tag}>" not in completion
+ assert f"</{think_tag}>" not in completion
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
@@ -124,7 +124,8 @@ class AutoGGUFModel(override val uid: String)
  with HasEngine
  with HasLlamaCppModelProperties
  with HasLlamaCppInferenceProperties
- with HasProtectedParams {
+ with HasProtectedParams
+ with CompletionPostProcessing {
 
  override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT)
@@ -204,7 +205,8 @@ class AutoGGUFModel(override val uid: String)
  inferenceParams,
  getSystemPrompt,
  annotationsText)
- (results, Map.empty)
+ val resultsCleaned = processCompletions(results)
+ (resultsCleaned, Map.empty)
  } catch {
  case e: LlamaException =>
  logger.error("Error in llama.cpp batch completion", e)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModel.scala
@@ -157,7 +157,8 @@ class AutoGGUFVisionModel(override val uid: String)
  with HasEngine
  with HasLlamaCppModelProperties
  with HasLlamaCppInferenceProperties
- with HasProtectedParams {
+ with HasProtectedParams
+ with CompletionPostProcessing {
  override val inputAnnotatorTypes: Array[AnnotatorType] =
  Array(AnnotatorType.IMAGE, AnnotatorType.DOCUMENT)
  override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
@@ -242,14 +243,14 @@ class AutoGGUFVisionModel(override val uid: String)
  .zip(base64EncodedImages)
  .map { case (prompt, base64Image) =>
  try {
- (
- LlamaExtensions.completeImage(
-  model,
-  getInferenceParameters,
-  getSystemPrompt,
-  prompt,
-  base64Image),
-   Map.empty[String, String])
+ val results = LlamaExtensions.completeImage(
+ model,
+ getInferenceParameters,
+ getSystemPrompt,
+ prompt,
+ base64Image)
+ val resultsCleaned = processCompletions(Array(results)).head
+ (resultsCleaned, Map.empty[String, String])
  } catch {
  case e: LlamaException =>
  logger.error("Error in llama.cpp image batch completion", e)

diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CompletionPostProcessing.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CompletionPostProcessing.scala
@@ -0,0 +1,31 @@
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import org.apache.spark.ml.param.{Param, Params}
+
+private[nlp] trait CompletionPostProcessing {
+ this: Params =>
+
+ /** @group param */
+ val removeThinkingTag =
+ new Param[String](
+ this,
+ "removeThinkingTag",
+ "Set a thinking tag (e.g. think) to be removed from output. Will match <TAG>...</TAG>")
+
+ /** Set a thinking tag (e.g. `think`) to be removed from output. Will produce the regex
+ * `(?s)<$TAG>.+?</$TAG>`
+ * @group setParam
+ */
+ def setRemoveThinkingTag(value: String): this.type = set(removeThinkingTag, value)
+
+ /** @group getParam */
+ def getRemoveThinkingTag: Option[String] = get(removeThinkingTag)
+
+ protected def processCompletions(results: Array[String]): Array[String] = {
+ getRemoveThinkingTag match {
+ case Some(thinkingTag) =>
+ results.map(text => text.replaceFirst(s"(?s)<$thinkingTag>.*?</$thinkingTag>", "").trim)
+ case None => results
+ }
+ }
+}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
@@ -252,6 +252,27 @@ class AutoGGUFModelTest extends AnyFlatSpec {
  assert(ramChange < -100, "Freed RAM should be greater than 100 MB")
  }
 
+ it should "be able to remove thinking tags" taggedAs SlowTest in {
+ val thinkTag = "think"
+ val model = AutoGGUFModel
+ .loadSavedModel("models/Qwen3-8B-Q4_K_M.gguf", ResourceHelper.spark)
+ .setInputCols("document")
+ .setOutputCol("completions")
+ .setRemoveThinkingTag(thinkTag)
+ .setNPredict(500)
+ .setTemperature(0.1f)
+
+ val data = Seq("What is the meaning of life? Think shortly step by step.").toDF("text")
+
+ val pipeline =
+ new Pipeline().setStages(Array(documentAssembler, model))
+ val result = pipeline.fit(data).transform(data)
+
+ val completion = Annotation.collect(result, "completions").flatten.head.result
+ println(completion)
+ assert(!completion.contains(s"<$thinkTag>") && !completion.contains(s"</$thinkTag>"))
+ }
+
 // it should "benchmark" taggedAs SlowTest in {
 // val model = AutoGGUFModel
 // .loadSavedModel("models/gemma-3-4b-it-qat-Q4_K_M.gguf", ResourceHelper.spark)

diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTestSpec.scala
@@ -156,4 +156,34 @@ class AutoGGUFVisionModelTestSpec extends AnyFlatSpec {
  println("Freed RAM after closing the model: " + ramChange + " MB")
  assert(ramChange < -100, "Freed RAM should be greater than 100 MB")
  }
+
+ it should "be able to remove thinking tags" taggedAs SlowTest in {
+ val thinkTag = "think"
+ val model = AutoGGUFVisionModel
+ .loadSavedModel(
+ "models/SmolVLM-256M-Instruct-Q8_0.gguf",
+ "models/mmproj-SmolVLM-256M-Instruct-Q8_0.gguf",
+ ResourceHelper.spark)
+ .setInputCols("caption_document", "image_assembler")
+ .setOutputCol("completions")
+ .setRemoveThinkingTag(thinkTag)
+ .setNPredict(500)
+ .setTemperature(0.1f)
+
+ val pipeline =
+ new Pipeline().setStages(Array(documentAssembler, imageAssembler, model))
+ val dataThinking = data
+ .limit(1)
+ .withColumn(
+ "caption",
+ lit("What is the meaning of life? Think real hard and relate it to the image."))
+
+ dataThinking.select("caption").show(false)
+
+ val result = pipeline.fit(dataThinking).transform(dataThinking)
+
+ val completion = Annotation.collect(result, "completions").flatten.head.result
+ println(completion)
+ assert(!completion.contains(s"<$thinkTag>") && !completion.contains(s"</$thinkTag>"))
+ }
 }