Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.ai.transformer.splitter.TextSplitter;
import org.xml.sax.ContentHandler;

import org.springframework.ai.document.Document;
Expand Down Expand Up @@ -85,6 +86,8 @@ public class TikaDocumentReader implements DocumentReader {
*/
private final ExtractedTextFormatter textFormatter;

private TextSplitter textSplitter = new NoOpTextSplitter();

/**
* Constructor initializing the reader with a given resource URL.
* @param resourceUrl URL to the resource
Expand Down Expand Up @@ -135,6 +138,14 @@ public TikaDocumentReader(Resource resource, ContentHandler contentHandler, Extr
this.textFormatter = textFormatter;
}

/**
* Sets the text splitter.
* @param textSplitter Text splitter
*/
public void setTextSplitter(TextSplitter textSplitter) {
this.textSplitter = textSplitter;
}

/**
* Extracts and returns the list of documents from the resource.
* @return List of extracted {@link Document}
Expand All @@ -143,7 +154,11 @@ public TikaDocumentReader(Resource resource, ContentHandler contentHandler, Extr
public List<Document> get() {
try (InputStream stream = this.resource.getInputStream()) {
this.parser.parse(stream, this.handler, this.metadata, this.context);
return List.of(toDocument(this.handler.toString()));
return this.textSplitter.splitText(this.handler.toString())
.stream()
.filter(StringUtils::hasText)
.map(this::toDocument)
.toList();
}
catch (Exception e) {
throw new RuntimeException(e);
Expand Down Expand Up @@ -181,4 +196,13 @@ private String resourceName() {
}
}

private static final class NoOpTextSplitter extends TextSplitter {

@Override
public List<String> splitText(String text) {
return List.of(text);
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;

import static org.assertj.core.api.Assertions.assertThat;

Expand Down Expand Up @@ -46,4 +47,26 @@ public void testDocx(String resourceUri, String resourceName, String contentSnip
assertThat(doc.getContent()).contains(contentSnipped);
}

@ParameterizedTest
@CsvSource({ "classpath:/word-sample.docx,word-sample.docx,3,This document has embedded the Ubuntu font family.",
"classpath:/word-sample.doc,word-sample.doc,3,The paper size is set to Letter, which is 8 ½ x 11.",
"classpath:/sample2.pdf,sample2.pdf,3,put all source .tex files in one directory, then chdir to the directory",
"classpath:/sample.ppt,sample.ppt,1,Sed ipsum tortor, fringilla a consectetur eget, cursus posuere sem.",
"classpath:/sample.pptx,sample.pptx,1,Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
"https://docs.spring.io/spring-ai/reference/,https://docs.spring.io/spring-ai/reference/,2,help set up essential dependencies and classes." })
public void testDocsWithTextSplitter(String resourceUri, String resourceName, int documentCount,
String contentSnipped) {

TikaDocumentReader reader = new TikaDocumentReader(resourceUri);
reader.setTextSplitter(new TokenTextSplitter());
var docs = reader.get();
assertThat(docs).hasSize(documentCount);

var doc = docs.get(0);

assertThat(doc.getMetadata()).containsKeys(TikaDocumentReader.METADATA_SOURCE);
assertThat(doc.getMetadata().get(TikaDocumentReader.METADATA_SOURCE)).isEqualTo(resourceName);
assertThat(doc.getContent()).contains(contentSnipped);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,6 @@ private List<Document> createDocuments(List<String> texts, List<ContentFormatter
return documents;
}

protected abstract List<String> splitText(String text);
public abstract List<String> splitText(String text);

}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public TokenTextSplitter(boolean keepSeparator) {
private final Encoding encoding = registry.getEncoding(EncodingType.CL100K_BASE);

@Override
protected List<String> splitText(String text) {
public List<String> splitText(String text) {
return split(text, defaultChunkSize);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public class TextSplitterTests {
static TextSplitter testTextSplitter = new TextSplitter() {

@Override
protected List<String> splitText(String text) {
public List<String> splitText(String text) {
int chuckSize = text.length() / 2;

List<String> chunks = new ArrayList<>();
Expand Down