Skip to content

Commit 499ab3e

Browse files
ericbottardilayaperumalg
authored andcommitted
Add support for multiple documents in MarkdownDocumentReader.
Closes #2715 Signed-off-by: Eric Bottard <eric.bottard@broadcom.com>
1 parent 0cea262 commit 499ab3e

File tree

6 files changed

+126
-20
lines changed

6 files changed

+126
-20
lines changed

document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@
3737
import org.springframework.ai.document.Document;
3838
import org.springframework.ai.document.DocumentReader;
3939
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
40-
import org.springframework.core.io.DefaultResourceLoader;
4140
import org.springframework.core.io.Resource;
41+
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
4242

4343
/**
4444
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
@@ -51,9 +51,9 @@
5151
public class MarkdownDocumentReader implements DocumentReader {
5252

5353
/**
54-
* The resource points to the Markdown document.
54+
* The resources read by this document reader.
5555
*/
56-
private final Resource markdownResource;
56+
private final Resource[] markdownResources;
5757

5858
/**
5959
* Configuration to a parsing process.
@@ -67,48 +67,72 @@ public class MarkdownDocumentReader implements DocumentReader {
6767

6868
/**
6969
* Create a new {@link MarkdownDocumentReader} instance.
70-
* @param markdownResource the resource to read
70+
* @param markdownResources the resources to read, will be resolved via
71+
* {@link PathMatchingResourcePatternResolver}
7172
*/
72-
public MarkdownDocumentReader(String markdownResource) {
73-
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
73+
public MarkdownDocumentReader(String markdownResources) {
74+
this(markdownResources, MarkdownDocumentReaderConfig.defaultConfig());
7475
}
7576

7677
/**
7778
* Create a new {@link MarkdownDocumentReader} instance.
78-
* @param markdownResource the resource to read
79+
* @param markdownResources the resources to read, will be resolved via
80+
* {@link PathMatchingResourcePatternResolver}
7981
* @param config the configuration to use
8082
*/
81-
public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
82-
this(new DefaultResourceLoader().getResource(markdownResource), config);
83+
public MarkdownDocumentReader(String markdownResources, MarkdownDocumentReaderConfig config) {
84+
this(resolveResources(markdownResources), config);
8385
}
8486

8587
/**
86-
* Create a new {@link MarkdownDocumentReader} instance.
88+
* Create a new {@link MarkdownDocumentReader} instance using a single
89+
* {@link Resource}.
8790
* @param markdownResource the resource to read
8891
*/
8992
public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
90-
this.markdownResource = markdownResource;
93+
this(List.of(markdownResource), config);
94+
}
95+
96+
/**
97+
* Create a new {@link MarkdownDocumentReader} instance using already resolved
98+
* {@link Resource resources}.
99+
* @param markdownResources the resources to read
100+
*/
101+
public MarkdownDocumentReader(List<Resource> markdownResources, MarkdownDocumentReaderConfig config) {
102+
this.markdownResources = markdownResources.toArray(new Resource[0]);
91103
this.config = config;
92104
this.parser = Parser.builder().build();
93105
}
94106

107+
private static List<Resource> resolveResources(String markdownResources) {
108+
try {
109+
return List.of(new PathMatchingResourcePatternResolver().getResources(markdownResources));
110+
}
111+
catch (IOException e) {
112+
throw new RuntimeException(e);
113+
}
114+
}
115+
95116
/**
96117
* Extracts and returns a list of documents from the resource.
97118
* @return List of extracted {@link Document}
98119
*/
99120
@Override
100121
public List<Document> get() {
101-
try (var input = this.markdownResource.getInputStream()) {
102-
Node node = this.parser.parseReader(new InputStreamReader(input));
103-
122+
List<Document> documents = new ArrayList<>();
123+
for (Resource markdownResource : this.markdownResources) {
104124
DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
105-
node.accept(documentVisitor);
125+
try (var input = markdownResource.getInputStream()) {
126+
Node node = this.parser.parseReader(new InputStreamReader(input));
106127

107-
return documentVisitor.getDocuments();
108-
}
109-
catch (IOException e) {
110-
throw new RuntimeException(e);
128+
node.accept(documentVisitor);
129+
documents.addAll(documentVisitor.getDocuments());
130+
}
131+
catch (IOException e) {
132+
throw new RuntimeException(e);
133+
}
111134
}
135+
return documents;
112136
}
113137

114138
/**

document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,49 @@
2828
import static org.assertj.core.groups.Tuple.tuple;
2929

3030
/**
31+
* Unit tests for {@link MarkdownDocumentReader}.
32+
*
3133
* @author Piotr Olaszewski
34+
* @author shown.Ji
35+
* @author Eric Bottard
3236
*/
3337
class MarkdownDocumentReaderTest {
3438

39+
@Test
40+
void testDirPathSingle() {
41+
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-1/*.md");
42+
43+
List<Document> documents = reader.get();
44+
45+
assertThat(documents).hasSize(2)
46+
.extracting(Document::getMetadata, Document::getText)
47+
.containsOnly(tuple(Map.of(),
48+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
49+
tuple(Map.of("category", "blockquote"),
50+
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
51+
}
52+
53+
@Test
54+
void testDirPathMultiple() {
55+
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-2/*.md");
56+
List<Document> documents = reader.get();
57+
58+
assertThat(documents).hasSize(6)
59+
.extracting(Document::getMetadata, Document::getText)
60+
.containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
61+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
62+
tuple(Map.of("category", "header_3", "title", "Header 3"),
63+
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
64+
tuple(Map.of("category", "header_1", "title", "Header 1a"),
65+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
66+
tuple(Map.of("category", "header_1", "title", "Header 1b"),
67+
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
68+
tuple(Map.of("category", "header_2", "title", "Header 2b"),
69+
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
70+
tuple(Map.of("category", "header_2", "title", "Header 2c"),
71+
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
72+
}
73+
3574
@Test
3675
void testOnlyHeadersWithParagraphs() {
3776
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md");
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
2+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
3+
4+
> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
5+
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
6+
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
7+
> suscipit.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
2+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
3+
4+
> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
5+
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
6+
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
7+
> suscipit.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Header 1a
2+
3+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
4+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
5+
6+
# Header 1b
7+
8+
Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed
9+
sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh.
10+
11+
## Header 2b
12+
13+
Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien
14+
odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero.
15+
16+
# Header 1c
17+
18+
## Header 2c
19+
20+
Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# This is a fancy header name
2+
3+
Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan
4+
tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum***
5+
dignissim.
6+
7+
### Header 3
8+
9+
Aenean eu leo eu nibh tristique _posuere quis quis massa_.

0 commit comments

Comments
 (0)