Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@

package org.springframework.ai.reader.markdown;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import org.commonmark.node.AbstractVisitor;
import org.commonmark.node.BlockQuote;
Expand All @@ -33,27 +38,37 @@
import org.commonmark.node.Text;
import org.commonmark.node.ThematicBreak;
import org.commonmark.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;

/**
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
* horizontal lines (depending on the
* {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
* {@link Document}s.
* {@link Document}s. Currently, only Markdown resource files in the ClassPath path are
* supported, and Markdown files can be read in the way of directory path configuration.
* Use
* {@See org.springframework.ai.reader.markdown.MarkdownDocumentReaderTest#testDirPathSingle()}
* {@See org.springframework.ai.reader.markdown.MarkdownDocumentReaderTest#testMultipleMarkdownFiles()}
*
* @author Piotr Olaszewski
* @auther shown.Ji
*/
public class MarkdownDocumentReader implements DocumentReader {

private final static Logger logger = LoggerFactory.getLogger(MarkdownDocumentReader.class);

/**
* The resource points to the Markdown document.
*/
private final Resource markdownResource;
private final List<Resource> markdownResources;

/**
* Configuration to a parsing process.
Expand All @@ -67,27 +82,56 @@ public class MarkdownDocumentReader implements DocumentReader {

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param markdownResourcePath the markdown file resource path to read
*/
public MarkdownDocumentReader(String markdownResource) {
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
public MarkdownDocumentReader(String markdownResourcePath) {
this(loadResources(loadResourcePaths(markdownResourcePath)), MarkdownDocumentReaderConfig.defaultConfig());
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param markdownResourcePath the resource path
* @param config the configuration to use
*/
public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(markdownResource), config);
public MarkdownDocumentReader(String markdownResourcePath, MarkdownDocumentReaderConfig config) {
this(loadResources(loadResourcePaths(markdownResourcePath)), config);
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param markdownResourcePaths the resources paths to read
*/
public MarkdownDocumentReader(List<String> markdownResourcePaths) {
this(loadResources(markdownResourcePaths), MarkdownDocumentReaderConfig.defaultConfig());
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the markdown file resources to read
*/
public MarkdownDocumentReader(Resource markdownResource) {
this(markdownResource, MarkdownDocumentReaderConfig.defaultConfig());
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the markdown file resource to read
* @param config the configuration to use
*/
public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
this.markdownResource = markdownResource;
this(List.of(markdownResource), config);
}

/**
* Create a new {@link MarkdownDocumentReader} instance.
* @param markdownResource the resource to read
* @param config the configuration to use
*/
public MarkdownDocumentReader(List<Resource> markdownResource, MarkdownDocumentReaderConfig config) {

Assert.notEmpty(markdownResource, "Markdown resource must not be empty");

this.markdownResources = markdownResource;
this.config = config;
this.parser = Parser.builder().build();
}
Expand All @@ -98,17 +142,35 @@ public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderC
*/
@Override
public List<Document> get() {
try (var input = this.markdownResource.getInputStream()) {
Node node = this.parser.parseReader(new InputStreamReader(input));

DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
node.accept(documentVisitor);
return this.markdownResources.stream()
.flatMap(markdownResource -> getDocuments(markdownResource).stream())
.collect(Collectors.toList());
}

private List<Document> getDocuments(Resource markdownResource) {

List<Document> documents;
try {
if (markdownResource.isFile() && !markdownResource.exists()) {
throw new FileNotFoundException("Resource does not exist: " + markdownResource.getFilename());
}

logger.debug("Attempting to read resource: " + markdownResource.getDescription());
try (InputStream input = markdownResource.getInputStream()) {
Node node = this.parser.parseReader(new InputStreamReader(input));

DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
node.accept(documentVisitor);

return documentVisitor.getDocuments();
documents = documentVisitor.getDocuments();
}
}
catch (IOException e) {
throw new RuntimeException(e);
logger.error("Error reading markdown resource: " + e.getMessage(), e);
throw new RuntimeException("Error reading markdown resource", e);
}
return documents;
}

/**
Expand Down Expand Up @@ -247,4 +309,61 @@ private void translateLineBreakToSpace() {

}

/**
* Load resources from the given paths.
* @param markdownResourcePaths the resource paths to load
* @return a list of Resources
*/
private static List<Resource> loadResources(List<String> markdownResourcePaths) {

DefaultResourceLoader resourceLoader = new DefaultResourceLoader();

return markdownResourcePaths.stream().map(resourceLoader::getResource).collect(Collectors.toList());
}

/**
* Load resource paths from the given path.
* @param resourcePath markdown resource path
* @return a list of resource paths
*/
private static List<String> loadResourcePaths(String resourcePath) {
List<String> resources = new ArrayList<>();

if (resourcePath.startsWith("classpath:")) {
String path = resourcePath.replace("classpath:", "");
URL resourceURL = MarkdownDocumentReader.class.getResource(path);

if (resourceURL != null) {
File file = new File(resourceURL.getFile());
if (file.isDirectory()) {
File[] files = file.listFiles((dir, name) -> name.endsWith(".md"));
if (files != null) {
for (File mdFile : files) {
resources.add("classpath:" + mdFile.getName());
}
}
}
else if (file.exists() && file.getName().endsWith(".md")) {
resources.add(resourcePath);
}
}
}
else {
File file = new File(resourcePath);
if (file.exists() && file.isDirectory()) {
File[] files = file.listFiles((dir, name) -> name.endsWith(".md"));
if (files != null) {
for (File mdFile : files) {
resources.add(mdFile.getAbsolutePath());
}
}
}
else if (file.exists() && file.getName().endsWith(".md")) {
resources.add(file.getAbsolutePath());
}
}

return resources;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,67 @@

/**
* @author Piotr Olaszewski
* @author shown.Ji
*/
class MarkdownDocumentReaderTest {

@Test
void testDirPathSingle() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-1");

List<Document> documents = reader.get();

assertThat(documents).hasSize(2)
.extracting(Document::getMetadata, Document::getText)
.containsOnly(tuple(Map.of(),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
tuple(Map.of("category", "blockquote"),
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
}

@Test
void testDirPathMultiple() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-2");
List<Document> documents = reader.get();

assertThat(documents).hasSize(6)
.extracting(Document::getMetadata, Document::getText)
.containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
tuple(Map.of("category", "header_3", "title", "Header 3"),
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
tuple(Map.of("category", "header_1", "title", "Header 1a"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
tuple(Map.of("category", "header_1", "title", "Header 1b"),
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
tuple(Map.of("category", "header_2", "title", "Header 2b"),
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
tuple(Map.of("category", "header_2", "title", "Header 2c"),
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
}

@Test
void testMultipleMarkdownFiles() {
MarkdownDocumentReader reader = new MarkdownDocumentReader(
List.of("classpath:/only-headers.md", "classpath:/with-formatting.md"));
List<Document> documents = reader.get();

assertThat(documents).hasSize(6)
.extracting(Document::getMetadata, Document::getText)
.containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
tuple(Map.of("category", "header_3", "title", "Header 3"),
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
tuple(Map.of("category", "header_1", "title", "Header 1a"),
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
tuple(Map.of("category", "header_1", "title", "Header 1b"),
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
tuple(Map.of("category", "header_2", "title", "Header 2b"),
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
tuple(Map.of("category", "header_2", "title", "Header 2c"),
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
}

@Test
void testOnlyHeadersWithParagraphs() {
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
> suscipit.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
> suscipit.

Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Header 1a

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.

# Header 1b

Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed
sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh.

## Header 2b

Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien
odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero.

# Header 1c

## Header 2c

Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a fancy header name

Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan
tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum***
dignissim.

### Header 3

Aenean eu leo eu nibh tristique _posuere quis quis massa_.