I have the following Java code that generates our indexing words:
@Entity @Indexed(index = "process") @Table(name = "process") public class Process extends BaseTemplateBean { // [... a lot of unrelated stuff ...] @Transient private transient IndexingKeyworder indexingKeyworder; // [... a bigger lot of unrelated stuff ...] @Transient @FullTextField(name = "search") @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public String getKeywordsForFreeSearch() { return initializeKeywords().getSearch(); } @Transient @FullTextField(name = "searchTitle") @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public String getKeywordsForSearchingInTitle() { return initializeKeywords().getSearchTitle(); } @Transient @FullTextField(name = "searchProject") @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public String getKeywordsForSearchingByProjectName() { return initializeKeywords().getSearchProject(); } @Transient @FullTextField(name = "searchBatch") @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public String getKeywordsForAssignmentToBatches() { return initializeKeywords().getSearchBatch(); } @Transient @FullTextField(name = "searchTask") @IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO) public String getKeywordsForSearchingForTaskInformation() { return initializeKeywords().getSearchTask(); } private IndexingKeyworder initializeKeywords() { if (this.indexingKeyworder == null) { IndexingKeyworder indexingKeyworder = new IndexingKeyworder(this); this.indexingKeyworder = indexingKeyworder; return indexingKeyworder; } else { return indexingKeyworder; } } }
These code use this helper class to work without duplicating the work in any case:
class IndexingKeyworder { private static final String PSEUDOWORD_TASK_AUTOMATIC = "automatic"; private static final String PSEUDOWORD_TASK_DONE = "closed"; private static final String PSEUDOWORD_TASK_DONE_PROCESSING_USER = "closeduser"; private static final String ANY_METADATA_MARKER = "mdWrap"; private static final char VALUE_SEPARATOR = 'q'; private static final Pattern TITLE_GROUPS_PATTERN = Pattern.compile("[\\p{IsLetter}\\p{Digit}]+"); private static final Pattern METADATA_PATTERN = Pattern.compile("name=\"([^\"]+)\">([^<]*)<", Pattern.DOTALL); private static final Pattern METADATA_SECTIONS_PATTERN = attern.compile("<mets:dmdSec.*?o>(.*?)</kitodo:k", Pattern.DOTALL); private static final Pattern RULESET_KEY_PATTERN = Pattern.compile("key id=\"([^\"]+)\">(.*?)</key>", Pattern.DOTALL); private static final Pattern RULESET_LABEL_PATTERN = Pattern.compile("<label[^>]*>([^<]+)", Pattern.DOTALL); private final Map<String, Map<String, Collection<String>>> rulesetCache = new HashMap<>(); private Set<String> titleKeywords = Collections.emptySet(); private Set<String> projectKeywords = Collections.emptySet(); private Set<String> batchKeywords = Collections.emptySet(); private Set<String> taskKeywords = Collections.emptySet(); private Set<String> taskPseudoKeywords = Collections.emptySet(); private Set<String> metadataKeywords = Collections.emptySet(); private Set<String> metadataPseudoKeywords = Collections.emptySet(); private String processId = null; private Set<String> commentKeywords = Collections.emptySet(); public IndexingKeyworder(Process process) { this.titleKeywords = filterMinLength(initTitleKeywords(process.getTitle())); this.projectKeywords = filterMinLength(initSimpleKeywords(Objects.nonNull(process.getProject()) ? process.getProject().getTitle() : "")); this.batchKeywords = filterMinLength(initBatchKeywords(process.getBatches())); var taskKeywords = initTaskKeywords(process.getTasksUnmodified()); this.taskKeywords = filterMinLength(taskKeywords.getLeft()); this.taskPseudoKeywords = filterMinLength(taskKeywords.getRight()); var metadataKeywords = initMetadataKeywords(process); this.metadataKeywords = filterMinLength(metadataKeywords.getLeft()); this.metadataPseudoKeywords = filterMinLength(metadataKeywords.getRight()); this.processId = process.getId().toString(); this.commentKeywords = filterMinLength(initCommentKeywords(process.getComments())); } private static Set<String> initTitleKeywords(String processTitle) { Set<String> tokens = new HashSet<>(); Matcher matcher = TITLE_GROUPS_PATTERN.matcher(processTitle); while (matcher.find()) { String normalized = normalize(matcher.group()); final int length = normalized.length(); for (int end = 1; end <= length; end++) { tokens.add(normalized.substring(0, end)); } for (int beginning = length - 1; beginning >= 0; beginning--) { tokens.add(normalized.substring(beginning, length)); } } return tokens; } private static final Set<String> initSimpleKeywords(String input) { Set<String> tokens = new HashSet<>(); for (String term : splitValues(input)) { tokens.add(normalize(term)); } return tokens; } private static final Set<String> initBatchKeywords(Collection<Batch> batches) { if (batches.isEmpty()) { return Collections.emptySet(); } Set<String> tokens = new HashSet<>(); for (Batch batch : batches) { String optionalTitle = batch.getTitle(); if (StringUtils.isNotBlank(optionalTitle)) { tokens.addAll(initSimpleKeywords(optionalTitle)); } } return tokens; } private static final Pair<Set<String>, Set<String>> initTaskKeywords(Collection<Task> tasks) { Set<String> taskKeywords = new HashSet<>(); Set<String> taskPseudoKeywords = new HashSet<>(); for (Task task : tasks) { for (String token : splitValues(task.getTitle())) { String term = normalize(token); taskKeywords.add(term); if (task.isTypeAutomatic()) { taskKeywords.add(PSEUDOWORD_TASK_AUTOMATIC + VALUE_SEPARATOR + term); } TaskStatus taskStatus = task.getProcessingStatus(); if (Objects.isNull(taskStatus)) { continue; } if (Objects.equals(taskStatus, TaskStatus.DONE)) { taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE); taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE + VALUE_SEPARATOR + term); User closedUser = task.getProcessingUser(); if (Objects.isNull(closedUser)) { continue; } if (StringUtils.isNotBlank(closedUser.getName())) { taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE_PROCESSING_USER + VALUE_SEPARATOR + normalize( closedUser.getName())); } if (StringUtils.isNotBlank(closedUser.getSurname())) { taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE_PROCESSING_USER + VALUE_SEPARATOR + normalize( closedUser.getSurname())); } } else { String taskKeyword = taskStatus.toString().toLowerCase(); taskPseudoKeywords.add(taskKeyword); taskPseudoKeywords.add(taskKeyword + VALUE_SEPARATOR + term); } } } return Pair.of(taskKeywords, taskPseudoKeywords); } private static final Pair<Set<String>, Set<String>> initMetadataKeywords(Process process) { final Pair<Set<String>, Set<String>> emptyResult = Pair.of(Collections.emptySet(), Collections.emptySet()); try { String processId = Integer.toString(process.getId()); Path path = Paths.get(KitodoConfig.getKitodoDataDirectory(), processId, "meta.xml"); if (!Files.isReadable(path)) { return emptyResult; } String metaXml = FileUtils.readFileToString(path.toFile(), StandardCharsets.UTF_8); if (!metaXml.contains(ANY_METADATA_MARKER)) { return emptyResult; } Set<String> metadataKeywords = new HashSet<>(); Set<String> metadataPseudoKeywords = new HashSet<>(); Map<String, Collection<String>> rulesetLabelMap = getRulesetLabelMap(process.getRuleset().getFile()); Matcher metadataSectionsMatcher = METADATA_SECTIONS_PATTERN.matcher(metaXml); while (metadataSectionsMatcher.find()) { Matcher keyMatcher = METADATA_PATTERN.matcher(metadataSectionsMatcher.group(1)); while (keyMatcher.find()) { String key = normalize(keyMatcher.group(1)); String valueString = keyMatcher.group(2); for (String singleValue : splitValues(valueString)) { String value = normalize(singleValue); metadataKeywords.add(value); metadataPseudoKeywords.add(key + VALUE_SEPARATOR + value); metadataPseudoKeywords.add(key); for (String label : rulesetLabelMap.getOrDefault(key, Collections.emptyList())) { metadataPseudoKeywords.add(label + VALUE_SEPARATOR + value); metadataPseudoKeywords.add(label); } } } } return Pair.of(metadataKeywords, metadataPseudoKeywords); } catch (IOException | RuntimeException e) { return emptyResult; } } private static Map<String, Collection<String>> getRulesetLabelMap(String file) { Map<String, Collection<String>> rulesetLabelMap = rulesetCache.get(file); if (Objects.nonNull(rulesetLabelMap)) { return rulesetLabelMap; } try { File rulesetFile = Paths.get(KitodoConfig.getParameter("directory.rulesets"), file).toFile(); String ruleset = FileUtils.readFileToString(rulesetFile, StandardCharsets.UTF_8); rulesetLabelMap = new HashMap<>(); Matcher keysMatcher = RULESET_KEY_PATTERN.matcher(ruleset); while (keysMatcher.find()) { String key = normalize(keysMatcher.group(1)); Matcher labelMatcher = RULESET_LABEL_PATTERN.matcher(keysMatcher.group(2)); Set<String> labels = new HashSet<>(); while (labelMatcher.find()) { labels.add(normalize(labelMatcher.group(1))); } rulesetLabelMap.put(key, labels); } rulesetCache.put(file, rulesetLabelMap); return rulesetLabelMap; } catch (IOException | RuntimeException e) { return Collections.emptyMap(); } } private static final Set<String> initCommentKeywords(List<Comment> comments) { Set<String> tokens = new HashSet<>(); for (Comment comment : comments) { String message = comment.getMessage(); if (StringUtils.isNotBlank(message)) { tokens.addAll(initSimpleKeywords(message)); } } return tokens; } private static String normalize(String string) { return string.toLowerCase().replaceAll("[\0-/:-`{-¿]", ""); } private static List<String> splitValues(String value) { String initializedValue = value != null ? value : ""; return Arrays.asList(initializedValue.split("[ ,\\-._]+")); } private static Set<String> filterMinLength(Set<String> tokens) { for (Iterator<String> iterator = tokens.iterator(); iterator.hasNext();) { if (iterator.next().length() < 3) { iterator.remove(); } } return tokens; } public String getSearch() { Set<String> freeKeywords = new HashSet<>(); freeKeywords.addAll(titleKeywords); freeKeywords.addAll(projectKeywords); freeKeywords.addAll(batchKeywords); freeKeywords.addAll(taskKeywords); freeKeywords.addAll(metadataKeywords); freeKeywords.addAll(metadataPseudoKeywords); if (Objects.nonNull(processId)) { freeKeywords.add(processId); } freeKeywords.addAll(commentKeywords); return String.join(" ", freeKeywords); } public String getSearchTitle() { return String.join(" ", titleKeywords); } public String getSearchProject() { return String.join(" ", projectKeywords); } public String getSearchBatch() { return String.join(" ", batchKeywords); } public String getSearchTask() { Set<String> allTaskKeywords = new HashSet<>(); allTaskKeywords.addAll(taskKeywords); allTaskKeywords.addAll(taskPseudoKeywords); return String.join(" ", allTaskKeywords); } }
I have now been accused of not using the Hibernate Sarch framework and the functionality needs to be provided via annotations. I am very open to not reinventing an existing function, so how can I do that?
The following requirements need to be preserved for performance reasons:
- String
metaXml
must only be read once! rulesetLabelMap
must only be created once for a given file and use cache! (500k objects can be processed with the same file!)- all tokens must be normalized!
- follow the special rules, title must be cut into character strings that are not letters or numbers, and can be searched one-way or backwards (but not both together) but there must be at least 3 characters
- for the metadata search terms must be generated using the rule set (see Java code above)
- and the various search terms must be included in the joint search (“search”) but not all (see Java code above) and also separated
- and take into account that none of the caclulations are calculated twice!
Please let me know how I can implement this with the annotations offered by Hibernate Search guaranteed without any disadvantages!