Skip to content

Commit 48ba1ae

Browse files
authored
feat(compression): Add LZ77 and LZ78 algorithms (TheAlgorithms#6910)
* feat(compression): Add LZ77 and LZ78 algorithms * Resolve Spotbugs warning in LZ78 by using Trie structure * fix code style
1 parent f66da5e commit 48ba1ae

File tree

4 files changed

+822
-0
lines changed

4 files changed

+822
-0
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
/**
7+
* An implementation of the Lempel-Ziv 77 (LZ77) compression algorithm.
8+
* <p>
9+
* LZ77 is a lossless data compression algorithm that works by finding repeated
10+
* occurrences of data in a sliding window. It replaces subsequent occurrences
11+
* with references (offset, length) to the first occurrence within the window.
12+
* </p>
13+
* <p>
14+
* This implementation uses a simple sliding window and lookahead buffer approach.
15+
* Output format is a sequence of tuples (offset, length, next_character).
16+
* </p>
17+
* <p>
18+
* Time Complexity: O(n*W) in this naive implementation, where n is the input length
19+
* and W is the window size, due to the search for the longest match. More advanced
20+
* data structures (like suffix trees) can improve this.
21+
* </p>
22+
* <p>
23+
* References:
24+
* <ul>
25+
* <li><a href="https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ77">Wikipedia: LZ77</a></li>
26+
* </ul>
27+
* </p>
28+
*/
29+
public final class LZ77 {
30+
31+
private static final int DEFAULT_WINDOW_SIZE = 4096;
32+
private static final int DEFAULT_LOOKAHEAD_BUFFER_SIZE = 16;
33+
private static final char END_OF_STREAM = '\u0000';
34+
private LZ77() {
35+
}
36+
37+
/**
38+
* Represents a token in the LZ77 compressed output.
39+
* Stores the offset back into the window, the length of the match,
40+
* and the next character after the match (or END_OF_STREAM if at end).
41+
*/
42+
public record Token(int offset, int length, char nextChar) {
43+
}
44+
45+
/**
46+
* Compresses the input text using the LZ77 algorithm.
47+
*
48+
* @param text The input string to compress. Must not be null.
49+
* @param windowSize The size of the sliding window (search buffer). Must be positive.
50+
* @param lookaheadBufferSize The size of the lookahead buffer. Must be positive.
51+
* @return A list of {@link Token} objects representing the compressed data.
52+
* @throws IllegalArgumentException if windowSize or lookaheadBufferSize are not positive.
53+
*/
54+
public static List<Token> compress(String text, int windowSize, int lookaheadBufferSize) {
55+
if (text == null) {
56+
return new ArrayList<>();
57+
}
58+
if (windowSize <= 0 || lookaheadBufferSize <= 0) {
59+
throw new IllegalArgumentException("Window size and lookahead buffer size must be positive.");
60+
}
61+
62+
List<Token> compressedOutput = new ArrayList<>();
63+
int currentPosition = 0;
64+
65+
while (currentPosition < text.length()) {
66+
int bestMatchDistance = 0;
67+
int bestMatchLength = 0;
68+
69+
// Define the start of the search window
70+
int searchBufferStart = Math.max(0, currentPosition - windowSize);
71+
// Define the end of the lookahead buffer (don't go past text length)
72+
int lookaheadEnd = Math.min(currentPosition + lookaheadBufferSize, text.length());
73+
74+
// Search for the longest match in the window
75+
for (int i = searchBufferStart; i < currentPosition; i++) {
76+
int currentMatchLength = 0;
77+
78+
// Check how far the match extends into the lookahead buffer
79+
// This allows for overlapping matches (e.g., "aaa" can match with offset 1)
80+
while (currentPosition + currentMatchLength < lookaheadEnd) {
81+
int sourceIndex = i + currentMatchLength;
82+
83+
// Handle overlapping matches (run-length encoding within LZ77)
84+
// When we've matched beyond our starting position, wrap around using modulo
85+
if (sourceIndex >= currentPosition) {
86+
int offset = currentPosition - i;
87+
sourceIndex = i + (currentMatchLength % offset);
88+
}
89+
90+
if (text.charAt(sourceIndex) == text.charAt(currentPosition + currentMatchLength)) {
91+
currentMatchLength++;
92+
} else {
93+
break;
94+
}
95+
}
96+
97+
// If this match is longer than the best found so far
98+
if (currentMatchLength > bestMatchLength) {
99+
bestMatchLength = currentMatchLength;
100+
bestMatchDistance = currentPosition - i; // Calculate offset from current position
101+
}
102+
}
103+
104+
char nextChar;
105+
if (currentPosition + bestMatchLength < text.length()) {
106+
nextChar = text.charAt(currentPosition + bestMatchLength);
107+
} else {
108+
nextChar = END_OF_STREAM;
109+
}
110+
111+
// Add the token to the output
112+
compressedOutput.add(new Token(bestMatchDistance, bestMatchLength, nextChar));
113+
114+
// Move the current position forward
115+
// If we're at the end and had a match, just move by the match length
116+
if (nextChar == END_OF_STREAM) {
117+
currentPosition += bestMatchLength;
118+
} else {
119+
currentPosition += bestMatchLength + 1;
120+
}
121+
}
122+
123+
return compressedOutput;
124+
}
125+
126+
/**
127+
* Compresses the input text using the LZ77 algorithm with default buffer sizes.
128+
*
129+
* @param text The input string to compress. Must not be null.
130+
* @return A list of {@link Token} objects representing the compressed data.
131+
*/
132+
public static List<Token> compress(String text) {
133+
return compress(text, DEFAULT_WINDOW_SIZE, DEFAULT_LOOKAHEAD_BUFFER_SIZE);
134+
}
135+
136+
/**
137+
* Decompresses a list of LZ77 tokens back into the original string.
138+
*
139+
* @param compressedData The list of {@link Token} objects. Must not be null.
140+
* @return The original, uncompressed string.
141+
*/
142+
public static String decompress(List<Token> compressedData) {
143+
if (compressedData == null) {
144+
return "";
145+
}
146+
147+
StringBuilder decompressedText = new StringBuilder();
148+
149+
for (Token token : compressedData) {
150+
// Copy matched characters from the sliding window
151+
if (token.length > 0) {
152+
int startIndex = decompressedText.length() - token.offset;
153+
154+
// Handle overlapping matches (e.g., when length > offset)
155+
for (int i = 0; i < token.length; i++) {
156+
decompressedText.append(decompressedText.charAt(startIndex + i));
157+
}
158+
}
159+
160+
// Append the next character (if not END_OF_STREAM)
161+
if (token.nextChar != END_OF_STREAM) {
162+
decompressedText.append(token.nextChar);
163+
}
164+
}
165+
166+
return decompressedText.toString();
167+
}
168+
}
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.HashMap;
5+
import java.util.List;
6+
import java.util.Map;
7+
8+
/**
9+
* An implementation of the Lempel-Ziv 78 (LZ78) compression algorithm.
10+
* <p>
11+
* LZ78 is a dictionary-based lossless data compression algorithm. It processes
12+
* input data sequentially, building a dictionary of phrases encountered so far.
13+
* It outputs pairs (dictionary_index, next_character), representing
14+
* the longest match found in the dictionary plus the character that follows it.
15+
* </p>
16+
* <p>
17+
* This implementation builds the dictionary dynamically during compression.
18+
* The dictionary index 0 represents the empty string (no prefix).
19+
* </p>
20+
* <p>
21+
* Time Complexity: O(n) on average for compression and decompression, assuming
22+
* efficient dictionary lookups (using a HashMap), where n is the
23+
* length of the input string.
24+
* </p>
25+
* <p>
26+
* References:
27+
* <ul>
28+
* <li><a href="https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ78">Wikipedia: LZ78</a></li>
29+
* </ul>
30+
* </p>
31+
*/
32+
public final class LZ78 {
33+
34+
/**
35+
* Special character used to mark end of stream when needed.
36+
*/
37+
private static final char END_OF_STREAM = '\u0000';
38+
39+
/**
40+
* Private constructor to prevent instantiation of this utility class.
41+
*/
42+
private LZ78() {
43+
}
44+
45+
/**
46+
* Represents a token in the LZ78 compressed output.
47+
* Stores the index of the matching prefix in the dictionary and the next character.
48+
* Index 0 represents the empty string (no prefix).
49+
*/
50+
public record Token(int index, char nextChar) {
51+
}
52+
53+
/**
54+
* A node in the dictionary trie structure.
55+
* Each node represents a phrase and can have child nodes for extended phrases.
56+
*/
57+
private static final class TrieNode {
58+
Map<Character, TrieNode> children = new HashMap<>();
59+
int index = -1; // -1 means not assigned yet
60+
}
61+
62+
/**
63+
* Compresses the input text using the LZ78 algorithm.
64+
*
65+
* @param text The input string to compress. Must not be null.
66+
* @return A list of {@link Token} objects representing the compressed data.
67+
*/
68+
public static List<Token> compress(String text) {
69+
if (text == null || text.isEmpty()) {
70+
return new ArrayList<>();
71+
}
72+
73+
List<Token> compressedOutput = new ArrayList<>();
74+
TrieNode root = new TrieNode();
75+
int nextDictionaryIndex = 1;
76+
77+
TrieNode currentNode = root;
78+
int lastMatchedIndex = 0;
79+
80+
for (int i = 0; i < text.length(); i++) {
81+
char currentChar = text.charAt(i);
82+
83+
if (currentNode.children.containsKey(currentChar)) {
84+
currentNode = currentNode.children.get(currentChar);
85+
lastMatchedIndex = currentNode.index;
86+
} else {
87+
// Output: (index of longest matching prefix, current character)
88+
compressedOutput.add(new Token(lastMatchedIndex, currentChar));
89+
90+
TrieNode newNode = new TrieNode();
91+
newNode.index = nextDictionaryIndex++;
92+
currentNode.children.put(currentChar, newNode);
93+
94+
currentNode = root;
95+
lastMatchedIndex = 0;
96+
}
97+
}
98+
99+
// Handle remaining phrase at end of input
100+
if (currentNode != root) {
101+
compressedOutput.add(new Token(lastMatchedIndex, END_OF_STREAM));
102+
}
103+
104+
return compressedOutput;
105+
}
106+
107+
/**
108+
* Decompresses a list of LZ78 tokens back into the original string.
109+
*
110+
* @param compressedData The list of {@link Token} objects. Must not be null.
111+
* @return The original, uncompressed string.
112+
*/
113+
public static String decompress(List<Token> compressedData) {
114+
if (compressedData == null || compressedData.isEmpty()) {
115+
return "";
116+
}
117+
118+
StringBuilder decompressedText = new StringBuilder();
119+
Map<Integer, String> dictionary = new HashMap<>();
120+
int nextDictionaryIndex = 1;
121+
122+
for (Token token : compressedData) {
123+
String prefix = (token.index == 0) ? "" : dictionary.get(token.index);
124+
125+
if (token.nextChar == END_OF_STREAM) {
126+
decompressedText.append(prefix);
127+
} else {
128+
String currentPhrase = prefix + token.nextChar;
129+
decompressedText.append(currentPhrase);
130+
dictionary.put(nextDictionaryIndex++, currentPhrase);
131+
}
132+
}
133+
134+
return decompressedText.toString();
135+
}
136+
}

0 commit comments

Comments
 (0)