Skip to content

Commit 6337aba

Browse files
committed
added preprocessing and post-processing options to HtmlPolicyBuilder so that clients no longer are tempted to do search/replace on sanitized output
1 parent bb7f71e commit 6337aba

File tree

8 files changed

+371
-74
lines changed

8 files changed

+371
-74
lines changed

src/main/java/org/owasp/html/HtmlPolicyBuilder.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ public class HtmlPolicyBuilder {
172172
private final Set<String> skipIfEmpty = Sets.newLinkedHashSet(
173173
DEFAULT_SKIP_IF_EMPTY);
174174
private final Map<String, Boolean> textContainers = Maps.newLinkedHashMap();
175+
private HtmlStreamEventProcessor postprocessor =
176+
HtmlStreamEventProcessor.Processors.IDENTITY;
177+
private HtmlStreamEventProcessor preprocessor =
178+
HtmlStreamEventProcessor.Processors.IDENTITY;
175179
private boolean requireRelNofollowOnLinks;
176180

177181
/**
@@ -445,6 +449,34 @@ public HtmlPolicyBuilder allowStyling(CssSchema whitelist) {
445449
return this;
446450
}
447451

452+
/**
453+
* Inserts a pre-processor into the pipeline between the lexer and the policy.
454+
* Pre-processors receive HTML events before the policy, so the policy will
455+
* be applied to anything they add.
456+
* Pre-processors are not in the TCB since they cannot bypass the policy.
457+
*/
458+
public HtmlPolicyBuilder withPreprocessor(HtmlStreamEventProcessor pp) {
459+
this.preprocessor = HtmlStreamEventProcessor.Processors.compose(
460+
this.preprocessor, pp);
461+
return this;
462+
}
463+
464+
/**
465+
* Inserts a post-processor into the pipeline between the policy and the
466+
* output sink.
467+
* Post-processors can insert events into the stream that are not vetted
468+
* by the policy, so they are in the TCB.
469+
* <p>
470+
* Try doing what you want with a pre-processor instead of a post-processor
471+
* but if you're thinking of doing search/replace on a sanitized string, then
472+
* definitely use either a pre or post-processor instead.
473+
*/
474+
public HtmlPolicyBuilder withPostprocessor(HtmlStreamEventProcessor pp) {
475+
this.postprocessor = HtmlStreamEventProcessor.Processors.compose(
476+
this.postprocessor, pp);
477+
return this;
478+
}
479+
448480
/**
449481
* Names of attributes from HTML 4 whose values are URLs.
450482
* Other attributes, e.g. <code>style</code> may contain URLs even though
@@ -499,7 +531,8 @@ public PolicyFactory toFactory() {
499531
}
500532
}
501533
return new PolicyFactory(compilePolicies(), textContainerSet.build(),
502-
ImmutableMap.copyOf(globalAttrPolicies));
534+
ImmutableMap.copyOf(globalAttrPolicies),
535+
preprocessor, postprocessor);
503536
}
504537

505538
// Speed up subsequent builds by caching the compiled policies.

src/main/java/org/owasp/html/HtmlSanitizer.java

Lines changed: 64 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -99,33 +99,41 @@ public interface Policy extends HtmlStreamEventReceiver {
9999
* {@link HtmlStreamRenderer} after filtering.
100100
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
101101
*/
102-
public static void sanitize(@Nullable String html, final Policy policy) {
103-
if (html == null) { html = ""; }
104-
105-
TagBalancingHtmlStreamEventReceiver balancer
106-
= new TagBalancingHtmlStreamEventReceiver(policy);
107-
108-
// According to Opera the maximum table nesting depth seen in the wild is
109-
// 795, but 99.99% of documents have a table nesting depth of less than 22.
110-
// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
111-
// document depth of 90 (incl. HTML & BODY).
112-
// Obviously table nesting depth is not the same as whole document depth,
113-
// but it is the best proxy I have available.
114-
// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
115-
// the original data.
102+
public static void sanitize(
103+
@Nullable String html, final Policy policy) {
104+
sanitize(html, policy, HtmlStreamEventProcessor.Processors.IDENTITY);
105+
}
116106

117-
// Webkit defines the maximum HTML parser tree depth as 512.
118-
// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
119-
// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
107+
/**
108+
* Sanitizes the given HTML by applying the given policy to it.
109+
*
110+
* <p>
111+
* This method is not in the TCB.
112+
*
113+
* <p>
114+
* This method has no return value since policies are assumed to render things
115+
* they accept and do nothing on things they reject.
116+
* Use {@link HtmlStreamRenderer} to render content to an output buffer.
117+
*
118+
* @param html A snippet of HTML to sanitize. {@code null} is treated as the
119+
* empty string and will not result in a {@code NullPointerException}.
120+
* @param policy The Policy that will receive events based on the tokens in
121+
* HTML. Typically, this policy ends up routing the events to an
122+
* {@link HtmlStreamRenderer} after filtering.
123+
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
124+
* @param preprocessor A processor that may wrap the policy to reinterpret
125+
* parse events.
126+
* Since the policy encapsulates its output buffer, this is not in the
127+
* policy's TCB.
128+
*/
129+
public static void sanitize(
130+
@Nullable String html, final Policy policy,
131+
HtmlStreamEventProcessor preprocessor) {
132+
if (html == null) { html = ""; }
120133

121-
// The first number gives us a lower bound on the nesting depth we allow,
122-
// 90, and the second gives us an upper bound: 512.
123-
// We do not want to bump right up against that limit.
124-
// 256 is substantially larger than the lower bound and well clear of the
125-
// upper bound.
126-
balancer.setNestingLimit(256);
134+
HtmlStreamEventReceiver receiver = initializePolicy(policy, preprocessor);
127135

128-
balancer.openDocument();
136+
receiver.openDocument();
129137

130138
HtmlLexer lexer = new HtmlLexer(html);
131139
// Use a linked list so that policies can use Iterator.remove() in an O(1)
@@ -135,16 +143,16 @@ public static void sanitize(@Nullable String html, final Policy policy) {
135143
HtmlToken token = lexer.next();
136144
switch (token.type) {
137145
case TEXT:
138-
balancer.text(
146+
receiver.text(
139147
Encoding.decodeHtml(html.substring(token.start, token.end)));
140148
break;
141149
case UNESCAPED:
142-
balancer.text(Encoding.stripBannedCodeunits(
150+
receiver.text(Encoding.stripBannedCodeunits(
143151
html.substring(token.start, token.end)));
144152
break;
145153
case TAGBEGIN:
146154
if (html.charAt(token.start + 1) == '/') { // A close tag.
147-
balancer.closeTag(HtmlLexer.canonicalName(
155+
receiver.closeTag(HtmlLexer.canonicalName(
148156
html.substring(token.start + 2, token.end)));
149157
while (lexer.hasNext()
150158
&& lexer.next().type != HtmlTokenType.TAGEND) {
@@ -182,7 +190,7 @@ public static void sanitize(@Nullable String html, final Policy policy) {
182190
if (!attrsReadyForName) {
183191
attrs.add(attrs.getLast());
184192
}
185-
balancer.openTag(
193+
receiver.openTag(
186194
HtmlLexer.canonicalName(
187195
html.substring(token.start + 1, token.end)),
188196
attrs);
@@ -195,7 +203,7 @@ public static void sanitize(@Nullable String html, final Policy policy) {
195203
}
196204
}
197205

198-
balancer.closeDocument();
206+
receiver.closeDocument();
199207
}
200208

201209
private static String stripQuotes(String encodedAttributeValue) {
@@ -216,4 +224,31 @@ private static String stripQuotes(String encodedAttributeValue) {
216224
return encodedAttributeValue;
217225
}
218226

227+
228+
private static HtmlStreamEventReceiver initializePolicy(
229+
Policy policy, HtmlStreamEventProcessor preprocessor) {
230+
TagBalancingHtmlStreamEventReceiver balancer
231+
= new TagBalancingHtmlStreamEventReceiver(policy);
232+
233+
// According to Opera the maximum table nesting depth seen in the wild is
234+
// 795, but 99.99% of documents have a table nesting depth of less than 22.
235+
// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
236+
// document depth of 90 (incl. HTML & BODY).
237+
// Obviously table nesting depth is not the same as whole document depth,
238+
// but it is the best proxy I have available.
239+
// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
240+
// the original data.
241+
242+
// Webkit defines the maximum HTML parser tree depth as 512.
243+
// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
244+
// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
245+
246+
// The first number gives us a lower bound on the nesting depth we allow,
247+
// 90, and the second gives us an upper bound: 512.
248+
// We do not want to bump right up against that limit.
249+
// 256 is substantially larger than the lower bound and well clear of the
250+
// upper bound.
251+
balancer.setNestingLimit(256);
252+
return preprocessor.wrap(balancer);
253+
}
219254
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.owasp.html;
2+
3+
/**
4+
* Receives the output sink to allow user-code to post-process events.
5+
*/
6+
public interface HtmlStreamEventProcessor {
7+
/**
8+
* @param sink an HTML stream event receiver that can take events from a
9+
* sanitizer policy to build a safe output on an appropriate buffer.
10+
* @return an HTML stream event receiver that can take events from a
11+
* sanitizer policy to build a safe output on an appropriate buffer by
12+
* sending events to sink.
13+
*/
14+
HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink);
15+
16+
/** */
17+
public static final class Processors {
18+
/**
19+
* A post-processor that returns the sink without wrapping it to do any
20+
* additional work.
21+
*/
22+
public static final HtmlStreamEventProcessor IDENTITY =
23+
new HtmlStreamEventProcessor() {
24+
25+
public HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink) {
26+
return sink;
27+
}
28+
29+
@Override
30+
public String toString() {
31+
return "[identity]";
32+
}
33+
};
34+
35+
/**
36+
* @return a processor whose that wraps its input in f wrapped in g.
37+
*/
38+
public static HtmlStreamEventProcessor compose(
39+
final HtmlStreamEventProcessor g, final HtmlStreamEventProcessor f) {
40+
if (f == IDENTITY) { return g; }
41+
if (g == IDENTITY) { return f; }
42+
return new HtmlStreamEventProcessor() {
43+
public HtmlStreamEventReceiver wrap(HtmlStreamEventReceiver sink) {
44+
return g.wrap(f.wrap(sink));
45+
}
46+
@Override
47+
public String toString() {
48+
return "(" + g + " \u2218 " + f + ")";
49+
}
50+
};
51+
}
52+
}
53+
}

src/main/java/org/owasp/html/PolicyFactory.java

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,26 @@ public final class PolicyFactory
5656
private final ImmutableMap<String, ElementAndAttributePolicies> policies;
5757
private final ImmutableMap<String, AttributePolicy> globalAttrPolicies;
5858
private final ImmutableSet<String> textContainers;
59+
private final HtmlStreamEventProcessor preprocessor;
60+
private final HtmlStreamEventProcessor postprocessor;
5961

6062
PolicyFactory(
6163
ImmutableMap<String, ElementAndAttributePolicies> policies,
6264
ImmutableSet<String> textContainers,
63-
ImmutableMap<String, AttributePolicy> globalAttrPolicies) {
65+
ImmutableMap<String, AttributePolicy> globalAttrPolicies,
66+
HtmlStreamEventProcessor preprocessor,
67+
HtmlStreamEventProcessor postprocessor) {
6468
this.policies = policies;
6569
this.textContainers = textContainers;
6670
this.globalAttrPolicies = globalAttrPolicies;
71+
this.preprocessor = preprocessor;
72+
this.postprocessor = postprocessor;
6773
}
6874

6975
/** Produces a sanitizer that emits tokens to {@code out}. */
7076
public HtmlSanitizer.Policy apply(@Nonnull HtmlStreamEventReceiver out) {
7177
return new ElementAndAttributePolicyBasedSanitizerPolicy(
72-
out, policies, textContainers);
78+
postprocessor.wrap(out), policies, textContainers);
7379
}
7480

7581
/**
@@ -120,8 +126,11 @@ public <CTX> String sanitize(
120126
StringBuilder out = new StringBuilder(html.length());
121127
HtmlSanitizer.sanitize(
122128
html,
123-
apply(HtmlStreamRenderer.create(out, Handler.DO_NOTHING),
124-
listener, context));
129+
apply(
130+
HtmlStreamRenderer.create(out, Handler.DO_NOTHING),
131+
listener,
132+
context),
133+
preprocessor);
125134
return out.toString();
126135
}
127136

@@ -193,6 +202,14 @@ public PolicyFactory and(PolicyFactory f) {
193202
}
194203
allGlobalAttrPolicies = ab.build();
195204
}
196-
return new PolicyFactory(b.build(), allTextContainers, allGlobalAttrPolicies);
205+
HtmlStreamEventProcessor compositionOfPreprocessors
206+
= HtmlStreamEventProcessor.Processors.compose(
207+
this.preprocessor, f.preprocessor);
208+
HtmlStreamEventProcessor compositionOfPostprocessors
209+
= HtmlStreamEventProcessor.Processors.compose(
210+
this.postprocessor, f.postprocessor);
211+
return new PolicyFactory(
212+
b.build(), allTextContainers, allGlobalAttrPolicies,
213+
compositionOfPreprocessors, compositionOfPostprocessors);
197214
}
198215
}

0 commit comments

Comments
 (0)