1616 * Normalized input.
1717 *
1818 * @typedef Schema
19- * Sanitization configuration.
19+ * Schema that defines what nodes and properties are allowed.
20+ *
21+ * The default schema is `defaultSchema`, which follows how GitHub cleans.
22+ * If any top-level key is missing in the given schema, the corresponding
23+ * value of the default schema is used.
24+ *
25+ * To extend the standard schema with a few changes, clone `defaultSchema`
26+ * like so:
27+ *
28+ * ```js
29+ * import {h} from 'hastscript'
30+ * import deepmerge from 'deepmerge' // You can use `structuredClone` in modern JS.
31+ * import {sanitize, defaultSchema} from 'hast-util-sanitize'
32+ *
33+ * const schema = deepmerge(defaultSchema, {attributes: {'*': ['className']}})
34+ *
35+ * const tree = sanitize(h('div', {className: ['foo']}), schema)
36+ *
37+ * // `tree` still has `className`.
38+ * console.log(tree)
39+ * // {
40+ * // type: 'element',
41+ * // tagName: 'div',
42+ * // properties: {className: ['foo']},
43+ * // children: []
44+ * // }
45+ * ```
2046 * @property {Attributes | null | undefined } [attributes]
21- * Map of tag names to allowed properties.
47+ * Map of tag names to allowed *property names*.
48+ *
49+ * The special key `'*'` as a tag name defines property names allowed on all
50+ * elements.
51+ *
52+ * The special value `'data*'` as a property name can be used to allow all
53+ * `data`properties.
54+ *
55+ * For example:
56+ *
57+ * ```js
58+ * attributes: {
59+ * a: ['href'],
60+ * img: ['src', 'longDesc'],
61+ * // …
62+ * '*': [
63+ * 'abbr',
64+ * 'accept',
65+ * 'acceptCharset',
66+ * // …
67+ * 'vSpace',
68+ * 'width',
69+ * 'itemProp'
70+ * ]
71+ * }
72+ * ```
73+ *
74+ * Instead of a single string, which allows any *property value* of that
75+ * property name, it’s also possible to provide an array to allow several
76+ * values.
77+ * For example, `input: ['type']` allows the `type` attribute set to any
78+ * value on inputs.
79+ * But `input: [['type', 'checkbox', 'radio']]` allows `type` only when set
80+ * to one of the allowed values (`'checkbox'` or `'radio'`).
81+ *
82+ * You can also use regexes, so for example `span: [['className', /^hljs-/]]`
83+ * allows any class that starts with `hljs-` on `span` elements.
84+ *
85+ * This is how the default GitHub schema allows only disabled checkbox
86+ * inputs:
2287 *
23- * The special `'*'` key defines property names allowed on all elements.
88+ * ```js
89+ * attributes: {
90+ * // …
91+ * input: [
92+ * ['type', 'checkbox'],
93+ * ['disabled', true]
94+ * ]
95+ * // …
96+ * }
97+ * ```
98+ *
99+ * Attributes also plays well with properties that accept space- or
100+ * comma-separated values, such as `class`.
101+ * Say you wanted to allow certain classes on `span` elements for syntax
102+ * highlighting, that can be done like this:
103+ *
104+ * ```js
105+ * // …
106+ * span: [
107+ * ['className', 'token', 'number', 'operator']
108+ * ]
109+ * // …
110+ * ```
24111 * @property {Record<string, Record<string, PropertyValue>> | null | undefined } [required]
25- * Map of tag names to required property names and their default property value.
112+ * Map of tag names to required *property names* and their default *property
113+ * value*.
114+ *
115+ * If the defined keys do not exist in an element’s properties, they are added
116+ * and set to the specified value.
117+ *
118+ * Note that properties are first checked based on the schema at `attributes`,
119+ * so properties could be removed by that step and then added again through
120+ * `required`.
121+ *
122+ * For example:
123+ *
124+ * ```js
125+ * required: {
126+ * input: {type: 'checkbox', disabled: true}
127+ * }
128+ * ```
26129 * @property {Array<string> | null | undefined } [tagNames]
27130 * List of allowed tag names.
131+ *
132+ * For example:
133+ *
134+ * ```js
135+ * tagNames: [
136+ * 'h1',
137+ * 'h2',
138+ * 'h3',
139+ * // …
140+ * 'strike',
141+ * 'summary',
142+ * 'details'
143+ * ]
144+ * ```
28145 * @property {Record<string, Array<string>> | null | undefined } [protocols]
29- * Map of protocols to allow in property values.
146+ * Map of *property names* to allowed protocols.
147+ *
148+ * The listed property names can be set to URLs that are local (relative to
149+ * the current website, such as `this`, `#this`, `/this`, or `?this`) or
150+ * remote (such as `https://example.com`), in which case they must have a
151+ * protocol that is allowed here.
152+ *
153+ * For example:
154+ *
155+ * ```js
156+ * protocols: {
157+ * href: ['http', 'https', 'mailto'],
158+ * // …
159+ * longDesc: ['http', 'https']
160+ * }
161+ * ```
30162 * @property {Record<string, Array<string>> | null | undefined } [ancestors]
31- * Map of tag names to their required ancestor elements.
163+ * Map of tag names to a list of tag names which are required ancestors.
164+ *
165+ * Elements with these tag names will be ignored if they occur outside of one
166+ * of their allowed parents.
167+ *
168+ * For example:
169+ *
170+ * ```js
171+ * ancestors: {
172+ * li: ['ol', 'ul'],
173+ * // …
174+ * tr: ['table']
175+ * }
176+ * ```
32177 * @property {Array<string> | null | undefined } [clobber]
33- * List of allowed property names which can clobber.
178+ * List of *property names* that clobber (`Array<string>`).
179+ *
180+ * For example:
181+ *
182+ * ```js
183+ * clobber: ['name', 'id']
184+ * ```
34185 * @property {string | null | undefined } [clobberPrefix]
35- * Prefix to use before potentially clobbering property names.
186+ * Prefix to use before clobbering properties.
187+ *
188+ * For example:
189+ *
190+ * ```js
191+ * clobberPrefix: 'user-content-'
192+ * ```
36193 * @property {Array<string> | null | undefined } [strip]
37- * Names of elements to strip from the tree.
38- * @property {boolean | null | undefined } [allowComments]
39- * Whether to allow comments.
40- * @property {boolean | null | undefined } [allowDoctypes]
41- * Whether to allow doctypes.
194+ * List of tag names to strip from the tree.
195+ *
196+ * By default, unsafe elements are replaced by their children.
197+ * Some elements should however be entirely stripped from the tree.
198+ *
199+ * For example:
200+ *
201+ * ```js
202+ * strip: ['script']
203+ * ```
204+ * @property {boolean | null | undefined } [allowComments=false]
205+ * Whether to allow comment nodes.
206+ *
207+ * For example:
208+ *
209+ * ```js
210+ * allowComments: true
211+ * ```
212+ * @property {boolean | null | undefined } [allowDoctypes=false]
213+ * Whether to allow doctype nodes.
214+ *
215+ * ```js
216+ * allowDoctypes: true
217+ * ```
42218 *
43219 * @typedef {(schema: Schema, value: any, node: any, stack: Array<string>) => unknown } Handler
44220 * @typedef {Record<string, Handler> } NodeDefinition
@@ -65,12 +241,14 @@ const nodeSchema = {
65241}
66242
67243/**
68- * Utility to sanitize a tree
244+ * Sanitize a tree.
69245 *
70246 * @param {Node } node
71- * Hast tree to sanitize
247+ * Tree to clean.
72248 * @param {Schema | null | undefined } [schema]
73- * Schema defining how to sanitize - defaults to Github style sanitation
249+ * Schema defining how to sanitize.
250+ * @returns {Node }
251+ * New, sanitized, tree.
74252 */
75253export function sanitize ( node , schema ) {
76254 /** @type {Node } */
@@ -420,19 +598,23 @@ function handlePropertyValue(schema, value, prop, definition) {
420598 * @returns {boolean }
421599 */
422600function safeProtocol ( schema , value , prop ) {
601+ const protocols =
602+ schema . protocols && own . call ( schema . protocols , prop )
603+ ? schema . protocols [ prop ] . concat ( )
604+ : [ ]
605+
606+ // Not listed.
607+ if ( protocols . length === 0 ) {
608+ return true
609+ }
610+
423611 const url = String ( value )
424612 const colon = url . indexOf ( ':' )
425613 const questionMark = url . indexOf ( '?' )
426614 const numberSign = url . indexOf ( '#' )
427615 const slash = url . indexOf ( '/' )
428- const protocols =
429- schema . protocols && own . call ( schema . protocols , prop )
430- ? schema . protocols [ prop ] . concat ( )
431- : [ ]
432- let index = - 1
433616
434617 if (
435- protocols . length === 0 ||
436618 colon < 0 ||
437619 // If the first colon is after a `?`, `#`, or `/`, it’s not a protocol.
438620 ( slash > - 1 && colon > slash ) ||
@@ -442,6 +624,8 @@ function safeProtocol(schema, value, prop) {
442624 return true
443625 }
444626
627+ let index = - 1
628+
445629 while ( ++ index < protocols . length ) {
446630 if (
447631 colon === protocols [ index ] . length &&
0 commit comments