001/*
002 * Copyright 2002-2019 the original author or authors.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      https://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.springframework.web.util;
018
019import org.springframework.util.Assert;
020
021/**
022 * Utility class for HTML escaping.
023 *
024 * <p>Escapes and unescapes based on the W3C HTML 4.01 recommendation, handling
025 * character entity references.
026 *
027 * <p>Reference:
028 * <a href="https://www.w3.org/TR/html4/charset.html">https://www.w3.org/TR/html4/charset.html</a>
029 *
030 * <p>For a comprehensive set of String escaping utilities, consider
031 * <a href="https://commons.apache.org/proper/commons-text/">Apache Commons Text</a>
032 * and its {@code StringEscapeUtils} class. We do not use that class here in order
033 * to avoid a runtime dependency on Commons Text just for HTML escaping. Furthermore,
034 * Spring's HTML escaping is more flexible and 100% HTML 4.0 compliant.
035 *
036 * @author Juergen Hoeller
037 * @author Martin Kersten
038 * @author Craig Andrews
039 * @since 01.03.2003
040 */
041public abstract class HtmlUtils {
042
043        /**
044         * Shared instance of pre-parsed HTML character entity references.
045         */
046        private static final HtmlCharacterEntityReferences characterEntityReferences =
047                        new HtmlCharacterEntityReferences();
048
049
050        /**
051         * Turn special characters into HTML character references.
052         * <p>Handles complete character set defined in HTML 4.01 recommendation.
053         * <p>Escapes all special characters to their corresponding
054         * entity reference (e.g. {@code &lt;}).
055         * <p>Reference:
056         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
057         * https://www.w3.org/TR/html4/sgml/entities.html
058         * </a>
059         * @param input the (unescaped) input string
060         * @return the escaped string
061         */
062        public static String htmlEscape(String input) {
063                return htmlEscape(input, WebUtils.DEFAULT_CHARACTER_ENCODING);
064        }
065
066        /**
067         * Turn special characters into HTML character references.
068         * <p>Handles complete character set defined in HTML 4.01 recommendation.
069         * <p>Escapes all special characters to their corresponding
070         * entity reference (e.g. {@code &lt;}) at least as required by the
071         * specified encoding. In other words, if a special character does
072         * not have to be escaped for the given encoding, it may not be.
073         * <p>Reference:
074         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
075         * https://www.w3.org/TR/html4/sgml/entities.html
076         * </a>
077         * @param input the (unescaped) input string
078         * @param encoding the name of a supported {@link java.nio.charset.Charset charset}
079         * @return the escaped string
080         * @since 4.1.2
081         */
082        public static String htmlEscape(String input, String encoding) {
083                Assert.notNull(input, "Input is required");
084                Assert.notNull(encoding, "Encoding is required");
085                StringBuilder escaped = new StringBuilder(input.length() * 2);
086                for (int i = 0; i < input.length(); i++) {
087                        char character = input.charAt(i);
088                        String reference = characterEntityReferences.convertToReference(character, encoding);
089                        if (reference != null) {
090                                escaped.append(reference);
091                        }
092                        else {
093                                escaped.append(character);
094                        }
095                }
096                return escaped.toString();
097        }
098
099        /**
100         * Turn special characters into HTML character references.
101         * <p>Handles complete character set defined in HTML 4.01 recommendation.
102         * <p>Escapes all special characters to their corresponding numeric
103         * reference in decimal format (&#<i>Decimal</i>;).
104         * <p>Reference:
105         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
106         * https://www.w3.org/TR/html4/sgml/entities.html
107         * </a>
108         * @param input the (unescaped) input string
109         * @return the escaped string
110         */
111        public static String htmlEscapeDecimal(String input) {
112                return htmlEscapeDecimal(input, WebUtils.DEFAULT_CHARACTER_ENCODING);
113        }
114
115        /**
116         * Turn special characters into HTML character references.
117         * <p>Handles complete character set defined in HTML 4.01 recommendation.
118         * <p>Escapes all special characters to their corresponding numeric
119         * reference in decimal format (&#<i>Decimal</i>;) at least as required by the
120         * specified encoding. In other words, if a special character does
121         * not have to be escaped for the given encoding, it may not be.
122         * <p>Reference:
123         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
124         * https://www.w3.org/TR/html4/sgml/entities.html
125         * </a>
126         * @param input the (unescaped) input string
127         * @param encoding the name of a supported {@link java.nio.charset.Charset charset}
128         * @return the escaped string
129         * @since 4.1.2
130         */
131        public static String htmlEscapeDecimal(String input, String encoding) {
132                Assert.notNull(input, "Input is required");
133                Assert.notNull(encoding, "Encoding is required");
134                StringBuilder escaped = new StringBuilder(input.length() * 2);
135                for (int i = 0; i < input.length(); i++) {
136                        char character = input.charAt(i);
137                        if (characterEntityReferences.isMappedToReference(character, encoding)) {
138                                escaped.append(HtmlCharacterEntityReferences.DECIMAL_REFERENCE_START);
139                                escaped.append((int) character);
140                                escaped.append(HtmlCharacterEntityReferences.REFERENCE_END);
141                        }
142                        else {
143                                escaped.append(character);
144                        }
145                }
146                return escaped.toString();
147        }
148
149        /**
150         * Turn special characters into HTML character references.
151         * <p>Handles complete character set defined in HTML 4.01 recommendation.
152         * <p>Escapes all special characters to their corresponding numeric
153         * reference in hex format (&#x<i>Hex</i>;).
154         * <p>Reference:
155         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
156         * https://www.w3.org/TR/html4/sgml/entities.html
157         * </a>
158         * @param input the (unescaped) input string
159         * @return the escaped string
160         */
161        public static String htmlEscapeHex(String input) {
162                return htmlEscapeHex(input, WebUtils.DEFAULT_CHARACTER_ENCODING);
163        }
164
165        /**
166         * Turn special characters into HTML character references.
167         * <p>Handles complete character set defined in HTML 4.01 recommendation.
168         * <p>Escapes all special characters to their corresponding numeric
169         * reference in hex format (&#x<i>Hex</i>;) at least as required by the
170         * specified encoding. In other words, if a special character does
171         * not have to be escaped for the given encoding, it may not be.
172         * <p>Reference:
173         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
174         * https://www.w3.org/TR/html4/sgml/entities.html
175         * </a>
176         * @param input the (unescaped) input string
177         * @param encoding the name of a supported {@link java.nio.charset.Charset charset}
178         * @return the escaped string
179         * @since 4.1.2
180         */
181        public static String htmlEscapeHex(String input, String encoding) {
182                Assert.notNull(input, "Input is required");
183                Assert.notNull(encoding, "Encoding is required");
184                StringBuilder escaped = new StringBuilder(input.length() * 2);
185                for (int i = 0; i < input.length(); i++) {
186                        char character = input.charAt(i);
187                        if (characterEntityReferences.isMappedToReference(character, encoding)) {
188                                escaped.append(HtmlCharacterEntityReferences.HEX_REFERENCE_START);
189                                escaped.append(Integer.toString(character, 16));
190                                escaped.append(HtmlCharacterEntityReferences.REFERENCE_END);
191                        }
192                        else {
193                                escaped.append(character);
194                        }
195                }
196                return escaped.toString();
197        }
198
199        /**
200         * Turn HTML character references into their plain text UNICODE equivalent.
201         * <p>Handles complete character set defined in HTML 4.01 recommendation
202         * and all reference types (decimal, hex, and entity).
203         * <p>Correctly converts the following formats:
204         * <blockquote>
205         * &amp;#<i>Entity</i>; - <i>(Example: &amp;amp;) case sensitive</i>
206         * &amp;#<i>Decimal</i>; - <i>(Example: &amp;#68;)</i><br>
207         * &amp;#x<i>Hex</i>; - <i>(Example: &amp;#xE5;) case insensitive</i><br>
208         * </blockquote>
209         * <p>Gracefully handles malformed character references by copying original
210         * characters as is when encountered.
211         * <p>Reference:
212         * <a href="https://www.w3.org/TR/html4/sgml/entities.html">
213         * https://www.w3.org/TR/html4/sgml/entities.html
214         * </a>
215         * @param input the (escaped) input string
216         * @return the unescaped string
217         */
218        public static String htmlUnescape(String input) {
219                return new HtmlCharacterEntityDecoder(characterEntityReferences, input).decode();
220        }
221
222}