001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.ArrayList;
023import java.util.HashMap;
024import java.util.List;
025import java.util.Map;
026import java.util.regex.Matcher;
027import java.util.regex.Pattern;
028
029import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
030import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
031import com.puppycrawl.tools.checkstyle.api.DetailAST;
032import com.puppycrawl.tools.checkstyle.api.TokenTypes;
033import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
034
035/**
036 * <div>
037 * Restricts using
038 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
039 * Unicode escapes</a>
040 * (such as &#92;u221e). It is possible to allow using escapes for
041 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
042 * non-printable, control characters</a>.
043 * Also, this check can be configured to allow using escapes
044 * if trail comment is present. By the option it is possible to
045 * allow using escapes if literal contains only them.
046 * </div>
047 *
048 * @since 5.8
049 */
050@FileStatefulCheck
051public class AvoidEscapedUnicodeCharactersCheck
052    extends AbstractCheck {
053
054    /**
055     * A key is pointing to the warning message text in "messages.properties"
056     * file.
057     */
058    public static final String MSG_KEY = "forbid.escaped.unicode.char";
059
060    /** Regular expression for Unicode chars. */
061    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
062
063    /**
064     * Regular expression Unicode control characters.
065     *
066     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
067     *     Appendix:Control characters</a>
068     */
069    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
070            + "(00[0-1][\\dA-Fa-f]"
071            + "|00[8-9][\\dA-Fa-f]"
072            + "|00[aA][dD]"
073            + "|034[fF]"
074            + "|070[fF]"
075            + "|180[eE]"
076            + "|200[b-fB-F]"
077            + "|202[a-eA-E]"
078            + "|206[0-4a-fA-F]"
079            + "|[fF]{3}[9a-bA-B]"
080            + "|[fF][eE][fF]{2})");
081
082    /**
083     * Regular expression for all escaped chars.
084     * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
085     * EscapeSequence</a>
086     */
087    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
088            + UNICODE_REGEXP.pattern()
089            + "|\""
090            + "|'"
091            + "|\\\\"
092            + "|\\\\b"
093            + "|\\\\f"
094            + "|\\\\n"
095            + "|\\R"
096            + "|\\\\r"
097            + "|\\\\s"
098            + "|\\\\t"
099            + ")+$");
100
101    /** Regular expression for escaped backslash. */
102    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
103
104    /** Regular expression for non-printable unicode chars. */
105    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
106            + "|\\\\u0009"
107            + "|\\\\u000[bB]"
108            + "|\\\\u000[cC]"
109            + "|\\\\u0020"
110            + "|\\\\u007[fF]"
111            + "|\\\\u0085"
112            + "|\\\\u009[fF]"
113            + "|\\\\u00[aA]0"
114            + "|\\\\u00[aA][dD]"
115            + "|\\\\u04[fF]9"
116            + "|\\\\u05[bB][eE]"
117            + "|\\\\u05[dD]0"
118            + "|\\\\u05[eE][aA]"
119            + "|\\\\u05[fF]3"
120            + "|\\\\u05[fF]4"
121            + "|\\\\u0600"
122            + "|\\\\u0604"
123            + "|\\\\u061[cC]"
124            + "|\\\\u06[dD]{2}"
125            + "|\\\\u06[fF]{2}"
126            + "|\\\\u070[fF]"
127            + "|\\\\u0750"
128            + "|\\\\u077[fF]"
129            + "|\\\\u0[eE]00"
130            + "|\\\\u0[eE]7[fF]"
131            + "|\\\\u1680"
132            + "|\\\\u180[eE]"
133            + "|\\\\u1[eE]00"
134            + "|\\\\u2000"
135            + "|\\\\u2001"
136            + "|\\\\u2002"
137            + "|\\\\u2003"
138            + "|\\\\u2004"
139            + "|\\\\u2005"
140            + "|\\\\u2006"
141            + "|\\\\u2007"
142            + "|\\\\u2008"
143            + "|\\\\u2009"
144            + "|\\\\u200[aA]"
145            + "|\\\\u200[fF]"
146            + "|\\\\u2025"
147            + "|\\\\u2028"
148            + "|\\\\u2029"
149            + "|\\\\u202[fF]"
150            + "|\\\\u205[fF]"
151            + "|\\\\u2064"
152            + "|\\\\u2066"
153            + "|\\\\u2067"
154            + "|\\\\u2068"
155            + "|\\\\u2069"
156            + "|\\\\u206[aA]"
157            + "|\\\\u206[fF]"
158            + "|\\\\u20[aA][fF]"
159            + "|\\\\u2100"
160            + "|\\\\u213[aA]"
161            + "|\\\\u3000"
162            + "|\\\\u[dD]800"
163            + "|\\\\u[fF]8[fF]{2}"
164            + "|\\\\u[fF][bB]50"
165            + "|\\\\u[fF][dD][fF]{2}"
166            + "|\\\\u[fF][eE]70"
167            + "|\\\\u[fF][eE][fF]{2}"
168            + "|\\\\u[fF]{2}0[eE]"
169            + "|\\\\u[fF]{2}61"
170            + "|\\\\u[fF]{2}[dD][cC]"
171            + "|\\\\u[fF]{3}9"
172            + "|\\\\u[fF]{3}[aA]"
173            + "|\\\\u[fF]{3}[bB]"
174            + "|\\\\u[fF]{4}");
175
176    /**
177     * Map of Pending Violations.
178     * Key: Line number of the violation.
179     * Value: List of literal AST nodes on that line pending validation.
180     */
181    private final Map<Integer, List<DetailAST>> pendingViolations = new HashMap<>();
182
183    /** Allow use escapes for non-printable, control characters. */
184    private boolean allowEscapesForControlCharacters;
185
186    /** Allow use escapes if trail comment is present. */
187    private boolean allowByTailComment;
188
189    /** Allow if all characters in literal are escaped. */
190    private boolean allowIfAllCharactersEscaped;
191
192    /** Allow use escapes for non-printable, whitespace characters. */
193    private boolean allowNonPrintableEscapes;
194
195    /**
196     * Setter to allow use escapes for non-printable, control characters.
197     *
198     * @param allow user's value.
199     * @since 5.8
200     */
201    public final void setAllowEscapesForControlCharacters(boolean allow) {
202        allowEscapesForControlCharacters = allow;
203    }
204
205    /**
206     * Setter to allow use escapes if trail comment is present.
207     *
208     * @param allow user's value.
209     * @since 5.8
210     */
211    public final void setAllowByTailComment(boolean allow) {
212        allowByTailComment = allow;
213    }
214
215    /**
216     * Setter to allow if all characters in literal are escaped.
217     *
218     * @param allow user's value.
219     * @since 5.8
220     */
221    public final void setAllowIfAllCharactersEscaped(boolean allow) {
222        allowIfAllCharactersEscaped = allow;
223    }
224
225    /**
226     * Setter to allow use escapes for non-printable, whitespace characters.
227     *
228     * @param allow user's value.
229     * @since 5.8
230     */
231    public final void setAllowNonPrintableEscapes(boolean allow) {
232        allowNonPrintableEscapes = allow;
233    }
234
235    @Override
236    public int[] getDefaultTokens() {
237        return getRequiredTokens();
238    }
239
240    @Override
241    public int[] getAcceptableTokens() {
242        return getRequiredTokens();
243    }
244
245    @Override
246    public int[] getRequiredTokens() {
247        return new int[] {
248            TokenTypes.STRING_LITERAL,
249            TokenTypes.CHAR_LITERAL,
250            TokenTypes.TEXT_BLOCK_CONTENT,
251            TokenTypes.SINGLE_LINE_COMMENT,
252            TokenTypes.BLOCK_COMMENT_BEGIN,
253        };
254    }
255
256    @Override
257    public boolean isCommentNodesRequired() {
258        return true;
259    }
260
261    @Override
262    public void beginTree(DetailAST rootAST) {
263        pendingViolations.clear();
264    }
265
266    @Override
267    public void visitToken(DetailAST ast) {
268        if (ast.getType() == TokenTypes.SINGLE_LINE_COMMENT
269                || ast.getType() == TokenTypes.BLOCK_COMMENT_BEGIN) {
270            checkComment(ast);
271        }
272        else {
273            checkLiteral(ast);
274        }
275    }
276
277    @Override
278    public void finishTree(DetailAST rootAST) {
279        for (List<DetailAST> asts : pendingViolations.values()) {
280            for (DetailAST ast : asts) {
281                log(ast, MSG_KEY);
282            }
283        }
284    }
285
286    /**
287     * Checks if the literal has Unicode char and should be reported.
288     * If violation is found, it is added to pendingViolations.
289     *
290     * @param ast literal token.
291     */
292    private void checkLiteral(DetailAST ast) {
293        final String literal =
294            CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
295
296        if (hasUnicodeChar(literal) && !(isAllCharactersEscaped(literal)
297                || allowEscapesForControlCharacters
298                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
299                || allowNonPrintableEscapes
300                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
301
302            if (allowByTailComment) {
303                int lineNo = ast.getLineNo();
304                if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
305                    lineNo = ast.getNextSibling().getLineNo();
306                }
307                pendingViolations.computeIfAbsent(lineNo, key -> new ArrayList<>()).add(ast);
308            }
309            else {
310                log(ast, MSG_KEY);
311            }
312        }
313    }
314
315    /**
316     * Checks if a comment clears any pending violations on the same line.
317     *
318     * @param comment comment token.
319     */
320    private void checkComment(DetailAST comment) {
321        if (isTrailingComment(comment)) {
322            pendingViolations.remove(comment.getLineNo());
323        }
324    }
325
326    /**
327     * Checks if a comment is trailing (has no code after it on the same line).
328     *
329     * @param commentNode the comment AST node
330     * @return true if it is trailing
331     */
332    private static boolean isTrailingComment(DetailAST commentNode) {
333        final DetailAST nextSibling = commentNode.getNextSibling();
334        return nextSibling == null || nextSibling.getLineNo() != commentNode.getLineNo();
335    }
336
337    /**
338     * Checks if literal has Unicode chars.
339     *
340     * @param literal String literal.
341     * @return true if literal has Unicode chars.
342     */
343    private static boolean hasUnicodeChar(String literal) {
344        final String literalWithoutEscapedBackslashes =
345                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
346        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
347    }
348
349    /**
350     * Check if String literal contains Unicode control chars.
351     *
352     * @param literal String literal.
353     * @param pattern RegExp for valid characters.
354     * @return true, if String literal contains Unicode control chars.
355     */
356    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
357        final int unicodeMatchesCounter =
358                countMatches(UNICODE_REGEXP, literal);
359        final int unicodeValidMatchesCounter =
360                countMatches(pattern, literal);
361        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
362    }
363
364    /**
365     * Count regexp matches into String literal.
366     *
367     * @param pattern pattern.
368     * @param target String literal.
369     * @return count of regexp matches.
370     */
371    private static int countMatches(Pattern pattern, String target) {
372        int matcherCounter = 0;
373        final Matcher matcher = pattern.matcher(target);
374        while (matcher.find()) {
375            matcherCounter++;
376        }
377        return matcherCounter;
378    }
379
380    /**
381     * Checks if all characters in String literal is escaped.
382     *
383     * @param literal current literal.
384     * @return true if all characters in String literal is escaped.
385     */
386    private boolean isAllCharactersEscaped(String literal) {
387        return allowIfAllCharactersEscaped
388                && ALL_ESCAPED_CHARS.matcher(literal).find();
389    }
390}