View Javadoc
1   ///////////////////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3   // Copyright (C) 2001-2026 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ///////////////////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.ArrayList;
23  import java.util.HashMap;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
30  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
31  import com.puppycrawl.tools.checkstyle.api.DetailAST;
32  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
33  import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
34  
35  /**
36   * <div>
37   * Restricts using
38   * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
39   * Unicode escapes</a>
40   * (such as &#92;u221e). It is possible to allow using escapes for
41   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
42   * non-printable, control characters</a>.
43   * Also, this check can be configured to allow using escapes
44   * if trail comment is present. By the option it is possible to
45   * allow using escapes if literal contains only them.
46   * </div>
47   *
48   * @since 5.8
49   */
50  @FileStatefulCheck
51  public class AvoidEscapedUnicodeCharactersCheck
52      extends AbstractCheck {
53  
54      /**
55       * A key is pointing to the warning message text in "messages.properties"
56       * file.
57       */
58      public static final String MSG_KEY = "forbid.escaped.unicode.char";
59  
60      /** Regular expression for Unicode chars. */
61      private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
62  
63      /**
64       * Regular expression Unicode control characters.
65       *
66       * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
67       *     Appendix:Control characters</a>
68       */
69      private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
70              + "(00[0-1][\\dA-Fa-f]"
71              + "|00[8-9][\\dA-Fa-f]"
72              + "|00[aA][dD]"
73              + "|034[fF]"
74              + "|070[fF]"
75              + "|180[eE]"
76              + "|200[b-fB-F]"
77              + "|202[a-eA-E]"
78              + "|206[0-4a-fA-F]"
79              + "|[fF]{3}[9a-bA-B]"
80              + "|[fF][eE][fF]{2})");
81  
82      /**
83       * Regular expression for all escaped chars.
84       * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
85       * EscapeSequence</a>
86       */
87      private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
88              + UNICODE_REGEXP.pattern()
89              + "|\""
90              + "|'"
91              + "|\\\\"
92              + "|\\\\b"
93              + "|\\\\f"
94              + "|\\\\n"
95              + "|\\R"
96              + "|\\\\r"
97              + "|\\\\s"
98              + "|\\\\t"
99              + ")+$");
100 
101     /** Regular expression for escaped backslash. */
102     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
103 
104     /** Regular expression for non-printable unicode chars. */
105     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
106             + "|\\\\u0009"
107             + "|\\\\u000[bB]"
108             + "|\\\\u000[cC]"
109             + "|\\\\u0020"
110             + "|\\\\u007[fF]"
111             + "|\\\\u0085"
112             + "|\\\\u009[fF]"
113             + "|\\\\u00[aA]0"
114             + "|\\\\u00[aA][dD]"
115             + "|\\\\u04[fF]9"
116             + "|\\\\u05[bB][eE]"
117             + "|\\\\u05[dD]0"
118             + "|\\\\u05[eE][aA]"
119             + "|\\\\u05[fF]3"
120             + "|\\\\u05[fF]4"
121             + "|\\\\u0600"
122             + "|\\\\u0604"
123             + "|\\\\u061[cC]"
124             + "|\\\\u06[dD]{2}"
125             + "|\\\\u06[fF]{2}"
126             + "|\\\\u070[fF]"
127             + "|\\\\u0750"
128             + "|\\\\u077[fF]"
129             + "|\\\\u0[eE]00"
130             + "|\\\\u0[eE]7[fF]"
131             + "|\\\\u1680"
132             + "|\\\\u180[eE]"
133             + "|\\\\u1[eE]00"
134             + "|\\\\u2000"
135             + "|\\\\u2001"
136             + "|\\\\u2002"
137             + "|\\\\u2003"
138             + "|\\\\u2004"
139             + "|\\\\u2005"
140             + "|\\\\u2006"
141             + "|\\\\u2007"
142             + "|\\\\u2008"
143             + "|\\\\u2009"
144             + "|\\\\u200[aA]"
145             + "|\\\\u200[fF]"
146             + "|\\\\u2025"
147             + "|\\\\u2028"
148             + "|\\\\u2029"
149             + "|\\\\u202[fF]"
150             + "|\\\\u205[fF]"
151             + "|\\\\u2064"
152             + "|\\\\u2066"
153             + "|\\\\u2067"
154             + "|\\\\u2068"
155             + "|\\\\u2069"
156             + "|\\\\u206[aA]"
157             + "|\\\\u206[fF]"
158             + "|\\\\u20[aA][fF]"
159             + "|\\\\u2100"
160             + "|\\\\u213[aA]"
161             + "|\\\\u3000"
162             + "|\\\\u[dD]800"
163             + "|\\\\u[fF]8[fF]{2}"
164             + "|\\\\u[fF][bB]50"
165             + "|\\\\u[fF][dD][fF]{2}"
166             + "|\\\\u[fF][eE]70"
167             + "|\\\\u[fF][eE][fF]{2}"
168             + "|\\\\u[fF]{2}0[eE]"
169             + "|\\\\u[fF]{2}61"
170             + "|\\\\u[fF]{2}[dD][cC]"
171             + "|\\\\u[fF]{3}9"
172             + "|\\\\u[fF]{3}[aA]"
173             + "|\\\\u[fF]{3}[bB]"
174             + "|\\\\u[fF]{4}");
175 
176     /**
177      * Map of Pending Violations.
178      * Key: Line number of the violation.
179      * Value: List of literal AST nodes on that line pending validation.
180      */
181     private final Map<Integer, List<DetailAST>> pendingViolations = new HashMap<>();
182 
183     /** Allow use escapes for non-printable, control characters. */
184     private boolean allowEscapesForControlCharacters;
185 
186     /** Allow use escapes if trail comment is present. */
187     private boolean allowByTailComment;
188 
189     /** Allow if all characters in literal are escaped. */
190     private boolean allowIfAllCharactersEscaped;
191 
192     /** Allow use escapes for non-printable, whitespace characters. */
193     private boolean allowNonPrintableEscapes;
194 
195     /**
196      * Setter to allow use escapes for non-printable, control characters.
197      *
198      * @param allow user's value.
199      * @since 5.8
200      */
201     public final void setAllowEscapesForControlCharacters(boolean allow) {
202         allowEscapesForControlCharacters = allow;
203     }
204 
205     /**
206      * Setter to allow use escapes if trail comment is present.
207      *
208      * @param allow user's value.
209      * @since 5.8
210      */
211     public final void setAllowByTailComment(boolean allow) {
212         allowByTailComment = allow;
213     }
214 
215     /**
216      * Setter to allow if all characters in literal are escaped.
217      *
218      * @param allow user's value.
219      * @since 5.8
220      */
221     public final void setAllowIfAllCharactersEscaped(boolean allow) {
222         allowIfAllCharactersEscaped = allow;
223     }
224 
225     /**
226      * Setter to allow use escapes for non-printable, whitespace characters.
227      *
228      * @param allow user's value.
229      * @since 5.8
230      */
231     public final void setAllowNonPrintableEscapes(boolean allow) {
232         allowNonPrintableEscapes = allow;
233     }
234 
235     @Override
236     public int[] getDefaultTokens() {
237         return getRequiredTokens();
238     }
239 
240     @Override
241     public int[] getAcceptableTokens() {
242         return getRequiredTokens();
243     }
244 
245     @Override
246     public int[] getRequiredTokens() {
247         return new int[] {
248             TokenTypes.STRING_LITERAL,
249             TokenTypes.CHAR_LITERAL,
250             TokenTypes.TEXT_BLOCK_CONTENT,
251             TokenTypes.SINGLE_LINE_COMMENT,
252             TokenTypes.BLOCK_COMMENT_BEGIN,
253         };
254     }
255 
256     @Override
257     public boolean isCommentNodesRequired() {
258         return true;
259     }
260 
261     @Override
262     public void beginTree(DetailAST rootAST) {
263         pendingViolations.clear();
264     }
265 
266     @Override
267     public void visitToken(DetailAST ast) {
268         if (ast.getType() == TokenTypes.SINGLE_LINE_COMMENT
269                 || ast.getType() == TokenTypes.BLOCK_COMMENT_BEGIN) {
270             checkComment(ast);
271         }
272         else {
273             checkLiteral(ast);
274         }
275     }
276 
277     @Override
278     public void finishTree(DetailAST rootAST) {
279         for (List<DetailAST> asts : pendingViolations.values()) {
280             for (DetailAST ast : asts) {
281                 log(ast, MSG_KEY);
282             }
283         }
284     }
285 
286     /**
287      * Checks if the literal has Unicode char and should be reported.
288      * If violation is found, it is added to pendingViolations.
289      *
290      * @param ast literal token.
291      */
292     private void checkLiteral(DetailAST ast) {
293         final String literal =
294             CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
295 
296         if (hasUnicodeChar(literal) && !(isAllCharactersEscaped(literal)
297                 || allowEscapesForControlCharacters
298                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
299                 || allowNonPrintableEscapes
300                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
301 
302             if (allowByTailComment) {
303                 int lineNo = ast.getLineNo();
304                 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
305                     lineNo = ast.getNextSibling().getLineNo();
306                 }
307                 pendingViolations.computeIfAbsent(lineNo, key -> new ArrayList<>()).add(ast);
308             }
309             else {
310                 log(ast, MSG_KEY);
311             }
312         }
313     }
314 
315     /**
316      * Checks if a comment clears any pending violations on the same line.
317      *
318      * @param comment comment token.
319      */
320     private void checkComment(DetailAST comment) {
321         if (isTrailingComment(comment)) {
322             pendingViolations.remove(comment.getLineNo());
323         }
324     }
325 
326     /**
327      * Checks if a comment is trailing (has no code after it on the same line).
328      *
329      * @param commentNode the comment AST node
330      * @return true if it is trailing
331      */
332     private static boolean isTrailingComment(DetailAST commentNode) {
333         final DetailAST nextSibling = commentNode.getNextSibling();
334         return nextSibling == null || nextSibling.getLineNo() != commentNode.getLineNo();
335     }
336 
337     /**
338      * Checks if literal has Unicode chars.
339      *
340      * @param literal String literal.
341      * @return true if literal has Unicode chars.
342      */
343     private static boolean hasUnicodeChar(String literal) {
344         final String literalWithoutEscapedBackslashes =
345                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
346         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
347     }
348 
349     /**
350      * Check if String literal contains Unicode control chars.
351      *
352      * @param literal String literal.
353      * @param pattern RegExp for valid characters.
354      * @return true, if String literal contains Unicode control chars.
355      */
356     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
357         final int unicodeMatchesCounter =
358                 countMatches(UNICODE_REGEXP, literal);
359         final int unicodeValidMatchesCounter =
360                 countMatches(pattern, literal);
361         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
362     }
363 
364     /**
365      * Count regexp matches into String literal.
366      *
367      * @param pattern pattern.
368      * @param target String literal.
369      * @return count of regexp matches.
370      */
371     private static int countMatches(Pattern pattern, String target) {
372         int matcherCounter = 0;
373         final Matcher matcher = pattern.matcher(target);
374         while (matcher.find()) {
375             matcherCounter++;
376         }
377         return matcherCounter;
378     }
379 
380     /**
381      * Checks if all characters in String literal is escaped.
382      *
383      * @param literal current literal.
384      * @return true if all characters in String literal is escaped.
385      */
386     private boolean isAllCharactersEscaped(String literal) {
387         return allowIfAllCharactersEscaped
388                 && ALL_ESCAPED_CHARS.matcher(literal).find();
389     }
390 }