View Javadoc
1   ///////////////////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3   // Copyright (C) 2001-2024 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ///////////////////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.Arrays;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
29  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
30  import com.puppycrawl.tools.checkstyle.api.DetailAST;
31  import com.puppycrawl.tools.checkstyle.api.TextBlock;
32  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
33  import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
34  import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
35  
36  /**
37   * <p>
38   * Restricts using
39   * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
40   * Unicode escapes</a>
41   * (such as &#92;u221e). It is possible to allow using escapes for
42   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
43   * non-printable, control characters</a>.
44   * Also, this check can be configured to allow using escapes
45   * if trail comment is present. By the option it is possible to
46   * allow using escapes if literal contains only them.
47   * </p>
48   * <ul>
49   * <li>
50   * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
51   * Type is {@code boolean}.
52   * Default value is {@code false}.
53   * </li>
54   * <li>
55   * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
56   * non-printable, control characters.
57   * Type is {@code boolean}.
58   * Default value is {@code false}.
59   * </li>
60   * <li>
61   * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
62   * Type is {@code boolean}.
63   * Default value is {@code false}.
64   * </li>
65   * <li>
66   * Property {@code allowNonPrintableEscapes} - Allow use escapes for
67   * non-printable, whitespace characters.
68   * Type is {@code boolean}.
69   * Default value is {@code false}.
70   * </li>
71   * </ul>
72   * <p>
73   * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
74   * </p>
75   * <p>
76   * Violation Message Keys:
77   * </p>
78   * <ul>
79   * <li>
80   * {@code forbid.escaped.unicode.char}
81   * </li>
82   * </ul>
83   *
84   * @since 5.8
85   */
86  @FileStatefulCheck
87  public class AvoidEscapedUnicodeCharactersCheck
88      extends AbstractCheck {
89  
90      /**
91       * A key is pointing to the warning message text in "messages.properties"
92       * file.
93       */
94      public static final String MSG_KEY = "forbid.escaped.unicode.char";
95  
96      /** Regular expression for Unicode chars. */
97      private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
98  
99      /**
100      * Regular expression Unicode control characters.
101      *
102      * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
103      *     Appendix:Control characters</a>
104      */
105     private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
106             + "(00[0-1][\\dA-Fa-f]"
107             + "|00[8-9][\\dA-Fa-f]"
108             + "|00[aA][dD]"
109             + "|034[fF]"
110             + "|070[fF]"
111             + "|180[eE]"
112             + "|200[b-fB-F]"
113             + "|202[a-eA-E]"
114             + "|206[0-4a-fA-F]"
115             + "|[fF]{3}[9a-bA-B]"
116             + "|[fF][eE][fF]{2})");
117 
118     /**
119      * Regular expression for all escaped chars.
120      * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
121      * EscapeSequence</a>
122      */
123     private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
124             + UNICODE_REGEXP.pattern()
125             + "|\""
126             + "|'"
127             + "|\\\\"
128             + "|\\\\b"
129             + "|\\\\f"
130             + "|\\\\n"
131             + "|\\R"
132             + "|\\\\r"
133             + "|\\\\s"
134             + "|\\\\t"
135             + ")+$");
136 
137     /** Regular expression for escaped backslash. */
138     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
139 
140     /** Regular expression for non-printable unicode chars. */
141     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
142             + "|\\\\u0009"
143             + "|\\\\u000[bB]"
144             + "|\\\\u000[cC]"
145             + "|\\\\u0020"
146             + "|\\\\u007[fF]"
147             + "|\\\\u0085"
148             + "|\\\\u009[fF]"
149             + "|\\\\u00[aA]0"
150             + "|\\\\u00[aA][dD]"
151             + "|\\\\u04[fF]9"
152             + "|\\\\u05[bB][eE]"
153             + "|\\\\u05[dD]0"
154             + "|\\\\u05[eE][aA]"
155             + "|\\\\u05[fF]3"
156             + "|\\\\u05[fF]4"
157             + "|\\\\u0600"
158             + "|\\\\u0604"
159             + "|\\\\u061[cC]"
160             + "|\\\\u06[dD]{2}"
161             + "|\\\\u06[fF]{2}"
162             + "|\\\\u070[fF]"
163             + "|\\\\u0750"
164             + "|\\\\u077[fF]"
165             + "|\\\\u0[eE]00"
166             + "|\\\\u0[eE]7[fF]"
167             + "|\\\\u1680"
168             + "|\\\\u180[eE]"
169             + "|\\\\u1[eE]00"
170             + "|\\\\u2000"
171             + "|\\\\u2001"
172             + "|\\\\u2002"
173             + "|\\\\u2003"
174             + "|\\\\u2004"
175             + "|\\\\u2005"
176             + "|\\\\u2006"
177             + "|\\\\u2007"
178             + "|\\\\u2008"
179             + "|\\\\u2009"
180             + "|\\\\u200[aA]"
181             + "|\\\\u200[fF]"
182             + "|\\\\u2025"
183             + "|\\\\u2028"
184             + "|\\\\u2029"
185             + "|\\\\u202[fF]"
186             + "|\\\\u205[fF]"
187             + "|\\\\u2064"
188             + "|\\\\u2066"
189             + "|\\\\u2067"
190             + "|\\\\u2068"
191             + "|\\\\u2069"
192             + "|\\\\u206[aA]"
193             + "|\\\\u206[fF]"
194             + "|\\\\u20[aA][fF]"
195             + "|\\\\u2100"
196             + "|\\\\u213[aA]"
197             + "|\\\\u3000"
198             + "|\\\\u[dD]800"
199             + "|\\\\u[fF]8[fF]{2}"
200             + "|\\\\u[fF][bB]50"
201             + "|\\\\u[fF][dD][fF]{2}"
202             + "|\\\\u[fF][eE]70"
203             + "|\\\\u[fF][eE][fF]{2}"
204             + "|\\\\u[fF]{2}0[eE]"
205             + "|\\\\u[fF]{2}61"
206             + "|\\\\u[fF]{2}[dD][cC]"
207             + "|\\\\u[fF]{3}9"
208             + "|\\\\u[fF]{3}[aA]"
209             + "|\\\\u[fF]{3}[bB]"
210             + "|\\\\u[fF]{4}");
211 
212     /** Cpp style comments. */
213     private Map<Integer, TextBlock> singlelineComments;
214     /** C style comments. */
215     private Map<Integer, List<TextBlock>> blockComments;
216 
217     /** Allow use escapes for non-printable, control characters. */
218     private boolean allowEscapesForControlCharacters;
219 
220     /** Allow use escapes if trail comment is present. */
221     private boolean allowByTailComment;
222 
223     /** Allow if all characters in literal are escaped. */
224     private boolean allowIfAllCharactersEscaped;
225 
226     /** Allow use escapes for non-printable, whitespace characters. */
227     private boolean allowNonPrintableEscapes;
228 
229     /**
230      * Setter to allow use escapes for non-printable, control characters.
231      *
232      * @param allow user's value.
233      * @since 5.8
234      */
235     public final void setAllowEscapesForControlCharacters(boolean allow) {
236         allowEscapesForControlCharacters = allow;
237     }
238 
239     /**
240      * Setter to allow use escapes if trail comment is present.
241      *
242      * @param allow user's value.
243      * @since 5.8
244      */
245     public final void setAllowByTailComment(boolean allow) {
246         allowByTailComment = allow;
247     }
248 
249     /**
250      * Setter to allow if all characters in literal are escaped.
251      *
252      * @param allow user's value.
253      * @since 5.8
254      */
255     public final void setAllowIfAllCharactersEscaped(boolean allow) {
256         allowIfAllCharactersEscaped = allow;
257     }
258 
259     /**
260      * Setter to allow use escapes for non-printable, whitespace characters.
261      *
262      * @param allow user's value.
263      * @since 5.8
264      */
265     public final void setAllowNonPrintableEscapes(boolean allow) {
266         allowNonPrintableEscapes = allow;
267     }
268 
269     @Override
270     public int[] getDefaultTokens() {
271         return getRequiredTokens();
272     }
273 
274     @Override
275     public int[] getAcceptableTokens() {
276         return getRequiredTokens();
277     }
278 
279     @Override
280     public int[] getRequiredTokens() {
281         return new int[] {
282             TokenTypes.STRING_LITERAL,
283             TokenTypes.CHAR_LITERAL,
284             TokenTypes.TEXT_BLOCK_CONTENT,
285         };
286     }
287 
288     // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
289     @SuppressWarnings("deprecation")
290     @Override
291     public void beginTree(DetailAST rootAST) {
292         singlelineComments = getFileContents().getSingleLineComments();
293         blockComments = getFileContents().getBlockComments();
294     }
295 
296     @Override
297     public void visitToken(DetailAST ast) {
298         final String literal =
299             CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
300 
301         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
302                 || isAllCharactersEscaped(literal)
303                 || allowEscapesForControlCharacters
304                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
305                 || allowNonPrintableEscapes
306                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
307             log(ast, MSG_KEY);
308         }
309     }
310 
311     /**
312      * Checks if literal has Unicode chars.
313      *
314      * @param literal String literal.
315      * @return true if literal has Unicode chars.
316      */
317     private static boolean hasUnicodeChar(String literal) {
318         final String literalWithoutEscapedBackslashes =
319                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
320         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
321     }
322 
323     /**
324      * Check if String literal contains Unicode control chars.
325      *
326      * @param literal String literal.
327      * @param pattern RegExp for valid characters.
328      * @return true, if String literal contains Unicode control chars.
329      */
330     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
331         final int unicodeMatchesCounter =
332                 countMatches(UNICODE_REGEXP, literal);
333         final int unicodeValidMatchesCounter =
334                 countMatches(pattern, literal);
335         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
336     }
337 
338     /**
339      * Check if trail comment is present after ast token.
340      *
341      * @param ast current token.
342      * @return true if trail comment is present after ast token.
343      */
344     private boolean hasTrailComment(DetailAST ast) {
345         int lineNo = ast.getLineNo();
346 
347         // Since the trailing comment in the case of text blocks must follow the """ delimiter,
348         // we need to look for it after TEXT_BLOCK_LITERAL_END.
349         if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
350             lineNo = ast.getNextSibling().getLineNo();
351         }
352         boolean result = false;
353         if (singlelineComments.containsKey(lineNo)) {
354             result = true;
355         }
356         else {
357             final List<TextBlock> commentList = blockComments.get(lineNo);
358             if (commentList != null) {
359                 final TextBlock comment = commentList.get(commentList.size() - 1);
360                 final int[] codePoints = getLineCodePoints(lineNo - 1);
361                 result = isTrailingBlockComment(comment, codePoints);
362             }
363         }
364         return result;
365     }
366 
367     /**
368      * Whether the C style comment is trailing.
369      *
370      * @param comment the comment to check.
371      * @param codePoints the first line of the comment, in unicode code points
372      * @return true if the comment is trailing.
373      */
374     private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
375         return comment.getText().length != 1
376             || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
377                 comment.getEndColNo() + 1, codePoints.length));
378     }
379 
380     /**
381      * Count regexp matches into String literal.
382      *
383      * @param pattern pattern.
384      * @param target String literal.
385      * @return count of regexp matches.
386      */
387     private static int countMatches(Pattern pattern, String target) {
388         int matcherCounter = 0;
389         final Matcher matcher = pattern.matcher(target);
390         while (matcher.find()) {
391             matcherCounter++;
392         }
393         return matcherCounter;
394     }
395 
396     /**
397      * Checks if all characters in String literal is escaped.
398      *
399      * @param literal current literal.
400      * @return true if all characters in String literal is escaped.
401      */
402     private boolean isAllCharactersEscaped(String literal) {
403         return allowIfAllCharactersEscaped
404                 && ALL_ESCAPED_CHARS.matcher(literal).find();
405     }
406 
407 }