View Javadoc
1   ///////////////////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3   // Copyright (C) 2001-2024 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ///////////////////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.Arrays;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
29  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
30  import com.puppycrawl.tools.checkstyle.api.DetailAST;
31  import com.puppycrawl.tools.checkstyle.api.TextBlock;
32  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
33  import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
34  import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
35  
36  /**
37   * <div>
38   * Restricts using
39   * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
40   * Unicode escapes</a>
41   * (such as &#92;u221e). It is possible to allow using escapes for
42   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
43   * non-printable, control characters</a>.
44   * Also, this check can be configured to allow using escapes
45   * if trail comment is present. By the option it is possible to
46   * allow using escapes if literal contains only them.
47   * </div>
48   *
49   * <ul>
50   * <li>
51   * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
52   * Type is {@code boolean}.
53   * Default value is {@code false}.
54   * </li>
55   * <li>
56   * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
57   * non-printable, control characters.
58   * Type is {@code boolean}.
59   * Default value is {@code false}.
60   * </li>
61   * <li>
62   * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
63   * Type is {@code boolean}.
64   * Default value is {@code false}.
65   * </li>
66   * <li>
67   * Property {@code allowNonPrintableEscapes} - Allow use escapes for
68   * non-printable, whitespace characters.
69   * Type is {@code boolean}.
70   * Default value is {@code false}.
71   * </li>
72   * </ul>
73   *
74   * <p>
75   * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
76   * </p>
77   *
78   * <p>
79   * Violation Message Keys:
80   * </p>
81   * <ul>
82   * <li>
83   * {@code forbid.escaped.unicode.char}
84   * </li>
85   * </ul>
86   *
87   * @since 5.8
88   */
89  @FileStatefulCheck
90  public class AvoidEscapedUnicodeCharactersCheck
91      extends AbstractCheck {
92  
93      /**
94       * A key is pointing to the warning message text in "messages.properties"
95       * file.
96       */
97      public static final String MSG_KEY = "forbid.escaped.unicode.char";
98  
99      /** Regular expression for Unicode chars. */
100     private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
101 
102     /**
103      * Regular expression Unicode control characters.
104      *
105      * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
106      *     Appendix:Control characters</a>
107      */
108     private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
109             + "(00[0-1][\\dA-Fa-f]"
110             + "|00[8-9][\\dA-Fa-f]"
111             + "|00[aA][dD]"
112             + "|034[fF]"
113             + "|070[fF]"
114             + "|180[eE]"
115             + "|200[b-fB-F]"
116             + "|202[a-eA-E]"
117             + "|206[0-4a-fA-F]"
118             + "|[fF]{3}[9a-bA-B]"
119             + "|[fF][eE][fF]{2})");
120 
121     /**
122      * Regular expression for all escaped chars.
123      * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
124      * EscapeSequence</a>
125      */
126     private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
127             + UNICODE_REGEXP.pattern()
128             + "|\""
129             + "|'"
130             + "|\\\\"
131             + "|\\\\b"
132             + "|\\\\f"
133             + "|\\\\n"
134             + "|\\R"
135             + "|\\\\r"
136             + "|\\\\s"
137             + "|\\\\t"
138             + ")+$");
139 
140     /** Regular expression for escaped backslash. */
141     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
142 
143     /** Regular expression for non-printable unicode chars. */
144     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
145             + "|\\\\u0009"
146             + "|\\\\u000[bB]"
147             + "|\\\\u000[cC]"
148             + "|\\\\u0020"
149             + "|\\\\u007[fF]"
150             + "|\\\\u0085"
151             + "|\\\\u009[fF]"
152             + "|\\\\u00[aA]0"
153             + "|\\\\u00[aA][dD]"
154             + "|\\\\u04[fF]9"
155             + "|\\\\u05[bB][eE]"
156             + "|\\\\u05[dD]0"
157             + "|\\\\u05[eE][aA]"
158             + "|\\\\u05[fF]3"
159             + "|\\\\u05[fF]4"
160             + "|\\\\u0600"
161             + "|\\\\u0604"
162             + "|\\\\u061[cC]"
163             + "|\\\\u06[dD]{2}"
164             + "|\\\\u06[fF]{2}"
165             + "|\\\\u070[fF]"
166             + "|\\\\u0750"
167             + "|\\\\u077[fF]"
168             + "|\\\\u0[eE]00"
169             + "|\\\\u0[eE]7[fF]"
170             + "|\\\\u1680"
171             + "|\\\\u180[eE]"
172             + "|\\\\u1[eE]00"
173             + "|\\\\u2000"
174             + "|\\\\u2001"
175             + "|\\\\u2002"
176             + "|\\\\u2003"
177             + "|\\\\u2004"
178             + "|\\\\u2005"
179             + "|\\\\u2006"
180             + "|\\\\u2007"
181             + "|\\\\u2008"
182             + "|\\\\u2009"
183             + "|\\\\u200[aA]"
184             + "|\\\\u200[fF]"
185             + "|\\\\u2025"
186             + "|\\\\u2028"
187             + "|\\\\u2029"
188             + "|\\\\u202[fF]"
189             + "|\\\\u205[fF]"
190             + "|\\\\u2064"
191             + "|\\\\u2066"
192             + "|\\\\u2067"
193             + "|\\\\u2068"
194             + "|\\\\u2069"
195             + "|\\\\u206[aA]"
196             + "|\\\\u206[fF]"
197             + "|\\\\u20[aA][fF]"
198             + "|\\\\u2100"
199             + "|\\\\u213[aA]"
200             + "|\\\\u3000"
201             + "|\\\\u[dD]800"
202             + "|\\\\u[fF]8[fF]{2}"
203             + "|\\\\u[fF][bB]50"
204             + "|\\\\u[fF][dD][fF]{2}"
205             + "|\\\\u[fF][eE]70"
206             + "|\\\\u[fF][eE][fF]{2}"
207             + "|\\\\u[fF]{2}0[eE]"
208             + "|\\\\u[fF]{2}61"
209             + "|\\\\u[fF]{2}[dD][cC]"
210             + "|\\\\u[fF]{3}9"
211             + "|\\\\u[fF]{3}[aA]"
212             + "|\\\\u[fF]{3}[bB]"
213             + "|\\\\u[fF]{4}");
214 
215     /** Cpp style comments. */
216     private Map<Integer, TextBlock> singlelineComments;
217     /** C style comments. */
218     private Map<Integer, List<TextBlock>> blockComments;
219 
220     /** Allow use escapes for non-printable, control characters. */
221     private boolean allowEscapesForControlCharacters;
222 
223     /** Allow use escapes if trail comment is present. */
224     private boolean allowByTailComment;
225 
226     /** Allow if all characters in literal are escaped. */
227     private boolean allowIfAllCharactersEscaped;
228 
229     /** Allow use escapes for non-printable, whitespace characters. */
230     private boolean allowNonPrintableEscapes;
231 
232     /**
233      * Setter to allow use escapes for non-printable, control characters.
234      *
235      * @param allow user's value.
236      * @since 5.8
237      */
238     public final void setAllowEscapesForControlCharacters(boolean allow) {
239         allowEscapesForControlCharacters = allow;
240     }
241 
242     /**
243      * Setter to allow use escapes if trail comment is present.
244      *
245      * @param allow user's value.
246      * @since 5.8
247      */
248     public final void setAllowByTailComment(boolean allow) {
249         allowByTailComment = allow;
250     }
251 
252     /**
253      * Setter to allow if all characters in literal are escaped.
254      *
255      * @param allow user's value.
256      * @since 5.8
257      */
258     public final void setAllowIfAllCharactersEscaped(boolean allow) {
259         allowIfAllCharactersEscaped = allow;
260     }
261 
262     /**
263      * Setter to allow use escapes for non-printable, whitespace characters.
264      *
265      * @param allow user's value.
266      * @since 5.8
267      */
268     public final void setAllowNonPrintableEscapes(boolean allow) {
269         allowNonPrintableEscapes = allow;
270     }
271 
272     @Override
273     public int[] getDefaultTokens() {
274         return getRequiredTokens();
275     }
276 
277     @Override
278     public int[] getAcceptableTokens() {
279         return getRequiredTokens();
280     }
281 
282     @Override
283     public int[] getRequiredTokens() {
284         return new int[] {
285             TokenTypes.STRING_LITERAL,
286             TokenTypes.CHAR_LITERAL,
287             TokenTypes.TEXT_BLOCK_CONTENT,
288         };
289     }
290 
291     // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
292     @SuppressWarnings("deprecation")
293     @Override
294     public void beginTree(DetailAST rootAST) {
295         singlelineComments = getFileContents().getSingleLineComments();
296         blockComments = getFileContents().getBlockComments();
297     }
298 
299     @Override
300     public void visitToken(DetailAST ast) {
301         final String literal =
302             CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
303 
304         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
305                 || isAllCharactersEscaped(literal)
306                 || allowEscapesForControlCharacters
307                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
308                 || allowNonPrintableEscapes
309                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
310             log(ast, MSG_KEY);
311         }
312     }
313 
314     /**
315      * Checks if literal has Unicode chars.
316      *
317      * @param literal String literal.
318      * @return true if literal has Unicode chars.
319      */
320     private static boolean hasUnicodeChar(String literal) {
321         final String literalWithoutEscapedBackslashes =
322                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
323         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
324     }
325 
326     /**
327      * Check if String literal contains Unicode control chars.
328      *
329      * @param literal String literal.
330      * @param pattern RegExp for valid characters.
331      * @return true, if String literal contains Unicode control chars.
332      */
333     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
334         final int unicodeMatchesCounter =
335                 countMatches(UNICODE_REGEXP, literal);
336         final int unicodeValidMatchesCounter =
337                 countMatches(pattern, literal);
338         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
339     }
340 
341     /**
342      * Check if trail comment is present after ast token.
343      *
344      * @param ast current token.
345      * @return true if trail comment is present after ast token.
346      */
347     private boolean hasTrailComment(DetailAST ast) {
348         int lineNo = ast.getLineNo();
349 
350         // Since the trailing comment in the case of text blocks must follow the """ delimiter,
351         // we need to look for it after TEXT_BLOCK_LITERAL_END.
352         if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
353             lineNo = ast.getNextSibling().getLineNo();
354         }
355         boolean result = false;
356         if (singlelineComments.containsKey(lineNo)) {
357             result = true;
358         }
359         else {
360             final List<TextBlock> commentList = blockComments.get(lineNo);
361             if (commentList != null) {
362                 final TextBlock comment = commentList.get(commentList.size() - 1);
363                 final int[] codePoints = getLineCodePoints(lineNo - 1);
364                 result = isTrailingBlockComment(comment, codePoints);
365             }
366         }
367         return result;
368     }
369 
370     /**
371      * Whether the C style comment is trailing.
372      *
373      * @param comment the comment to check.
374      * @param codePoints the first line of the comment, in unicode code points
375      * @return true if the comment is trailing.
376      */
377     private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
378         return comment.getText().length != 1
379             || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
380                 comment.getEndColNo() + 1, codePoints.length));
381     }
382 
383     /**
384      * Count regexp matches into String literal.
385      *
386      * @param pattern pattern.
387      * @param target String literal.
388      * @return count of regexp matches.
389      */
390     private static int countMatches(Pattern pattern, String target) {
391         int matcherCounter = 0;
392         final Matcher matcher = pattern.matcher(target);
393         while (matcher.find()) {
394             matcherCounter++;
395         }
396         return matcherCounter;
397     }
398 
399     /**
400      * Checks if all characters in String literal is escaped.
401      *
402      * @param literal current literal.
403      * @return true if all characters in String literal is escaped.
404      */
405     private boolean isAllCharactersEscaped(String literal) {
406         return allowIfAllCharactersEscaped
407                 && ALL_ESCAPED_CHARS.matcher(literal).find();
408     }
409 
410 }