View Javadoc
1   ///////////////////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3   // Copyright (C) 2001-2025 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ///////////////////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks;
21  
22  import java.util.Arrays;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
29  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
30  import com.puppycrawl.tools.checkstyle.api.DetailAST;
31  import com.puppycrawl.tools.checkstyle.api.TextBlock;
32  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
33  import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
34  import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
35  
36  /**
37   * <div>
38   * Restricts using
39   * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
40   * Unicode escapes</a>
41   * (such as &#92;u221e). It is possible to allow using escapes for
42   * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
43   * non-printable, control characters</a>.
44   * Also, this check can be configured to allow using escapes
45   * if trail comment is present. By the option it is possible to
46   * allow using escapes if literal contains only them.
47   * </div>
48   *
49   * @since 5.8
50   */
51  @FileStatefulCheck
52  public class AvoidEscapedUnicodeCharactersCheck
53      extends AbstractCheck {
54  
55      /**
56       * A key is pointing to the warning message text in "messages.properties"
57       * file.
58       */
59      public static final String MSG_KEY = "forbid.escaped.unicode.char";
60  
61      /** Regular expression for Unicode chars. */
62      private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
63  
64      /**
65       * Regular expression Unicode control characters.
66       *
67       * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
68       *     Appendix:Control characters</a>
69       */
70      private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
71              + "(00[0-1][\\dA-Fa-f]"
72              + "|00[8-9][\\dA-Fa-f]"
73              + "|00[aA][dD]"
74              + "|034[fF]"
75              + "|070[fF]"
76              + "|180[eE]"
77              + "|200[b-fB-F]"
78              + "|202[a-eA-E]"
79              + "|206[0-4a-fA-F]"
80              + "|[fF]{3}[9a-bA-B]"
81              + "|[fF][eE][fF]{2})");
82  
83      /**
84       * Regular expression for all escaped chars.
85       * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
86       * EscapeSequence</a>
87       */
88      private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
89              + UNICODE_REGEXP.pattern()
90              + "|\""
91              + "|'"
92              + "|\\\\"
93              + "|\\\\b"
94              + "|\\\\f"
95              + "|\\\\n"
96              + "|\\R"
97              + "|\\\\r"
98              + "|\\\\s"
99              + "|\\\\t"
100             + ")+$");
101 
102     /** Regular expression for escaped backslash. */
103     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
104 
105     /** Regular expression for non-printable unicode chars. */
106     private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
107             + "|\\\\u0009"
108             + "|\\\\u000[bB]"
109             + "|\\\\u000[cC]"
110             + "|\\\\u0020"
111             + "|\\\\u007[fF]"
112             + "|\\\\u0085"
113             + "|\\\\u009[fF]"
114             + "|\\\\u00[aA]0"
115             + "|\\\\u00[aA][dD]"
116             + "|\\\\u04[fF]9"
117             + "|\\\\u05[bB][eE]"
118             + "|\\\\u05[dD]0"
119             + "|\\\\u05[eE][aA]"
120             + "|\\\\u05[fF]3"
121             + "|\\\\u05[fF]4"
122             + "|\\\\u0600"
123             + "|\\\\u0604"
124             + "|\\\\u061[cC]"
125             + "|\\\\u06[dD]{2}"
126             + "|\\\\u06[fF]{2}"
127             + "|\\\\u070[fF]"
128             + "|\\\\u0750"
129             + "|\\\\u077[fF]"
130             + "|\\\\u0[eE]00"
131             + "|\\\\u0[eE]7[fF]"
132             + "|\\\\u1680"
133             + "|\\\\u180[eE]"
134             + "|\\\\u1[eE]00"
135             + "|\\\\u2000"
136             + "|\\\\u2001"
137             + "|\\\\u2002"
138             + "|\\\\u2003"
139             + "|\\\\u2004"
140             + "|\\\\u2005"
141             + "|\\\\u2006"
142             + "|\\\\u2007"
143             + "|\\\\u2008"
144             + "|\\\\u2009"
145             + "|\\\\u200[aA]"
146             + "|\\\\u200[fF]"
147             + "|\\\\u2025"
148             + "|\\\\u2028"
149             + "|\\\\u2029"
150             + "|\\\\u202[fF]"
151             + "|\\\\u205[fF]"
152             + "|\\\\u2064"
153             + "|\\\\u2066"
154             + "|\\\\u2067"
155             + "|\\\\u2068"
156             + "|\\\\u2069"
157             + "|\\\\u206[aA]"
158             + "|\\\\u206[fF]"
159             + "|\\\\u20[aA][fF]"
160             + "|\\\\u2100"
161             + "|\\\\u213[aA]"
162             + "|\\\\u3000"
163             + "|\\\\u[dD]800"
164             + "|\\\\u[fF]8[fF]{2}"
165             + "|\\\\u[fF][bB]50"
166             + "|\\\\u[fF][dD][fF]{2}"
167             + "|\\\\u[fF][eE]70"
168             + "|\\\\u[fF][eE][fF]{2}"
169             + "|\\\\u[fF]{2}0[eE]"
170             + "|\\\\u[fF]{2}61"
171             + "|\\\\u[fF]{2}[dD][cC]"
172             + "|\\\\u[fF]{3}9"
173             + "|\\\\u[fF]{3}[aA]"
174             + "|\\\\u[fF]{3}[bB]"
175             + "|\\\\u[fF]{4}");
176 
177     /** Cpp style comments. */
178     private Map<Integer, TextBlock> singlelineComments;
179     /** C style comments. */
180     private Map<Integer, List<TextBlock>> blockComments;
181 
182     /** Allow use escapes for non-printable, control characters. */
183     private boolean allowEscapesForControlCharacters;
184 
185     /** Allow use escapes if trail comment is present. */
186     private boolean allowByTailComment;
187 
188     /** Allow if all characters in literal are escaped. */
189     private boolean allowIfAllCharactersEscaped;
190 
191     /** Allow use escapes for non-printable, whitespace characters. */
192     private boolean allowNonPrintableEscapes;
193 
194     /**
195      * Setter to allow use escapes for non-printable, control characters.
196      *
197      * @param allow user's value.
198      * @since 5.8
199      */
200     public final void setAllowEscapesForControlCharacters(boolean allow) {
201         allowEscapesForControlCharacters = allow;
202     }
203 
204     /**
205      * Setter to allow use escapes if trail comment is present.
206      *
207      * @param allow user's value.
208      * @since 5.8
209      */
210     public final void setAllowByTailComment(boolean allow) {
211         allowByTailComment = allow;
212     }
213 
214     /**
215      * Setter to allow if all characters in literal are escaped.
216      *
217      * @param allow user's value.
218      * @since 5.8
219      */
220     public final void setAllowIfAllCharactersEscaped(boolean allow) {
221         allowIfAllCharactersEscaped = allow;
222     }
223 
224     /**
225      * Setter to allow use escapes for non-printable, whitespace characters.
226      *
227      * @param allow user's value.
228      * @since 5.8
229      */
230     public final void setAllowNonPrintableEscapes(boolean allow) {
231         allowNonPrintableEscapes = allow;
232     }
233 
234     @Override
235     public int[] getDefaultTokens() {
236         return getRequiredTokens();
237     }
238 
239     @Override
240     public int[] getAcceptableTokens() {
241         return getRequiredTokens();
242     }
243 
244     @Override
245     public int[] getRequiredTokens() {
246         return new int[] {
247             TokenTypes.STRING_LITERAL,
248             TokenTypes.CHAR_LITERAL,
249             TokenTypes.TEXT_BLOCK_CONTENT,
250         };
251     }
252 
253     // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
254     @Override
255     @SuppressWarnings("deprecation")
256     public void beginTree(DetailAST rootAST) {
257         singlelineComments = getFileContents().getSingleLineComments();
258         blockComments = getFileContents().getBlockComments();
259     }
260 
261     @Override
262     public void visitToken(DetailAST ast) {
263         final String literal =
264             CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
265 
266         if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
267                 || isAllCharactersEscaped(literal)
268                 || allowEscapesForControlCharacters
269                         && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
270                 || allowNonPrintableEscapes
271                         && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
272             log(ast, MSG_KEY);
273         }
274     }
275 
276     /**
277      * Checks if literal has Unicode chars.
278      *
279      * @param literal String literal.
280      * @return true if literal has Unicode chars.
281      */
282     private static boolean hasUnicodeChar(String literal) {
283         final String literalWithoutEscapedBackslashes =
284                 ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
285         return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
286     }
287 
288     /**
289      * Check if String literal contains Unicode control chars.
290      *
291      * @param literal String literal.
292      * @param pattern RegExp for valid characters.
293      * @return true, if String literal contains Unicode control chars.
294      */
295     private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
296         final int unicodeMatchesCounter =
297                 countMatches(UNICODE_REGEXP, literal);
298         final int unicodeValidMatchesCounter =
299                 countMatches(pattern, literal);
300         return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
301     }
302 
303     /**
304      * Check if trail comment is present after ast token.
305      *
306      * @param ast current token.
307      * @return true if trail comment is present after ast token.
308      */
309     private boolean hasTrailComment(DetailAST ast) {
310         int lineNo = ast.getLineNo();
311 
312         // Since the trailing comment in the case of text blocks must follow the """ delimiter,
313         // we need to look for it after TEXT_BLOCK_LITERAL_END.
314         if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
315             lineNo = ast.getNextSibling().getLineNo();
316         }
317         boolean result = false;
318         if (singlelineComments.containsKey(lineNo)) {
319             result = true;
320         }
321         else {
322             final List<TextBlock> commentList = blockComments.get(lineNo);
323             if (commentList != null) {
324                 final TextBlock comment = commentList.get(commentList.size() - 1);
325                 final int[] codePoints = getLineCodePoints(lineNo - 1);
326                 result = isTrailingBlockComment(comment, codePoints);
327             }
328         }
329         return result;
330     }
331 
332     /**
333      * Whether the C style comment is trailing.
334      *
335      * @param comment the comment to check.
336      * @param codePoints the first line of the comment, in unicode code points
337      * @return true if the comment is trailing.
338      */
339     private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
340         return comment.getText().length != 1
341             || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
342                 comment.getEndColNo() + 1, codePoints.length));
343     }
344 
345     /**
346      * Count regexp matches into String literal.
347      *
348      * @param pattern pattern.
349      * @param target String literal.
350      * @return count of regexp matches.
351      */
352     private static int countMatches(Pattern pattern, String target) {
353         int matcherCounter = 0;
354         final Matcher matcher = pattern.matcher(target);
355         while (matcher.find()) {
356             matcherCounter++;
357         }
358         return matcherCounter;
359     }
360 
361     /**
362      * Checks if all characters in String literal is escaped.
363      *
364      * @param literal current literal.
365      * @return true if all characters in String literal is escaped.
366      */
367     private boolean isAllCharactersEscaped(String literal) {
368         return allowIfAllCharactersEscaped
369                 && ALL_ESCAPED_CHARS.matcher(literal).find();
370     }
371 
372 }