001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2021 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
033import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
034
035/**
036 * <p>
037 * Restricts using
038 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
039 * Unicode escapes</a>
040 * (such as &#92;u221e). It is possible to allow using escapes for
041 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
042 * non-printable, control characters</a>.
043 * Also, this check can be configured to allow using escapes
044 * if trail comment is present. By the option it is possible to
045 * allow using escapes if literal contains only them.
046 * </p>
047 * <ul>
048 * <li>
049 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
050 * non-printable, control characters.
051 * Type is {@code boolean}.
052 * Default value is {@code false}.
053 * </li>
054 * <li>
055 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
056 * Type is {@code boolean}.
057 * Default value is {@code false}.
058 * </li>
059 * <li>
060 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
061 * Type is {@code boolean}.
062 * Default value is {@code false}.
063 * </li>
064 * <li>
065 * Property {@code allowNonPrintableEscapes} - Allow use escapes for
066 * non-printable, whitespace characters.
067 * Type is {@code boolean}.
068 * Default value is {@code false}.
069 * </li>
070 * </ul>
071 * <p>
072 * To configure the check:
073 * </p>
074 * <pre>
075 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
076 * </pre>
077 * <p>
078 * Examples of using Unicode:</p>
079 * <pre>
080 * String unitAbbrev = "μs";     // OK, perfectly clear even without a comment.
081 * String unitAbbrev = "&#92;u03bcs";// violation, the reader has no idea what this is.
082 * return '&#92;ufeff' + content;    // OK, an example of non-printable,
083 *                               // control characters (byte order mark).
084 * </pre>
085 * <p>
086 * An example of how to configure the check to allow using escapes
087 * for non-printable, control characters:
088 * </p>
089 * <pre>
090 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
091 *   &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
092 * &lt;/module&gt;
093 * </pre>
094 * <p>
095 * Example of using escapes for non-printable, control characters:
096 * </p>
097 * <pre>
098 * String unitAbbrev = "μs";      // OK, a normal String
099 * String unitAbbrev = "&#92;u03bcs"; // violation, "&#92;u03bcs" is a printable character.
100 * return '&#92;ufeff' + content;     // OK, non-printable control character.
101 * </pre>
102 * <p>
103 * An example of how to configure the check to allow using escapes
104 * if trail comment is present:
105 * </p>
106 * <pre>
107 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
108 *   &lt;property name="allowByTailComment" value="true"/&gt;
109 * &lt;/module&gt;
110 * </pre>
111 * <p>Example of using escapes if trail comment is present:
112 * </p>
113 * <pre>
114 * String unitAbbrev = "μs";      // OK, a normal String
115 * String unitAbbrev = "&#92;u03bcs"; // OK, Greek letter mu, "s"
116 * return '&#92;ufeff' + content;
117 * // -----^--------------------- violation, comment is not used within same line.
118 * </pre>
119 * <p>
120 * An example of how to configure the check to allow if
121 * all characters in literal are escaped.
122 * </p>
123 * <pre>
124 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
125 *   &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
126 * &lt;/module&gt;
127 * </pre>
128 * <p>Example of using escapes if all characters in literal are escaped:</p>
129 * <pre>
130 * String unitAbbrev = "μs";      // OK, a normal String
131 * String unitAbbrev = "&#92;u03bcs"; // violation, not all characters are escaped ('s').
132 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc"; // OK
133 * String unitAbbrev = "&#92;u03bc&#92;u03bcs";// violation, not all characters are escaped ('s').
134 * return '&#92;ufeff' + content;          // OK, all control characters are escaped
135 * </pre>
136 * <p>An example of how to configure the check to allow using escapes
137 * for non-printable whitespace characters:
138 * </p>
139 * <pre>
140 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
141 *   &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
142 * &lt;/module&gt;
143 * </pre>
144 * <p>Example of using escapes for non-printable whitespace characters:</p>
145 * <pre>
146 * String unitAbbrev = "μs";       // OK, a normal String
147 * String unitAbbrev1 = "&#92;u03bcs"; // violation, printable escape character.
148 * String unitAbbrev2 = "&#92;u03bc&#92;u03bc&#92;u03bc"; // violation, printable escape character.
149 * String unitAbbrev3 = "&#92;u03bc&#92;u03bcs";// violation, printable escape character.
150 * return '&#92;ufeff' + content;           // OK, non-printable escape character.
151 * </pre>
152 * <p>
153 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
154 * </p>
155 * <p>
156 * Violation Message Keys:
157 * </p>
158 * <ul>
159 * <li>
160 * {@code forbid.escaped.unicode.char}
161 * </li>
162 * </ul>
163 *
164 * @since 5.8
165 */
166@FileStatefulCheck
167public class AvoidEscapedUnicodeCharactersCheck
168    extends AbstractCheck {
169
170    /**
171     * A key is pointing to the warning message text in "messages.properties"
172     * file.
173     */
174    public static final String MSG_KEY = "forbid.escaped.unicode.char";
175
176    /** Regular expression for Unicode chars. */
177    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F0-9]{4}");
178
179    /**
180     * Regular expression Unicode control characters.
181     *
182     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
183     *     Appendix:Control characters</a>
184     */
185    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
186            + "(00[0-1][0-9A-Fa-f]"
187            + "|00[8-9][0-9A-Fa-f]"
188            + "|00[aA][dD]"
189            + "|034[fF]"
190            + "|070[fF]"
191            + "|180[eE]"
192            + "|200[b-fB-F]"
193            + "|202[a-eA-E]"
194            + "|206[0-4a-fA-F]"
195            + "|[fF]{3}[9a-bA-B]"
196            + "|[fF][eE][fF]{2})");
197
198    /**
199     * Regular expression for all escaped chars.
200     * See "EscapeSequence" at
201     * https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7
202     */
203    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
204            + UNICODE_REGEXP.pattern()
205            + "|\""
206            + "|'"
207            + "|\\\\"
208            + "|\\\\b"
209            + "|\\\\f"
210            + "|\\\\n"
211            + "|\\R"
212            + "|\\\\r"
213            + "|\\\\s"
214            + "|\\\\t"
215            + ")+$");
216
217    /** Regular expression for escaped backslash. */
218    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
219
220    /** Regular expression for non-printable unicode chars. */
221    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
222            + "|\\\\u0009"
223            + "|\\\\u000[bB]"
224            + "|\\\\u000[cC]"
225            + "|\\\\u0020"
226            + "|\\\\u007[fF]"
227            + "|\\\\u0085"
228            + "|\\\\u009[fF]"
229            + "|\\\\u00[aA]0"
230            + "|\\\\u00[aA][dD]"
231            + "|\\\\u04[fF]9"
232            + "|\\\\u05[bB][eE]"
233            + "|\\\\u05[dD]0"
234            + "|\\\\u05[eE][aA]"
235            + "|\\\\u05[fF]3"
236            + "|\\\\u05[fF]4"
237            + "|\\\\u0600"
238            + "|\\\\u0604"
239            + "|\\\\u061[cC]"
240            + "|\\\\u06[dD]{2}"
241            + "|\\\\u06[fF]{2}"
242            + "|\\\\u070[fF]"
243            + "|\\\\u0750"
244            + "|\\\\u077[fF]"
245            + "|\\\\u0[eE]00"
246            + "|\\\\u0[eE]7[fF]"
247            + "|\\\\u1680"
248            + "|\\\\u180[eE]"
249            + "|\\\\u1[eE]00"
250            + "|\\\\u2000"
251            + "|\\\\u2001"
252            + "|\\\\u2002"
253            + "|\\\\u2003"
254            + "|\\\\u2004"
255            + "|\\\\u2005"
256            + "|\\\\u2006"
257            + "|\\\\u2007"
258            + "|\\\\u2008"
259            + "|\\\\u2009"
260            + "|\\\\u200[aA]"
261            + "|\\\\u200[fF]"
262            + "|\\\\u2025"
263            + "|\\\\u2028"
264            + "|\\\\u2029"
265            + "|\\\\u202[fF]"
266            + "|\\\\u205[fF]"
267            + "|\\\\u2064"
268            + "|\\\\u2066"
269            + "|\\\\u2067"
270            + "|\\\\u2068"
271            + "|\\\\u2069"
272            + "|\\\\u206[aA]"
273            + "|\\\\u206[fF]"
274            + "|\\\\u20[aA][fF]"
275            + "|\\\\u2100"
276            + "|\\\\u213[aA]"
277            + "|\\\\u3000"
278            + "|\\\\u[dD]800"
279            + "|\\\\u[fF]8[fF]{2}"
280            + "|\\\\u[fF][bB]50"
281            + "|\\\\u[fF][dD][fF]{2}"
282            + "|\\\\u[fF][eE]70"
283            + "|\\\\u[fF][eE][fF]{2}"
284            + "|\\\\u[fF]{2}0[eE]"
285            + "|\\\\u[fF]{2}61"
286            + "|\\\\u[fF]{2}[dD][cC]"
287            + "|\\\\u[fF]{3}9"
288            + "|\\\\u[fF]{3}[aA]"
289            + "|\\\\u[fF]{3}[bB]"
290            + "|\\\\u[fF]{4}");
291
292    /** Cpp style comments. */
293    private Map<Integer, TextBlock> singlelineComments;
294    /** C style comments. */
295    private Map<Integer, List<TextBlock>> blockComments;
296
297    /** Allow use escapes for non-printable, control characters. */
298    private boolean allowEscapesForControlCharacters;
299
300    /** Allow use escapes if trail comment is present. */
301    private boolean allowByTailComment;
302
303    /** Allow if all characters in literal are escaped. */
304    private boolean allowIfAllCharactersEscaped;
305
306    /** Allow use escapes for non-printable, whitespace characters. */
307    private boolean allowNonPrintableEscapes;
308
309    /**
310     * Setter to allow use escapes for non-printable, control characters.
311     *
312     * @param allow user's value.
313     */
314    public final void setAllowEscapesForControlCharacters(boolean allow) {
315        allowEscapesForControlCharacters = allow;
316    }
317
318    /**
319     * Setter to allow use escapes if trail comment is present.
320     *
321     * @param allow user's value.
322     */
323    public final void setAllowByTailComment(boolean allow) {
324        allowByTailComment = allow;
325    }
326
327    /**
328     * Setter to allow if all characters in literal are escaped.
329     *
330     * @param allow user's value.
331     */
332    public final void setAllowIfAllCharactersEscaped(boolean allow) {
333        allowIfAllCharactersEscaped = allow;
334    }
335
336    /**
337     * Setter to allow use escapes for non-printable, whitespace characters.
338     *
339     * @param allow user's value.
340     */
341    public final void setAllowNonPrintableEscapes(boolean allow) {
342        allowNonPrintableEscapes = allow;
343    }
344
345    @Override
346    public int[] getDefaultTokens() {
347        return getRequiredTokens();
348    }
349
350    @Override
351    public int[] getAcceptableTokens() {
352        return getRequiredTokens();
353    }
354
355    @Override
356    public int[] getRequiredTokens() {
357        return new int[] {
358            TokenTypes.STRING_LITERAL,
359            TokenTypes.CHAR_LITERAL,
360            TokenTypes.TEXT_BLOCK_CONTENT,
361        };
362    }
363
364    @Override
365    public void beginTree(DetailAST rootAST) {
366        singlelineComments = getFileContents().getSingleLineComments();
367        blockComments = getFileContents().getBlockComments();
368    }
369
370    @Override
371    public void visitToken(DetailAST ast) {
372        final String literal =
373            CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
374
375        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
376                || isAllCharactersEscaped(literal)
377                || allowEscapesForControlCharacters
378                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
379                || allowNonPrintableEscapes
380                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
381            log(ast, MSG_KEY);
382        }
383    }
384
385    /**
386     * Checks if literal has Unicode chars.
387     *
388     * @param literal String literal.
389     * @return true if literal has Unicode chars.
390     */
391    private static boolean hasUnicodeChar(String literal) {
392        final String literalWithoutEscapedBackslashes =
393                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
394        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
395    }
396
397    /**
398     * Check if String literal contains Unicode control chars.
399     *
400     * @param literal String literal.
401     * @param pattern RegExp for valid characters.
402     * @return true, if String literal contains Unicode control chars.
403     */
404    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
405        final int unicodeMatchesCounter =
406                countMatches(UNICODE_REGEXP, literal);
407        final int unicodeValidMatchesCounter =
408                countMatches(pattern, literal);
409        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
410    }
411
412    /**
413     * Check if trail comment is present after ast token.
414     *
415     * @param ast current token.
416     * @return true if trail comment is present after ast token.
417     */
418    private boolean hasTrailComment(DetailAST ast) {
419        int lineNo = ast.getLineNo();
420
421        // Since the trailing comment in the case of text blocks must follow the """ delimiter,
422        // we need to look for it after TEXT_BLOCK_LITERAL_END.
423        if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
424            lineNo = ast.getNextSibling().getLineNo();
425        }
426        boolean result = false;
427        if (singlelineComments.containsKey(lineNo)) {
428            result = true;
429        }
430        else {
431            final List<TextBlock> commentList = blockComments.get(lineNo);
432            if (commentList != null) {
433                final TextBlock comment = commentList.get(commentList.size() - 1);
434                final String line = getLines()[lineNo - 1];
435                result = isTrailingBlockComment(comment, line);
436            }
437        }
438        return result;
439    }
440
441    /**
442     * Whether the C style comment is trailing.
443     *
444     * @param comment the comment to check.
445     * @param line the line where the comment starts.
446     * @return true if the comment is trailing.
447     */
448    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
449        return comment.getText().length != 1
450            || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1));
451    }
452
453    /**
454     * Count regexp matches into String literal.
455     *
456     * @param pattern pattern.
457     * @param target String literal.
458     * @return count of regexp matches.
459     */
460    private static int countMatches(Pattern pattern, String target) {
461        int matcherCounter = 0;
462        final Matcher matcher = pattern.matcher(target);
463        while (matcher.find()) {
464            matcherCounter++;
465        }
466        return matcherCounter;
467    }
468
469    /**
470     * Checks if all characters in String literal is escaped.
471     *
472     * @param literal current literal.
473     * @return true if all characters in String literal is escaped.
474     */
475    private boolean isAllCharactersEscaped(String literal) {
476        return allowIfAllCharactersEscaped
477                && ALL_ESCAPED_CHARS.matcher(literal).find();
478    }
479
480}