001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2026 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.ArrayList; 023import java.util.HashMap; 024import java.util.List; 025import java.util.Map; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 030import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 031import com.puppycrawl.tools.checkstyle.api.DetailAST; 032import com.puppycrawl.tools.checkstyle.api.TokenTypes; 033import com.puppycrawl.tools.checkstyle.utils.CheckUtil; 034 035/** 036 * <div> 037 * Restricts using 038 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 039 * Unicode escapes</a> 040 * (such as \u221e). It is possible to allow using escapes for 041 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 042 * non-printable, control characters</a>. 043 * Also, this check can be configured to allow using escapes 044 * if trail comment is present. By the option it is possible to 045 * allow using escapes if literal contains only them. 046 * </div> 047 * 048 * @since 5.8 049 */ 050@FileStatefulCheck 051public class AvoidEscapedUnicodeCharactersCheck 052 extends AbstractCheck { 053 054 /** 055 * A key is pointing to the warning message text in "messages.properties" 056 * file. 057 */ 058 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 059 060 /** Regular expression for Unicode chars. */ 061 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}"); 062 063 /** 064 * Regular expression Unicode control characters. 065 * 066 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 067 * Appendix:Control characters</a> 068 */ 069 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+" 070 + "(00[0-1][\\dA-Fa-f]" 071 + "|00[8-9][\\dA-Fa-f]" 072 + "|00[aA][dD]" 073 + "|034[fF]" 074 + "|070[fF]" 075 + "|180[eE]" 076 + "|200[b-fB-F]" 077 + "|202[a-eA-E]" 078 + "|206[0-4a-fA-F]" 079 + "|[fF]{3}[9a-bA-B]" 080 + "|[fF][eE][fF]{2})"); 081 082 /** 083 * Regular expression for all escaped chars. 084 * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7"> 085 * EscapeSequence</a> 086 */ 087 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^(" 088 + UNICODE_REGEXP.pattern() 089 + "|\"" 090 + "|'" 091 + "|\\\\" 092 + "|\\\\b" 093 + "|\\\\f" 094 + "|\\\\n" 095 + "|\\R" 096 + "|\\\\r" 097 + "|\\\\s" 098 + "|\\\\t" 099 + ")+$"); 100 101 /** Regular expression for escaped backslash. */ 102 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 103 104 /** Regular expression for non-printable unicode chars. */ 105 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 106 + "|\\\\u0009" 107 + "|\\\\u000[bB]" 108 + "|\\\\u000[cC]" 109 + "|\\\\u0020" 110 + "|\\\\u007[fF]" 111 + "|\\\\u0085" 112 + "|\\\\u009[fF]" 113 + "|\\\\u00[aA]0" 114 + "|\\\\u00[aA][dD]" 115 + "|\\\\u04[fF]9" 116 + "|\\\\u05[bB][eE]" 117 + "|\\\\u05[dD]0" 118 + "|\\\\u05[eE][aA]" 119 + "|\\\\u05[fF]3" 120 + "|\\\\u05[fF]4" 121 + "|\\\\u0600" 122 + "|\\\\u0604" 123 + "|\\\\u061[cC]" 124 + "|\\\\u06[dD]{2}" 125 + "|\\\\u06[fF]{2}" 126 + "|\\\\u070[fF]" 127 + "|\\\\u0750" 128 + "|\\\\u077[fF]" 129 + "|\\\\u0[eE]00" 130 + "|\\\\u0[eE]7[fF]" 131 + "|\\\\u1680" 132 + "|\\\\u180[eE]" 133 + "|\\\\u1[eE]00" 134 + "|\\\\u2000" 135 + "|\\\\u2001" 136 + "|\\\\u2002" 137 + "|\\\\u2003" 138 + "|\\\\u2004" 139 + "|\\\\u2005" 140 + "|\\\\u2006" 141 + "|\\\\u2007" 142 + "|\\\\u2008" 143 + "|\\\\u2009" 144 + "|\\\\u200[aA]" 145 + "|\\\\u200[fF]" 146 + "|\\\\u2025" 147 + "|\\\\u2028" 148 + "|\\\\u2029" 149 + "|\\\\u202[fF]" 150 + "|\\\\u205[fF]" 151 + "|\\\\u2064" 152 + "|\\\\u2066" 153 + "|\\\\u2067" 154 + "|\\\\u2068" 155 + "|\\\\u2069" 156 + "|\\\\u206[aA]" 157 + "|\\\\u206[fF]" 158 + "|\\\\u20[aA][fF]" 159 + "|\\\\u2100" 160 + "|\\\\u213[aA]" 161 + "|\\\\u3000" 162 + "|\\\\u[dD]800" 163 + "|\\\\u[fF]8[fF]{2}" 164 + "|\\\\u[fF][bB]50" 165 + "|\\\\u[fF][dD][fF]{2}" 166 + "|\\\\u[fF][eE]70" 167 + "|\\\\u[fF][eE][fF]{2}" 168 + "|\\\\u[fF]{2}0[eE]" 169 + "|\\\\u[fF]{2}61" 170 + "|\\\\u[fF]{2}[dD][cC]" 171 + "|\\\\u[fF]{3}9" 172 + "|\\\\u[fF]{3}[aA]" 173 + "|\\\\u[fF]{3}[bB]" 174 + "|\\\\u[fF]{4}"); 175 176 /** 177 * Map of Pending Violations. 178 * Key: Line number of the violation. 179 * Value: List of literal AST nodes on that line pending validation. 180 */ 181 private final Map<Integer, List<DetailAST>> pendingViolations = new HashMap<>(); 182 183 /** Allow use escapes for non-printable, control characters. */ 184 private boolean allowEscapesForControlCharacters; 185 186 /** Allow use escapes if trail comment is present. */ 187 private boolean allowByTailComment; 188 189 /** Allow if all characters in literal are escaped. */ 190 private boolean allowIfAllCharactersEscaped; 191 192 /** Allow use escapes for non-printable, whitespace characters. */ 193 private boolean allowNonPrintableEscapes; 194 195 /** 196 * Setter to allow use escapes for non-printable, control characters. 197 * 198 * @param allow user's value. 199 * @since 5.8 200 */ 201 public final void setAllowEscapesForControlCharacters(boolean allow) { 202 allowEscapesForControlCharacters = allow; 203 } 204 205 /** 206 * Setter to allow use escapes if trail comment is present. 207 * 208 * @param allow user's value. 209 * @since 5.8 210 */ 211 public final void setAllowByTailComment(boolean allow) { 212 allowByTailComment = allow; 213 } 214 215 /** 216 * Setter to allow if all characters in literal are escaped. 217 * 218 * @param allow user's value. 219 * @since 5.8 220 */ 221 public final void setAllowIfAllCharactersEscaped(boolean allow) { 222 allowIfAllCharactersEscaped = allow; 223 } 224 225 /** 226 * Setter to allow use escapes for non-printable, whitespace characters. 227 * 228 * @param allow user's value. 229 * @since 5.8 230 */ 231 public final void setAllowNonPrintableEscapes(boolean allow) { 232 allowNonPrintableEscapes = allow; 233 } 234 235 @Override 236 public int[] getDefaultTokens() { 237 return getRequiredTokens(); 238 } 239 240 @Override 241 public int[] getAcceptableTokens() { 242 return getRequiredTokens(); 243 } 244 245 @Override 246 public int[] getRequiredTokens() { 247 return new int[] { 248 TokenTypes.STRING_LITERAL, 249 TokenTypes.CHAR_LITERAL, 250 TokenTypes.TEXT_BLOCK_CONTENT, 251 TokenTypes.SINGLE_LINE_COMMENT, 252 TokenTypes.BLOCK_COMMENT_BEGIN, 253 }; 254 } 255 256 @Override 257 public boolean isCommentNodesRequired() { 258 return true; 259 } 260 261 @Override 262 public void beginTree(DetailAST rootAST) { 263 pendingViolations.clear(); 264 } 265 266 @Override 267 public void visitToken(DetailAST ast) { 268 if (ast.getType() == TokenTypes.SINGLE_LINE_COMMENT 269 || ast.getType() == TokenTypes.BLOCK_COMMENT_BEGIN) { 270 checkComment(ast); 271 } 272 else { 273 checkLiteral(ast); 274 } 275 } 276 277 @Override 278 public void finishTree(DetailAST rootAST) { 279 for (List<DetailAST> asts : pendingViolations.values()) { 280 for (DetailAST ast : asts) { 281 log(ast, MSG_KEY); 282 } 283 } 284 } 285 286 /** 287 * Checks if the literal has Unicode char and should be reported. 288 * If violation is found, it is added to pendingViolations. 289 * 290 * @param ast literal token. 291 */ 292 private void checkLiteral(DetailAST ast) { 293 final String literal = 294 CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText()); 295 296 if (hasUnicodeChar(literal) && !(isAllCharactersEscaped(literal) 297 || allowEscapesForControlCharacters 298 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 299 || allowNonPrintableEscapes 300 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 301 302 if (allowByTailComment) { 303 int lineNo = ast.getLineNo(); 304 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 305 lineNo = ast.getNextSibling().getLineNo(); 306 } 307 pendingViolations.computeIfAbsent(lineNo, key -> new ArrayList<>()).add(ast); 308 } 309 else { 310 log(ast, MSG_KEY); 311 } 312 } 313 } 314 315 /** 316 * Checks if a comment clears any pending violations on the same line. 317 * 318 * @param comment comment token. 319 */ 320 private void checkComment(DetailAST comment) { 321 if (isTrailingComment(comment)) { 322 pendingViolations.remove(comment.getLineNo()); 323 } 324 } 325 326 /** 327 * Checks if a comment is trailing (has no code after it on the same line). 328 * 329 * @param commentNode the comment AST node 330 * @return true if it is trailing 331 */ 332 private static boolean isTrailingComment(DetailAST commentNode) { 333 final DetailAST nextSibling = commentNode.getNextSibling(); 334 return nextSibling == null || nextSibling.getLineNo() != commentNode.getLineNo(); 335 } 336 337 /** 338 * Checks if literal has Unicode chars. 339 * 340 * @param literal String literal. 341 * @return true if literal has Unicode chars. 342 */ 343 private static boolean hasUnicodeChar(String literal) { 344 final String literalWithoutEscapedBackslashes = 345 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 346 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 347 } 348 349 /** 350 * Check if String literal contains Unicode control chars. 351 * 352 * @param literal String literal. 353 * @param pattern RegExp for valid characters. 354 * @return true, if String literal contains Unicode control chars. 355 */ 356 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 357 final int unicodeMatchesCounter = 358 countMatches(UNICODE_REGEXP, literal); 359 final int unicodeValidMatchesCounter = 360 countMatches(pattern, literal); 361 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 362 } 363 364 /** 365 * Count regexp matches into String literal. 366 * 367 * @param pattern pattern. 368 * @param target String literal. 369 * @return count of regexp matches. 370 */ 371 private static int countMatches(Pattern pattern, String target) { 372 int matcherCounter = 0; 373 final Matcher matcher = pattern.matcher(target); 374 while (matcher.find()) { 375 matcherCounter++; 376 } 377 return matcherCounter; 378 } 379 380 /** 381 * Checks if all characters in String literal is escaped. 382 * 383 * @param literal current literal. 384 * @return true if all characters in String literal is escaped. 385 */ 386 private boolean isAllCharactersEscaped(String literal) { 387 return allowIfAllCharactersEscaped 388 && ALL_ESCAPED_CHARS.matcher(literal).find(); 389 } 390}