001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2026 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks.coding; 021 022import java.util.HashSet; 023import java.util.Objects; 024import java.util.Set; 025 026import com.puppycrawl.tools.checkstyle.StatelessCheck; 027import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 028import com.puppycrawl.tools.checkstyle.api.DetailAST; 029import com.puppycrawl.tools.checkstyle.api.TokenTypes; 030import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 031 032/** 033 * <div> 034 * Checks that specified symbols (by Unicode code points or ranges) are not used in code. 035 * By default, blocks common symbol ranges. 036 * </div> 037 * 038 * <p> 039 * Rationale: This check helps prevent emoji symbols and special characters in code 040 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters. 041 * </p> 042 * 043 * <p> 044 * Default ranges cover: 045 * </p> 046 * <ul> 047 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes, 048 * Miscellaneous Symbols, and Dingbats</li> 049 * <li>U+1F600–U+1F64F: Emoticons</li> 050 * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li> 051 * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li> 052 * </ul> 053 * 054 * <p> 055 * For a complete list of Unicode characters and ranges, see: 056 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters"> 057 * List of Unicode characters</a> 058 * </p> 059 * 060 * <ul> 061 * <li> 062 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points 063 * or ranges. Format: comma-separated list of hex codes or ranges 064 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters, 065 * use {@code "0x0080-0x10FFFF"}. 066 * Type is {@code java.lang.String}. 067 * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF"}. 068 * </li> 069 * </ul> 070 * 071 * @since 13.4.0 072 */ 073@StatelessCheck 074public class IllegalSymbolCheck extends AbstractCheck { 075 076 /** 077 * A key is pointing to the warning message text in "messages.properties" file. 078 */ 079 public static final String MSG_KEY = "illegal.symbol"; 080 081 /** Separator used for defining ranges. */ 082 private static final String RANGE_SEPARATOR = "-"; 083 084 /** Default symbol codes to check for. */ 085 private static final String DEFAULT_ILLEGAL_CODES = 086 "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF"; 087 088 /** Set of individual Unicode code points to disallow. */ 089 private final Set<Integer> singleCodePoints = new HashSet<>(); 090 091 /** Set of Unicode ranges to disallow. */ 092 private final Set<CodePointRange> codePointRanges = new HashSet<>(); 093 094 /** Specify the symbols to check for, as Unicode code points or ranges. */ 095 private String symbolCodes = DEFAULT_ILLEGAL_CODES; 096 097 /** 098 * Setter to specify the symbols to check for. 099 * 100 * @param symbols the symbols specification 101 * @throws IllegalArgumentException if the format is invalid 102 * @since 13.4.0 103 */ 104 public void setSymbolCodes(String symbols) { 105 symbolCodes = Objects.requireNonNullElse(symbols, ""); 106 parseSymbolCodes(); 107 } 108 109 /** 110 * Initializes the check after all properties are set. 111 * 112 * <p> 113 * Ensures that the {@code symbolCodes} property is parsed 114 * for both default configuration and custom user configuration. 115 * </p> 116 * 117 * @throws IllegalArgumentException if the configured symbol format is invalid 118 */ 119 @Override 120 public void init() { 121 parseSymbolCodes(); 122 } 123 124 @Override 125 public int[] getDefaultTokens() { 126 return new int[] { 127 TokenTypes.COMMENT_CONTENT, 128 }; 129 } 130 131 @Override 132 public int[] getAcceptableTokens() { 133 return new int[] { 134 TokenTypes.COMMENT_CONTENT, 135 TokenTypes.STRING_LITERAL, 136 TokenTypes.CHAR_LITERAL, 137 TokenTypes.TEXT_BLOCK_CONTENT, 138 TokenTypes.IDENT, 139 }; 140 } 141 142 @Override 143 public int[] getRequiredTokens() { 144 return CommonUtil.EMPTY_INT_ARRAY; 145 } 146 147 @Override 148 public boolean isCommentNodesRequired() { 149 return true; 150 } 151 152 @Override 153 public void visitToken(DetailAST ast) { 154 ast.getText().codePoints() 155 .filter(this::isIllegalSymbol) 156 .findFirst() 157 .ifPresent(codePoint -> log(ast, MSG_KEY, Character.toString(codePoint))); 158 } 159 160 /** 161 * Parses the configured symbolCodes string into singleCodePoints and codePointRanges. 162 * 163 * @throws IllegalArgumentException if format is invalid 164 */ 165 private void parseSymbolCodes() { 166 for (String part : symbolCodes.split(",", -1)) { 167 final String trimmed = part.trim(); 168 if (!trimmed.isEmpty()) { 169 try { 170 if (trimmed.contains(RANGE_SEPARATOR)) { 171 parseRange(trimmed); 172 } 173 else { 174 singleCodePoints.add(parseCodePoint(trimmed)); 175 } 176 } 177 catch (NumberFormatException exception) { 178 throw new IllegalArgumentException( 179 "Invalid symbol code format: " + trimmed, exception); 180 } 181 } 182 } 183 } 184 185 /** 186 * Determines whether a code point is illegal. 187 * 188 * @param codePoint Unicode code point 189 * @return true if illegal; false otherwise 190 */ 191 private boolean isIllegalSymbol(int codePoint) { 192 boolean illegal = singleCodePoints.contains(codePoint); 193 194 for (CodePointRange range : codePointRanges) { 195 if (range.contains(codePoint)) { 196 illegal = true; 197 break; 198 } 199 } 200 201 return illegal; 202 } 203 204 /** 205 * Parses and stores a Unicode range. 206 * 207 * @param rangeStr range definition string (already trimmed by caller) 208 * @throws IllegalArgumentException if format is invalid 209 */ 210 private void parseRange(String rangeStr) { 211 final String[] parts = rangeStr.split(RANGE_SEPARATOR, -1); 212 if (parts.length != 2 213 || CommonUtil.isBlank(parts[0]) 214 || CommonUtil.isBlank(parts[1])) { 215 throw new IllegalArgumentException( 216 "Invalid range format: " + rangeStr); 217 } 218 219 final int start = parseCodePoint(parts[0].trim()); 220 final int end = parseCodePoint(parts[1].trim()); 221 222 if (start > end) { 223 throw new IllegalArgumentException( 224 "Range start must be <= end: " + rangeStr); 225 } 226 227 codePointRanges.add(new CodePointRange(start, end)); 228 } 229 230 /** 231 * Parses a Unicode code point from a trimmed string. 232 * Supported formats: {@code 0x1234}, {@code \\u1234}, {@code U+1234}, or plain hex. 233 * 234 * @param str input string (already trimmed by caller) 235 * @return parsed code point 236 * @throws NumberFormatException if invalid format 237 */ 238 private static int parseCodePoint(String str) { 239 final int hexRadix = 16; 240 final int result; 241 242 final boolean hasPrefix = 243 str.startsWith("\\u") 244 || str.startsWith("0x") 245 || str.startsWith("0X") 246 || str.startsWith("U+") 247 || str.startsWith("u+"); 248 249 if (hasPrefix) { 250 result = Integer.parseInt(str.substring(2), hexRadix); 251 } 252 else { 253 result = Integer.parseInt(str, hexRadix); 254 } 255 return result; 256 } 257 258 /** 259 * Represents a Unicode code point range. 260 * 261 * @param start range start (inclusive) 262 * @param end range end (inclusive) 263 */ 264 private record CodePointRange(int start, int end) { 265 266 /** 267 * Checks if code point is within range. 268 * 269 * @param codePoint code point to test 270 * @return true if within range; false otherwise 271 */ 272 private boolean contains(int codePoint) { 273 return codePoint >= start && codePoint <= end; 274 } 275 } 276}