1 ///////////////////////////////////////////////////////////////////////////////////////////////
2 // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3 // Copyright (C) 2001-2026 the original author or authors.
4 //
5 // This library is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU Lesser General Public
7 // License as published by the Free Software Foundation; either
8 // version 2.1 of the License, or (at your option) any later version.
9 //
10 // This library is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public
16 // License along with this library; if not, write to the Free Software
17 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 ///////////////////////////////////////////////////////////////////////////////////////////////
19
20 package com.puppycrawl.tools.checkstyle.checks.coding;
21
22 import java.util.HashSet;
23 import java.util.Objects;
24 import java.util.Set;
25
26 import com.puppycrawl.tools.checkstyle.StatelessCheck;
27 import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
28 import com.puppycrawl.tools.checkstyle.api.DetailAST;
29 import com.puppycrawl.tools.checkstyle.api.TokenTypes;
30 import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
31
32 /**
33 * <div>
34 * Checks that specified symbols (by Unicode code points or ranges) are not used in code.
35 * By default, blocks common symbol ranges.
36 * </div>
37 *
38 * <p>
39 * Rationale: This check helps prevent emoji symbols and special characters in code
40 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters.
41 * </p>
42 *
43 * <p>
44 * Default ranges cover:
45 * </p>
46 * <ul>
47 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes,
48 * Miscellaneous Symbols, and Dingbats</li>
49 * <li>U+1F600–U+1F64F: Emoticons</li>
50 * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li>
51 * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li>
52 * </ul>
53 *
54 * <p>
55 * For a complete list of Unicode characters and ranges, see:
56 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters">
57 * List of Unicode characters</a>
58 * </p>
59 *
60 * <ul>
61 * <li>
62 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points
63 * or ranges. Format: comma-separated list of hex codes or ranges
64 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters,
65 * use {@code "0x0080-0x10FFFF"}.
66 * Type is {@code java.lang.String}.
67 * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF"}.
68 * </li>
69 * </ul>
70 *
71 * @since 13.4.0
72 */
73 @StatelessCheck
74 public class IllegalSymbolCheck extends AbstractCheck {
75
76 /**
77 * A key is pointing to the warning message text in "messages.properties" file.
78 */
79 public static final String MSG_KEY = "illegal.symbol";
80
81 /** Separator used for defining ranges. */
82 private static final String RANGE_SEPARATOR = "-";
83
84 /** Default symbol codes to check for. */
85 private static final String DEFAULT_ILLEGAL_CODES =
86 "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF";
87
88 /** Set of individual Unicode code points to disallow. */
89 private final Set<Integer> singleCodePoints = new HashSet<>();
90
91 /** Set of Unicode ranges to disallow. */
92 private final Set<CodePointRange> codePointRanges = new HashSet<>();
93
94 /** Specify the symbols to check for, as Unicode code points or ranges. */
95 private String symbolCodes = DEFAULT_ILLEGAL_CODES;
96
97 /**
98 * Setter to specify the symbols to check for.
99 *
100 * @param symbols the symbols specification
101 * @throws IllegalArgumentException if the format is invalid
102 * @since 13.4.0
103 */
104 public void setSymbolCodes(String symbols) {
105 symbolCodes = Objects.requireNonNullElse(symbols, "");
106 parseSymbolCodes();
107 }
108
109 /**
110 * Initializes the check after all properties are set.
111 *
112 * <p>
113 * Ensures that the {@code symbolCodes} property is parsed
114 * for both default configuration and custom user configuration.
115 * </p>
116 *
117 * @throws IllegalArgumentException if the configured symbol format is invalid
118 */
119 @Override
120 public void init() {
121 parseSymbolCodes();
122 }
123
124 @Override
125 public int[] getDefaultTokens() {
126 return new int[] {
127 TokenTypes.COMMENT_CONTENT,
128 };
129 }
130
131 @Override
132 public int[] getAcceptableTokens() {
133 return new int[] {
134 TokenTypes.COMMENT_CONTENT,
135 TokenTypes.STRING_LITERAL,
136 TokenTypes.CHAR_LITERAL,
137 TokenTypes.TEXT_BLOCK_CONTENT,
138 TokenTypes.IDENT,
139 };
140 }
141
142 @Override
143 public int[] getRequiredTokens() {
144 return CommonUtil.EMPTY_INT_ARRAY;
145 }
146
147 @Override
148 public boolean isCommentNodesRequired() {
149 return true;
150 }
151
152 @Override
153 public void visitToken(DetailAST ast) {
154 ast.getText().codePoints()
155 .filter(this::isIllegalSymbol)
156 .findFirst()
157 .ifPresent(codePoint -> log(ast, MSG_KEY, Character.toString(codePoint)));
158 }
159
160 /**
161 * Parses the configured symbolCodes string into singleCodePoints and codePointRanges.
162 *
163 * @throws IllegalArgumentException if format is invalid
164 */
165 private void parseSymbolCodes() {
166 for (String part : symbolCodes.split(",", -1)) {
167 final String trimmed = part.trim();
168 if (!trimmed.isEmpty()) {
169 try {
170 if (trimmed.contains(RANGE_SEPARATOR)) {
171 parseRange(trimmed);
172 }
173 else {
174 singleCodePoints.add(parseCodePoint(trimmed));
175 }
176 }
177 catch (NumberFormatException exception) {
178 throw new IllegalArgumentException(
179 "Invalid symbol code format: " + trimmed, exception);
180 }
181 }
182 }
183 }
184
185 /**
186 * Determines whether a code point is illegal.
187 *
188 * @param codePoint Unicode code point
189 * @return true if illegal; false otherwise
190 */
191 private boolean isIllegalSymbol(int codePoint) {
192 boolean illegal = singleCodePoints.contains(codePoint);
193
194 for (CodePointRange range : codePointRanges) {
195 if (range.contains(codePoint)) {
196 illegal = true;
197 break;
198 }
199 }
200
201 return illegal;
202 }
203
204 /**
205 * Parses and stores a Unicode range.
206 *
207 * @param rangeStr range definition string (already trimmed by caller)
208 * @throws IllegalArgumentException if format is invalid
209 */
210 private void parseRange(String rangeStr) {
211 final String[] parts = rangeStr.split(RANGE_SEPARATOR, -1);
212 if (parts.length != 2
213 || CommonUtil.isBlank(parts[0])
214 || CommonUtil.isBlank(parts[1])) {
215 throw new IllegalArgumentException(
216 "Invalid range format: " + rangeStr);
217 }
218
219 final int start = parseCodePoint(parts[0].trim());
220 final int end = parseCodePoint(parts[1].trim());
221
222 if (start > end) {
223 throw new IllegalArgumentException(
224 "Range start must be <= end: " + rangeStr);
225 }
226
227 codePointRanges.add(new CodePointRange(start, end));
228 }
229
230 /**
231 * Parses a Unicode code point from a trimmed string.
232 * Supported formats: {@code 0x1234}, {@code \\u1234}, {@code U+1234}, or plain hex.
233 *
234 * @param str input string (already trimmed by caller)
235 * @return parsed code point
236 * @throws NumberFormatException if invalid format
237 */
238 private static int parseCodePoint(String str) {
239 final int hexRadix = 16;
240 final int result;
241
242 final boolean hasPrefix =
243 str.startsWith("\\u")
244 || str.startsWith("0x")
245 || str.startsWith("0X")
246 || str.startsWith("U+")
247 || str.startsWith("u+");
248
249 if (hasPrefix) {
250 result = Integer.parseInt(str.substring(2), hexRadix);
251 }
252 else {
253 result = Integer.parseInt(str, hexRadix);
254 }
255 return result;
256 }
257
258 /**
259 * Represents a Unicode code point range.
260 *
261 * @param start range start (inclusive)
262 * @param end range end (inclusive)
263 */
264 private record CodePointRange(int start, int end) {
265
266 /**
267 * Checks if code point is within range.
268 *
269 * @param codePoint code point to test
270 * @return true if within range; false otherwise
271 */
272 private boolean contains(int codePoint) {
273 return codePoint >= start && codePoint <= end;
274 }
275 }
276 }