View Javadoc
1   ///////////////////////////////////////////////////////////////////////////////////////////////
2   // checkstyle: Checks Java source code and other text files for adherence to a set of rules.
3   // Copyright (C) 2001-2026 the original author or authors.
4   //
5   // This library is free software; you can redistribute it and/or
6   // modify it under the terms of the GNU Lesser General Public
7   // License as published by the Free Software Foundation; either
8   // version 2.1 of the License, or (at your option) any later version.
9   //
10  // This library is distributed in the hope that it will be useful,
11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  // Lesser General Public License for more details.
14  //
15  // You should have received a copy of the GNU Lesser General Public
16  // License along with this library; if not, write to the Free Software
17  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  ///////////////////////////////////////////////////////////////////////////////////////////////
19  
20  package com.puppycrawl.tools.checkstyle.checks.coding;
21  
22  import java.util.HashSet;
23  import java.util.Objects;
24  import java.util.Set;
25  
26  import com.puppycrawl.tools.checkstyle.StatelessCheck;
27  import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
28  import com.puppycrawl.tools.checkstyle.api.DetailAST;
29  import com.puppycrawl.tools.checkstyle.api.TokenTypes;
30  import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
31  
32  /**
33   * <div>
34   * Checks that specified symbols (by Unicode code points or ranges) are not used in code.
35   * By default, blocks common symbol ranges.
36   * </div>
37   *
38   * <p>
39   * Rationale: This check helps prevent emoji symbols and special characters in code
40   * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters.
41   * </p>
42   *
43   * <p>
44   * Default ranges cover:
45   * </p>
46   * <ul>
47   * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes,
48   * Miscellaneous Symbols, and Dingbats</li>
49   * <li>U+1F600–U+1F64F: Emoticons</li>
50   * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li>
51   * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li>
52   * </ul>
53   *
54   * <p>
55   * For a complete list of Unicode characters and ranges, see:
56   * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters">
57   * List of Unicode characters</a>
58   * </p>
59   *
60   * <ul>
61   * <li>
62   * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points
63   * or ranges. Format: comma-separated list of hex codes or ranges
64   * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters,
65   * use {@code "0x0080-0x10FFFF"}.
66   * Type is {@code java.lang.String}.
67   * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF"}.
68   * </li>
69   * </ul>
70   *
71   * @since 13.4.0
72   */
73  @StatelessCheck
74  public class IllegalSymbolCheck extends AbstractCheck {
75  
76      /**
77       * A key is pointing to the warning message text in "messages.properties" file.
78       */
79      public static final String MSG_KEY = "illegal.symbol";
80  
81      /** Separator used for defining ranges. */
82      private static final String RANGE_SEPARATOR = "-";
83  
84      /** Default symbol codes to check for. */
85      private static final String DEFAULT_ILLEGAL_CODES =
86          "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF";
87  
88      /** Set of individual Unicode code points to disallow. */
89      private final Set<Integer> singleCodePoints = new HashSet<>();
90  
91      /** Set of Unicode ranges to disallow. */
92      private final Set<CodePointRange> codePointRanges = new HashSet<>();
93  
94      /** Specify the symbols to check for, as Unicode code points or ranges. */
95      private String symbolCodes = DEFAULT_ILLEGAL_CODES;
96  
97      /**
98       * Setter to specify the symbols to check for.
99       *
100      * @param symbols the symbols specification
101      * @throws IllegalArgumentException if the format is invalid
102      * @since 13.4.0
103      */
104     public void setSymbolCodes(String symbols) {
105         symbolCodes = Objects.requireNonNullElse(symbols, "");
106         parseSymbolCodes();
107     }
108 
109     /**
110      * Initializes the check after all properties are set.
111      *
112      * <p>
113      * Ensures that the {@code symbolCodes} property is parsed
114      * for both default configuration and custom user configuration.
115      * </p>
116      *
117      * @throws IllegalArgumentException if the configured symbol format is invalid
118      */
119     @Override
120     public void init() {
121         parseSymbolCodes();
122     }
123 
124     @Override
125     public int[] getDefaultTokens() {
126         return new int[] {
127             TokenTypes.COMMENT_CONTENT,
128         };
129     }
130 
131     @Override
132     public int[] getAcceptableTokens() {
133         return new int[] {
134             TokenTypes.COMMENT_CONTENT,
135             TokenTypes.STRING_LITERAL,
136             TokenTypes.CHAR_LITERAL,
137             TokenTypes.TEXT_BLOCK_CONTENT,
138             TokenTypes.IDENT,
139         };
140     }
141 
142     @Override
143     public int[] getRequiredTokens() {
144         return CommonUtil.EMPTY_INT_ARRAY;
145     }
146 
147     @Override
148     public boolean isCommentNodesRequired() {
149         return true;
150     }
151 
152     @Override
153     public void visitToken(DetailAST ast) {
154         ast.getText().codePoints()
155             .filter(this::isIllegalSymbol)
156             .findFirst()
157             .ifPresent(codePoint -> log(ast, MSG_KEY, Character.toString(codePoint)));
158     }
159 
160     /**
161      * Parses the configured symbolCodes string into singleCodePoints and codePointRanges.
162      *
163      * @throws IllegalArgumentException if format is invalid
164      */
165     private void parseSymbolCodes() {
166         for (String part : symbolCodes.split(",", -1)) {
167             final String trimmed = part.trim();
168             if (!trimmed.isEmpty()) {
169                 try {
170                     if (trimmed.contains(RANGE_SEPARATOR)) {
171                         parseRange(trimmed);
172                     }
173                     else {
174                         singleCodePoints.add(parseCodePoint(trimmed));
175                     }
176                 }
177                 catch (NumberFormatException exception) {
178                     throw new IllegalArgumentException(
179                             "Invalid symbol code format: " + trimmed, exception);
180                 }
181             }
182         }
183     }
184 
185     /**
186      * Determines whether a code point is illegal.
187      *
188      * @param codePoint Unicode code point
189      * @return true if illegal; false otherwise
190      */
191     private boolean isIllegalSymbol(int codePoint) {
192         boolean illegal = singleCodePoints.contains(codePoint);
193 
194         for (CodePointRange range : codePointRanges) {
195             if (range.contains(codePoint)) {
196                 illegal = true;
197                 break;
198             }
199         }
200 
201         return illegal;
202     }
203 
204     /**
205      * Parses and stores a Unicode range.
206      *
207      * @param rangeStr range definition string (already trimmed by caller)
208      * @throws IllegalArgumentException if format is invalid
209      */
210     private void parseRange(String rangeStr) {
211         final String[] parts = rangeStr.split(RANGE_SEPARATOR, -1);
212         if (parts.length != 2
213                 || CommonUtil.isBlank(parts[0])
214                 || CommonUtil.isBlank(parts[1])) {
215             throw new IllegalArgumentException(
216                     "Invalid range format: " + rangeStr);
217         }
218 
219         final int start = parseCodePoint(parts[0].trim());
220         final int end = parseCodePoint(parts[1].trim());
221 
222         if (start > end) {
223             throw new IllegalArgumentException(
224                     "Range start must be <= end: " + rangeStr);
225         }
226 
227         codePointRanges.add(new CodePointRange(start, end));
228     }
229 
230     /**
231      * Parses a Unicode code point from a trimmed string.
232      * Supported formats: {@code 0x1234}, {@code \\u1234}, {@code U+1234}, or plain hex.
233      *
234      * @param str input string (already trimmed by caller)
235      * @return parsed code point
236      * @throws NumberFormatException if invalid format
237      */
238     private static int parseCodePoint(String str) {
239         final int hexRadix = 16;
240         final int result;
241 
242         final boolean hasPrefix =
243                 str.startsWith("\\u")
244                         || str.startsWith("0x")
245                         || str.startsWith("0X")
246                         || str.startsWith("U+")
247                         || str.startsWith("u+");
248 
249         if (hasPrefix) {
250             result = Integer.parseInt(str.substring(2), hexRadix);
251         }
252         else {
253             result = Integer.parseInt(str, hexRadix);
254         }
255         return result;
256     }
257 
258     /**
259      * Represents a Unicode code point range.
260      *
261      * @param start range start (inclusive)
262      * @param end range end (inclusive)
263      */
264     private record CodePointRange(int start, int end) {
265 
266         /**
267          * Checks if code point is within range.
268          *
269          * @param codePoint code point to test
270          * @return true if within range; false otherwise
271          */
272         private boolean contains(int codePoint) {
273             return codePoint >= start && codePoint <= end;
274         }
275     }
276 }