Source code

001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks.coding;
021
022import java.util.HashSet;
023import java.util.Objects;
024import java.util.Set;
025
026import com.puppycrawl.tools.checkstyle.StatelessCheck;
027import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
028import com.puppycrawl.tools.checkstyle.api.DetailAST;
029import com.puppycrawl.tools.checkstyle.api.TokenTypes;
030import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
031
032/**
033 * <div>
034 * Checks that specified symbols (by Unicode code points or ranges) are not used in code.
035 * By default, blocks common symbol ranges.
036 * </div>
037 *
038 * <p>
039 * Rationale: This check helps prevent emoji symbols and special characters in code
040 * (commonly added by AI tools), enforce coding standards, or forbid specific Unicode characters.
041 * </p>
042 *
043 * <p>
044 * Default ranges cover:
045 * </p>
046 * <ul>
047 * <li>U+2190–U+27BF: Arrows, Mathematical Operators, Box Drawing, Geometric Shapes,
048 * Miscellaneous Symbols, and Dingbats</li>
049 * <li>U+1F600–U+1F64F: Emoticons</li>
050 * <li>U+1F680–U+1F6FF: Transport and Map Symbols</li>
051 * <li>U+1F700–U+10FFFF: Alchemical Symbols and other pictographic symbols</li>
052 * </ul>
053 *
054 * <p>
055 * For a complete list of Unicode characters and ranges, see:
056 * <a href="https://en.wikipedia.org/wiki/List_of_Unicode_characters">
057 * List of Unicode characters</a>
058 * </p>
059 *
060 * <ul>
061 * <li>
062 * Property {@code symbolCodes} - Specify the symbols to check for, as Unicode code points
063 * or ranges. Format: comma-separated list of hex codes or ranges
064 * (e.g., {@code "0x2705, 0x1F600-0x1F64F"}). To allow only ASCII characters,
065 * use {@code "0x0080-0x10FFFF"}.
066 * Type is {@code java.lang.String}.
067 * Default value is {@code "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF"}.
068 * </li>
069 * </ul>
070 *
071 * @since 13.4.0
072 */
073@StatelessCheck
074public class IllegalSymbolCheck extends AbstractCheck {
075
076    /**
077     * A key is pointing to the warning message text in "messages.properties" file.
078     */
079    public static final String MSG_KEY = "illegal.symbol";
080
081    /** Separator used for defining ranges. */
082    private static final String RANGE_SEPARATOR = "-";
083
084    /** Default symbol codes to check for. */
085    private static final String DEFAULT_ILLEGAL_CODES =
086        "0x2190-0x27BF, 0x1F600-0x1F64F, 0x1F680-0x1F6FF, 0x1F700-0x10FFFF";
087
088    /** Set of individual Unicode code points to disallow. */
089    private final Set<Integer> singleCodePoints = new HashSet<>();
090
091    /** Set of Unicode ranges to disallow. */
092    private final Set<CodePointRange> codePointRanges = new HashSet<>();
093
094    /** Specify the symbols to check for, as Unicode code points or ranges. */
095    private String symbolCodes = DEFAULT_ILLEGAL_CODES;
096
097    /**
098     * Setter to specify the symbols to check for.
099     *
100     * @param symbols the symbols specification
101     * @throws IllegalArgumentException if the format is invalid
102     * @since 13.4.0
103     */
104    public void setSymbolCodes(String symbols) {
105        symbolCodes = Objects.requireNonNullElse(symbols, "");
106        parseSymbolCodes();
107    }
108
109    /**
110     * Initializes the check after all properties are set.
111     *
112     * <p>
113     * Ensures that the {@code symbolCodes} property is parsed
114     * for both default configuration and custom user configuration.
115     * </p>
116     *
117     * @throws IllegalArgumentException if the configured symbol format is invalid
118     */
119    @Override
120    public void init() {
121        parseSymbolCodes();
122    }
123
124    @Override
125    public int[] getDefaultTokens() {
126        return new int[] {
127            TokenTypes.COMMENT_CONTENT,
128        };
129    }
130
131    @Override
132    public int[] getAcceptableTokens() {
133        return new int[] {
134            TokenTypes.COMMENT_CONTENT,
135            TokenTypes.STRING_LITERAL,
136            TokenTypes.CHAR_LITERAL,
137            TokenTypes.TEXT_BLOCK_CONTENT,
138            TokenTypes.IDENT,
139        };
140    }
141
142    @Override
143    public int[] getRequiredTokens() {
144        return CommonUtil.EMPTY_INT_ARRAY;
145    }
146
147    @Override
148    public boolean isCommentNodesRequired() {
149        return true;
150    }
151
152    @Override
153    public void visitToken(DetailAST ast) {
154        ast.getText().codePoints()
155            .filter(this::isIllegalSymbol)
156            .findFirst()
157            .ifPresent(codePoint -> log(ast, MSG_KEY, Character.toString(codePoint)));
158    }
159
160    /**
161     * Parses the configured symbolCodes string into singleCodePoints and codePointRanges.
162     *
163     * @throws IllegalArgumentException if format is invalid
164     */
165    private void parseSymbolCodes() {
166        for (String part : symbolCodes.split(",", -1)) {
167            final String trimmed = part.trim();
168            if (!trimmed.isEmpty()) {
169                try {
170                    if (trimmed.contains(RANGE_SEPARATOR)) {
171                        parseRange(trimmed);
172                    }
173                    else {
174                        singleCodePoints.add(parseCodePoint(trimmed));
175                    }
176                }
177                catch (NumberFormatException exception) {
178                    throw new IllegalArgumentException(
179                            "Invalid symbol code format: " + trimmed, exception);
180                }
181            }
182        }
183    }
184
185    /**
186     * Determines whether a code point is illegal.
187     *
188     * @param codePoint Unicode code point
189     * @return true if illegal; false otherwise
190     */
191    private boolean isIllegalSymbol(int codePoint) {
192        boolean illegal = singleCodePoints.contains(codePoint);
193
194        for (CodePointRange range : codePointRanges) {
195            if (range.contains(codePoint)) {
196                illegal = true;
197                break;
198            }
199        }
200
201        return illegal;
202    }
203
204    /**
205     * Parses and stores a Unicode range.
206     *
207     * @param rangeStr range definition string (already trimmed by caller)
208     * @throws IllegalArgumentException if format is invalid
209     */
210    private void parseRange(String rangeStr) {
211        final String[] parts = rangeStr.split(RANGE_SEPARATOR, -1);
212        if (parts.length != 2
213                || CommonUtil.isBlank(parts[0])
214                || CommonUtil.isBlank(parts[1])) {
215            throw new IllegalArgumentException(
216                    "Invalid range format: " + rangeStr);
217        }
218
219        final int start = parseCodePoint(parts[0].trim());
220        final int end = parseCodePoint(parts[1].trim());
221
222        if (start > end) {
223            throw new IllegalArgumentException(
224                    "Range start must be <= end: " + rangeStr);
225        }
226
227        codePointRanges.add(new CodePointRange(start, end));
228    }
229
230    /**
231     * Parses a Unicode code point from a trimmed string.
232     * Supported formats: {@code 0x1234}, {@code \\u1234}, {@code U+1234}, or plain hex.
233     *
234     * @param str input string (already trimmed by caller)
235     * @return parsed code point
236     * @throws NumberFormatException if invalid format
237     */
238    private static int parseCodePoint(String str) {
239        final int hexRadix = 16;
240        final int result;
241
242        final boolean hasPrefix =
243                str.startsWith("\\u")
244                        || str.startsWith("0x")
245                        || str.startsWith("0X")
246                        || str.startsWith("U+")
247                        || str.startsWith("u+");
248
249        if (hasPrefix) {
250            result = Integer.parseInt(str.substring(2), hexRadix);
251        }
252        else {
253            result = Integer.parseInt(str, hexRadix);
254        }
255        return result;
256    }
257
258    /**
259     * Represents a Unicode code point range.
260     *
261     * @param start range start (inclusive)
262     * @param end range end (inclusive)
263     */
264    private record CodePointRange(int start, int end) {
265
266        /**
267         * Checks if code point is within range.
268         *
269         * @param codePoint code point to test
270         * @return true if within range; false otherwise
271         */
272        private boolean contains(int codePoint) {
273            return codePoint >= start && codePoint <= end;
274        }
275    }
276}