Source code

001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.site;
021
022import java.io.File;
023import java.io.IOException;
024import java.io.PrintWriter;
025import java.nio.charset.StandardCharsets;
026import java.nio.file.Files;
027import java.nio.file.Path;
028import java.util.ArrayList;
029import java.util.Arrays;
030import java.util.HashSet;
031import java.util.LinkedHashMap;
032import java.util.LinkedHashSet;
033import java.util.List;
034import java.util.Locale;
035import java.util.Map;
036import java.util.Set;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039import java.util.stream.Collectors;
040
041import javax.xml.parsers.DocumentBuilder;
042import javax.xml.parsers.DocumentBuilderFactory;
043import javax.xml.parsers.ParserConfigurationException;
044
045import org.w3c.dom.Document;
046import org.w3c.dom.Element;
047import org.w3c.dom.NodeList;
048import org.xml.sax.SAXException;
049
050/**
051 * Generates {@code search-index.json} from the Checkstyle XDoc source files.
052 *
053 * <p>This is a plain Java {@code main()} class - no Maven plugin API required.
054 * It is invoked by {@code exec-maven-plugin} during the {@code process-classes}
055 * phase so the index is ready when Maven Site copies static resources.</p>
056 *
057 * <p>Output is written as a JSON file. The search widget fetches this file
058 * using the fetch API and parses it to populate the search index.</p>
059 *
060 * <h2>Key design decisions</h2>
061 * <ul>
062 *   <li><b>No duplicates.</b> Only plain {@code .xml} files are processed for
063 *       check/filter/filefilter directories. The {@code .xml.template} and
064 *       {@code .xml.vm} siblings are pre-render source files that would produce
065 *       identical URLs and duplicate entries. A secondary URL-keyed dedup guard
066 *       is also applied across the entire output list.</li>
067 *
068 *   <li><b>Identifiable example titles.</b> Both {@code -config} and
069 *       {@code -code} example paragraphs are indexed.  Their titles use the
070 *       pattern {@code "<CheckName>: Example1 [config]"} and
071 *       {@code "<CheckName>: Example1 [code]"} so users can distinguish a
072 *       configuration snippet from its matching Java code example in search
073 *       results.</li>
074 *
075 *   <li><b>Full general-page indexing.</b> Each meaningful {@code <section>}
076 *       in general documentation pages (e.g. {@code config_system_properties},
077 *       {@code writingchecks}, {@code cmdline}) is indexed as its own entry
078 *       with the full section text used for keyword extraction - not just the
079 *       first sentence. This makes page-internal headings discoverable.</li>
080 *
081 *   <li><b>Disambiguated generic titles.</b> Structural section names that are
082 *       repeated across many pages (e.g. "Overview", "Debug", "Contributing")
083 *       are prefixed with the page title, yielding e.g.
084 *       "Eclipse IDE: Debug" instead of a bare "Debug" that collides with
085 *       "IntelliJ IDE: Debug".</li>
086 *
087 *   <li><b>Junk pages excluded.</b> Release notes, auto-generated style
088 *       coverage reports and bare category aggregator stubs are skipped.</li>
089 * </ul>
090 *
091 * <p>Usage (called by exec-maven-plugin in pom.xml):</p>
092 * <pre>
093 *   java SearchIndexGenerator &lt;xdocsDir&gt; &lt;outputFilePath&gt;
094 *   java SearchIndexGenerator src/site/xdoc target/site/search-index.json
095 * </pre>
096 */
097public final class SearchIndexGenerator {
098
099    /** String literal for checks directory. */
100    private static final String CHECKS = "checks";
101
102    /** String literal for comma. */
103    private static final String COMMA_STR = ",";
104
105    /** String literal for space. */
106    private static final String SPACE = " ";
107
108    /** Character literal for space. */
109    private static final char SPACE_CHAR = ' ';
110
111    /** String literal for colon separator used in disambiguated titles. */
112    private static final String TITLE_SEPARATOR = ": ";
113
114    /** String literal for ellipsis. */
115    private static final String ELLIPSIS = "...";
116
117    /** String literal for external general entities feature. */
118    private static final String EXTERNAL_GENERAL_ENTITIES =
119            "http://xml.org/sax/features/external-general-entities";
120
121    /** String literal for external parameter entities feature. */
122    private static final String EXTERNAL_PARAMETER_ENTITIES =
123            "http://xml.org/sax/features/external-parameter-entities";
124
125    /** String literal for General category. */
126    private static final String GENERAL = "General";
127
128    /** String literal for Example document type. */
129    private static final String EXAMPLE_TYPE = "Example";
130
131    /** String literal for Property document type. */
132    private static final String PROPERTY_TYPE = "Property";
133
134    /** String literal for subsection element. */
135    private static final String SUBSECTION = "subsection";
136
137    /** String literal for name attribute. */
138    private static final String NAME_ATTR = "name";
139
140    /** String literal for id attribute. */
141    private static final String ID_ATTR = "id";
142
143    /** String literal for index.xml. */
144    private static final String INDEX_XML = "index.xml";
145
146    /** Constant for the filters directory. */
147    private static final String FILTERS_DIR = "filters";
148
149    /** Constant for the filefilters directory. */
150    private static final String FILEFILTERS_DIR = "filefilters";
151
152    /** Constant for the index file name. */
153    private static final String INDEX_HTML = "index.html";
154
155    /** String literal for Content. */
156    private static final String CONTENT = "Content";
157
158    /** String literal for the Examples subsection name. */
159    private static final String EXAMPLES_SUBSECTION = "examples";
160
161    /** String literal for body element. */
162    private static final String BODY = "body";
163
164    /** String literal for section element. */
165    private static final String SECTION = "section";
166
167    /** String literal for title element. */
168    private static final String TITLE = "title";
169
170    /** String literal for description element. */
171    private static final String DESCRIPTION = "description";
172
173    /** String literal for anchor separator. */
174    private static final String ANCHOR_SEPARATOR = "#";
175
176    /** String literal for path separator in URLs. */
177    private static final String PATH_SEPARATOR = "/";
178
179    /** String literal for the Properties subsection name fragment. */
180    private static final String PROPERTIES_FRAGMENT = "propert";
181
182    /** Exception message prefix used when an XDoc file fails to parse. */
183    private static final String PARSE_FAILURE_MSG = "Failed to parse XDoc file: ";
184
185    /**
186     * Suffix label appended to example titles for configuration snippets.
187     * Yields e.g. "AnnotationLocation: Example1 [config]".
188     */
189    private static final String EXAMPLE_LABEL_CONFIG = " [config]";
190
191    /**
192     * Suffix label appended to example titles for Java code examples.
193     * Yields e.g. "AnnotationLocation: Example1 [code]".
194     */
195    private static final String EXAMPLE_LABEL_CODE = " [code]";
196
197    /** Magic number for minimum word length. */
198    private static final int MIN_WORD_LENGTH = 2;
199
200    /** Magic number for maximum keywords. */
201    private static final int MAX_KEYWORDS = 15;
202
203    /** Magic number for maximum description length. */
204    private static final int MAX_DESCRIPTION_LENGTH = 150;
205
206    /** Whitespace pattern. */
207    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
208
209    /** Non-alphanumeric pattern. */
210    private static final Pattern NON_ALPHANUMERIC = Pattern.compile("[^a-z0-9]+");
211
212    /**
213     * Matches only plain {@code .xml} files (not {@code .xml.vm} or
214     * {@code .xml.template}).  Used when scanning check/filter/filefilter
215     * directories to avoid processing pre-render source templates and
216     * producing duplicate index entries.
217     */
218    private static final Pattern PLAIN_XML = Pattern.compile("\\.xml$");
219
220    /**
221     * Matches {@code .xml}, {@code .xml.vm} and {@code .xml.template}.
222     * Used only for URL building (stripping the extension to produce a
223     * {@code .html} path) and for the general-pages scanner where we
224     * want to exclude templates by name rather than by extension.
225     */
226    private static final Pattern DOC_EXTENSION =
227            Pattern.compile("\\.xml$|\\.xml\\.vm$|\\.xml\\.template$");
228
229    /**
230     * Matches {@code config_<category>.xml} files that redirect to check category pages.
231     * Captures the category name (e.g. "metrics" from "config_metrics.xml") in group 1.
232     */
233    private static final Pattern CONFIG_CATEGORY =
234          Pattern.compile("^config_(.+)\\.xml$");
235
236    /**
237     * Matches an example paragraph {@code id} attribute that has a suffix of
238     * either {@code -config} or {@code -code}, capturing the base label
239     * (e.g. "Example1") in group 1 and the type ("config" or "code") in
240     * group 2.
241     *
242     * <p>Example ids found in XDoc source:</p>
243     * <ul>
244     *   <li>{@code id="Example1-config"} -&gt; label "Example1", type "config"</li>
245     *   <li>{@code id="Example1-code"}   -&gt; label "Example1", type "code"</li>
246     * </ul>
247     */
248    private static final Pattern EXAMPLE_PARAGRAPH_ID =
249            Pattern.compile("^(Example\\d+)-(config|code)$");
250
251    /**
252     * Generic section/subsection names that are structurally repeated across
253     * many unrelated general pages (IDE setup guides, writing-* guides, etc).
254     * On their own they are meaningless in search results ("Debug" appears
255     * identically in eclipse.xml, idea.xml, and netbeans.xml) so when one of
256     * these is used as a section title it is always disambiguated with the
257     * source page's own title, e.g. "Eclipse IDE: Debug".
258     */
259    private static final Set<String> GENERIC_SECTION_NAMES = new HashSet<>(Arrays.asList(
260            "overview", DESCRIPTION, EXAMPLES_SUBSECTION, "example", "debug",
261            "contributing", "limitations", "parameters", "installation"
262    ));
263
264    /**
265     * Display names for the check category subdirectories under
266     * {@code checks/}, keyed by lowercase directory name. Every directory
267     * that exists under {@code checks/} must have an entry here -
268     * {@link #processChecksDirectory} fails fast if one is missing, so a
269     * contributor adding a new category is forced to register its display
270     * name instead of getting a guessed-at label.
271     */
272    private static final Map<String, String> CHECKS_CATEGORY_DISPLAY_NAMES = new LinkedHashMap<>();
273
274    static {
275        CHECKS_CATEGORY_DISPLAY_NAMES.put("annotation", "Annotations");
276        CHECKS_CATEGORY_DISPLAY_NAMES.put("blocks", "Block Checks");
277        CHECKS_CATEGORY_DISPLAY_NAMES.put("coding", "Coding");
278        CHECKS_CATEGORY_DISPLAY_NAMES.put("design", "Class Design");
279        CHECKS_CATEGORY_DISPLAY_NAMES.put("header", "Headers");
280        CHECKS_CATEGORY_DISPLAY_NAMES.put("imports", "Imports");
281        CHECKS_CATEGORY_DISPLAY_NAMES.put("javadoc", "Javadoc Comments");
282        CHECKS_CATEGORY_DISPLAY_NAMES.put("metrics", "Metrics");
283        CHECKS_CATEGORY_DISPLAY_NAMES.put("misc", "Miscellaneous");
284        CHECKS_CATEGORY_DISPLAY_NAMES.put("modifier", "Modifiers");
285        CHECKS_CATEGORY_DISPLAY_NAMES.put("naming", "Naming Conventions");
286        CHECKS_CATEGORY_DISPLAY_NAMES.put("regexp", "Regexp");
287        CHECKS_CATEGORY_DISPLAY_NAMES.put("sizes", "Size Violations");
288        CHECKS_CATEGORY_DISPLAY_NAMES.put("whitespace", "Whitespace");
289    }
290
291    /** Stop words: too generic to be useful as search keywords. */
292    private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(
293            "a", "an", "the", "and", "or", "of", "to", "in", "is", "it",
294            "that", "this", "for", "on", "with", "are", "be", "by", "at",
295            "as", "if", "its", "from", "which", "whether", "can", "will",
296            "has", "have", "not", "also", "only", "any", "all", "each",
297            "more", "than", "when", "then", "into", "such", "use", "used",
298            "check", CHECKS, "checkstyle"
299    ));
300
301    /** Accumulated search index entries. */
302    private List<SearchIndexEntry> entries;
303
304    /** Deduplication guard for URLs. */
305    private Set<String> seenUrls;
306
307    /** Prevent instantiation. */
308    private SearchIndexGenerator() {
309    }
310
311    /**
312     * Main entry point called by exec-maven-plugin.
313     *
314     * @param args args[0] = path to src/xdocs, args[1] = path to target/site
315     * @throws IOException on file write failure
316     * @throws IllegalArgumentException if args are missing
317     * @throws IllegalStateException if xdocsDir is missing
318     * @noinspectionreason UseOfSystemOutOrSystemErr - main method of a CLI utility
319     */
320    public static void main(String... args) throws IOException {
321        new SearchIndexGenerator().execute(args);
322    }
323
324    /**
325     * Internal execution method to avoid static context for the logger.
326     *
327     * @param args args[0] = path to src/xdocs, args[1] = output file path
328     * @throws IOException on file write failure
329     * @throws IllegalArgumentException if args are missing
330     * @throws IllegalStateException if xdocsDir is missing
331     */
332    private void execute(String... args) throws IOException {
333        if (args.length < 2) {
334            throw new IllegalArgumentException(
335                    "Usage: SearchIndexGenerator <xdocsDir> <outputFilePath>");
336        }
337
338        final Path xdocsPath = Path.of(args[0]);
339        final Path outputFilePath = Path.of(args[1]);
340        final File xdocsDir = xdocsPath.toFile();
341
342        if (!Files.exists(xdocsPath)) {
343            final String error = "[SearchIndex] ERROR: xdocsDir not found: "
344                    + xdocsPath.toAbsolutePath();
345            throw new IllegalStateException(error);
346        }
347
348        seenUrls = new LinkedHashSet<>();
349        entries = new ArrayList<>();
350
351        final Path checksPath = xdocsPath.resolve(CHECKS);
352        if (Files.exists(checksPath)) {
353            processChecksDirectory(checksPath.toFile(), xdocsDir);
354        }
355
356        final Path filtersPath = xdocsPath.resolve(FILTERS_DIR);
357        if (Files.exists(filtersPath)) {
358            processDirectory(filtersPath.toFile(), xdocsDir,
359                    "Filters", "Filter");
360        }
361
362        final Path fileFiltersPath = xdocsPath.resolve(FILEFILTERS_DIR);
363        if (Files.exists(fileFiltersPath)) {
364            processDirectory(fileFiltersPath.toFile(), xdocsDir,
365                    "File Filters", "File Filter");
366        }
367
368        processGeneralPages(xdocsDir);
369        writeJson(entries, outputFilePath);
370
371    }
372
373    /**
374     * Walks {@code src/xdocs/checks/} and processes each category subdirectory.
375     *
376     * <p>Every directory found here must have a corresponding entry in
377     * {@link #CHECKS_CATEGORY_DISPLAY_NAMES}; an unmapped directory likely
378     * means a new check category was added without registering its display
379     * name, so this fails fast rather than guessing a label from the
380     * directory name.</p>
381     *
382     * @param checksDir the checks root directory
383     * @param xdocsDir  the xdocs root (used for URL building)
384     * @throws IllegalStateException if {@code checksDir} cannot be listed, or
385     *         if one of its subdirectories has no entry in
386     *         {@link #CHECKS_CATEGORY_DISPLAY_NAMES}
387     */
388    private void processChecksDirectory(File checksDir, File xdocsDir) {
389        final File[] categoryDirs = checksDir.listFiles(File::isDirectory);
390        if (categoryDirs == null) {
391            throw new IllegalStateException(
392                    "Unable to list check category directories under: " + checksDir);
393        }
394
395        Arrays.sort(categoryDirs);
396        for (File categoryDir : categoryDirs) {
397            final String dirName = categoryDir.getName().toLowerCase(Locale.ROOT);
398            final String category = CHECKS_CATEGORY_DISPLAY_NAMES.get(dirName);
399            if (category == null) {
400                throw new IllegalStateException(
401                        "No display name registered for check category directory '"
402                                + dirName + "' in CHECKS_CATEGORY_DISPLAY_NAMES. "
403                                + "Please add one.");
404            }
405            processDirectory(categoryDir, xdocsDir, category, "Check");
406        }
407    }
408
409    /**
410     * Processes all <b>plain</b> {@code .xml} files in a directory
411     * (non-recursive). {@code index.xml} files and any file whose name ends
412     * with {@code .xml.template} or {@code .xml.vm} are skipped.
413     *
414     * <p>Skipping templates is critical: every check page has a sibling
415     * {@code *.xml.template} file that resolves to the <em>same</em> HTML
416     * URL. Without this filter both files would be processed, producing two
417     * identical (or near-identical) main entries plus doubled example and
418     * property entries for every check.</p>
419     *
420     * <p>For each plain {@code .xml} file, the main check/filter entry,
421     * per-example entries (both config and code), and per-property entries
422     * are added.</p>
423     *
424     * @param dir      directory to scan
425     * @param xdocsDir xdocs root (used for URL building)
426     * @param category category label for all entries in this directory
427     * @param type     document type ("Check", "Filter", "File Filter")
428     */
429    private void processDirectory(File dir, File xdocsDir,
430                                  String category, String type) {
431        final File[] xmlFiles = dir.listFiles(file -> {
432            return file.isFile()
433                    && PLAIN_XML.matcher(file.getName()).find()
434                    && !INDEX_XML.equals(file.getName());
435        });
436
437        if (xmlFiles != null) {
438            Arrays.sort(xmlFiles);
439            for (File xmlFile : xmlFiles) {
440                processXmlFile(xmlFile, xdocsDir, category, type);
441            }
442        }
443    }
444
445    /**
446     * Parses a single check/filter XDoc file and adds its main, example, and
447     * property entries to the index.
448     *
449     * <p>A parse failure here means the source XDoc itself is malformed,
450     * which is a real problem with the documentation rather than something
451     * safe to skip - so this fails the build instead of logging a warning
452     * and silently continuing.</p>
453     *
454     * @param xmlFile  the XDoc source file to process
455     * @param xdocsDir xdocs root (used for URL building)
456     * @param category category label for entries from this file
457     * @param type     document type ("Check", "Filter", "File Filter")
458     * @throws IllegalStateException if {@code xmlFile} cannot be parsed
459     */
460    private void processXmlFile(File xmlFile, File xdocsDir, String category, String type) {
461        try {
462            final Document doc = parseXml(xmlFile);
463            final String baseUrl = buildUrl(xmlFile, xdocsDir);
464
465            addIfNew(buildMainEntry(doc, xmlFile, category, type, baseUrl));
466
467            for (SearchIndexEntry entry : extractExampleEntries(doc, baseUrl, category)) {
468                addIfNew(entry);
469            }
470            for (SearchIndexEntry entry : extractPropertyEntries(doc, baseUrl, category)) {
471                addIfNew(entry);
472            }
473        }
474        catch (IOException | SAXException | ParserConfigurationException exception) {
475            throw new IllegalStateException(PARSE_FAILURE_MSG + xmlFile, exception);
476        }
477    }
478
479    /**
480     * Adds entries for the top-level general documentation pages.
481     *
482     * <p>Each remaining page is indexed per top-level {@code <section>},
483     * using the section's full text content for keyword extraction so
484     * page-internal headings are fully discoverable. Generic structural
485     * section names (see {@link #GENERIC_SECTION_NAMES}) are disambiguated
486     * by prefixing the page's own title.</p>
487     *
488     * @param xdocsDir the xdocs root directory
489     */
490    private void processGeneralPages(File xdocsDir) {
491        final File[] xmlFiles = xdocsDir.listFiles(file -> {
492            final String name = file.getName();
493            return file.isFile()
494                    && PLAIN_XML.matcher(name).find();
495        });
496
497        if (xmlFiles != null) {
498            Arrays.sort(xmlFiles);
499            for (File xmlFile : xmlFiles) {
500                processGeneralPage(xmlFile);
501            }
502        }
503    }
504
505    /**
506     * Parses a single general-documentation XDoc page and adds its
507     * per-section entries to the index.
508     *
509     * <p>A parse failure here means the source XDoc itself is malformed, so
510     * this fails the build instead of logging a warning and continuing.</p>
511     *
512     * @param xmlFile the XDoc source file to process
513     * @throws IllegalStateException if {@code xmlFile} cannot be parsed
514     */
515    private void processGeneralPage(File xmlFile) {
516        try {
517            for (SearchIndexEntry entry : buildGeneralPageEntries(xmlFile)) {
518                addIfNew(entry);
519            }
520        }
521        catch (IOException | SAXException | ParserConfigurationException exception) {
522            throw new IllegalStateException(PARSE_FAILURE_MSG + xmlFile, exception);
523        }
524    }
525
526    /**
527     * Builds the main search entry representing an entire check/filter document.
528     *
529     * @param doc      the parsed XDoc document
530     * @param xmlFile  the source file
531     * @param category category label for this file's entry
532     * @param type     document type ("Check", "Filter", etc.)
533     * @param baseUrl  the page url without anchor
534     * @return an entry representing the document
535     */
536    private static SearchIndexEntry buildMainEntry(Document doc, File xmlFile,
537                                                   String category, String type,
538                                                   String baseUrl) {
539        final Element body = requireBody(doc, xmlFile.toString());
540        final NodeList sections = body.getElementsByTagName(SECTION);
541
542        final String title = extractTitle(doc, xmlFile, sections);
543        final String description = extractAggregateDescription(sections);
544        final String keywords = extractAggregateKeywords(title, sections);
545
546        return new SearchIndexEntry(title, baseUrl, category, type, description, keywords);
547    }
548
549    /**
550     * Builds one search entry per top-level {@code <section>} in a general
551     * documentation page, using each section's full text for keyword
552     * extraction so that page-internal content is fully discoverable.
553     *
554     * <p>Generic structural section names (see {@link #GENERIC_SECTION_NAMES})
555     * are disambiguated as {@code "<page title>: <section name>"} to avoid
556     * collisions across pages (e.g. "Eclipse IDE: Debug" vs
557     * "IntelliJ IDE: Debug").</p>
558     *
559     * @param xmlFile the XDoc source file to parse
560     * @return list of entries, one per top-level section found
561     * @throws ParserConfigurationException on XML parser setup failure
562     * @throws SAXException on XML parse error
563     * @throws IOException on file read failure
564     */
565    private static List<SearchIndexEntry> buildGeneralPageEntries(File xmlFile)
566            throws ParserConfigurationException, SAXException, IOException {
567        final List<SearchIndexEntry> results = new ArrayList<>();
568        final Document doc = parseXml(xmlFile);
569        final Element body = requireBody(doc, xmlFile.toString());
570        final NodeList sections = body.getElementsByTagName(SECTION);
571        final String pageUrl = resolvePageUrl(xmlFile, xmlFile.getParentFile());
572        final String pageTitle = derivePageTitle(doc, xmlFile);
573
574        if (sections.getLength() == 0) {
575            final String fullText = WHITESPACE.matcher(body.getTextContent())
576                    .replaceAll(SPACE).trim();
577            final String description = extractFirstSentenceOrTruncated(fullText);
578            final String keywords = extractKeywordsFromText(
579                    pageTitle + SPACE + fullText);
580            results.add(new SearchIndexEntry(
581                    pageTitle, pageUrl, GENERAL, GENERAL, description, keywords));
582        }
583        else {
584            for (int index = 0; index < sections.getLength(); index++) {
585                final Element section = (Element) sections.item(index);
586                if (body.equals(section.getParentNode())) {
587                    final String sectionName = section.getAttribute(NAME_ATTR).trim();
588                    if (!sectionName.isEmpty() && !CONTENT.equalsIgnoreCase(sectionName)) {
589
590                        final String entryTitle = disambiguateTitle(sectionName, pageTitle);
591                        final String anchor = doxiaAnchorFor(sectionName);
592                        final String url = pageUrl + ANCHOR_SEPARATOR + anchor;
593
594                        final String sectionText = WHITESPACE.matcher(section.getTextContent())
595                                .replaceAll(SPACE).trim();
596                        final String description = extractFirstSentenceOrTruncated(sectionText);
597                        final String keywords = extractKeywordsFromText(
598                                pageTitle + SPACE + sectionName + SPACE + sectionText);
599
600                        results.add(new SearchIndexEntry(
601                                entryTitle, url, GENERAL, GENERAL, description, keywords));
602                    }
603                }
604            }
605        }
606
607        return results;
608    }
609
610    /**
611     * Extracts per-example search entries from a check/filter document.
612     *
613     * <p>Both {@code -config} and {@code -code} example paragraphs are
614     * indexed so users can find both the configuration snippet and the
615     * corresponding Java code example independently in search results.</p>
616     *
617     * <p>Titles use the pattern {@code "<CheckName>: Example1 [config]"} and
618     * {@code "<CheckName>: Example1 [code]"} to make the type immediately
619     * visible in search result listings without needing to open the page.</p>
620     *
621     * <p>Confirmed XDoc template structure for the Examples subsection:</p>
622     * <pre>
623     *   &lt;p id="Example1-config"&gt;To configure the check...&lt;/p&gt;
624     *   &lt;macro name="example"&gt;&lt;param name="type" value="config"/&gt;&lt;/macro&gt;
625     *   &lt;p id="Example1-code"&gt;Example:&lt;/p&gt;
626     *   &lt;macro name="example"&gt;&lt;param name="type" value="code"/&gt;&lt;/macro&gt;
627     * </pre>
628     *
629     * @param doc      the parsed XDoc document
630     * @param baseUrl  the page url without anchor
631     * @param category category label
632     * @return list of per-example entries (both config and code); empty if
633     *         none found
634     */
635    private static List<SearchIndexEntry> extractExampleEntries(Document doc,
636                                                                String baseUrl,
637                                                                String category) {
638        final List<SearchIndexEntry> exampleEntries = new ArrayList<>();
639        final Element body = requireBody(doc, baseUrl);
640        final NodeList sections = body.getElementsByTagName(SECTION);
641
642        for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) {
643            final Element section = (Element) sections.item(sectionIdx);
644            final String checkName = section.getAttribute(NAME_ATTR).trim();
645            final Element examplesSubsection =
646                    findSubsectionByPrefix(section, EXAMPLES_SUBSECTION);
647
648            if (examplesSubsection == null) {
649                continue;
650            }
651
652            final NodeList paragraphs = examplesSubsection.getElementsByTagName("p");
653
654            for (int paragraphIndex = 0; paragraphIndex < paragraphs.getLength();
655                 paragraphIndex++) {
656                final Element paragraph = (Element) paragraphs.item(paragraphIndex);
657                final SearchIndexEntry entry = buildExampleEntry(
658                        paragraph, checkName, baseUrl, category);
659                if (entry != null) {
660                    exampleEntries.add(entry);
661                }
662            }
663        }
664
665        return exampleEntries;
666    }
667
668    /**
669     * Builds a single example entry from a paragraph element.
670     *
671     * @param paragraph the paragraph element containing the example
672     * @param checkName the name of the check
673     * @param baseUrl the base URL for the page
674     * @param category the category label
675     * @return a SearchIndexEntry if the paragraph matches the example pattern,
676     *         null otherwise
677     */
678    private static SearchIndexEntry buildExampleEntry(Element paragraph,
679                                                       String checkName,
680                                                       String baseUrl,
681                                                       String category) {
682        final String id = paragraph.getAttribute(ID_ATTR);
683        final Matcher matcher = EXAMPLE_PARAGRAPH_ID.matcher(id);
684        SearchIndexEntry result = null;
685
686        if (matcher.matches()) {
687            final String exampleLabel = matcher.group(1);
688            final String exampleType = matcher.group(2);
689
690            final String labelSuffix;
691            if ("config".equals(exampleType)) {
692                labelSuffix = EXAMPLE_LABEL_CONFIG;
693            }
694            else {
695                labelSuffix = EXAMPLE_LABEL_CODE;
696            }
697
698            final String introText = WHITESPACE
699                    .matcher(paragraph.getTextContent())
700                    .replaceAll(SPACE).trim();
701
702            final String title = checkName + TITLE_SEPARATOR
703                    + exampleLabel + labelSuffix;
704            final String url = baseUrl + ANCHOR_SEPARATOR + id;
705            final String description =
706                    truncate(introText, MAX_DESCRIPTION_LENGTH);
707            final String keywords = extractKeywordsFromText(
708                    checkName + SPACE + exampleLabel
709                            + SPACE + exampleType + SPACE + introText);
710
711            result = new SearchIndexEntry(
712                    title, url, category, EXAMPLE_TYPE,
713                    description, keywords);
714        }
715
716        return result;
717    }
718
719    /**
720     * Extracts per-property search entries from a check/filter document.
721     *
722     * <p>Each row of the Properties table is indexed under the title
723     * {@code "<CheckName>: <propertyName>"} and linked to the property's
724     * own anchor on the page.</p>
725     *
726     * @param doc      the parsed XDoc document
727     * @param baseUrl  the page url without anchor
728     * @param category category label
729     * @return list of per-property entries; empty if none found
730     */
731    private static List<SearchIndexEntry> extractPropertyEntries(Document doc,
732                                                                 String baseUrl,
733                                                                 String category) {
734        final List<SearchIndexEntry> propertyEntries = new ArrayList<>();
735        final Element body = requireBody(doc, baseUrl);
736        final NodeList sections = body.getElementsByTagName(SECTION);
737
738        for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) {
739            final Element section = (Element) sections.item(sectionIdx);
740            final Element propertiesSubsection =
741                    findSubsectionByPrefix(section, PROPERTIES_FRAGMENT);
742
743            if (propertiesSubsection != null) {
744                final String checkName = section.getAttribute(NAME_ATTR).trim();
745                extractPropertiesFromRows(propertiesSubsection, checkName, baseUrl,
746                        category, propertyEntries);
747            }
748        }
749
750        return propertyEntries;
751    }
752
753    /**
754     * Extracts property entries from table rows and adds them to the list.
755     *
756     * @param propertiesSubsection the properties subsection element
757     * @param checkName the check name
758     * @param baseUrl the page url without anchor
759     * @param category category label
760     * @param propertyEntries the list to add entries to
761     */
762    private static void extractPropertiesFromRows(Element propertiesSubsection,
763                                                  String checkName,
764                                                  String baseUrl,
765                                                  String category,
766                                                  List<SearchIndexEntry> propertyEntries) {
767        final NodeList rows = propertiesSubsection.getElementsByTagName("tr");
768
769        for (int rowIdx = 1; rowIdx < rows.getLength(); rowIdx++) {
770            final Element row = (Element) rows.item(rowIdx);
771            final NodeList cells = row.getElementsByTagName("td");
772            if (cells.getLength() >= 2) {
773                processPropertyRow(cells, checkName, baseUrl, category, propertyEntries);
774            }
775        }
776    }
777
778    /**
779     * Processes a single property row and adds an entry if valid.
780     *
781     * @param cells the table cells
782     * @param checkName the check name
783     * @param baseUrl the page url without anchor
784     * @param category category label
785     * @param propertyEntries the list to add entries to
786     */
787    private static void processPropertyRow(NodeList cells,
788                                           String checkName,
789                                           String baseUrl,
790                                           String category,
791                                           List<SearchIndexEntry> propertyEntries) {
792        final String propName = WHITESPACE
793                .matcher(cells.item(0).getTextContent())
794                .replaceAll(SPACE).trim();
795
796        if (!propName.isEmpty()) {
797            final String propDesc = WHITESPACE
798                    .matcher(cells.item(1).getTextContent())
799                    .replaceAll(SPACE).trim();
800
801            final String title = checkName + TITLE_SEPARATOR + propName;
802            final String url = baseUrl + ANCHOR_SEPARATOR + propName;
803            final String description = truncate(propDesc, MAX_DESCRIPTION_LENGTH);
804            final String keywords = extractKeywordsFromText(
805                    checkName + SPACE + propName + SPACE + propDesc);
806
807            propertyEntries.add(new SearchIndexEntry(
808                    title, url, category, PROPERTY_TYPE,
809                    description, keywords));
810        }
811    }
812
813    /**
814     * Adds an entry to the output list only if its URL has not been seen
815     * before. This is a secondary guard that catches any duplicates that
816     * slip through the primary filter (only processing plain {@code .xml}
817     * files), e.g. if a check has the same example paragraph id repeated
818     * across two sections.
819     *
820     * @param entry the entry to conditionally add
821     */
822    private void addIfNew(SearchIndexEntry entry) {
823        if (seenUrls.add(entry.url())) {
824            entries.add(entry);
825        }
826    }
827
828    /**
829     * Finds a subsection within a section whose lowercased name contains the
830     * given fragment (e.g. "examples" or "propert" to match "Properties").
831     *
832     * @param section  the section to search
833     * @param fragment lowercase fragment to match against the subsection name
834     * @return the matching subsection element, or {@code null} if not found
835     */
836    private static Element findSubsectionByPrefix(Element section, String fragment) {
837        final NodeList subsections = section.getElementsByTagName(SUBSECTION);
838        Element result = null;
839        for (int index = 0; index < subsections.getLength(); index++) {
840            final Element sub = (Element) subsections.item(index);
841            if (sub.getAttribute(NAME_ATTR).trim()
842                    .toLowerCase(Locale.ROOT).contains(fragment)) {
843                result = sub;
844                break;
845            }
846        }
847        return result;
848    }
849
850    /**
851     * Parses the XML file into a Document with external entity resolution
852     * disabled for security.
853     *
854     * @param xmlFile the XDoc source file
855     * @return the parsed Document
856     * @throws ParserConfigurationException on XML parser setup failure
857     * @throws SAXException on XML parse error
858     * @throws IOException on file read failure
859     */
860    private static Document parseXml(File xmlFile)
861            throws ParserConfigurationException, SAXException, IOException {
862        final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
863        factory.setFeature(EXTERNAL_GENERAL_ENTITIES, false);
864        factory.setFeature(EXTERNAL_PARAMETER_ENTITIES, false);
865
866        final DocumentBuilder builder = factory.newDocumentBuilder();
867        builder.setErrorHandler(null);
868
869        final Document doc = builder.parse(xmlFile);
870        doc.getDocumentElement().normalize();
871        return doc;
872    }
873
874    /**
875     * Returns the document's {@code <body>} element, failing fast if it is
876     * absent. Every XDoc page processed by this generator is expected to
877     * have one; its absence indicates a malformed source file that should
878     * be fixed rather than silently skipped or producing an empty entry.
879     *
880     * @param doc        the parsed document
881     * @param identifier file path or URL used to identify the source in the
882     *                   error message
883     * @return the body element
884     * @throws IllegalStateException if {@code doc} has no {@code <body>} element
885     */
886    private static Element requireBody(Document doc, String identifier) {
887        final NodeList bodies = doc.getElementsByTagName(BODY);
888        if (bodies.getLength() == 0) {
889            throw new IllegalStateException(
890                    "XDoc file is missing a <body> element: " + identifier);
891        }
892        final Element body = (Element) bodies.item(0);
893        if (body == null) {
894            throw new IllegalStateException(
895                    "XDoc file has a null <body> element: " + identifier);
896        }
897        return body;
898    }
899
900    /**
901     * Extracts the document title from the {@code <title>} element, falling
902     * back to the first non-empty, non-"Content" section name, and finally
903     * to a capitalised version of the file name.
904     *
905     * @param doc      the document
906     * @param xmlFile  the source file
907     * @param sections the list of sections
908     * @return the title string, never empty
909     */
910    private static String extractTitle(Document doc, File xmlFile, NodeList sections) {
911        final NodeList titles = doc.getElementsByTagName(TITLE);
912        String title = "";
913        if (titles.getLength() > 0) {
914            title = titles.item(0).getTextContent().trim();
915        }
916
917        if ((title.isEmpty() || CONTENT.equalsIgnoreCase(title))
918                && sections.getLength() > 0) {
919            final String firstSection =
920                    ((Element) sections.item(0)).getAttribute(NAME_ATTR).trim();
921            if (!firstSection.isEmpty() && !CONTENT.equalsIgnoreCase(firstSection)) {
922                title = firstSection;
923            }
924        }
925
926        if (title.isEmpty() || CONTENT.equalsIgnoreCase(title)) {
927            final String name =
928                    xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), "");
929            title = capitalise(name.replace('_', ' '));
930        }
931        return title;
932    }
933
934    /**
935     * Aggregates description from sections, taking the first non-empty
936     * Description subsection found across all sections in the document.
937     *
938     * @param sections list of sections
939     * @return description string, possibly empty
940     */
941    private static String extractAggregateDescription(NodeList sections) {
942        String description = "";
943        for (int index = 0; index < sections.getLength(); index++) {
944            description = extractDescription((Element) sections.item(index));
945            if (!description.isEmpty()) {
946                break;
947            }
948        }
949        return description;
950    }
951
952    /**
953     * Aggregates keywords from sections using all section text so that the
954     * main check entry is discoverable by any term in the document.
955     *
956     * @param title    the document title
957     * @param sections list of sections
958     * @return keywords string
959     */
960    private static String extractAggregateKeywords(String title, NodeList sections) {
961        final StringBuilder keywordSource = new StringBuilder(title);
962        for (int index = 0; index < sections.getLength(); index++) {
963            final Element section = (Element) sections.item(index);
964            keywordSource.append(SPACE_CHAR)
965                .append(section.getAttribute(NAME_ATTR))
966                .append(SPACE_CHAR)
967                .append(section.getTextContent());
968        }
969        return extractKeywordsFromText(keywordSource.toString());
970    }
971
972    /**
973     * Extracts the first sentence of the Description subsection.
974     * Returns an empty string if no Description subsection is found.
975     *
976     * @param section the {@code <section>} element to search
977     * @return first sentence of the description, or empty string
978     */
979    private static String extractDescription(Element section) {
980        final Element sub = findSubsectionByPrefix(section, DESCRIPTION);
981        String result = "";
982        if (sub != null) {
983            final String text = WHITESPACE.matcher(sub.getTextContent())
984                    .replaceAll(SPACE).trim();
985            result = extractFirstSentenceOrTruncated(text);
986        }
987        return result;
988    }
989
990    /**
991     * Derives a fallback page title from the document's {@code <title>}
992     * element or, failing that, from the filename.
993     *
994     * @param doc     the parsed document
995     * @param xmlFile the source file
996     * @return a non-empty title string
997     */
998    private static String derivePageTitle(Document doc, File xmlFile) {
999        final NodeList titles = doc.getElementsByTagName(TITLE);
1000        String title = "";
1001        if (titles.getLength() > 0) {
1002            title = titles.item(0).getTextContent().trim();
1003        }
1004        if (title.isEmpty()) {
1005            final String name =
1006                    xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), "");
1007            title = capitalise(name.replace('_', ' '));
1008        }
1009        return title;
1010    }
1011
1012    /**
1013     * Disambiguates a section title when it is a generic, structurally
1014     * repeated header (see {@link #GENERIC_SECTION_NAMES}).
1015     * Non-generic section names are returned unchanged.
1016     *
1017     * @param sectionName the raw section name
1018     * @param pageTitle   the owning page's own title
1019     * @return either {@code sectionName} unchanged, or
1020     *         {@code "<pageTitle>: <sectionName>"} if generic
1021     */
1022    private static String disambiguateTitle(String sectionName, String pageTitle) {
1023        final String result;
1024        if (GENERIC_SECTION_NAMES.contains(sectionName.toLowerCase(Locale.ROOT))) {
1025            result = pageTitle + TITLE_SEPARATOR + sectionName;
1026        }
1027        else {
1028            result = sectionName;
1029        }
1030        return result;
1031    }
1032
1033    /**
1034     * Converts a Doxia {@code <section name="...">} value into the anchor id
1035     * Doxia generates for it in the rendered HTML by replacing runs of
1036     * whitespace with single underscores.
1037     *
1038     * @param sectionName the raw {@code name} attribute value
1039     * @return the anchor id Doxia would render for this section name
1040     */
1041    private static String doxiaAnchorFor(String sectionName) {
1042        return WHITESPACE.matcher(sectionName.trim()).replaceAll("_");
1043    }
1044
1045    /**
1046     * Returns the first sentence of the given text (up to and including the
1047     * first period), or the text truncated to {@link #MAX_DESCRIPTION_LENGTH}
1048     * with an ellipsis if no period is found within range.
1049     *
1050     * @param text the source text, already whitespace-normalised
1051     * @return first sentence or truncated text
1052     */
1053    private static String extractFirstSentenceOrTruncated(String text) {
1054        final String result;
1055        final int dot = text.indexOf('.');
1056        if (dot > 0) {
1057            result = text.substring(0, dot + 1).trim();
1058        }
1059        else {
1060            result = truncate(text, MAX_DESCRIPTION_LENGTH);
1061        }
1062        return result;
1063    }
1064
1065    /**
1066     * Truncates text to the given max length, appending an ellipsis if
1067     * truncation occurred.
1068     *
1069     * @param text      the text to truncate
1070     * @param maxLength maximum length before truncation
1071     * @return original text if short enough, otherwise truncated with ellipsis
1072     */
1073    private static String truncate(String text, int maxLength) {
1074        final String result;
1075        if (text.length() > maxLength) {
1076            result = text.substring(0, maxLength) + ELLIPSIS;
1077        }
1078        else {
1079            result = text;
1080        }
1081        return result;
1082    }
1083
1084    /**
1085     * Builds the root-relative URL for an XDoc file, without any anchor.
1086     * Always uses forward slashes regardless of OS.
1087     *
1088     * @param xmlFile  the source XDoc file
1089     * @param xdocsDir the xdocs root directory
1090     * @return root-relative URL string with no anchor
1091     */
1092    private static String buildUrl(File xmlFile, File xdocsDir) {
1093        return xdocsDir.toPath()
1094                .relativize(xmlFile.toPath())
1095                .toString()
1096                .replace(File.separatorChar, '/')
1097                .replaceFirst(DOC_EXTENSION.pattern(), ".html");
1098    }
1099
1100    /**
1101     * Resolves the correct URL for a general page file. For {@code config_<category>.xml} files
1102     * that redirect to check category pages, maps to {@code checks/<category>/index.html} instead
1103     * of the file path.
1104     *
1105     * @param xmlFile  the source XDoc file
1106     * @param xdocsDir the xdocs root directory
1107     * @return the resolved URL
1108     */
1109    private static String resolvePageUrl(File xmlFile, File xdocsDir) {
1110        String url = buildUrl(xmlFile, xdocsDir);
1111        final Matcher matcher = CONFIG_CATEGORY.matcher(xmlFile.getName());
1112        if (matcher.find()) {
1113            final String category = matcher.group(1);
1114            if (CHECKS_CATEGORY_DISPLAY_NAMES.containsKey(category)) {
1115                url = CHECKS + PATH_SEPARATOR + category + PATH_SEPARATOR + INDEX_HTML;
1116            }
1117            else if (FILTERS_DIR.equals(category) || FILEFILTERS_DIR.equals(category)) {
1118                url = category + PATH_SEPARATOR + INDEX_HTML;
1119            }
1120        }
1121        return url;
1122    }
1123
1124    /**
1125     * Extracts keywords from free-form text by splitting on non-word
1126     * characters and filtering short and stop words.
1127     *
1128     * @param text input text
1129     * @return comma-separated keyword string (up to {@link #MAX_KEYWORDS} words)
1130     */
1131    private static String extractKeywordsFromText(String text) {
1132        String result = "";
1133        if (text != null && !text.isEmpty()) {
1134            result = NON_ALPHANUMERIC.splitAsStream(text.toLowerCase(Locale.ROOT))
1135                    .filter(word -> {
1136                        return word.length() >= MIN_WORD_LENGTH
1137                                && !STOP_WORDS.contains(word);
1138                    })
1139                    .distinct()
1140                    .limit(MAX_KEYWORDS)
1141                    .collect(Collectors.joining(COMMA_STR));
1142        }
1143        return result;
1144    }
1145
1146    /**
1147     * Writes all index entries to the output file.
1148     *
1149     * @param indexEntries the list of entries to serialise
1150     * @param outputFilePath the full path to the output file
1151     * @throws IOException on file write failure
1152     */
1153    private static void writeJson(List<SearchIndexEntry> indexEntries, Path outputFilePath)
1154            throws IOException {
1155
1156        final Path outputPath = outputFilePath.getParent();
1157        if (outputPath != null) {
1158            Files.createDirectories(outputPath);
1159        }
1160
1161        try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter(
1162                outputFilePath, StandardCharsets.UTF_8))) {
1163            writer.println("[");
1164
1165            final int size = indexEntries.size();
1166            for (int index = 0; index < size; index++) {
1167                final String comma;
1168                if (index < size - 1) {
1169                    comma = COMMA_STR;
1170                }
1171                else {
1172                    comma = "";
1173                }
1174                writer.println("  " + indexEntries.get(index).toJson() + comma);
1175            }
1176            writer.println("]");
1177        }
1178    }
1179
1180    /**
1181     * Capitalises the first character of a string.
1182     *
1183     * @param input the string to capitalise
1184     * @return string with first character uppercased, or input unchanged if
1185     *         empty
1186     */
1187    private static String capitalise(String input) {
1188        String result = input;
1189        if (input != null && !input.isEmpty()) {
1190            result = Character.toUpperCase(input.charAt(0)) + input.substring(1);
1191        }
1192        return result;
1193    }
1194}