001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2026 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.site;
021
022import java.io.File;
023import java.io.IOException;
024import java.io.PrintWriter;
025import java.nio.charset.StandardCharsets;
026import java.nio.file.Files;
027import java.nio.file.Path;
028import java.util.ArrayList;
029import java.util.Arrays;
030import java.util.HashSet;
031import java.util.LinkedHashMap;
032import java.util.LinkedHashSet;
033import java.util.List;
034import java.util.Locale;
035import java.util.Map;
036import java.util.Set;
037import java.util.logging.Level;
038import java.util.logging.Logger;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041import java.util.stream.Collectors;
042
043import javax.xml.parsers.DocumentBuilder;
044import javax.xml.parsers.DocumentBuilderFactory;
045import javax.xml.parsers.ParserConfigurationException;
046
047import org.w3c.dom.Document;
048import org.w3c.dom.Element;
049import org.w3c.dom.NodeList;
050import org.xml.sax.SAXException;
051
052/**
053 * Generates {@code search-index.json} from the Checkstyle XDoc source files.
054 *
055 * <p>This is a plain Java {@code main()} class - no Maven plugin API required.
056 * It is invoked by {@code exec-maven-plugin} during the {@code process-classes}
057 * phase so the index is ready when Maven Site copies static resources.</p>
058 *
059 * <p>Output is written as a JSON file. The search widget fetches this file
060 * using the fetch API and parses it to populate the search index.</p>
061 *
062 * <h2>Key design decisions</h2>
063 * <ul>
064 *   <li><b>No duplicates.</b> Only plain {@code .xml} files are processed for
065 *       check/filter/filefilter directories. The {@code .xml.template} and
066 *       {@code .xml.vm} siblings are pre-render source files that would produce
067 *       identical URLs and duplicate entries. A secondary URL-keyed dedup guard
068 *       is also applied across the entire output list.</li>
069 *
070 *   <li><b>Identifiable example titles.</b> Both {@code -config} and
071 *       {@code -code} example paragraphs are indexed.  Their titles use the
072 *       pattern {@code "<CheckName>: Example1 [config]"} and
073 *       {@code "<CheckName>: Example1 [code]"} so users can distinguish a
074 *       configuration snippet from its matching Java code example in search
075 *       results.</li>
076 *
077 *   <li><b>Full general-page indexing.</b> Each meaningful {@code <section>}
078 *       in general documentation pages (e.g. {@code config_system_properties},
079 *       {@code writingchecks}, {@code cmdline}) is indexed as its own entry
080 *       with the full section text used for keyword extraction - not just the
081 *       first sentence. This makes page-internal headings discoverable.</li>
082 *
083 *   <li><b>Disambiguated generic titles.</b> Structural section names that are
084 *       repeated across many pages (e.g. "Overview", "Debug", "Contributing")
085 *       are prefixed with the page title, yielding e.g.
086 *       "Eclipse IDE: Debug" instead of a bare "Debug" that collides with
087 *       "IntelliJ IDE: Debug".</li>
088 *
089 *   <li><b>Junk pages excluded.</b> Release notes, auto-generated style
090 *       coverage reports and bare category aggregator stubs are skipped.</li>
091 * </ul>
092 *
093 * <p>Usage (called by exec-maven-plugin in pom.xml):</p>
094 * <pre>
095 *   java SearchIndexGenerator &lt;xdocsDir&gt; &lt;outputFilePath&gt;
096 *   java SearchIndexGenerator src/site/xdoc target/site/search-index.json
097 * </pre>
098 */
099public final class SearchIndexGenerator {
100
101    /** String literal for checks directory. */
102    private static final String CHECKS = "checks";
103
104    /** String literal for comma. */
105    private static final String COMMA_STR = ",";
106
107    /** String literal for space. */
108    private static final String SPACE = " ";
109
110    /** Character literal for space. */
111    private static final char SPACE_CHAR = ' ';
112
113    /** String literal for colon separator used in disambiguated titles. */
114    private static final String TITLE_SEPARATOR = ": ";
115
116    /** String literal for ellipsis. */
117    private static final String ELLIPSIS = "...";
118
119    /** String literal for external general entities feature. */
120    private static final String EXTERNAL_GENERAL_ENTITIES =
121            "http://xml.org/sax/features/external-general-entities";
122
123    /** String literal for external parameter entities feature. */
124    private static final String EXTERNAL_PARAMETER_ENTITIES =
125            "http://xml.org/sax/features/external-parameter-entities";
126
127    /** String literal for General category. */
128    private static final String GENERAL = "General";
129
130    /** String literal for Example document type. */
131    private static final String EXAMPLE_TYPE = "Example";
132
133    /** String literal for Property document type. */
134    private static final String PROPERTY_TYPE = "Property";
135
136    /** String literal for subsection element. */
137    private static final String SUBSECTION = "subsection";
138
139    /** String literal for name attribute. */
140    private static final String NAME_ATTR = "name";
141
142    /** String literal for id attribute. */
143    private static final String ID_ATTR = "id";
144
145    /** String literal for index.xml. */
146    private static final String INDEX_XML = "index.xml";
147
148    /** String literal for Content. */
149    private static final String CONTENT = "Content";
150
151    /** String literal for the Examples subsection name. */
152    private static final String EXAMPLES_SUBSECTION = "examples";
153
154    /** String literal for body element. */
155    private static final String BODY = "body";
156
157    /** String literal for section element. */
158    private static final String SECTION = "section";
159
160    /** String literal for title element. */
161    private static final String TITLE = "title";
162
163    /** String literal for description element. */
164    private static final String DESCRIPTION = "description";
165
166    /** String literal for anchor separator. */
167    private static final String ANCHOR_SEPARATOR = "#";
168
169    /** String literal for the Properties subsection name fragment. */
170    private static final String PROPERTIES_FRAGMENT = "propert";
171
172    /** Log message for skipping files. */
173    private static final String SKIPPING_MSG = "[SearchIndex] WARN: skipping {0} - {1}";
174
175    /**
176     * Suffix label appended to example titles for configuration snippets.
177     * Yields e.g. "AnnotationLocation: Example1 [config]".
178     */
179    private static final String EXAMPLE_LABEL_CONFIG = " [config]";
180
181    /**
182     * Suffix label appended to example titles for Java code examples.
183     * Yields e.g. "AnnotationLocation: Example1 [code]".
184     */
185    private static final String EXAMPLE_LABEL_CODE = " [code]";
186
187    /** Magic number for minimum word length. */
188    private static final int MIN_WORD_LENGTH = 2;
189
190    /** Magic number for maximum keywords. */
191    private static final int MAX_KEYWORDS = 15;
192
193    /** Magic number for maximum description length. */
194    private static final int MAX_DESCRIPTION_LENGTH = 150;
195
196    /** Whitespace pattern. */
197    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
198
199    /** Non-alphanumeric pattern. */
200    private static final Pattern NON_ALPHANUMERIC = Pattern.compile("[^a-z0-9]+");
201
202    /**
203     * Matches only plain {@code .xml} files (not {@code .xml.vm} or
204     * {@code .xml.template}).  Used when scanning check/filter/filefilter
205     * directories to avoid processing pre-render source templates and
206     * producing duplicate index entries.
207     */
208    private static final Pattern PLAIN_XML = Pattern.compile("\\.xml$");
209
210    /**
211     * Matches {@code .xml}, {@code .xml.vm} and {@code .xml.template}.
212     * Used only for URL building (stripping the extension to produce a
213     * {@code .html} path) and for the general-pages scanner where we
214     * want to exclude templates by name rather than by extension.
215     */
216    private static final Pattern DOC_EXTENSION =
217            Pattern.compile("\\.xml$|\\.xml\\.vm$|\\.xml\\.template$");
218
219    /**
220     * Matches {@code config_<category>.xml} files that redirect to check category pages.
221     * Captures the category name (e.g. "metrics" from "config_metrics.xml") in group 1.
222     */
223    private static final Pattern CONFIG_CATEGORY =
224          Pattern.compile("^config_(.+)\\.xml$");
225
226    /**
227     * Matches an example paragraph {@code id} attribute that has a suffix of
228     * either {@code -config} or {@code -code}, capturing the base label
229     * (e.g. "Example1") in group 1 and the type ("config" or "code") in
230     * group 2.
231     *
232     * <p>Example ids found in XDoc source:</p>
233     * <ul>
234     *   <li>{@code id="Example1-config"} -&gt; label "Example1", type "config"</li>
235     *   <li>{@code id="Example1-code"}   -&gt; label "Example1", type "code"</li>
236     * </ul>
237     */
238    private static final Pattern EXAMPLE_PARAGRAPH_ID =
239            Pattern.compile("^(Example\\d+)-(config|code)$");
240
241    /**
242     * Generic section/subsection names that are structurally repeated across
243     * many unrelated general pages (IDE setup guides, writing-* guides, etc).
244     * On their own they are meaningless in search results ("Debug" appears
245     * identically in eclipse.xml, idea.xml, and netbeans.xml) so when one of
246     * these is used as a section title it is always disambiguated with the
247     * source page's own title, e.g. "Eclipse IDE: Debug".
248     */
249    private static final Set<String> GENERIC_SECTION_NAMES = new HashSet<>(Arrays.asList(
250            "overview", DESCRIPTION, EXAMPLES_SUBSECTION, "example", "debug",
251            "contributing", "limitations", "parameters", "installation"
252    ));
253
254    /** Category mapping: XDoc subdirectory name to display label. */
255    private static final Map<String, String> CATEGORY_MAP = new LinkedHashMap<>();
256
257    static {
258        CATEGORY_MAP.put("annotation", "Annotation");
259        CATEGORY_MAP.put("blocks", "Block Checks");
260        CATEGORY_MAP.put("coding", "Coding");
261        CATEGORY_MAP.put("design", "Class Design");
262        CATEGORY_MAP.put("header", "Headers");
263        CATEGORY_MAP.put("imports", "Imports");
264        CATEGORY_MAP.put("javadoc", "Javadoc Comments");
265        CATEGORY_MAP.put("metrics", "Metrics");
266        CATEGORY_MAP.put("misc", "Miscellaneous");
267        CATEGORY_MAP.put("modifier", "Modifiers");
268        CATEGORY_MAP.put("naming", "Naming Conventions");
269        CATEGORY_MAP.put("regexp", "Regexp");
270        CATEGORY_MAP.put("sizes", "Size Violations");
271        CATEGORY_MAP.put("whitespace", "Whitespace");
272    }
273
274    /** Stop words: too generic to be useful as search keywords. */
275    private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(
276            "a", "an", "the", "and", "or", "of", "to", "in", "is", "it",
277            "that", "this", "for", "on", "with", "are", "be", "by", "at",
278            "as", "if", "its", "from", "which", "whether", "can", "will",
279            "has", "have", "not", "also", "only", "any", "all", "each",
280            "more", "than", "when", "then", "into", "such", "use", "used",
281            "check", CHECKS, "checkstyle"
282    ));
283
284    /** Logger for this class. */
285    private final Logger logger = Logger.getLogger(getClass().getName());
286
287    /** Accumulated search index entries. */
288    private List<SearchIndexEntry> entries;
289
290    /** Deduplication guard for URLs. */
291    private Set<String> seenUrls;
292
293    /** Prevent instantiation. */
294    private SearchIndexGenerator() {
295    }
296
297    /**
298     * Main entry point called by exec-maven-plugin.
299     *
300     * @param args args[0] = path to src/xdocs, args[1] = path to target/site
301     * @throws IOException on file write failure
302     * @throws IllegalArgumentException if args are missing
303     * @throws IllegalStateException if xdocsDir is missing
304     * @noinspectionreason UseOfSystemOutOrSystemErr - main method of a CLI utility
305     */
306    public static void main(String... args) throws IOException {
307        new SearchIndexGenerator().execute(args);
308    }
309
310    /**
311     * Internal execution method to avoid static context for the logger.
312     *
313     * @param args args[0] = path to src/xdocs, args[1] = output file path
314     * @throws IOException on file write failure
315     * @throws IllegalArgumentException if args are missing
316     * @throws IllegalStateException if xdocsDir is missing
317     */
318    private void execute(String... args) throws IOException {
319        if (args.length < 2) {
320            throw new IllegalArgumentException(
321                    "Usage: SearchIndexGenerator <xdocsDir> <outputFilePath>");
322        }
323
324        final Path xdocsPath = Path.of(args[0]);
325        final Path outputFilePath = Path.of(args[1]);
326        final File xdocsDir = xdocsPath.toFile();
327
328        if (!Files.exists(xdocsPath)) {
329            final String error = "[SearchIndex] ERROR: xdocsDir not found: "
330                    + xdocsPath.toAbsolutePath();
331            throw new IllegalStateException(error);
332        }
333
334        if (logger.isLoggable(Level.INFO)) {
335            logger.log(Level.INFO, "[SearchIndex] Reading XDocs from: {0}", xdocsPath);
336        }
337
338        seenUrls = new LinkedHashSet<>();
339        entries = new ArrayList<>();
340
341        final Path checksPath = xdocsPath.resolve(CHECKS);
342        if (Files.exists(checksPath)) {
343            processChecksDirectory(checksPath.toFile(), xdocsDir);
344        }
345
346        final Path filtersPath = xdocsPath.resolve("filters");
347        if (Files.exists(filtersPath)) {
348            processDirectory(filtersPath.toFile(), xdocsDir,
349                    "Filters", "Filter");
350        }
351
352        final Path fileFiltersPath = xdocsPath.resolve("filefilters");
353        if (Files.exists(fileFiltersPath)) {
354            processDirectory(fileFiltersPath.toFile(), xdocsDir,
355                    "File Filters", "File Filter");
356        }
357
358        processGeneralPages(xdocsDir);
359        writeJson(entries, outputFilePath);
360
361        if (logger.isLoggable(Level.INFO)) {
362            logger.log(Level.INFO, "[SearchIndex] Done - {0} entries indexed.",
363                    entries.size());
364        }
365    }
366
367    /**
368     * Walks {@code src/xdocs/checks/} and processes each category subdirectory.
369     *
370     * @param checksDir the checks root directory
371     * @param xdocsDir  the xdocs root (used for URL building)
372     */
373    private void processChecksDirectory(File checksDir, File xdocsDir) {
374
375        final File[] categoryDirs = checksDir.listFiles(File::isDirectory);
376        if (categoryDirs != null) {
377            for (File categoryDir : categoryDirs) {
378                final String dirName = categoryDir.getName().toLowerCase(Locale.ROOT);
379                final String category = CATEGORY_MAP.getOrDefault(dirName,
380                        capitalise(dirName));
381                processDirectory(categoryDir, xdocsDir, category, "Check");
382            }
383        }
384    }
385
386    /**
387     * Processes all <b>plain</b> {@code .xml} files in a directory
388     * (non-recursive). {@code index.xml} files and any file whose name ends
389     * with {@code .xml.template} or {@code .xml.vm} are skipped.
390     *
391     * <p>Skipping templates is critical: every check page has a sibling
392     * {@code *.xml.template} file that resolves to the <em>same</em> HTML
393     * URL. Without this filter both files would be processed, producing two
394     * identical (or near-identical) main entries plus doubled example and
395     * property entries for every check.</p>
396     *
397     * <p>For each plain {@code .xml} file, the main check/filter entry,
398     * per-example entries (both config and code), and per-property entries
399     * are added.</p>
400     *
401     * @param dir      directory to scan
402     * @param xdocsDir xdocs root (used for URL building)
403     * @param category category label for all entries in this directory
404     * @param type     document type ("Check", "Filter", "File Filter")
405     */
406    private void processDirectory(File dir, File xdocsDir,
407                                  String category, String type) {
408
409        final File[] xmlFiles = dir.listFiles(file -> {
410            return file.isFile()
411                    && PLAIN_XML.matcher(file.getName()).find()
412                    && !INDEX_XML.equals(file.getName());
413        });
414
415        if (xmlFiles != null) {
416            Arrays.sort(xmlFiles);
417            for (File xmlFile : xmlFiles) {
418                try {
419                    final Document doc = parseXml(xmlFile);
420                    final String baseUrl = buildUrl(xmlFile, xdocsDir);
421
422                    addIfNew(buildMainEntry(doc, xmlFile, category, type, baseUrl));
423
424                    for (SearchIndexEntry entry
425                            : extractExampleEntries(doc, baseUrl, category)) {
426                        addIfNew(entry);
427                    }
428                    for (SearchIndexEntry entry
429                            : extractPropertyEntries(doc, baseUrl, category)) {
430                        addIfNew(entry);
431                    }
432                }
433                catch (IOException | SAXException | ParserConfigurationException exception) {
434                    if (logger.isLoggable(Level.WARNING)) {
435                        logger.log(Level.WARNING, SKIPPING_MSG,
436                                new Object[] {xmlFile.getName(), exception.getMessage()});
437                    }
438                }
439            }
440        }
441    }
442
443    /**
444     * Adds entries for the top-level general documentation pages.
445     *
446     * <p>Each remaining page is indexed per top-level {@code <section>},
447     * using the section's full text content for keyword extraction so
448     * page-internal headings are fully discoverable. Generic structural
449     * section names (see {@link #GENERIC_SECTION_NAMES}) are disambiguated
450     * by prefixing the page's own title.</p>
451     *
452     * @param xdocsDir the xdocs root directory
453     */
454    private void processGeneralPages(File xdocsDir) {
455
456        final File[] xmlFiles = xdocsDir.listFiles(file -> {
457            final String name = file.getName();
458            return file.isFile()
459                    && PLAIN_XML.matcher(name).find();
460        });
461
462        if (xmlFiles != null) {
463            Arrays.sort(xmlFiles);
464            for (File xmlFile : xmlFiles) {
465                try {
466                    for (SearchIndexEntry entry : buildGeneralPageEntries(xmlFile)) {
467                        addIfNew(entry);
468                    }
469                }
470                catch (IOException | SAXException | ParserConfigurationException exception) {
471                    if (logger.isLoggable(Level.WARNING)) {
472                        logger.log(Level.WARNING, SKIPPING_MSG,
473                                new Object[] {xmlFile.getName(), exception.getMessage()});
474                    }
475                }
476            }
477        }
478    }
479
480    /**
481     * Builds the main search entry representing an entire check/filter document.
482     *
483     * @param doc      the parsed XDoc document
484     * @param xmlFile  the source file
485     * @param category category label for this file's entry
486     * @param type     document type ("Check", "Filter", etc.)
487     * @param baseUrl  the page url without anchor
488     * @return an entry representing the document
489     */
490    private static SearchIndexEntry buildMainEntry(Document doc, File xmlFile,
491                                                   String category, String type,
492                                                   String baseUrl) {
493        final NodeList bodies = doc.getElementsByTagName(BODY);
494        final Element body = (Element) bodies.item(0);
495        final NodeList sections = body.getElementsByTagName(SECTION);
496
497        final String title = extractTitle(doc, xmlFile, sections);
498        final String description = extractAggregateDescription(sections);
499        final String keywords = extractAggregateKeywords(title, sections);
500
501        return new SearchIndexEntry(title, baseUrl, category, type, description, keywords);
502    }
503
504    /**
505     * Builds one search entry per top-level {@code <section>} in a general
506     * documentation page, using each section's full text for keyword
507     * extraction so that page-internal content is fully discoverable.
508     *
509     * <p>Generic structural section names (see {@link #GENERIC_SECTION_NAMES})
510     * are disambiguated as {@code "<page title>: <section name>"} to avoid
511     * collisions across pages (e.g. "Eclipse IDE: Debug" vs
512     * "IntelliJ IDE: Debug").</p>
513     *
514     * @param xmlFile the XDoc source file to parse
515     * @return list of entries, one per top-level section found
516     * @throws ParserConfigurationException on XML parser setup failure
517     * @throws SAXException on XML parse error
518     * @throws IOException on file read failure
519     */
520    private static List<SearchIndexEntry> buildGeneralPageEntries(File xmlFile)
521            throws ParserConfigurationException, SAXException, IOException {
522
523        final List<SearchIndexEntry> results = new ArrayList<>();
524        final Document doc = parseXml(xmlFile);
525
526        final NodeList bodies = doc.getElementsByTagName(BODY);
527        if (bodies.getLength() != 0) {
528            final Element body = (Element) bodies.item(0);
529            final NodeList sections = body.getElementsByTagName(SECTION);
530            final String pageUrl = resolvePageUrl(xmlFile, xmlFile.getParentFile());
531            final String pageTitle = derivePageTitle(doc, xmlFile);
532
533            if (sections.getLength() == 0) {
534                final String fullText = WHITESPACE.matcher(body.getTextContent())
535                        .replaceAll(SPACE).trim();
536                final String description = extractFirstSentenceOrTruncated(fullText);
537                final String keywords = extractKeywordsFromText(
538                        pageTitle + SPACE + fullText);
539                results.add(new SearchIndexEntry(
540                        pageTitle, pageUrl, GENERAL, GENERAL, description, keywords));
541            }
542            else {
543                for (int index = 0; index < sections.getLength(); index++) {
544                    final Element section = (Element) sections.item(index);
545                    if (body.equals(section.getParentNode())) {
546                        final String sectionName = section.getAttribute(NAME_ATTR).trim();
547                        if (!sectionName.isEmpty() && !CONTENT.equalsIgnoreCase(sectionName)) {
548
549                            final String entryTitle = disambiguateTitle(sectionName, pageTitle);
550                            final String anchor = doxiaAnchorFor(sectionName);
551                            final String url = pageUrl + ANCHOR_SEPARATOR + anchor;
552
553                            final String sectionText = WHITESPACE.matcher(section.getTextContent())
554                                    .replaceAll(SPACE).trim();
555                            final String description = extractFirstSentenceOrTruncated(sectionText);
556                            final String keywords = extractKeywordsFromText(
557                                    pageTitle + SPACE + sectionName + SPACE + sectionText);
558
559                            results.add(new SearchIndexEntry(
560                                    entryTitle, url, GENERAL, GENERAL, description, keywords));
561                        }
562                    }
563                }
564            }
565        }
566
567        return results;
568    }
569
570    /**
571     * Extracts per-example search entries from a check/filter document.
572     *
573     * <p>Both {@code -config} and {@code -code} example paragraphs are
574     * indexed so users can find both the configuration snippet and the
575     * corresponding Java code example independently in search results.</p>
576     *
577     * <p>Titles use the pattern {@code "<CheckName>: Example1 [config]"} and
578     * {@code "<CheckName>: Example1 [code]"} to make the type immediately
579     * visible in search result listings without needing to open the page.</p>
580     *
581     * <p>Confirmed XDoc template structure for the Examples subsection:</p>
582     * <pre>
583     *   &lt;p id="Example1-config"&gt;To configure the check...&lt;/p&gt;
584     *   &lt;macro name="example"&gt;&lt;param name="type" value="config"/&gt;&lt;/macro&gt;
585     *   &lt;p id="Example1-code"&gt;Example:&lt;/p&gt;
586     *   &lt;macro name="example"&gt;&lt;param name="type" value="code"/&gt;&lt;/macro&gt;
587     * </pre>
588     *
589     * @param doc      the parsed XDoc document
590     * @param baseUrl  the page url without anchor
591     * @param category category label
592     * @return list of per-example entries (both config and code); empty if
593     *         none found
594     */
595    private static List<SearchIndexEntry> extractExampleEntries(Document doc,
596                                                                String baseUrl,
597                                                                String category) {
598
599        final List<SearchIndexEntry> exampleEntries = new ArrayList<>();
600        final NodeList bodies = doc.getElementsByTagName(BODY);
601        if (bodies.getLength() != 0) {
602            final Element body = (Element) bodies.item(0);
603            final NodeList sections = body.getElementsByTagName(SECTION);
604
605            for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) {
606                final Element section = (Element) sections.item(sectionIdx);
607                final String checkName = section.getAttribute(NAME_ATTR).trim();
608                final Element examplesSubsection =
609                        findSubsectionByPrefix(section, EXAMPLES_SUBSECTION);
610
611                if (examplesSubsection == null) {
612                    continue;
613                }
614
615                final NodeList paragraphs = examplesSubsection.getElementsByTagName("p");
616
617                for (int paragraphIndex = 0; paragraphIndex < paragraphs.getLength();
618                        paragraphIndex++) {
619                    final Element paragraph = (Element) paragraphs.item(paragraphIndex);
620                    final SearchIndexEntry entry = buildExampleEntry(
621                            paragraph, checkName, baseUrl, category);
622                    if (entry != null) {
623                        exampleEntries.add(entry);
624                    }
625                }
626            }
627        }
628
629        return exampleEntries;
630    }
631
632    /**
633     * Builds a single example entry from a paragraph element.
634     *
635     * @param paragraph the paragraph element containing the example
636     * @param checkName the name of the check
637     * @param baseUrl the base URL for the page
638     * @param category the category label
639     * @return a SearchIndexEntry if the paragraph matches the example pattern,
640     *         null otherwise
641     */
642    private static SearchIndexEntry buildExampleEntry(Element paragraph,
643                                                       String checkName,
644                                                       String baseUrl,
645                                                       String category) {
646        final String id = paragraph.getAttribute(ID_ATTR);
647        final Matcher matcher = EXAMPLE_PARAGRAPH_ID.matcher(id);
648        SearchIndexEntry result = null;
649
650        if (matcher.matches()) {
651            final String exampleLabel = matcher.group(1);
652            final String exampleType = matcher.group(2);
653
654            final String labelSuffix;
655            if ("config".equals(exampleType)) {
656                labelSuffix = EXAMPLE_LABEL_CONFIG;
657            }
658            else {
659                labelSuffix = EXAMPLE_LABEL_CODE;
660            }
661
662            final String introText = WHITESPACE
663                    .matcher(paragraph.getTextContent())
664                    .replaceAll(SPACE).trim();
665
666            final String title = checkName + TITLE_SEPARATOR
667                    + exampleLabel + labelSuffix;
668            final String url = baseUrl + ANCHOR_SEPARATOR + id;
669            final String description =
670                    truncate(introText, MAX_DESCRIPTION_LENGTH);
671            final String keywords = extractKeywordsFromText(
672                    checkName + SPACE + exampleLabel
673                            + SPACE + exampleType + SPACE + introText);
674
675            result = new SearchIndexEntry(
676                    title, url, category, EXAMPLE_TYPE,
677                    description, keywords);
678        }
679
680        return result;
681    }
682
683    /**
684     * Extracts per-property search entries from a check/filter document.
685     *
686     * <p>Each row of the Properties table is indexed under the title
687     * {@code "<CheckName>: <propertyName>"} and linked to the property's
688     * own anchor on the page.</p>
689     *
690     * @param doc      the parsed XDoc document
691     * @param baseUrl  the page url without anchor
692     * @param category category label
693     * @return list of per-property entries; empty if none found
694     */
695    private static List<SearchIndexEntry> extractPropertyEntries(Document doc,
696                                                                 String baseUrl,
697                                                                 String category) {
698
699        final List<SearchIndexEntry> propertyEntries = new ArrayList<>();
700        final NodeList bodies = doc.getElementsByTagName(BODY);
701        if (bodies.getLength() != 0) {
702            final Element body = (Element) bodies.item(0);
703            final NodeList sections = body.getElementsByTagName(SECTION);
704
705            for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) {
706                final Element section = (Element) sections.item(sectionIdx);
707                final Element propertiesSubsection =
708                        findSubsectionByPrefix(section, PROPERTIES_FRAGMENT);
709
710                if (propertiesSubsection != null) {
711                    final String checkName = section.getAttribute(NAME_ATTR).trim();
712                    extractPropertiesFromRows(propertiesSubsection, checkName, baseUrl,
713                            category, propertyEntries);
714                }
715            }
716        }
717
718        return propertyEntries;
719    }
720
721    /**
722     * Extracts property entries from table rows and adds them to the list.
723     *
724     * @param propertiesSubsection the properties subsection element
725     * @param checkName the check name
726     * @param baseUrl the page url without anchor
727     * @param category category label
728     * @param propertyEntries the list to add entries to
729     */
730    private static void extractPropertiesFromRows(Element propertiesSubsection,
731                                                  String checkName,
732                                                  String baseUrl,
733                                                  String category,
734                                                  List<SearchIndexEntry> propertyEntries) {
735        final NodeList rows = propertiesSubsection.getElementsByTagName("tr");
736
737        for (int rowIdx = 1; rowIdx < rows.getLength(); rowIdx++) {
738            final Element row = (Element) rows.item(rowIdx);
739            final NodeList cells = row.getElementsByTagName("td");
740            if (cells.getLength() >= 2) {
741                processPropertyRow(cells, checkName, baseUrl, category, propertyEntries);
742            }
743        }
744    }
745
746    /**
747     * Processes a single property row and adds an entry if valid.
748     *
749     * @param cells the table cells
750     * @param checkName the check name
751     * @param baseUrl the page url without anchor
752     * @param category category label
753     * @param propertyEntries the list to add entries to
754     */
755    private static void processPropertyRow(NodeList cells,
756                                           String checkName,
757                                           String baseUrl,
758                                           String category,
759                                           List<SearchIndexEntry> propertyEntries) {
760        final String propName = WHITESPACE
761                .matcher(cells.item(0).getTextContent())
762                .replaceAll(SPACE).trim();
763
764        if (!propName.isEmpty()) {
765            final String propDesc = WHITESPACE
766                    .matcher(cells.item(1).getTextContent())
767                    .replaceAll(SPACE).trim();
768
769            final String title = checkName + TITLE_SEPARATOR + propName;
770            final String url = baseUrl + ANCHOR_SEPARATOR + propName;
771            final String description = truncate(propDesc, MAX_DESCRIPTION_LENGTH);
772            final String keywords = extractKeywordsFromText(
773                    checkName + SPACE + propName + SPACE + propDesc);
774
775            propertyEntries.add(new SearchIndexEntry(
776                    title, url, category, PROPERTY_TYPE,
777                    description, keywords));
778        }
779    }
780
781    /**
782     * Adds an entry to the output list only if its URL has not been seen
783     * before. This is a secondary guard that catches any duplicates that
784     * slip through the primary filter (only processing plain {@code .xml}
785     * files), e.g. if a check has the same example paragraph id repeated
786     * across two sections.
787     *
788     * @param entry the entry to conditionally add
789     */
790    private void addIfNew(SearchIndexEntry entry) {
791        if (seenUrls.add(entry.url())) {
792            entries.add(entry);
793        }
794    }
795
796    /**
797     * Finds a subsection within a section whose lowercased name contains the
798     * given fragment (e.g. "examples" or "propert" to match "Properties").
799     *
800     * @param section  the section to search
801     * @param fragment lowercase fragment to match against the subsection name
802     * @return the matching subsection element, or {@code null} if not found
803     */
804    private static Element findSubsectionByPrefix(Element section, String fragment) {
805        final NodeList subsections = section.getElementsByTagName(SUBSECTION);
806        Element result = null;
807        for (int index = 0; index < subsections.getLength(); index++) {
808            final Element sub = (Element) subsections.item(index);
809            if (sub.getAttribute(NAME_ATTR).trim()
810                    .toLowerCase(Locale.ROOT).contains(fragment)) {
811                result = sub;
812                break;
813            }
814        }
815        return result;
816    }
817
818    /**
819     * Parses the XML file into a Document with external entity resolution
820     * disabled for security.
821     *
822     * @param xmlFile the XDoc source file
823     * @return the parsed Document
824     * @throws ParserConfigurationException on XML parser setup failure
825     * @throws SAXException on XML parse error
826     * @throws IOException on file read failure
827     */
828    private static Document parseXml(File xmlFile)
829            throws ParserConfigurationException, SAXException, IOException {
830        final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
831        factory.setFeature(EXTERNAL_GENERAL_ENTITIES, false);
832        factory.setFeature(EXTERNAL_PARAMETER_ENTITIES, false);
833
834        final DocumentBuilder builder = factory.newDocumentBuilder();
835        builder.setErrorHandler(null);
836
837        final Document doc = builder.parse(xmlFile);
838        doc.getDocumentElement().normalize();
839        return doc;
840    }
841
842    /**
843     * Extracts the document title from the {@code <title>} element, falling
844     * back to the first non-empty, non-"Content" section name, and finally
845     * to a capitalised version of the file name.
846     *
847     * @param doc      the document
848     * @param xmlFile  the source file
849     * @param sections the list of sections
850     * @return the title string, never empty
851     */
852    private static String extractTitle(Document doc, File xmlFile, NodeList sections) {
853        final NodeList titles = doc.getElementsByTagName(TITLE);
854        String title = "";
855        if (titles.getLength() > 0) {
856            title = titles.item(0).getTextContent().trim();
857        }
858
859        if ((title.isEmpty() || CONTENT.equalsIgnoreCase(title))
860                && sections.getLength() > 0) {
861            final String firstSection =
862                    ((Element) sections.item(0)).getAttribute(NAME_ATTR).trim();
863            if (!firstSection.isEmpty() && !CONTENT.equalsIgnoreCase(firstSection)) {
864                title = firstSection;
865            }
866        }
867
868        if (title.isEmpty() || CONTENT.equalsIgnoreCase(title)) {
869            final String name =
870                    xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), "");
871            title = capitalise(name.replace('_', ' '));
872        }
873        return title;
874    }
875
876    /**
877     * Aggregates description from sections, taking the first non-empty
878     * Description subsection found across all sections in the document.
879     *
880     * @param sections list of sections
881     * @return description string, possibly empty
882     */
883    private static String extractAggregateDescription(NodeList sections) {
884        String description = "";
885        for (int index = 0; index < sections.getLength(); index++) {
886            description = extractDescription((Element) sections.item(index));
887            if (!description.isEmpty()) {
888                break;
889            }
890        }
891        return description;
892    }
893
894    /**
895     * Aggregates keywords from sections using all section text so that the
896     * main check entry is discoverable by any term in the document.
897     *
898     * @param title    the document title
899     * @param sections list of sections
900     * @return keywords string
901     */
902    private static String extractAggregateKeywords(String title, NodeList sections) {
903        final StringBuilder keywordSource = new StringBuilder(title);
904        for (int index = 0; index < sections.getLength(); index++) {
905            final Element section = (Element) sections.item(index);
906            keywordSource.append(SPACE_CHAR)
907                .append(section.getAttribute(NAME_ATTR))
908                .append(SPACE_CHAR)
909                .append(section.getTextContent());
910        }
911        return extractKeywordsFromText(keywordSource.toString());
912    }
913
914    /**
915     * Extracts the first sentence of the Description subsection.
916     * Returns an empty string if no Description subsection is found.
917     *
918     * @param section the {@code <section>} element to search
919     * @return first sentence of the description, or empty string
920     */
921    private static String extractDescription(Element section) {
922        final Element sub = findSubsectionByPrefix(section, DESCRIPTION);
923        String result = "";
924        if (sub != null) {
925            final String text = WHITESPACE.matcher(sub.getTextContent())
926                    .replaceAll(SPACE).trim();
927            result = extractFirstSentenceOrTruncated(text);
928        }
929        return result;
930    }
931
932    /**
933     * Derives a fallback page title from the document's {@code <title>}
934     * element or, failing that, from the filename.
935     *
936     * @param doc     the parsed document
937     * @param xmlFile the source file
938     * @return a non-empty title string
939     */
940    private static String derivePageTitle(Document doc, File xmlFile) {
941        final NodeList titles = doc.getElementsByTagName(TITLE);
942        String title = "";
943        if (titles.getLength() > 0) {
944            title = titles.item(0).getTextContent().trim();
945        }
946        if (title.isEmpty()) {
947            final String name =
948                    xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), "");
949            title = capitalise(name.replace('_', ' '));
950        }
951        return title;
952    }
953
954    /**
955     * Disambiguates a section title when it is a generic, structurally
956     * repeated header (see {@link #GENERIC_SECTION_NAMES}).
957     * Non-generic section names are returned unchanged.
958     *
959     * @param sectionName the raw section name
960     * @param pageTitle   the owning page's own title
961     * @return either {@code sectionName} unchanged, or
962     *         {@code "<pageTitle>: <sectionName>"} if generic
963     */
964    private static String disambiguateTitle(String sectionName, String pageTitle) {
965        final String result;
966        if (GENERIC_SECTION_NAMES.contains(sectionName.toLowerCase(Locale.ROOT))) {
967            result = pageTitle + TITLE_SEPARATOR + sectionName;
968        }
969        else {
970            result = sectionName;
971        }
972        return result;
973    }
974
975    /**
976     * Converts a Doxia {@code <section name="...">} value into the anchor id
977     * Doxia generates for it in the rendered HTML by replacing runs of
978     * whitespace with single underscores.
979     *
980     * @param sectionName the raw {@code name} attribute value
981     * @return the anchor id Doxia would render for this section name
982     */
983    private static String doxiaAnchorFor(String sectionName) {
984        return WHITESPACE.matcher(sectionName.trim()).replaceAll("_");
985    }
986
987    /**
988     * Returns the first sentence of the given text (up to and including the
989     * first period), or the text truncated to {@link #MAX_DESCRIPTION_LENGTH}
990     * with an ellipsis if no period is found within range.
991     *
992     * @param text the source text, already whitespace-normalised
993     * @return first sentence or truncated text
994     */
995    private static String extractFirstSentenceOrTruncated(String text) {
996        final String result;
997        final int dot = text.indexOf('.');
998        if (dot > 0) {
999            result = text.substring(0, dot + 1).trim();
1000        }
1001        else {
1002            result = truncate(text, MAX_DESCRIPTION_LENGTH);
1003        }
1004        return result;
1005    }
1006
1007    /**
1008     * Truncates text to the given max length, appending an ellipsis if
1009     * truncation occurred.
1010     *
1011     * @param text      the text to truncate
1012     * @param maxLength maximum length before truncation
1013     * @return original text if short enough, otherwise truncated with ellipsis
1014     */
1015    private static String truncate(String text, int maxLength) {
1016        final String result;
1017        if (text.length() > maxLength) {
1018            result = text.substring(0, maxLength) + ELLIPSIS;
1019        }
1020        else {
1021            result = text;
1022        }
1023        return result;
1024    }
1025
1026    /**
1027     * Builds the root-relative URL for an XDoc file, without any anchor.
1028     * Always uses forward slashes regardless of OS.
1029     *
1030     * @param xmlFile  the source XDoc file
1031     * @param xdocsDir the xdocs root directory
1032     * @return root-relative URL string with no anchor
1033     */
1034    private static String buildUrl(File xmlFile, File xdocsDir) {
1035        return xdocsDir.toPath()
1036                .relativize(xmlFile.toPath())
1037                .toString()
1038                .replace(File.separatorChar, '/')
1039                .replaceFirst(DOC_EXTENSION.pattern(), ".html");
1040    }
1041
1042    /**
1043     * Resolves the correct URL for a general page file. For {@code config_<category>.xml} files
1044     * that redirect to check category pages, maps to {@code checks/<category>/index.html} instead
1045     * of the file path.
1046     *
1047     * @param xmlFile  the source XDoc file
1048     * @param xdocsDir the xdocs root directory
1049     * @return the resolved URL
1050     */
1051    private static String resolvePageUrl(File xmlFile, File xdocsDir) {
1052        String url = buildUrl(xmlFile, xdocsDir);
1053        final Matcher matcher = CONFIG_CATEGORY.matcher(xmlFile.getName());
1054        if (matcher.find()) {
1055            final String category = matcher.group(1);
1056            if (CATEGORY_MAP.containsKey(category)) {
1057                url = "checks/" + category + "/index.html";
1058            }
1059        }
1060        return url;
1061    }
1062
1063    /**
1064     * Extracts keywords from free-form text by splitting on non-word
1065     * characters and filtering short and stop words.
1066     *
1067     * @param text input text
1068     * @return comma-separated keyword string (up to {@link #MAX_KEYWORDS} words)
1069     */
1070    private static String extractKeywordsFromText(String text) {
1071        String result = "";
1072        if (text != null && !text.isEmpty()) {
1073            result = NON_ALPHANUMERIC.splitAsStream(text.toLowerCase(Locale.ROOT))
1074                    .filter(word -> {
1075                        return word.length() >= MIN_WORD_LENGTH
1076                                && !STOP_WORDS.contains(word);
1077                    })
1078                    .distinct()
1079                    .limit(MAX_KEYWORDS)
1080                    .collect(Collectors.joining(COMMA_STR));
1081        }
1082        return result;
1083    }
1084
1085    /**
1086     * Writes all index entries to the output file.
1087     *
1088     * @param indexEntries the list of entries to serialise
1089     * @param outputFilePath the full path to the output file
1090     * @throws IOException on file write failure
1091     */
1092    private void writeJson(List<SearchIndexEntry> indexEntries, Path outputFilePath)
1093            throws IOException {
1094
1095        final Path outputPath = outputFilePath.getParent();
1096        if (outputPath != null) {
1097            Files.createDirectories(outputPath);
1098        }
1099
1100        try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter(
1101                outputFilePath, StandardCharsets.UTF_8))) {
1102            writer.println("[");
1103
1104            final int size = indexEntries.size();
1105            for (int index = 0; index < size; index++) {
1106                final String comma;
1107                if (index < size - 1) {
1108                    comma = COMMA_STR;
1109                }
1110                else {
1111                    comma = "";
1112                }
1113                writer.println("  " + indexEntries.get(index).toJson() + comma);
1114            }
1115            writer.println("]");
1116        }
1117
1118        if (logger.isLoggable(Level.INFO)) {
1119            logger.log(Level.INFO,
1120                    "[SearchIndex] Written: {0}", outputFilePath.toAbsolutePath());
1121        }
1122    }
1123
1124    /**
1125     * Capitalises the first character of a string.
1126     *
1127     * @param input the string to capitalise
1128     * @return string with first character uppercased, or input unchanged if
1129     *         empty
1130     */
1131    private static String capitalise(String input) {
1132        String result = input;
1133        if (input != null && !input.isEmpty()) {
1134            result = Character.toUpperCase(input.charAt(0)) + input.substring(1);
1135        }
1136        return result;
1137    }
1138}