001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2026 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.site; 021 022import java.io.File; 023import java.io.IOException; 024import java.io.PrintWriter; 025import java.nio.charset.StandardCharsets; 026import java.nio.file.Files; 027import java.nio.file.Path; 028import java.util.ArrayList; 029import java.util.Arrays; 030import java.util.HashSet; 031import java.util.LinkedHashMap; 032import java.util.LinkedHashSet; 033import java.util.List; 034import java.util.Locale; 035import java.util.Map; 036import java.util.Set; 037import java.util.logging.Level; 038import java.util.logging.Logger; 039import java.util.regex.Matcher; 040import java.util.regex.Pattern; 041import java.util.stream.Collectors; 042 043import javax.xml.parsers.DocumentBuilder; 044import javax.xml.parsers.DocumentBuilderFactory; 045import javax.xml.parsers.ParserConfigurationException; 046 047import org.w3c.dom.Document; 048import org.w3c.dom.Element; 049import org.w3c.dom.NodeList; 050import org.xml.sax.SAXException; 051 052/** 053 * Generates {@code search-index.json} from the Checkstyle XDoc source files. 054 * 055 * <p>This is a plain Java {@code main()} class - no Maven plugin API required. 056 * It is invoked by {@code exec-maven-plugin} during the {@code process-classes} 057 * phase so the index is ready when Maven Site copies static resources.</p> 058 * 059 * <p>Output is written as a JSON file. The search widget fetches this file 060 * using the fetch API and parses it to populate the search index.</p> 061 * 062 * <h2>Key design decisions</h2> 063 * <ul> 064 * <li><b>No duplicates.</b> Only plain {@code .xml} files are processed for 065 * check/filter/filefilter directories. The {@code .xml.template} and 066 * {@code .xml.vm} siblings are pre-render source files that would produce 067 * identical URLs and duplicate entries. A secondary URL-keyed dedup guard 068 * is also applied across the entire output list.</li> 069 * 070 * <li><b>Identifiable example titles.</b> Both {@code -config} and 071 * {@code -code} example paragraphs are indexed. Their titles use the 072 * pattern {@code "<CheckName>: Example1 [config]"} and 073 * {@code "<CheckName>: Example1 [code]"} so users can distinguish a 074 * configuration snippet from its matching Java code example in search 075 * results.</li> 076 * 077 * <li><b>Full general-page indexing.</b> Each meaningful {@code <section>} 078 * in general documentation pages (e.g. {@code config_system_properties}, 079 * {@code writingchecks}, {@code cmdline}) is indexed as its own entry 080 * with the full section text used for keyword extraction - not just the 081 * first sentence. This makes page-internal headings discoverable.</li> 082 * 083 * <li><b>Disambiguated generic titles.</b> Structural section names that are 084 * repeated across many pages (e.g. "Overview", "Debug", "Contributing") 085 * are prefixed with the page title, yielding e.g. 086 * "Eclipse IDE: Debug" instead of a bare "Debug" that collides with 087 * "IntelliJ IDE: Debug".</li> 088 * 089 * <li><b>Junk pages excluded.</b> Release notes, auto-generated style 090 * coverage reports and bare category aggregator stubs are skipped.</li> 091 * </ul> 092 * 093 * <p>Usage (called by exec-maven-plugin in pom.xml):</p> 094 * <pre> 095 * java SearchIndexGenerator <xdocsDir> <outputFilePath> 096 * java SearchIndexGenerator src/site/xdoc target/site/search-index.json 097 * </pre> 098 */ 099public final class SearchIndexGenerator { 100 101 /** String literal for checks directory. */ 102 private static final String CHECKS = "checks"; 103 104 /** String literal for comma. */ 105 private static final String COMMA_STR = ","; 106 107 /** String literal for space. */ 108 private static final String SPACE = " "; 109 110 /** Character literal for space. */ 111 private static final char SPACE_CHAR = ' '; 112 113 /** String literal for colon separator used in disambiguated titles. */ 114 private static final String TITLE_SEPARATOR = ": "; 115 116 /** String literal for ellipsis. */ 117 private static final String ELLIPSIS = "..."; 118 119 /** String literal for external general entities feature. */ 120 private static final String EXTERNAL_GENERAL_ENTITIES = 121 "http://xml.org/sax/features/external-general-entities"; 122 123 /** String literal for external parameter entities feature. */ 124 private static final String EXTERNAL_PARAMETER_ENTITIES = 125 "http://xml.org/sax/features/external-parameter-entities"; 126 127 /** String literal for General category. */ 128 private static final String GENERAL = "General"; 129 130 /** String literal for Example document type. */ 131 private static final String EXAMPLE_TYPE = "Example"; 132 133 /** String literal for Property document type. */ 134 private static final String PROPERTY_TYPE = "Property"; 135 136 /** String literal for subsection element. */ 137 private static final String SUBSECTION = "subsection"; 138 139 /** String literal for name attribute. */ 140 private static final String NAME_ATTR = "name"; 141 142 /** String literal for id attribute. */ 143 private static final String ID_ATTR = "id"; 144 145 /** String literal for index.xml. */ 146 private static final String INDEX_XML = "index.xml"; 147 148 /** String literal for Content. */ 149 private static final String CONTENT = "Content"; 150 151 /** String literal for the Examples subsection name. */ 152 private static final String EXAMPLES_SUBSECTION = "examples"; 153 154 /** String literal for body element. */ 155 private static final String BODY = "body"; 156 157 /** String literal for section element. */ 158 private static final String SECTION = "section"; 159 160 /** String literal for title element. */ 161 private static final String TITLE = "title"; 162 163 /** String literal for description element. */ 164 private static final String DESCRIPTION = "description"; 165 166 /** String literal for anchor separator. */ 167 private static final String ANCHOR_SEPARATOR = "#"; 168 169 /** String literal for the Properties subsection name fragment. */ 170 private static final String PROPERTIES_FRAGMENT = "propert"; 171 172 /** Log message for skipping files. */ 173 private static final String SKIPPING_MSG = "[SearchIndex] WARN: skipping {0} - {1}"; 174 175 /** 176 * Suffix label appended to example titles for configuration snippets. 177 * Yields e.g. "AnnotationLocation: Example1 [config]". 178 */ 179 private static final String EXAMPLE_LABEL_CONFIG = " [config]"; 180 181 /** 182 * Suffix label appended to example titles for Java code examples. 183 * Yields e.g. "AnnotationLocation: Example1 [code]". 184 */ 185 private static final String EXAMPLE_LABEL_CODE = " [code]"; 186 187 /** Magic number for minimum word length. */ 188 private static final int MIN_WORD_LENGTH = 2; 189 190 /** Magic number for maximum keywords. */ 191 private static final int MAX_KEYWORDS = 15; 192 193 /** Magic number for maximum description length. */ 194 private static final int MAX_DESCRIPTION_LENGTH = 150; 195 196 /** Whitespace pattern. */ 197 private static final Pattern WHITESPACE = Pattern.compile("\\s+"); 198 199 /** Non-alphanumeric pattern. */ 200 private static final Pattern NON_ALPHANUMERIC = Pattern.compile("[^a-z0-9]+"); 201 202 /** 203 * Matches only plain {@code .xml} files (not {@code .xml.vm} or 204 * {@code .xml.template}). Used when scanning check/filter/filefilter 205 * directories to avoid processing pre-render source templates and 206 * producing duplicate index entries. 207 */ 208 private static final Pattern PLAIN_XML = Pattern.compile("\\.xml$"); 209 210 /** 211 * Matches {@code .xml}, {@code .xml.vm} and {@code .xml.template}. 212 * Used only for URL building (stripping the extension to produce a 213 * {@code .html} path) and for the general-pages scanner where we 214 * want to exclude templates by name rather than by extension. 215 */ 216 private static final Pattern DOC_EXTENSION = 217 Pattern.compile("\\.xml$|\\.xml\\.vm$|\\.xml\\.template$"); 218 219 /** 220 * Matches {@code config_<category>.xml} files that redirect to check category pages. 221 * Captures the category name (e.g. "metrics" from "config_metrics.xml") in group 1. 222 */ 223 private static final Pattern CONFIG_CATEGORY = 224 Pattern.compile("^config_(.+)\\.xml$"); 225 226 /** 227 * Matches an example paragraph {@code id} attribute that has a suffix of 228 * either {@code -config} or {@code -code}, capturing the base label 229 * (e.g. "Example1") in group 1 and the type ("config" or "code") in 230 * group 2. 231 * 232 * <p>Example ids found in XDoc source:</p> 233 * <ul> 234 * <li>{@code id="Example1-config"} -> label "Example1", type "config"</li> 235 * <li>{@code id="Example1-code"} -> label "Example1", type "code"</li> 236 * </ul> 237 */ 238 private static final Pattern EXAMPLE_PARAGRAPH_ID = 239 Pattern.compile("^(Example\\d+)-(config|code)$"); 240 241 /** 242 * Generic section/subsection names that are structurally repeated across 243 * many unrelated general pages (IDE setup guides, writing-* guides, etc). 244 * On their own they are meaningless in search results ("Debug" appears 245 * identically in eclipse.xml, idea.xml, and netbeans.xml) so when one of 246 * these is used as a section title it is always disambiguated with the 247 * source page's own title, e.g. "Eclipse IDE: Debug". 248 */ 249 private static final Set<String> GENERIC_SECTION_NAMES = new HashSet<>(Arrays.asList( 250 "overview", DESCRIPTION, EXAMPLES_SUBSECTION, "example", "debug", 251 "contributing", "limitations", "parameters", "installation" 252 )); 253 254 /** Category mapping: XDoc subdirectory name to display label. */ 255 private static final Map<String, String> CATEGORY_MAP = new LinkedHashMap<>(); 256 257 static { 258 CATEGORY_MAP.put("annotation", "Annotation"); 259 CATEGORY_MAP.put("blocks", "Block Checks"); 260 CATEGORY_MAP.put("coding", "Coding"); 261 CATEGORY_MAP.put("design", "Class Design"); 262 CATEGORY_MAP.put("header", "Headers"); 263 CATEGORY_MAP.put("imports", "Imports"); 264 CATEGORY_MAP.put("javadoc", "Javadoc Comments"); 265 CATEGORY_MAP.put("metrics", "Metrics"); 266 CATEGORY_MAP.put("misc", "Miscellaneous"); 267 CATEGORY_MAP.put("modifier", "Modifiers"); 268 CATEGORY_MAP.put("naming", "Naming Conventions"); 269 CATEGORY_MAP.put("regexp", "Regexp"); 270 CATEGORY_MAP.put("sizes", "Size Violations"); 271 CATEGORY_MAP.put("whitespace", "Whitespace"); 272 } 273 274 /** Stop words: too generic to be useful as search keywords. */ 275 private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList( 276 "a", "an", "the", "and", "or", "of", "to", "in", "is", "it", 277 "that", "this", "for", "on", "with", "are", "be", "by", "at", 278 "as", "if", "its", "from", "which", "whether", "can", "will", 279 "has", "have", "not", "also", "only", "any", "all", "each", 280 "more", "than", "when", "then", "into", "such", "use", "used", 281 "check", CHECKS, "checkstyle" 282 )); 283 284 /** Logger for this class. */ 285 private final Logger logger = Logger.getLogger(getClass().getName()); 286 287 /** Accumulated search index entries. */ 288 private List<SearchIndexEntry> entries; 289 290 /** Deduplication guard for URLs. */ 291 private Set<String> seenUrls; 292 293 /** Prevent instantiation. */ 294 private SearchIndexGenerator() { 295 } 296 297 /** 298 * Main entry point called by exec-maven-plugin. 299 * 300 * @param args args[0] = path to src/xdocs, args[1] = path to target/site 301 * @throws IOException on file write failure 302 * @throws IllegalArgumentException if args are missing 303 * @throws IllegalStateException if xdocsDir is missing 304 * @noinspectionreason UseOfSystemOutOrSystemErr - main method of a CLI utility 305 */ 306 public static void main(String... args) throws IOException { 307 new SearchIndexGenerator().execute(args); 308 } 309 310 /** 311 * Internal execution method to avoid static context for the logger. 312 * 313 * @param args args[0] = path to src/xdocs, args[1] = output file path 314 * @throws IOException on file write failure 315 * @throws IllegalArgumentException if args are missing 316 * @throws IllegalStateException if xdocsDir is missing 317 */ 318 private void execute(String... args) throws IOException { 319 if (args.length < 2) { 320 throw new IllegalArgumentException( 321 "Usage: SearchIndexGenerator <xdocsDir> <outputFilePath>"); 322 } 323 324 final Path xdocsPath = Path.of(args[0]); 325 final Path outputFilePath = Path.of(args[1]); 326 final File xdocsDir = xdocsPath.toFile(); 327 328 if (!Files.exists(xdocsPath)) { 329 final String error = "[SearchIndex] ERROR: xdocsDir not found: " 330 + xdocsPath.toAbsolutePath(); 331 throw new IllegalStateException(error); 332 } 333 334 if (logger.isLoggable(Level.INFO)) { 335 logger.log(Level.INFO, "[SearchIndex] Reading XDocs from: {0}", xdocsPath); 336 } 337 338 seenUrls = new LinkedHashSet<>(); 339 entries = new ArrayList<>(); 340 341 final Path checksPath = xdocsPath.resolve(CHECKS); 342 if (Files.exists(checksPath)) { 343 processChecksDirectory(checksPath.toFile(), xdocsDir); 344 } 345 346 final Path filtersPath = xdocsPath.resolve("filters"); 347 if (Files.exists(filtersPath)) { 348 processDirectory(filtersPath.toFile(), xdocsDir, 349 "Filters", "Filter"); 350 } 351 352 final Path fileFiltersPath = xdocsPath.resolve("filefilters"); 353 if (Files.exists(fileFiltersPath)) { 354 processDirectory(fileFiltersPath.toFile(), xdocsDir, 355 "File Filters", "File Filter"); 356 } 357 358 processGeneralPages(xdocsDir); 359 writeJson(entries, outputFilePath); 360 361 if (logger.isLoggable(Level.INFO)) { 362 logger.log(Level.INFO, "[SearchIndex] Done - {0} entries indexed.", 363 entries.size()); 364 } 365 } 366 367 /** 368 * Walks {@code src/xdocs/checks/} and processes each category subdirectory. 369 * 370 * @param checksDir the checks root directory 371 * @param xdocsDir the xdocs root (used for URL building) 372 */ 373 private void processChecksDirectory(File checksDir, File xdocsDir) { 374 375 final File[] categoryDirs = checksDir.listFiles(File::isDirectory); 376 if (categoryDirs != null) { 377 for (File categoryDir : categoryDirs) { 378 final String dirName = categoryDir.getName().toLowerCase(Locale.ROOT); 379 final String category = CATEGORY_MAP.getOrDefault(dirName, 380 capitalise(dirName)); 381 processDirectory(categoryDir, xdocsDir, category, "Check"); 382 } 383 } 384 } 385 386 /** 387 * Processes all <b>plain</b> {@code .xml} files in a directory 388 * (non-recursive). {@code index.xml} files and any file whose name ends 389 * with {@code .xml.template} or {@code .xml.vm} are skipped. 390 * 391 * <p>Skipping templates is critical: every check page has a sibling 392 * {@code *.xml.template} file that resolves to the <em>same</em> HTML 393 * URL. Without this filter both files would be processed, producing two 394 * identical (or near-identical) main entries plus doubled example and 395 * property entries for every check.</p> 396 * 397 * <p>For each plain {@code .xml} file, the main check/filter entry, 398 * per-example entries (both config and code), and per-property entries 399 * are added.</p> 400 * 401 * @param dir directory to scan 402 * @param xdocsDir xdocs root (used for URL building) 403 * @param category category label for all entries in this directory 404 * @param type document type ("Check", "Filter", "File Filter") 405 */ 406 private void processDirectory(File dir, File xdocsDir, 407 String category, String type) { 408 409 final File[] xmlFiles = dir.listFiles(file -> { 410 return file.isFile() 411 && PLAIN_XML.matcher(file.getName()).find() 412 && !INDEX_XML.equals(file.getName()); 413 }); 414 415 if (xmlFiles != null) { 416 Arrays.sort(xmlFiles); 417 for (File xmlFile : xmlFiles) { 418 try { 419 final Document doc = parseXml(xmlFile); 420 final String baseUrl = buildUrl(xmlFile, xdocsDir); 421 422 addIfNew(buildMainEntry(doc, xmlFile, category, type, baseUrl)); 423 424 for (SearchIndexEntry entry 425 : extractExampleEntries(doc, baseUrl, category)) { 426 addIfNew(entry); 427 } 428 for (SearchIndexEntry entry 429 : extractPropertyEntries(doc, baseUrl, category)) { 430 addIfNew(entry); 431 } 432 } 433 catch (IOException | SAXException | ParserConfigurationException exception) { 434 if (logger.isLoggable(Level.WARNING)) { 435 logger.log(Level.WARNING, SKIPPING_MSG, 436 new Object[] {xmlFile.getName(), exception.getMessage()}); 437 } 438 } 439 } 440 } 441 } 442 443 /** 444 * Adds entries for the top-level general documentation pages. 445 * 446 * <p>Each remaining page is indexed per top-level {@code <section>}, 447 * using the section's full text content for keyword extraction so 448 * page-internal headings are fully discoverable. Generic structural 449 * section names (see {@link #GENERIC_SECTION_NAMES}) are disambiguated 450 * by prefixing the page's own title.</p> 451 * 452 * @param xdocsDir the xdocs root directory 453 */ 454 private void processGeneralPages(File xdocsDir) { 455 456 final File[] xmlFiles = xdocsDir.listFiles(file -> { 457 final String name = file.getName(); 458 return file.isFile() 459 && PLAIN_XML.matcher(name).find(); 460 }); 461 462 if (xmlFiles != null) { 463 Arrays.sort(xmlFiles); 464 for (File xmlFile : xmlFiles) { 465 try { 466 for (SearchIndexEntry entry : buildGeneralPageEntries(xmlFile)) { 467 addIfNew(entry); 468 } 469 } 470 catch (IOException | SAXException | ParserConfigurationException exception) { 471 if (logger.isLoggable(Level.WARNING)) { 472 logger.log(Level.WARNING, SKIPPING_MSG, 473 new Object[] {xmlFile.getName(), exception.getMessage()}); 474 } 475 } 476 } 477 } 478 } 479 480 /** 481 * Builds the main search entry representing an entire check/filter document. 482 * 483 * @param doc the parsed XDoc document 484 * @param xmlFile the source file 485 * @param category category label for this file's entry 486 * @param type document type ("Check", "Filter", etc.) 487 * @param baseUrl the page url without anchor 488 * @return an entry representing the document 489 */ 490 private static SearchIndexEntry buildMainEntry(Document doc, File xmlFile, 491 String category, String type, 492 String baseUrl) { 493 final NodeList bodies = doc.getElementsByTagName(BODY); 494 final Element body = (Element) bodies.item(0); 495 final NodeList sections = body.getElementsByTagName(SECTION); 496 497 final String title = extractTitle(doc, xmlFile, sections); 498 final String description = extractAggregateDescription(sections); 499 final String keywords = extractAggregateKeywords(title, sections); 500 501 return new SearchIndexEntry(title, baseUrl, category, type, description, keywords); 502 } 503 504 /** 505 * Builds one search entry per top-level {@code <section>} in a general 506 * documentation page, using each section's full text for keyword 507 * extraction so that page-internal content is fully discoverable. 508 * 509 * <p>Generic structural section names (see {@link #GENERIC_SECTION_NAMES}) 510 * are disambiguated as {@code "<page title>: <section name>"} to avoid 511 * collisions across pages (e.g. "Eclipse IDE: Debug" vs 512 * "IntelliJ IDE: Debug").</p> 513 * 514 * @param xmlFile the XDoc source file to parse 515 * @return list of entries, one per top-level section found 516 * @throws ParserConfigurationException on XML parser setup failure 517 * @throws SAXException on XML parse error 518 * @throws IOException on file read failure 519 */ 520 private static List<SearchIndexEntry> buildGeneralPageEntries(File xmlFile) 521 throws ParserConfigurationException, SAXException, IOException { 522 523 final List<SearchIndexEntry> results = new ArrayList<>(); 524 final Document doc = parseXml(xmlFile); 525 526 final NodeList bodies = doc.getElementsByTagName(BODY); 527 if (bodies.getLength() != 0) { 528 final Element body = (Element) bodies.item(0); 529 final NodeList sections = body.getElementsByTagName(SECTION); 530 final String pageUrl = resolvePageUrl(xmlFile, xmlFile.getParentFile()); 531 final String pageTitle = derivePageTitle(doc, xmlFile); 532 533 if (sections.getLength() == 0) { 534 final String fullText = WHITESPACE.matcher(body.getTextContent()) 535 .replaceAll(SPACE).trim(); 536 final String description = extractFirstSentenceOrTruncated(fullText); 537 final String keywords = extractKeywordsFromText( 538 pageTitle + SPACE + fullText); 539 results.add(new SearchIndexEntry( 540 pageTitle, pageUrl, GENERAL, GENERAL, description, keywords)); 541 } 542 else { 543 for (int index = 0; index < sections.getLength(); index++) { 544 final Element section = (Element) sections.item(index); 545 if (body.equals(section.getParentNode())) { 546 final String sectionName = section.getAttribute(NAME_ATTR).trim(); 547 if (!sectionName.isEmpty() && !CONTENT.equalsIgnoreCase(sectionName)) { 548 549 final String entryTitle = disambiguateTitle(sectionName, pageTitle); 550 final String anchor = doxiaAnchorFor(sectionName); 551 final String url = pageUrl + ANCHOR_SEPARATOR + anchor; 552 553 final String sectionText = WHITESPACE.matcher(section.getTextContent()) 554 .replaceAll(SPACE).trim(); 555 final String description = extractFirstSentenceOrTruncated(sectionText); 556 final String keywords = extractKeywordsFromText( 557 pageTitle + SPACE + sectionName + SPACE + sectionText); 558 559 results.add(new SearchIndexEntry( 560 entryTitle, url, GENERAL, GENERAL, description, keywords)); 561 } 562 } 563 } 564 } 565 } 566 567 return results; 568 } 569 570 /** 571 * Extracts per-example search entries from a check/filter document. 572 * 573 * <p>Both {@code -config} and {@code -code} example paragraphs are 574 * indexed so users can find both the configuration snippet and the 575 * corresponding Java code example independently in search results.</p> 576 * 577 * <p>Titles use the pattern {@code "<CheckName>: Example1 [config]"} and 578 * {@code "<CheckName>: Example1 [code]"} to make the type immediately 579 * visible in search result listings without needing to open the page.</p> 580 * 581 * <p>Confirmed XDoc template structure for the Examples subsection:</p> 582 * <pre> 583 * <p id="Example1-config">To configure the check...</p> 584 * <macro name="example"><param name="type" value="config"/></macro> 585 * <p id="Example1-code">Example:</p> 586 * <macro name="example"><param name="type" value="code"/></macro> 587 * </pre> 588 * 589 * @param doc the parsed XDoc document 590 * @param baseUrl the page url without anchor 591 * @param category category label 592 * @return list of per-example entries (both config and code); empty if 593 * none found 594 */ 595 private static List<SearchIndexEntry> extractExampleEntries(Document doc, 596 String baseUrl, 597 String category) { 598 599 final List<SearchIndexEntry> exampleEntries = new ArrayList<>(); 600 final NodeList bodies = doc.getElementsByTagName(BODY); 601 if (bodies.getLength() != 0) { 602 final Element body = (Element) bodies.item(0); 603 final NodeList sections = body.getElementsByTagName(SECTION); 604 605 for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) { 606 final Element section = (Element) sections.item(sectionIdx); 607 final String checkName = section.getAttribute(NAME_ATTR).trim(); 608 final Element examplesSubsection = 609 findSubsectionByPrefix(section, EXAMPLES_SUBSECTION); 610 611 if (examplesSubsection == null) { 612 continue; 613 } 614 615 final NodeList paragraphs = examplesSubsection.getElementsByTagName("p"); 616 617 for (int paragraphIndex = 0; paragraphIndex < paragraphs.getLength(); 618 paragraphIndex++) { 619 final Element paragraph = (Element) paragraphs.item(paragraphIndex); 620 final SearchIndexEntry entry = buildExampleEntry( 621 paragraph, checkName, baseUrl, category); 622 if (entry != null) { 623 exampleEntries.add(entry); 624 } 625 } 626 } 627 } 628 629 return exampleEntries; 630 } 631 632 /** 633 * Builds a single example entry from a paragraph element. 634 * 635 * @param paragraph the paragraph element containing the example 636 * @param checkName the name of the check 637 * @param baseUrl the base URL for the page 638 * @param category the category label 639 * @return a SearchIndexEntry if the paragraph matches the example pattern, 640 * null otherwise 641 */ 642 private static SearchIndexEntry buildExampleEntry(Element paragraph, 643 String checkName, 644 String baseUrl, 645 String category) { 646 final String id = paragraph.getAttribute(ID_ATTR); 647 final Matcher matcher = EXAMPLE_PARAGRAPH_ID.matcher(id); 648 SearchIndexEntry result = null; 649 650 if (matcher.matches()) { 651 final String exampleLabel = matcher.group(1); 652 final String exampleType = matcher.group(2); 653 654 final String labelSuffix; 655 if ("config".equals(exampleType)) { 656 labelSuffix = EXAMPLE_LABEL_CONFIG; 657 } 658 else { 659 labelSuffix = EXAMPLE_LABEL_CODE; 660 } 661 662 final String introText = WHITESPACE 663 .matcher(paragraph.getTextContent()) 664 .replaceAll(SPACE).trim(); 665 666 final String title = checkName + TITLE_SEPARATOR 667 + exampleLabel + labelSuffix; 668 final String url = baseUrl + ANCHOR_SEPARATOR + id; 669 final String description = 670 truncate(introText, MAX_DESCRIPTION_LENGTH); 671 final String keywords = extractKeywordsFromText( 672 checkName + SPACE + exampleLabel 673 + SPACE + exampleType + SPACE + introText); 674 675 result = new SearchIndexEntry( 676 title, url, category, EXAMPLE_TYPE, 677 description, keywords); 678 } 679 680 return result; 681 } 682 683 /** 684 * Extracts per-property search entries from a check/filter document. 685 * 686 * <p>Each row of the Properties table is indexed under the title 687 * {@code "<CheckName>: <propertyName>"} and linked to the property's 688 * own anchor on the page.</p> 689 * 690 * @param doc the parsed XDoc document 691 * @param baseUrl the page url without anchor 692 * @param category category label 693 * @return list of per-property entries; empty if none found 694 */ 695 private static List<SearchIndexEntry> extractPropertyEntries(Document doc, 696 String baseUrl, 697 String category) { 698 699 final List<SearchIndexEntry> propertyEntries = new ArrayList<>(); 700 final NodeList bodies = doc.getElementsByTagName(BODY); 701 if (bodies.getLength() != 0) { 702 final Element body = (Element) bodies.item(0); 703 final NodeList sections = body.getElementsByTagName(SECTION); 704 705 for (int sectionIdx = 0; sectionIdx < sections.getLength(); sectionIdx++) { 706 final Element section = (Element) sections.item(sectionIdx); 707 final Element propertiesSubsection = 708 findSubsectionByPrefix(section, PROPERTIES_FRAGMENT); 709 710 if (propertiesSubsection != null) { 711 final String checkName = section.getAttribute(NAME_ATTR).trim(); 712 extractPropertiesFromRows(propertiesSubsection, checkName, baseUrl, 713 category, propertyEntries); 714 } 715 } 716 } 717 718 return propertyEntries; 719 } 720 721 /** 722 * Extracts property entries from table rows and adds them to the list. 723 * 724 * @param propertiesSubsection the properties subsection element 725 * @param checkName the check name 726 * @param baseUrl the page url without anchor 727 * @param category category label 728 * @param propertyEntries the list to add entries to 729 */ 730 private static void extractPropertiesFromRows(Element propertiesSubsection, 731 String checkName, 732 String baseUrl, 733 String category, 734 List<SearchIndexEntry> propertyEntries) { 735 final NodeList rows = propertiesSubsection.getElementsByTagName("tr"); 736 737 for (int rowIdx = 1; rowIdx < rows.getLength(); rowIdx++) { 738 final Element row = (Element) rows.item(rowIdx); 739 final NodeList cells = row.getElementsByTagName("td"); 740 if (cells.getLength() >= 2) { 741 processPropertyRow(cells, checkName, baseUrl, category, propertyEntries); 742 } 743 } 744 } 745 746 /** 747 * Processes a single property row and adds an entry if valid. 748 * 749 * @param cells the table cells 750 * @param checkName the check name 751 * @param baseUrl the page url without anchor 752 * @param category category label 753 * @param propertyEntries the list to add entries to 754 */ 755 private static void processPropertyRow(NodeList cells, 756 String checkName, 757 String baseUrl, 758 String category, 759 List<SearchIndexEntry> propertyEntries) { 760 final String propName = WHITESPACE 761 .matcher(cells.item(0).getTextContent()) 762 .replaceAll(SPACE).trim(); 763 764 if (!propName.isEmpty()) { 765 final String propDesc = WHITESPACE 766 .matcher(cells.item(1).getTextContent()) 767 .replaceAll(SPACE).trim(); 768 769 final String title = checkName + TITLE_SEPARATOR + propName; 770 final String url = baseUrl + ANCHOR_SEPARATOR + propName; 771 final String description = truncate(propDesc, MAX_DESCRIPTION_LENGTH); 772 final String keywords = extractKeywordsFromText( 773 checkName + SPACE + propName + SPACE + propDesc); 774 775 propertyEntries.add(new SearchIndexEntry( 776 title, url, category, PROPERTY_TYPE, 777 description, keywords)); 778 } 779 } 780 781 /** 782 * Adds an entry to the output list only if its URL has not been seen 783 * before. This is a secondary guard that catches any duplicates that 784 * slip through the primary filter (only processing plain {@code .xml} 785 * files), e.g. if a check has the same example paragraph id repeated 786 * across two sections. 787 * 788 * @param entry the entry to conditionally add 789 */ 790 private void addIfNew(SearchIndexEntry entry) { 791 if (seenUrls.add(entry.url())) { 792 entries.add(entry); 793 } 794 } 795 796 /** 797 * Finds a subsection within a section whose lowercased name contains the 798 * given fragment (e.g. "examples" or "propert" to match "Properties"). 799 * 800 * @param section the section to search 801 * @param fragment lowercase fragment to match against the subsection name 802 * @return the matching subsection element, or {@code null} if not found 803 */ 804 private static Element findSubsectionByPrefix(Element section, String fragment) { 805 final NodeList subsections = section.getElementsByTagName(SUBSECTION); 806 Element result = null; 807 for (int index = 0; index < subsections.getLength(); index++) { 808 final Element sub = (Element) subsections.item(index); 809 if (sub.getAttribute(NAME_ATTR).trim() 810 .toLowerCase(Locale.ROOT).contains(fragment)) { 811 result = sub; 812 break; 813 } 814 } 815 return result; 816 } 817 818 /** 819 * Parses the XML file into a Document with external entity resolution 820 * disabled for security. 821 * 822 * @param xmlFile the XDoc source file 823 * @return the parsed Document 824 * @throws ParserConfigurationException on XML parser setup failure 825 * @throws SAXException on XML parse error 826 * @throws IOException on file read failure 827 */ 828 private static Document parseXml(File xmlFile) 829 throws ParserConfigurationException, SAXException, IOException { 830 final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 831 factory.setFeature(EXTERNAL_GENERAL_ENTITIES, false); 832 factory.setFeature(EXTERNAL_PARAMETER_ENTITIES, false); 833 834 final DocumentBuilder builder = factory.newDocumentBuilder(); 835 builder.setErrorHandler(null); 836 837 final Document doc = builder.parse(xmlFile); 838 doc.getDocumentElement().normalize(); 839 return doc; 840 } 841 842 /** 843 * Extracts the document title from the {@code <title>} element, falling 844 * back to the first non-empty, non-"Content" section name, and finally 845 * to a capitalised version of the file name. 846 * 847 * @param doc the document 848 * @param xmlFile the source file 849 * @param sections the list of sections 850 * @return the title string, never empty 851 */ 852 private static String extractTitle(Document doc, File xmlFile, NodeList sections) { 853 final NodeList titles = doc.getElementsByTagName(TITLE); 854 String title = ""; 855 if (titles.getLength() > 0) { 856 title = titles.item(0).getTextContent().trim(); 857 } 858 859 if ((title.isEmpty() || CONTENT.equalsIgnoreCase(title)) 860 && sections.getLength() > 0) { 861 final String firstSection = 862 ((Element) sections.item(0)).getAttribute(NAME_ATTR).trim(); 863 if (!firstSection.isEmpty() && !CONTENT.equalsIgnoreCase(firstSection)) { 864 title = firstSection; 865 } 866 } 867 868 if (title.isEmpty() || CONTENT.equalsIgnoreCase(title)) { 869 final String name = 870 xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), ""); 871 title = capitalise(name.replace('_', ' ')); 872 } 873 return title; 874 } 875 876 /** 877 * Aggregates description from sections, taking the first non-empty 878 * Description subsection found across all sections in the document. 879 * 880 * @param sections list of sections 881 * @return description string, possibly empty 882 */ 883 private static String extractAggregateDescription(NodeList sections) { 884 String description = ""; 885 for (int index = 0; index < sections.getLength(); index++) { 886 description = extractDescription((Element) sections.item(index)); 887 if (!description.isEmpty()) { 888 break; 889 } 890 } 891 return description; 892 } 893 894 /** 895 * Aggregates keywords from sections using all section text so that the 896 * main check entry is discoverable by any term in the document. 897 * 898 * @param title the document title 899 * @param sections list of sections 900 * @return keywords string 901 */ 902 private static String extractAggregateKeywords(String title, NodeList sections) { 903 final StringBuilder keywordSource = new StringBuilder(title); 904 for (int index = 0; index < sections.getLength(); index++) { 905 final Element section = (Element) sections.item(index); 906 keywordSource.append(SPACE_CHAR) 907 .append(section.getAttribute(NAME_ATTR)) 908 .append(SPACE_CHAR) 909 .append(section.getTextContent()); 910 } 911 return extractKeywordsFromText(keywordSource.toString()); 912 } 913 914 /** 915 * Extracts the first sentence of the Description subsection. 916 * Returns an empty string if no Description subsection is found. 917 * 918 * @param section the {@code <section>} element to search 919 * @return first sentence of the description, or empty string 920 */ 921 private static String extractDescription(Element section) { 922 final Element sub = findSubsectionByPrefix(section, DESCRIPTION); 923 String result = ""; 924 if (sub != null) { 925 final String text = WHITESPACE.matcher(sub.getTextContent()) 926 .replaceAll(SPACE).trim(); 927 result = extractFirstSentenceOrTruncated(text); 928 } 929 return result; 930 } 931 932 /** 933 * Derives a fallback page title from the document's {@code <title>} 934 * element or, failing that, from the filename. 935 * 936 * @param doc the parsed document 937 * @param xmlFile the source file 938 * @return a non-empty title string 939 */ 940 private static String derivePageTitle(Document doc, File xmlFile) { 941 final NodeList titles = doc.getElementsByTagName(TITLE); 942 String title = ""; 943 if (titles.getLength() > 0) { 944 title = titles.item(0).getTextContent().trim(); 945 } 946 if (title.isEmpty()) { 947 final String name = 948 xmlFile.getName().replaceFirst(DOC_EXTENSION.pattern(), ""); 949 title = capitalise(name.replace('_', ' ')); 950 } 951 return title; 952 } 953 954 /** 955 * Disambiguates a section title when it is a generic, structurally 956 * repeated header (see {@link #GENERIC_SECTION_NAMES}). 957 * Non-generic section names are returned unchanged. 958 * 959 * @param sectionName the raw section name 960 * @param pageTitle the owning page's own title 961 * @return either {@code sectionName} unchanged, or 962 * {@code "<pageTitle>: <sectionName>"} if generic 963 */ 964 private static String disambiguateTitle(String sectionName, String pageTitle) { 965 final String result; 966 if (GENERIC_SECTION_NAMES.contains(sectionName.toLowerCase(Locale.ROOT))) { 967 result = pageTitle + TITLE_SEPARATOR + sectionName; 968 } 969 else { 970 result = sectionName; 971 } 972 return result; 973 } 974 975 /** 976 * Converts a Doxia {@code <section name="...">} value into the anchor id 977 * Doxia generates for it in the rendered HTML by replacing runs of 978 * whitespace with single underscores. 979 * 980 * @param sectionName the raw {@code name} attribute value 981 * @return the anchor id Doxia would render for this section name 982 */ 983 private static String doxiaAnchorFor(String sectionName) { 984 return WHITESPACE.matcher(sectionName.trim()).replaceAll("_"); 985 } 986 987 /** 988 * Returns the first sentence of the given text (up to and including the 989 * first period), or the text truncated to {@link #MAX_DESCRIPTION_LENGTH} 990 * with an ellipsis if no period is found within range. 991 * 992 * @param text the source text, already whitespace-normalised 993 * @return first sentence or truncated text 994 */ 995 private static String extractFirstSentenceOrTruncated(String text) { 996 final String result; 997 final int dot = text.indexOf('.'); 998 if (dot > 0) { 999 result = text.substring(0, dot + 1).trim(); 1000 } 1001 else { 1002 result = truncate(text, MAX_DESCRIPTION_LENGTH); 1003 } 1004 return result; 1005 } 1006 1007 /** 1008 * Truncates text to the given max length, appending an ellipsis if 1009 * truncation occurred. 1010 * 1011 * @param text the text to truncate 1012 * @param maxLength maximum length before truncation 1013 * @return original text if short enough, otherwise truncated with ellipsis 1014 */ 1015 private static String truncate(String text, int maxLength) { 1016 final String result; 1017 if (text.length() > maxLength) { 1018 result = text.substring(0, maxLength) + ELLIPSIS; 1019 } 1020 else { 1021 result = text; 1022 } 1023 return result; 1024 } 1025 1026 /** 1027 * Builds the root-relative URL for an XDoc file, without any anchor. 1028 * Always uses forward slashes regardless of OS. 1029 * 1030 * @param xmlFile the source XDoc file 1031 * @param xdocsDir the xdocs root directory 1032 * @return root-relative URL string with no anchor 1033 */ 1034 private static String buildUrl(File xmlFile, File xdocsDir) { 1035 return xdocsDir.toPath() 1036 .relativize(xmlFile.toPath()) 1037 .toString() 1038 .replace(File.separatorChar, '/') 1039 .replaceFirst(DOC_EXTENSION.pattern(), ".html"); 1040 } 1041 1042 /** 1043 * Resolves the correct URL for a general page file. For {@code config_<category>.xml} files 1044 * that redirect to check category pages, maps to {@code checks/<category>/index.html} instead 1045 * of the file path. 1046 * 1047 * @param xmlFile the source XDoc file 1048 * @param xdocsDir the xdocs root directory 1049 * @return the resolved URL 1050 */ 1051 private static String resolvePageUrl(File xmlFile, File xdocsDir) { 1052 String url = buildUrl(xmlFile, xdocsDir); 1053 final Matcher matcher = CONFIG_CATEGORY.matcher(xmlFile.getName()); 1054 if (matcher.find()) { 1055 final String category = matcher.group(1); 1056 if (CATEGORY_MAP.containsKey(category)) { 1057 url = "checks/" + category + "/index.html"; 1058 } 1059 } 1060 return url; 1061 } 1062 1063 /** 1064 * Extracts keywords from free-form text by splitting on non-word 1065 * characters and filtering short and stop words. 1066 * 1067 * @param text input text 1068 * @return comma-separated keyword string (up to {@link #MAX_KEYWORDS} words) 1069 */ 1070 private static String extractKeywordsFromText(String text) { 1071 String result = ""; 1072 if (text != null && !text.isEmpty()) { 1073 result = NON_ALPHANUMERIC.splitAsStream(text.toLowerCase(Locale.ROOT)) 1074 .filter(word -> { 1075 return word.length() >= MIN_WORD_LENGTH 1076 && !STOP_WORDS.contains(word); 1077 }) 1078 .distinct() 1079 .limit(MAX_KEYWORDS) 1080 .collect(Collectors.joining(COMMA_STR)); 1081 } 1082 return result; 1083 } 1084 1085 /** 1086 * Writes all index entries to the output file. 1087 * 1088 * @param indexEntries the list of entries to serialise 1089 * @param outputFilePath the full path to the output file 1090 * @throws IOException on file write failure 1091 */ 1092 private void writeJson(List<SearchIndexEntry> indexEntries, Path outputFilePath) 1093 throws IOException { 1094 1095 final Path outputPath = outputFilePath.getParent(); 1096 if (outputPath != null) { 1097 Files.createDirectories(outputPath); 1098 } 1099 1100 try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter( 1101 outputFilePath, StandardCharsets.UTF_8))) { 1102 writer.println("["); 1103 1104 final int size = indexEntries.size(); 1105 for (int index = 0; index < size; index++) { 1106 final String comma; 1107 if (index < size - 1) { 1108 comma = COMMA_STR; 1109 } 1110 else { 1111 comma = ""; 1112 } 1113 writer.println(" " + indexEntries.get(index).toJson() + comma); 1114 } 1115 writer.println("]"); 1116 } 1117 1118 if (logger.isLoggable(Level.INFO)) { 1119 logger.log(Level.INFO, 1120 "[SearchIndex] Written: {0}", outputFilePath.toAbsolutePath()); 1121 } 1122 } 1123 1124 /** 1125 * Capitalises the first character of a string. 1126 * 1127 * @param input the string to capitalise 1128 * @return string with first character uppercased, or input unchanged if 1129 * empty 1130 */ 1131 private static String capitalise(String input) { 1132 String result = input; 1133 if (input != null && !input.isEmpty()) { 1134 result = Character.toUpperCase(input.charAt(0)) + input.substring(1); 1135 } 1136 return result; 1137 } 1138}