@@ -900,6 +900,7 @@ private static String extractRelevantJavaDocContent(org.eclipse.jdt.core.IType t
900900
901901 // Clean Javadoc comment for processing
902902 String cleanedJavadoc = cleanJavadocComment (rawJavadoc );
903+ cleanedJavadoc = removeHtmlTags (cleanedJavadoc );
903904 cleanedJavadoc = convertHtmlEntities (cleanedJavadoc );
904905
905906 // === High Priority: Extract class description text (first paragraph) ===
@@ -1036,6 +1037,29 @@ private static String convertHtmlEntities(String text) {
10361037 .replace ("–" , "-" );
10371038 }
10381039
1040+ /**
1041+ * Remove all HTML tags from text, keeping only plain text content.
1042+ * Preserves line breaks for block-level tags like <p>, <br>, <div>.
1043+ */
1044+ private static String removeHtmlTags (String text ) {
1045+ if (text == null || text .isEmpty ()) {
1046+ return text ;
1047+ }
1048+
1049+ // Replace block-level tags with line breaks
1050+ text = text .replaceAll ("(?i)</(p|div|li)>|<br\\ s*/?>|<p[^>]*>" , "\n " );
1051+
1052+ // Remove all remaining HTML tags
1053+ text = text .replaceAll ("<[^>]+>" , "" );
1054+
1055+ // Clean up whitespace: collapse spaces, trim lines, limit line breaks
1056+ text = text .replaceAll ("[ \\ t]+" , " " )
1057+ .replaceAll (" *\\ n *" , "\n " )
1058+ .replaceAll ("\\ n{3,}" , "\n \n " );
1059+
1060+ return text .trim ();
1061+ }
1062+
10391063 /**
10401064 * Extract method JavaDoc content directly for LLM consumption.
10411065 * Returns cleaned JavaDoc without artificial truncation - let LLM understand the full context.
@@ -1056,6 +1080,7 @@ private static String extractMethodJavaDocSummary(IMethod method) {
10561080
10571081 // Just clean and return - let LLM understand the full context
10581082 String cleaned = cleanJavadocComment (rawJavadoc );
1083+ cleaned = removeHtmlTags (cleaned );
10591084 return convertHtmlEntities (cleaned );
10601085
10611086 } catch (Exception e ) {
@@ -1176,6 +1201,7 @@ private static String extractFieldJavaDocSummary(org.eclipse.jdt.core.IField fie
11761201
11771202 // Just clean and return - let LLM understand the full context
11781203 String cleaned = cleanJavadocComment (rawJavadoc );
1204+ cleaned = removeHtmlTags (cleaned );
11791205 return convertHtmlEntities (cleaned );
11801206
11811207 } catch (Exception e ) {
0 commit comments